diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile
index acd1e26bcec..9df86d0489b 100644
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -58,6 +58,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full && \
     cp build/bin/* /app/full/ && \
     cp *.py /app/full/ && \
+    cp -r conversion /app/full/ && \
     cp -r gguf-py /app/full/ && \
     cp -r requirements /app/full/ && \
     cp requirements.txt /app/full/
diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile
index c8f32235d16..c19b7038bbe 100644
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -30,6 +30,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
     && cp build/bin/* /app/full \
     && cp *.py /app/full \
+    && cp -r conversion /app/full \
     && cp -r gguf-py /app/full \
     && cp -r requirements /app/full \
     && cp requirements.txt /app/full \
diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile
index 3805ea3a009..621fe8b6a97 100644
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
     && cp build/bin/* /app/full \
     && cp *.py /app/full \
+    && cp -r conversion /app/full \
     && cp -r gguf-py /app/full \
     && cp -r requirements /app/full \
     && cp requirements.txt /app/full \
diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index 218418b80b6..b127c5cec46 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
     && cp build/bin/* /app/full \
     && cp *.py /app/full \
+    && cp -r conversion /app/full \
     && cp -r gguf-py /app/full \
     && cp -r requirements /app/full \
     && cp requirements.txt /app/full \
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
index a7f70b5f0df..3194294b36d 100644
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -41,6 +41,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
     && cp build/bin/* /app/full \
     && cp *.py /app/full \
+    && cp -r conversion /app/full \
     && cp -r gguf-py /app/full \
     && cp -r requirements /app/full \
     && cp requirements.txt /app/full \
diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile
index 1266713f378..6dabdb323ca 100644
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -81,6 +81,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
     && cp build/ReleaseOV/bin/* /app/full/ \
     && cp *.py /app/full \
+    && cp -r conversion /app/full \
     && cp -r gguf-py /app/full \
     && cp -r requirements /app/full \
     && cp requirements.txt /app/full \
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index 2da15975d13..3fdf7a8e487 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -53,6 +53,7 @@ RUN mkdir -p /app/lib \
 RUN mkdir -p /app/full \
     && cp build/bin/* /app/full \
     && cp *.py /app/full \
+    && cp -r conversion /app/full \
     && cp -r gguf-py /app/full \
     && cp -r requirements /app/full \
     && cp requirements.txt /app/full \
diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile
index d36f5f3ccc5..31c2fa902d4 100644
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \
 
 COPY *.py             /opt/llama.cpp/bin
 COPY .devops/tools.sh /opt/llama.cpp/bin
+COPY conversion       /opt/llama.cpp/conversion
 
 COPY gguf-py          /opt/llama.cpp/gguf-py
 COPY requirements.txt /opt/llama.cpp/gguf-py
@@ -47,9 +48,10 @@ COPY requirements     /opt/llama.cpp/gguf-py/requirements
 FROM scratch AS collector
 
 # Copy llama.cpp binaries and libraries
-COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
-COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
-COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+COPY --from=build /opt/llama.cpp/bin        /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib        /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py    /llama.cpp/gguf-py
+COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion
 
 
 ### Base image
@@ -107,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 
 COPY --from=collector /llama.cpp/bin /app
 COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+COPY --from=collector /llama.cpp/conversion /app/conversion
 
 RUN pip install --no-cache-dir --break-system-packages \
         -r /app/gguf-py/requirements.txt
diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
index 464ccfef1ce..138a50d7da5 100644
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -26,6 +26,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
     && cp build/bin/* /app/full \
     && cp *.py /app/full \
+    && cp -r conversion /app/full \
     && cp -r gguf-py /app/full \
     && cp -r requirements /app/full \
     && cp requirements.txt /app/full \
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
index c7001edf050..23150d0b619 100644
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -100,8 +100,8 @@ body:
       label: Relevant log output
       description: >
           Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          For very long logs (thousands of lines), preferably upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
+          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
       value: |
         <details>
         <summary>Logs</summary>
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
index 831c98eb637..041a7cdb2ee 100644
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -88,8 +88,8 @@ body:
       description: >
           If applicable, please copy and paste any relevant log output, including any generated text.
           If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
-          For very long logs (thousands of lines), please upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
+          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
       value: |
         <details>
         <summary>Logs</summary>
diff --git a/.github/actions/linux-setup-spacemit/action.yml b/.github/actions/linux-setup-spacemit/action.yml
index e2193e8931d..39e405b6779 100644
--- a/.github/actions/linux-setup-spacemit/action.yml
+++ b/.github/actions/linux-setup-spacemit/action.yml
@@ -15,6 +15,6 @@ runs:
       id: setup
       uses: ./.github/actions/unarchive-tar
       with:
-        url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
+        url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
         path: ${{ inputs.path }}
         strip: 1
diff --git a/.github/actions/unarchive-tar/action.yml b/.github/actions/unarchive-tar/action.yml
index b97e402f46a..3d2f9be7bdd 100644
--- a/.github/actions/unarchive-tar/action.yml
+++ b/.github/actions/unarchive-tar/action.yml
@@ -24,4 +24,4 @@ runs:
       run: |
         mkdir -p ${{ inputs.path }}
         cd ${{ inputs.path }}
-        curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
+        curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml
index 6ad61582a50..43c63ce44f0 100644
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -96,3 +96,34 @@ runs:
           echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
           echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
           echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Install Cuda Toolkit 13.3
+      if: ${{ inputs.cuda_version == '13.3' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
diff --git a/.github/workflows/build-3rd-party.yml b/.github/workflows/build-3rd-party.yml
index 642d9786441..82e53dbafb3 100644
--- a/.github/workflows/build-3rd-party.yml
+++ b/.github/workflows/build-3rd-party.yml
@@ -22,9 +22,9 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
   ubuntu-24-llguidance:
diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml
index ef3fe502fa7..3e857d48e39 100644
--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -31,7 +31,7 @@ jobs:
   android-ndk-snapdragon:
     runs-on: ubuntu-latest
     container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
     defaults:
       run:
         shell: bash
@@ -61,7 +61,7 @@ jobs:
   linux-iot-snapdragon:
     runs-on: ubuntu-latest
     container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
+      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml
index 5d88305a4f0..713ccdc7ffb 100644
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -27,9 +27,9 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
   android:
@@ -73,6 +73,11 @@ jobs:
           fetch-depth: 0
           lfs: false
 
+      - name: Dependencies
+        run: |
+          apt-get update
+          apt-get install -y build-essential
+
       - name: Build
         id: ndk_build
         run: |
@@ -86,3 +91,53 @@ jobs:
         with:
           name: llama-cpp-android-arm64-cpu
           path: pkg-adb/llama.cpp
+
+  android-arm64:
+    runs-on: ubuntu-latest
+
+    env:
+      NDK_VERSION: "29.0.14206865"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: android-arm64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Set up JDK
+        uses: actions/setup-java@v5
+        with:
+          java-version: 17
+          distribution: temurin
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Install NDK
+        run: |
+          sdkmanager "ndk;${{ env.NDK_VERSION }}"
+          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=arm64-v8a \
+            -DANDROID_PLATFORM=android-28 \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build-apple.yml b/.github/workflows/build-apple.yml
index b99e614666e..d2c99d0d5af 100644
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -32,12 +32,12 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
-  macOS-latest-ios:
+  macos-latest-arm64:
     runs-on: macos-latest
 
     steps:
@@ -48,7 +48,79 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: macOS-latest-ios
+          key: macos-latest-arm64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=OFF \
+            -DGGML_METAL_SHADER_DEBUG=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main -E "test-llama-archs" --verbose --timeout 900
+
+  macos-latest-x64:
+    runs-on: macos-15-intel
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: macos-latest-x64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  macos-latest-ios:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: macos-latest-ios
           evict-old-files: 1d
           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
@@ -59,6 +131,7 @@ jobs:
           cmake -B build -G Xcode \
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_COMMON=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
@@ -89,6 +162,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -115,7 +189,7 @@ jobs:
           xcodebuild -downloadPlatform iOS
           xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
 
-  macOS-latest-tvos:
+  macos-latest-tvos:
     runs-on: macos-latest
 
     steps:
@@ -126,7 +200,7 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: macOS-latest-tvos
+          key: macos-latest-tvos
           evict-old-files: 1d
           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
@@ -138,6 +212,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_COMMON=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -147,7 +222,7 @@ jobs:
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
-  macOS-latest-visionos:
+  macos-latest-visionos:
     runs-on: macos-latest
 
     steps:
@@ -163,6 +238,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_COMMON=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -172,7 +248,7 @@ jobs:
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
-  macOS-latest-swift:
+  macos-latest-swift:
     runs-on: macos-latest
     needs: macos-latest-ios-xcode
 
@@ -188,7 +264,7 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: macOS-latest-swift
+          key: macos-latest-swift
           evict-old-files: 1d
           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
@@ -206,6 +282,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml
index bc0a92c7fcf..53d65f3768b 100644
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -28,7 +28,7 @@ jobs:
         id: cache-sdk
         with:
           path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
 
       - name: Setup Vulkan SDK
         if: steps.cache-sdk.outputs.cache-hit != 'true'
@@ -54,7 +54,7 @@ jobs:
   #      id: cache-toolchain
   #      with:
   #        path: ./spacemit_toolchain
-  #        key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+  #        key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
 
   #    - name: Setup SpacemiT Toolchain
   #      if: steps.cache-toolchain.outputs.cache-hit != 'true'
@@ -81,7 +81,7 @@ jobs:
         id: cache-openvino
         with:
           path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
 
       - name: Setup OpenVINO Toolkit
         if: steps.cache-openvino.outputs.cache-hit != 'true'
@@ -108,7 +108,7 @@ jobs:
         id: cache-rocm
         with:
           path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
 
       - name: Setup ROCm
         if: steps.cache-rocm.outputs.cache-hit != 'true'
diff --git a/.github/workflows/build-cann.yml b/.github/workflows/build-cann.yml
index d39b8763733..6d76ed49992 100644
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@@ -29,74 +29,76 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
-  openEuler-latest-cann:
-    defaults:
-      run:
-        shell: bash -el {0}
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
-        use_acl_graph: ['on', 'off']
-        exclude:
-          # 310P does not support USE_ACL_GRAPH=on
-          - chip_type: '310p'
-            use_acl_graph: 'on'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE} \
-                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  openEuler-latest-cann:
+#    defaults:
+#      run:
+#        shell: bash -el {0}
+#    strategy:
+#      matrix:
+#        arch: [x86, aarch64]
+#        chip_type: ['910b', '310p']
+#        build: ['Release']
+#        use_acl_graph: ['on', 'off']
+#        exclude:
+#          # 310P does not support USE_ACL_GRAPH=on
+#          - chip_type: '310p'
+#            use_acl_graph: 'on'
+#    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Free up disk space
+#        uses: ggml-org/free-disk-space@v1.3.1
+#        with:
+#          tool-cache: true
+#
+#      - name: Set container image
+#        id: cann-image
+#        run: |
+#          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+#          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+#
+#      - name: Pull container image
+#        run: docker pull "${{ steps.cann-image.outputs.image }}"
+#
+#      - name: Build
+#        env:
+#          BUILD_TYPE: ${{ matrix.build }}
+#          SOC_TYPE: ascend${{ matrix.chip_type }}
+#          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
+#        run: |
+#          HOST_UID=$(id -u)
+#          HOST_GID=$(id -g)
+#
+#          docker run --rm \
+#            -v "${PWD}:/workspace" \
+#            -w /workspace \
+#            -e SOC_TYPE=${SOC_TYPE} \
+#            -e BUILD_TYPE=${BUILD_TYPE} \
+#            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
+#            "${{ steps.cann-image.outputs.image }}" \
+#            bash -lc '
+#              set -e
+#              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
+#              yum clean all && rm -rf /var/cache/yum
+#              git config --global --add safe.directory "/workspace"
+#              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+#              cmake -S . -B build \
+#                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+#                  -DGGML_CANN=on \
+#                  -DSOC_TYPE=${SOC_TYPE} \
+#                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
+#              cmake --build build -j $(nproc)
+#
+#              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+#            '
diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml
index 84cf8ddf48a..5becff09c1b 100644
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -5,23 +5,23 @@ on:
 
 jobs:
   linux:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, Linux, CPU]
     steps:
       - uses: actions/checkout@v6
         with:
           fetch-depth: 0
 
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y build-essential tcl cmake
-
       - name: Build
         run: |
           PREFIX="$(pwd)"/inst
-          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
+          cmake -S . -B build \
+                -DCMAKE_PREFIX_PATH="$PREFIX" \
+                -DLLAMA_OPENSSL=OFF \
+                -DLLAMA_BUILD_TESTS=OFF \
+                -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_BUILD_EXAMPLES=OFF \
+                -DLLAMA_BUILD_APP=OFF \
+                -DCMAKE_BUILD_TYPE=Release
           cmake --build build --config Release
           cmake --install build --prefix "$PREFIX" --config Release
 
diff --git a/.github/workflows/build-cross.yml b/.github/workflows/build-cross.yml
index 97c7368387e..eef78b67417 100644
--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -277,7 +277,7 @@ jobs:
 
     env:
       # Make sure this is in sync with build-cache.yml
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4"
 
     steps:
       - uses: actions/checkout@v6
@@ -287,7 +287,7 @@ jobs:
       #  id: cache-toolchain
       #  with:
       #    path: ./spacemit_toolchain
-      #    key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+      #    key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
 
       - name: Setup SpacemiT Toolchain
         #if: steps.cache-toolchain.outputs.cache-hit != 'true'
diff --git a/.github/workflows/build-hip.yml b/.github/workflows/build-hip.yml
new file mode 100644
index 00000000000..ff8283ae7b2
--- /dev/null
+++ b/.github/workflows/build-hip.yml
@@ -0,0 +1,167 @@
+name: CI (hip)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-hip.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-hip.yml',
+      'ggml/src/ggml-cuda/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-22-hip:
+    runs-on: ubuntu-22.04
+    container: rocm/dev-ubuntu-22.04:6.1.2
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-hip
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake HIP support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGPU_TARGETS="gfx1030" \
+            -DGGML_HIP=ON
+          cmake --build build --config Release -j $(nproc)
+
+  windows-latest-hip:
+    runs-on: windows-2022
+
+    env:
+      # Make sure this is in sync with build-cache.yml
+      HIPSDK_INSTALLER_VERSION: "26.Q1"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Grab rocWMMA package
+        id: grab_rocwmma
+        run: |
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
+          7z x rocwmma.deb
+          7z x data.tar
+
+      - name: Use ROCm Installation Cache
+        uses: actions/cache@v5
+        id: cache-rocm
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Setup ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-rocm
+        with:
+          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+
+      - name: Install ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ${{ github.job }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DLLAMA_BUILD_BORINGSSL=ON `
+            -DROCM_DIR="${env:HIP_PATH}" `
+            -DGGML_HIP=ON `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGPU_TARGETS="gfx1100"  `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+  ubuntu-22-musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libssl-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-musa
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DGGML_MUSA=ON
+          time cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build-ibm.yml b/.github/workflows/build-ibm.yml
new file mode 100644
index 00000000000..d2e4f3cdaeb
--- /dev/null
+++ b/.github/workflows/build-ibm.yml
@@ -0,0 +1,150 @@
+name: CI (ibm)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-ibm.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-ibm.yml',
+      'ggml/src/ggml-cpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-24-s390x:
+    runs-on: ubuntu-24.04-s390x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Swap Endianness
+        id: endianness
+        run: |
+          for f in models/*.gguf; do
+            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
+          done
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c (s390x)
+        id: llama2c_test_s390x
+        run: |
+          cd build
+          echo "Fetch llama2c big-endian model"
+          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
+          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+  ubuntu-24-ppc64le:
+    runs-on: ubuntu-24.04-ppc64le
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
diff --git a/.github/workflows/build-msys.yml b/.github/workflows/build-msys.yml
index 57cec7c166c..8214f2b8da8 100644
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@@ -15,9 +15,9 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
   windows-msys2:
diff --git a/.github/workflows/build-opencl.yml b/.github/workflows/build-opencl.yml
new file mode 100644
index 00000000000..fccb06b8821
--- /dev/null
+++ b/.github/workflows/build-opencl.yml
@@ -0,0 +1,83 @@
+name: CI (opencl)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-opencl.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-opencl.yml',
+      'ggml/src/ggml-opencl/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  windows-latest-opencl-adreno:
+    runs-on: windows-2025
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: windows-latest-llvm-arm64-opencl-adreno
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml
index f7177f6be37..47e04869ca9 100644
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -29,9 +29,9 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
   ubuntu-24-openvino:
@@ -84,7 +84,7 @@ jobs:
         id: cache-openvino
         with:
           path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
 
       - name: Setup OpenVINO Toolkit
         if: steps.cache-openvino.outputs.cache-hit != 'true'
diff --git a/.github/workflows/build-riscv.yml b/.github/workflows/build-riscv.yml
index b78b13140e5..c12aaa61fec 100644
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@@ -29,11 +29,84 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
+  ubuntu-cpu-riscv64-native:
+    runs-on: ubuntu-24.04-riscv
+
+    steps:
+      - name: Install dependencies
+        run: |
+          # Install necessary packages
+          sudo apt-get update
+          sudo apt-get install -y libssl-dev
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+
+          git lfs install
+
+      - name: Check environment
+        run: |
+          uname -a
+          gcc --version
+          g++ --version
+          ldd --version
+          cmake --version
+          rustc --version
+          env
+          echo "nproc=$(nproc)"
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
+      #  with:
+      #    key: ubuntu-cpu-riscv64-native
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DGGML_RPC=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
   ubuntu-riscv64-native-sanitizer:
     runs-on: ubuntu-24.04-riscv
 
@@ -62,12 +135,13 @@ jobs:
         id: checkout
         uses: actions/checkout@v6
 
-      - name: ccache
-        uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-        with:
-          key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
+      #  with:
+      #    key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
diff --git a/.github/workflows/build-rpc.yml b/.github/workflows/build-rpc.yml
new file mode 100644
index 00000000000..c1ff98770e9
--- /dev/null
+++ b/.github/workflows/build-rpc.yml
@@ -0,0 +1,67 @@
+name: CI (rpc)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-rpc.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-rpc.yml',
+      'ggml/src/ggml-rpc/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-latest-rpc:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libssl-dev ninja-build
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
diff --git a/.github/workflows/build-sanitize.yml b/.github/workflows/build-sanitize.yml
index c7b73d1dd0d..29f7a2922b2 100644
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@@ -22,66 +22,78 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
-  ubuntu-latest-sanitizer:
-    runs-on: ubuntu-latest
+  ctest:
+    runs-on: [self-hosted, X64, CPU, Linux]
 
     continue-on-error: true
 
     strategy:
       matrix:
         sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
 
     steps:
       - name: Clone
         id: checkout
         uses: actions/checkout@v6
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
-      - name: Dependencies
-        id: depends
+      #- name: Dependencies
+      #  id: depends
+      #  run: |
+      #    sudo apt-get update
+      #    sudo apt-get install build-essential libssl-dev
+
+      # with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
+      - name: Build (undefined)
+        id: cmake_build_undefined
+        if: ${{ matrix.sanitizer == 'UNDEFINED' }}
         run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
+
+          cmake --build build --config Debug -j $(nproc)
 
       - name: Build
         id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
+        if: ${{ matrix.sanitizer == 'ADDRESS' }}
         run: |
           cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
             -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
 
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+          cmake --build build --config RelWithDebInfo -j $(nproc)
 
       - name: Build (no OpenMP)
         id: cmake_build_no_openmp
         if: ${{ matrix.sanitizer == 'THREAD' }}
         run: |
           cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
             -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
             -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
             -DGGML_OPENMP=OFF
 
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+          cmake --build build --config RelWithDebInfo -j $(nproc)
 
       - name: Test
         id: cmake_test
+        # skip run in Debug - very slow
+        if: ${{ matrix.sanitizer != 'UNDEFINED' }}
         run: |
           cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L main -E tokenizer --verbose --timeout 900
diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
index 2851c45601f..9ff470ea3b7 100644
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -50,29 +50,12 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
-  determine-tag:
-    name: Determine tag name
-    runs-on: ubuntu-slim
-    outputs:
-      tag_name: ${{ steps.tag.outputs.name }}
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-        env:
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-  ggml-ci-nvidia-cuda:
-    needs: determine-tag
+  gpu-cuda:
     runs-on: [self-hosted, Linux, NVIDIA]
 
     steps:
@@ -82,14 +65,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-nvidia-vulkan-cm:
-    needs: determine-tag
+  gpu-vulkan-nvidia-cm:
     runs-on: [self-hosted, Linux, NVIDIA]
 
     steps:
@@ -99,14 +79,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-nvidia-vulkan-cm2:
-    needs: determine-tag
+  gpu-vulkan-nvidia-cm2:
     runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
 
     steps:
@@ -116,14 +93,12 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-nvidia-webgpu:
-    runs-on: [self-hosted, Linux, NVIDIA]
+  gpu-webgpu-nvidia:
+    runs-on: [self-hosted, Linux, NVIDIA, X64]
 
     steps:
       - name: Clone
@@ -149,10 +124,10 @@ jobs:
           GG_BUILD_WEBGPU=1 \
           GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
           GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMX-compatible machine
-  #ggml-ci-cpu-amx:
+  #cpu-amx:
   #  runs-on: [self-hosted, Linux, CPU, AMX]
 
   #  steps:
@@ -163,10 +138,10 @@ jobs:
   #    - name: Test
   #      id: ggml-ci
   #      run: |
-  #        bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #        bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMD GPU machine
-  # ggml-ci-amd-vulkan:
+  # amd-vulkan:
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -178,10 +153,10 @@ jobs:
   #       id: ggml-ci
   #       run: |
   #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMD GPU machine
-  # ggml-ci-amd-rocm:
+  # amd-rocm:
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -193,10 +168,9 @@ jobs:
   #       id: ggml-ci
   #       run: |
   #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-mac-metal:
-    needs: determine-tag
+  gpu-metal:
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -206,13 +180,10 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-mac-webgpu:
-    needs: determine-tag
+  gpu-webgpu-apple:
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -235,14 +206,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
             bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-mac-vulkan:
-    needs: determine-tag
+  gpu-vulkan:
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -252,14 +220,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-linux-intel-vulkan:
-    needs: determine-tag
+  gpu-vulkan-intel-linux:
     runs-on: [self-hosted, Linux, Intel]
 
     steps:
@@ -271,14 +236,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-win-intel-vulkan:
-    needs: determine-tag
+  gpu-vulkan-intel-windows:
     runs-on: [self-hosted, Windows, X64, Intel]
 
     steps:
@@ -293,15 +255,13 @@ jobs:
           MSYSTEM: UCRT64
           CHERE_INVOKING: 1
           PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
           # a valid python environment for testing
           LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
 
-  ggml-ci-intel-openvino-gpu-low-perf:
-    needs: determine-tag
+  cpu-openvino-low-perf:
     runs-on: [self-hosted, Linux, Intel, OpenVINO]
 
     concurrency:
@@ -333,8 +293,120 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           source ./openvino_toolkit/setupvars.sh
-          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-any-low-perf:
+    runs-on: [self-hosted, CPU]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-any-high-perf:
+    runs-on: [self-hosted, CPU]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-arm64-graviton4:
+    runs-on: ah-ubuntu_22_04-c8g_8x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+          build-essential \
+          python3-venv \
+          gpg \
+          wget \
+          time \
+          git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+            | gpg --dearmor \
+            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+            | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-arm64-graviton4-kleidiai:
+    runs-on: ah-ubuntu_22_04-c8g_8x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+          build-essential \
+          python3-venv \
+          gpg \
+          wget \
+          time \
+          git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+            | gpg --dearmor \
+            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+            | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
+      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: arm64-cpu-kleidiai-graviton4
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_KLEIDIAI=1 \
+          GG_BUILD_EXTRA_TESTS_0=1 \
+          bash ./ci/run.sh ./tmp/results ./tmp/mnt
diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml
index 09635f64edb..b0697f2f2b9 100644
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -29,132 +29,134 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
 
-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  ubuntu-24-sycl:
+#    strategy:
+#      matrix:
+#        build: [fp32]
+#        include:
+#          - build: fp32
+#            fp16: OFF
+#
+#    runs-on: ubuntu-24.04
+#
+#    env:
+#      ONEAPI_ROOT: /opt/intel/oneapi/
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#      LEVEL_ZERO_VERSION: "1.28.2"
+#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+#
+#    continue-on-error: true
+#
+#    steps:
+#      - uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          cd /tmp
+#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+#
+#      - name: Install Level Zero SDK
+#        shell: bash
+#        run: |
+#          cd /tmp
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+#
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: ubuntu-24-sycl-${{ matrix.build }}
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          source /opt/intel/oneapi/setvars.sh
+#          cmake -B build \
+#            -G "Ninja" \
+#            -DCMAKE_BUILD_TYPE=Release \
+#            -DGGML_SYCL=ON \
+#            -DCMAKE_C_COMPILER=icx \
+#            -DCMAKE_CXX_COMPILER=icpx \
+#            -DLLAMA_OPENSSL=OFF \
+#            -DGGML_NATIVE=OFF \
+#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+#          time cmake --build build --config Release -j $(nproc)
+
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  windows-latest-sycl:
+#    runs-on: windows-2022
+#
+#    defaults:
+#      run:
+#        shell: bash
+#
+#    env:
+#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+#
+#      - name: Install Level Zero SDK
+#        shell: pwsh
+#        run: |
+#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: windows-latest-sycl
+#          variant: ccache
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+#
+#      - name: Build
+#        id: cmake_build
+#        run:  examples/sycl/win-build-sycl.bat
diff --git a/.github/workflows/build-vulkan.yml b/.github/workflows/build-vulkan.yml
index ab32b6525ba..b44f08c6e97 100644
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -31,9 +31,9 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
   ubuntu-24-vulkan-llvmpipe:
@@ -68,7 +68,7 @@ jobs:
         id: cache-sdk
         with:
           path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
 
       - name: Setup Vulkan SDK
         if: steps.cache-sdk.outputs.cache-hit != 'true'
diff --git a/.github/workflows/build-webgpu.yml b/.github/workflows/build-webgpu.yml
new file mode 100644
index 00000000000..c7056358cc1
--- /dev/null
+++ b/.github/workflows/build-webgpu.yml
@@ -0,0 +1,173 @@
+name: CI (webgpu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-webgpu.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.wgsl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-webgpu.yml',
+      'ggml/src/ggml-webgpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  macos-latest-webgpu:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: macos-latest-webgpu
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export CMAKE_PREFIX_PATH=dawn
+          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  ubuntu-24-webgpu:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-webgpu
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo add-apt-repository -y ppa:kisak/kisak-mesa
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers \
+            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build \
+            -DGGML_WEBGPU=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900
+
+  ubuntu-24-webgpu-wasm:
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-webgpu-wasm
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install Emscripten
+        run: |
+          git clone https://github.com/emscripten-core/emsdk.git
+          cd emsdk
+          ./emsdk install latest
+          ./emsdk activate latest
+
+      - name: Fetch emdawnwebgpu
+        run: |
+          DAWN_TAG="v20260317.182325"
+          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+          echo "Downloading ${EMDAWN_PKG}"
+          curl -L -o emdawn.zip \
+            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+          unzip emdawn.zip
+
+      - name: Build WASM WebGPU
+        run: |
+          source emsdk/emsdk_env.sh
+          emcmake cmake -B build-wasm \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_WEBGPU=ON \
+            -DLLAMA_OPENSSL=OFF \
+            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 65fa24f4468..1deab2315e4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -52,127 +52,14 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
   build-cmake-pkg:
     uses: ./.github/workflows/build-cmake-pkg.yml
 
-  macOS-latest-arm64:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-arm64
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=OFF \
-            -DGGML_METAL_SHADER_DEBUG=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main -E "test-llama-archs" --verbose --timeout 900
-
-  macOS-latest-x64:
-    runs-on: macos-15-intel
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-x64
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  macOS-latest-arm64-webgpu:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-arm64-webgpu
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
   ubuntu-cpu:
     strategy:
       matrix:
@@ -181,10 +68,6 @@ jobs:
             os: ubuntu-22.04
           - build: 'arm64'
             os: ubuntu-24.04-arm
-          - build: 's390x'
-            os: ubuntu-24.04-s390x
-          - build: 'ppc64le'
-            os: ubuntu-24.04-ppc64le
 
     runs-on: ${{ matrix.os }}
 
@@ -194,7 +77,6 @@ jobs:
         uses: actions/checkout@v6
 
       - name: ccache
-        if: ${{ matrix.build != 's390x' && matrix.build != 'ppc64le' }}
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: ubuntu-cpu-${{ matrix.build }}
@@ -224,14 +106,6 @@ jobs:
           python3 -m pip install --upgrade pip setuptools
           pip3 install ./gguf-py
 
-      - name: Swap Endianness
-        id: endianness
-        if: ${{ matrix.build == 's390x' }}
-        run: |
-          for f in models/*.gguf; do
-            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
-          done
-
       - name: Build
         id: cmake_build
         run: |
@@ -248,7 +122,6 @@ jobs:
 
       - name: Test llama2c conversion
         id: llama2c_test
-        if: ${{ matrix.build != 's390x' }}
         run: |
           cd build
           echo "Fetch tokenizer"
@@ -258,96 +131,6 @@ jobs:
           ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
           ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
 
-      - name: Test llama2c (s390x)
-        id: llama2c_test_s390x
-        if: ${{ matrix.build == 's390x' }}
-        run: |
-          cd build
-          echo "Fetch llama2c big-endian model"
-          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
-          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  android-arm64:
-    runs-on: ubuntu-latest
-
-    env:
-      NDK_VERSION: "29.0.14206865"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: android-arm64
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Set up JDK
-        uses: actions/setup-java@v5
-        with:
-          java-version: 17
-          distribution: temurin
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Install NDK
-        run: |
-          sdkmanager "ndk;${{ env.NDK_VERSION }}"
-          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-            -DANDROID_ABI=arm64-v8a \
-            -DANDROID_PLATFORM=android-28 \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-  ubuntu-latest-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev ninja-build
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
   ubuntu-24-vulkan:
     strategy:
       matrix:
@@ -387,176 +170,6 @@ jobs:
         run: |
           time cmake --build build -j $(nproc)
 
-  ubuntu-24-webgpu:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-webgpu
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo add-apt-repository -y ppa:kisak/kisak-mesa
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers \
-            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Use Vulkan SDK Cache
-        uses: actions/cache@v5
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build \
-            -DGGML_WEBGPU=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          # This is using llvmpipe and runs slower than other backends
-          # test-backend-ops is too slow on llvmpipe, skip it
-          ctest -L main -E test-backend-ops --verbose --timeout 900
-
-  ubuntu-24-webgpu-wasm:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Install Emscripten
-        run: |
-          git clone https://github.com/emscripten-core/emsdk.git
-          cd emsdk
-          ./emsdk install latest
-          ./emsdk activate latest
-
-      - name: Fetch emdawnwebgpu
-        run: |
-          DAWN_TAG="v20260317.182325"
-          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
-          echo "Downloading ${EMDAWN_PKG}"
-          curl -L -o emdawn.zip \
-            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
-          unzip emdawn.zip
-
-      - name: Build WASM WebGPU
-        run: |
-          source emsdk/emsdk_env.sh
-          emcmake cmake -B build-wasm \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_WEBGPU=ON \
-            -DLLAMA_OPENSSL=OFF \
-            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
-
-          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
-
-  ubuntu-22-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.1.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-hip
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGPU_TARGETS="gfx1030" \
-            -DGGML_HIP=ON
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-musa:
-    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get install -y build-essential git cmake libssl-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-musa
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with native CMake MUSA support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
-          time cmake --build build --config Release -j $(nproc)
-
-
   windows-latest:
     runs-on: windows-2025
 
@@ -568,21 +181,18 @@ jobs:
     strategy:
       matrix:
         include:
-          - build: 'cpu-x64 (static)'
+          - build: 'x64-cpu-static'
             arch: 'x64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
-          - build: 'openblas-x64'
+          - build: 'x64-openblas'
             arch: 'x64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
+          - build: 'x64-vulkan'
             arch: 'x64'
-            defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'llvm-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
+          - build: 'arm64'
             arch: 'arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-          - build: 'llvm-arm64-opencl-adreno'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
 
     steps:
       - name: Clone
@@ -599,7 +209,7 @@ jobs:
 
       - name: Download OpenBLAS
         id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
+        if: ${{ matrix.build == 'x64-openblas' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
           curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@@ -612,7 +222,7 @@ jobs:
 
       - name: Install Vulkan SDK
         id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
+        if: ${{ matrix.build == 'x64-vulkan' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -624,26 +234,6 @@ jobs:
         run: |
           choco install ninja
 
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
       - name: Build
         id: cmake_build
         run: |
@@ -653,7 +243,7 @@ jobs:
 
       - name: Add libopenblas.dll
         id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
+        if: ${{ matrix.build == 'x64-openblas' }}
         run: |
           cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
           cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
@@ -763,348 +353,3 @@ jobs:
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
           cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
-
-
-  windows-latest-hip:
-    runs-on: windows-2022
-
-    env:
-      # Make sure this is in sync with build-cache.yml
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Grab rocWMMA package
-        id: grab_rocwmma
-        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
-          7z x rocwmma.deb
-          7z x data.tar
-
-      - name: Use ROCm Installation Cache
-        uses: actions/cache@v5
-        id: cache-rocm
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Setup ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-rocm
-        with:
-          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          # Find and test ROCm installation
-          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
-          if (-not $clangPath) {
-            Write-Error "ROCm installation not found"
-            exit 1
-          }
-          & $clangPath.FullName --version
-
-      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ${{ github.job }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DLLAMA_BUILD_BORINGSSL=ON `
-            -DROCM_DIR="${env:HIP_PATH}" `
-            -DGGML_HIP=ON `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGPU_TARGETS="gfx1100"  `
-            -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  ubuntu-cpu-riscv64-native:
-    runs-on: ubuntu-24.04-riscv
-
-    steps:
-      - name: Install dependencies
-        run: |
-          # Install necessary packages
-          sudo apt-get update
-          sudo apt-get install -y libssl-dev
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-
-          git lfs install
-
-      - name: Check environment
-        run: |
-          uname -a
-          gcc --version
-          g++ --version
-          ldd --version
-          cmake --version
-          rustc --version
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-        with:
-          key: ubuntu-cpu-riscv64-native
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=ON \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DGGML_RPC=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-# TODO: simplify the following workflows using a matrix
-# TODO: run lighter CI on PRs and the full CI only on master (if needed)
-  ggml-ci-x64-cpu-low-perf:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-x64-cpu-low-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-low-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-arm64-cpu-low-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-x64-cpu-high-perf:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-x64-cpu-high-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-high-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-arm64-cpu-high-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-high-perf-sve:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-arm64-cpu-high-perf-sve
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-kleidiai:
-     runs-on: ubuntu-22.04-arm
-
-     steps:
-       - name: Clone
-         id: checkout
-         uses: actions/checkout@v6
-
-       - name: ccache
-         uses: ggml-org/ccache-action@v1.2.21
-         with:
-           key: ggml-ci-arm64-cpu-kleidiai
-           evict-old-files: 1d
-           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-       - name: Dependencies
-         id: depends
-         run: |
-           sudo apt-get update
-           sudo apt-get install -y build-essential
-
-       - name: Test
-         id: ggml-ci
-         run: |
-           GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-kleidiai-graviton4:
-     runs-on: ah-ubuntu_22_04-c8g_8x
-
-     steps:
-       - name: Clone
-         id: checkout
-         uses: actions/checkout@v6
-
-       - name: Dependencies
-         id: depends
-         run: |
-           set -euxo pipefail
-           sudo apt-get update
-           sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
-           apt-get install -y \
-            build-essential \
-            python3-venv \
-            gpg \
-            wget \
-            time \
-            git-lfs
-
-           git lfs install
-
-           # install the latest cmake
-           sudo install -d /usr/share/keyrings
-           wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-            | gpg --dearmor \
-            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-           echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
-            | sudo tee /etc/apt/sources.list.d/kitware.list
-           sudo apt-get update
-           sudo apt-get install -y cmake
-
-       - name: ccache
-         uses: ggml-org/ccache-action@v1.2.21
-         with:
-           key: ggml-ci-arm64-cpu-kleidiai-graviton4
-           evict-old-files: 1d
-           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-       - name: Test
-         id: ggml-ci
-         run: |
-           GG_BUILD_KLEIDIAI=1 \
-           GG_BUILD_EXTRA_TESTS_0=1 \
-           bash ./ci/run.sh ./tmp/results ./tmp/mnt
diff --git a/.github/workflows/check-vendor.yml b/.github/workflows/check-vendor.yml
index 1671ed7b8bd..015629f380c 100644
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -19,7 +19,7 @@ on:
 
 jobs:
   check-vendor:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
 
     steps:
       - name: Checkout
diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml
index c88396c0a7d..50b598b84dd 100644
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   model-naming:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     steps:
       - uses: actions/checkout@v6
       - name: Check model naming conventions
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
index 53f6a0ccfda..59159cd4144 100644
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   editorconfig:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     steps:
       - uses: actions/checkout@v6
       - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
diff --git a/.github/workflows/hip-quality-check.yml b/.github/workflows/hip-quality-check.yml
index d00d30ed65c..5d03b177293 100644
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@@ -28,9 +28,9 @@ concurrency:
 env:
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
   ubuntu-22-hip-quality-check:
diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml
index 7126b62b690..3e440b67d9b 100644
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -3,16 +3,16 @@ name: Check Pre-Tokenizer Hashes
 on:
     push:
         paths:
-            - 'convert_hf_to_gguf.py'
+            - 'conversion/base.py'
             - 'convert_hf_to_gguf_update.py'
     pull_request:
         paths:
-            - 'convert_hf_to_gguf.py'
+            - 'conversion/base.py'
             - 'convert_hf_to_gguf_update.py'
 
 jobs:
     pre-tokenizer-hashes:
-        runs-on: ubuntu-slim
+        runs-on: [self-hosted, fast]
 
         steps:
         - name: Checkout repository
@@ -30,16 +30,16 @@ jobs:
 
         - name: Update pre-tokenizer hashes
           run: |
-              cp convert_hf_to_gguf.py /tmp
+              cp conversion/base.py /tmp
               .venv/bin/python convert_hf_to_gguf_update.py --check-missing
 
         - name: Check if committed pre-tokenizer hashes matches generated version
           run: |
-              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
-                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
-                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+              if ! diff -q conversion/base.py /tmp/base.py; then
+                  echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes"
                   echo "Differences found:"
-                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+                  diff conversion/base.py /tmp/base.py || true
                   exit 1
               fi
               echo "Model pre-tokenizer hashes are up to date."
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index 1219b874592..2c7fab40b44 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -20,7 +20,7 @@ concurrency:
 
 jobs:
   python-check-requirements:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, CPU, fast]
     name: check-requirements
     steps:
       - name: Check out source repository
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index 1e5d64c1aee..0424f372a14 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -21,7 +21,7 @@ concurrency:
 
 jobs:
   flake8-lint:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     name: Lint
     steps:
       - name: Check out source repository
diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
index cbeeb39d05b..14edb1a9d17 100644
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -22,7 +22,7 @@ concurrency:
 
 jobs:
   python-type-check:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     name: python type-check
     steps:
       - name: Check out source repository
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1880c155c85..20b2dc915a1 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -37,8 +37,30 @@ env:
 
 jobs:
 
-  macOS-cpu:
+  check_release:
+    runs-on: [self-hosted, fast]
 
+    outputs:
+      should_release: ${{ steps.check.outputs.should_release }}
+
+    steps:
+      - id: check
+        run: |
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "should_release=true" >> $GITHUB_OUTPUT
+          elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
+            if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
+              echo "should_release=false" >> $GITHUB_OUTPUT
+            else
+              echo "should_release=true" >> $GITHUB_OUTPUT
+            fi
+          else
+            echo "should_release=false" >> $GITHUB_OUTPUT
+          fi
+
+  macos-cpu:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
     strategy:
       matrix:
         include:
@@ -46,10 +68,12 @@ jobs:
             arch: 'arm64'
             os: macos-14
             defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
-          - build: 'arm64-kleidiai'
-            arch: 'arm64'
-            os: macos-14
-            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
+          # TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23780)
+          #       in order to enable it again, we have to provision dedicated runners  to run it
+          #- build: 'arm64-kleidiai'
+          #  arch: 'arm64'
+          #  os: macos-14
+          #  defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
           - build: 'x64'
             arch: 'x64'
             os: macos-15-intel
@@ -76,7 +100,7 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: macOS-latest-${{ matrix.arch }}
+          key: macos-latest-${{ matrix.arch }}
           evict-old-files: 1d
 
       - name: Build
@@ -109,7 +133,8 @@ jobs:
           name: llama-bin-macos-${{ matrix.build }}.tar.gz
 
   ubuntu-cpu:
-
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
     strategy:
       matrix:
         include:
@@ -186,6 +211,8 @@ jobs:
           name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
 
   ubuntu-vulkan:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     strategy:
       matrix:
@@ -262,6 +289,8 @@ jobs:
           name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
 
   android-arm64:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-latest
 
@@ -339,6 +368,8 @@ jobs:
           name: llama-bin-android-arm64.tar.gz
 
   ubuntu-24-openvino:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-24.04
 
@@ -385,7 +416,7 @@ jobs:
         id: cache-openvino
         with:
           path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
 
       - name: Setup OpenVINO Toolkit
         if: steps.cache-openvino.outputs.cache-hit != 'true'
@@ -427,6 +458,8 @@ jobs:
           name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
 
   windows-cpu:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     runs-on: windows-2025
 
@@ -487,6 +520,8 @@ jobs:
           name: llama-bin-win-cpu-${{ matrix.arch }}.zip
 
   windows:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     runs-on: windows-2025
 
@@ -577,12 +612,14 @@ jobs:
           name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
 
   windows-cuda:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     runs-on: windows-2022
 
     strategy:
       matrix:
-        cuda: ['12.4', '13.1']
+        cuda: ['12.4', '13.3']
 
     steps:
       - name: Clone
@@ -655,214 +692,218 @@ jobs:
           path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
           name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
 
-  windows-sycl:
-
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-          cmake -G "Ninja" -B build ^
-            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON
-          cmake --build build --target ggml-sycl -j
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
-          if [ -n "$ZE_LOADER_DLL" ]; then
-            echo "Using Level Zero loader: $ZE_LOADER_DLL"
-            cp "$ZE_LOADER_DLL" ./build/bin
-          else
-            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
-          fi
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  ubuntu-24-sycl:
-
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  windows-sycl:
+#
+#    runs-on: windows-2022
+#
+#    defaults:
+#      run:
+#        shell: bash
+#
+#    env:
+#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+#
+#      - name: Install Level Zero SDK
+#        shell: pwsh
+#        run: |
+#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+#
+#      - name: Setup Node.js
+#        uses: actions/setup-node@v6
+#        with:
+#          node-version: "24"
+#          cache: "npm"
+#          cache-dependency-path: "tools/ui/package-lock.json"
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: windows-latest-sycl
+#          variant: ccache
+#          evict-old-files: 1d
+#
+#      - name: Build
+#        id: cmake_build
+#        shell: cmd
+#        run: |
+#          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+#          cmake -G "Ninja" -B build ^
+#            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+#            -DCMAKE_BUILD_TYPE=Release ^
+#            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+#            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+#            -DLLAMA_BUILD_BORINGSSL=ON
+#          cmake --build build --target ggml-sycl -j
+#
+#      - name: Build the release package
+#        id: pack_artifacts
+#        run: |
+#          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+#
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+#          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+#          if [ -n "$ZE_LOADER_DLL" ]; then
+#            echo "Using Level Zero loader: $ZE_LOADER_DLL"
+#            cp "$ZE_LOADER_DLL" ./build/bin
+#          else
+#            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+#          fi
+#
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
+#
+#          echo "cp oneAPI running time dll files to ./build/bin done"
+#          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
+#
+#      - name: Upload the release package
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-bin-win-sycl-x64.zip
+#          name: llama-bin-win-sycl-x64.zip
+
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  ubuntu-24-sycl:
+#
+#    strategy:
+#      matrix:
+#        build: [fp32]
+#        include:
+#          - build: fp32
+#            fp16: OFF
+#
+#    runs-on: ubuntu-24.04
+#
+#    env:
+#      ONEAPI_ROOT: /opt/intel/oneapi/
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#      LEVEL_ZERO_VERSION: "1.28.2"
+#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          cd /tmp
+#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+#
+#      - name: Install Level Zero SDK
+#        shell: bash
+#        run: |
+#          cd /tmp
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+#
+#      - name: Setup Node.js
+#        uses: actions/setup-node@v6
+#        with:
+#          node-version: "24"
+#          cache: "npm"
+#          cache-dependency-path: "tools/ui/package-lock.json"
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: ubuntu-24-sycl-${{ matrix.build }}
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          source /opt/intel/oneapi/setvars.sh
+#          cmake -B build \
+#            -G "Ninja" \
+#            -DCMAKE_BUILD_TYPE=Release \
+#            -DGGML_SYCL=ON \
+#            -DCMAKE_C_COMPILER=icx \
+#            -DCMAKE_CXX_COMPILER=icpx \
+#            -DLLAMA_OPENSSL=OFF \
+#            -DGGML_NATIVE=OFF \
+#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+#          time cmake --build build --config Release -j $(nproc)
+#
+#      - name: Determine tag name
+#        id: tag
+#        uses: ./.github/actions/get-tag-name
+#
+#      - name: Pack artifacts
+#        id: pack_artifacts
+#        run: |
+#          cp LICENSE ./build/bin/
+#          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+#
+#      - name: Upload artifacts
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+#          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
 
   ubuntu-22-rocm:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-22.04
 
@@ -974,6 +1015,8 @@ jobs:
           name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
 
   windows-hip:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
 
     runs-on: windows-2022
 
@@ -1010,7 +1053,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
 
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
@@ -1088,6 +1131,8 @@ jobs:
           name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
 
   ios-xcode-build:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
     runs-on: macos-15
 
     steps:
@@ -1108,6 +1153,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -1142,96 +1188,102 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
           name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
 
-
-  openEuler-cann:
-    strategy:
-      matrix:
-        include:
-          # 910b with aclgraph (both architectures)
-          - arch: x86
-            chip_type: '910b'
-            build: 'Release'
-            use_acl_graph: 'on'
-          - arch: aarch64
-            chip_type: '910b'
-            build: 'Release'
-            use_acl_graph: 'on'
-          # 310p without aclgraph (both architectures)
-          - arch: x86
-            chip_type: '310p'
-            build: 'Release'
-            use_acl_graph: 'off'
-          - arch: aarch64
-            chip_type: '310p'
-            build: 'Release'
-            use_acl_graph: 'off'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE} \
-                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  openEuler-cann:
+#    strategy:
+#      matrix:
+#        include:
+#          # 910b with aclgraph (both architectures)
+#          - arch: x86
+#            chip_type: '910b'
+#            build: 'Release'
+#            use_acl_graph: 'on'
+#          - arch: aarch64
+#            chip_type: '910b'
+#            build: 'Release'
+#            use_acl_graph: 'on'
+#          # 310p without aclgraph (both architectures)
+#          - arch: x86
+#            chip_type: '310p'
+#            build: 'Release'
+#            use_acl_graph: 'off'
+#          - arch: aarch64
+#            chip_type: '310p'
+#            build: 'Release'
+#            use_acl_graph: 'off'
+#    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Free up disk space
+#        uses: ggml-org/free-disk-space@v1.3.1
+#        with:
+#          tool-cache: true
+#
+#      - name: Set container image
+#        id: cann-image
+#        run: |
+#          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+#          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+#
+#      - name: Pull container image
+#        run: docker pull "${{ steps.cann-image.outputs.image }}"
+#
+#      - name: Build
+#        env:
+#          BUILD_TYPE: ${{ matrix.build }}
+#          SOC_TYPE: ascend${{ matrix.chip_type }}
+#          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
+#        run: |
+#          HOST_UID=$(id -u)
+#          HOST_GID=$(id -g)
+#
+#          docker run --rm \
+#            -v "${PWD}:/workspace" \
+#            -w /workspace \
+#            -e SOC_TYPE=${SOC_TYPE} \
+#            -e BUILD_TYPE=${BUILD_TYPE} \
+#            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
+#            "${{ steps.cann-image.outputs.image }}" \
+#            bash -lc '
+#              set -e
+#              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
+#              yum clean all && rm -rf /var/cache/yum
+#              git config --global --add safe.directory "/workspace"
+#              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+#              cmake -S . -B build \
+#                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+#                  -DGGML_CANN=on \
+#                  -DSOC_TYPE=${SOC_TYPE} \
+#                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
+#              cmake --build build -j $(nproc)
+#
+#              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+#            '
+#
+#      - name: Determine tag name
+#        id: tag
+#        uses: ./.github/actions/get-tag-name
+#
+#      - name: Pack artifacts
+#        run: |
+#          cp LICENSE ./build/bin/
+#          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+#
+#      - name: Upload artifacts
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
+#          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
+
+  ui-build:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    uses: ./.github/workflows/ui-build.yml
 
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -1247,17 +1299,18 @@ jobs:
       - windows
       - windows-cpu
       - windows-cuda
-      - windows-sycl
+      #- windows-sycl
       - windows-hip
       - ubuntu-22-rocm
       - ubuntu-cpu
       - ubuntu-vulkan
       - ubuntu-24-openvino
-      - ubuntu-24-sycl
+      #- ubuntu-24-sycl
       - android-arm64
-      - macOS-cpu
+      - macos-cpu
       - ios-xcode-build
-      - openEuler-cann
+      #- openEuler-cann
+      - ui-build
 
     outputs:
       tag_name: ${{ steps.tag.outputs.name }}
@@ -1317,6 +1370,18 @@ jobs:
           mv -v artifact/*.zip release
           mv -v artifact/*.tar.gz release
 
+      - name: Download UI build
+        id: download_ui
+        uses: actions/download-artifact@v7
+        with:
+          name: ui-build
+          path: ./ui-dist
+
+      - name: Package UI
+        id: package_ui
+        run: |
+          tar -czvf release/llama-${{ steps.tag.outputs.name }}-ui.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./ui-dist .
+
       - name: Create release
         id: create_release
         uses: ggml-org/action-create-release@v1
@@ -1333,7 +1398,7 @@ jobs:
 
             **macOS/iOS:**
             - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
-            - [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
+            - macOS Apple Silicon (arm64, KleidiAI enabled) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23780)
             - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
             - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
 
@@ -1345,8 +1410,7 @@ jobs:
             - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
             - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
             - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
+            - Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
 
             **Android:**
             - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
@@ -1355,16 +1419,20 @@ jobs:
             - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
             - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
             - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
-            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
+            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
             - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
-            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
+            - Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
             - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
 
             **openEuler:**
-            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
-            - [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
-            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
-            - [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
+            - [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
+            - openEuler x86 (310p)
+            - openEuler x86 (910b, ACL Graph)
+            - openEuler aarch64 (310p)
+            - openEuler aarch64 (910b, ACL Graph)
+
+            **UI:**
+            - [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
 
       - name: Upload release
         id: upload_release
diff --git a/.github/workflows/server-sanitize.yml b/.github/workflows/server-sanitize.yml
index 53c9968ee96..c0817cbba87 100644
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@@ -26,10 +26,10 @@ on:
     ]
 
 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -37,7 +37,7 @@ concurrency:
 
 jobs:
   server:
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, CPU, Linux, llama-server]
 
     strategy:
       matrix:
@@ -46,19 +46,19 @@ jobs:
       fail-fast: false
 
     steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
+      #- name: Dependencies
+      #  id: depends
+      #  run: |
+      #    sudo apt-get update
+      #    sudo apt-get -y install \
+      #      build-essential \
+      #      xxd \
+      #      git \
+      #      cmake \
+      #      curl \
+      #      wget \
+      #      language-pack-en \
+      #      libssl-dev
 
       - name: Clone
         id: checkout
diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
index 857c72a4619..b9baede58b3 100644
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -29,10 +29,10 @@ on:
     ]
 
 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -91,45 +91,44 @@ jobs:
           export ${{ matrix.extra_args }}
           pytest -v -x -m "not slow"
 
-  # TODO: provision CUDA runner
-  #  server-cuda:
-  #    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
-  #
-  #    name: server-cuda (${{ matrix.wf_name }})
-  #    strategy:
-  #      matrix:
-  #        build_type: [Release]
-  #        wf_name: ["GPUx1"]
-  #        include:
-  #          - build_type: Release
-  #            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-  #            wf_name:    "GPUx1, backend-sampling"
-  #      fail-fast: false
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #        with:
-  #          fetch-depth: 0
-  #          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-  #
-  #      - name: Build
-  #        id: cmake_build
-  #        run: |
-  #          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-  #          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-  #
-  #      - name: Tests
-  #        id: server_integration_tests
-  #        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-  #        run: |
-  #          cd tools/server/tests
-  #          python3 -m venv venv
-  #          source venv/bin/activate
-  #          pip install -r requirements.txt
-  #          export ${{ matrix.extra_args }}
-  #          pytest -v -x -m "not slow"
+  server-cuda:
+    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
+
+    name: server-cuda (${{ matrix.wf_name }})
+    strategy:
+      matrix:
+        build_type: [Release]
+        wf_name: ["GPUx1"]
+        include:
+          - build_type: Release
+            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+            wf_name:    "GPUx1, backend-sampling"
+      fail-fast: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        run: |
+          cd tools/server/tests
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          export ${{ matrix.extra_args }}
+          pytest -v -x -m "not slow"
 
   server-kleidiai:
     runs-on: ah-ubuntu_22_04-c8g_8x
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 7b9c5a3a3d8..b30e3337044 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -44,20 +44,20 @@ on:
     ]
 
 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
 jobs:
-  server:
+  ubuntu:
     runs-on: ubuntu-latest
 
-    name: server (${{ matrix.wf_name }})
+    name: ubuntu (${{ matrix.wf_name }})
     strategy:
       matrix:
         build_type: [Release]
@@ -93,18 +93,17 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
 
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
         with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
+          key: server-ubuntu-default
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
         run: |
           cmake -B build \
-            -DLLAMA_BUILD_BORINGSSL=ON \
             -DGGML_SCHED_NO_REALLOC=ON
           cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
 
@@ -131,8 +130,8 @@ jobs:
           export ${{ matrix.extra_args }}
           SLOW_TESTS=1 pytest -v -x
 
-  server-windows:
-    runs-on: windows-2022
+  windows:
+    runs-on: windows-2025
 
     steps:
       - name: Clone
@@ -142,16 +141,24 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
 
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
         with:
-          node-version: "24"
+          key: server-windows-default
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
+        shell: cmd
         run: |
-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake -B build -G "Ninja Multi-Config" ^
+            -DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
+            -DGGML_SCHED_NO_REALLOC=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% --target llama-server
 
       - name: Python setup
         id: setup_python
diff --git a/.github/workflows/ui-build.yml b/.github/workflows/ui-build.yml
index 511c96fb6fc..2653afd06c7 100644
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -5,8 +5,7 @@ on:
 
 jobs:
   build:
-    name: Build static output
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     env:
       BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 
@@ -31,7 +30,7 @@ jobs:
 
       - name: Generate checksums
         run: |
-          cd build/tools/ui/dist
+          cd tools/ui/dist
           for f in *; do
             sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
           done
@@ -40,5 +39,5 @@ jobs:
         uses: actions/upload-artifact@v6
         with:
           name: ui-build
-          path: build/tools/ui/dist/
+          path: tools/ui/dist/
           retention-days: 1
diff --git a/.github/workflows/ui-publish.yml b/.github/workflows/ui-publish.yml
index 0df358557ef..8a0d991930c 100644
--- a/.github/workflows/ui-publish.yml
+++ b/.github/workflows/ui-publish.yml
@@ -38,7 +38,7 @@ jobs:
         uses: actions/download-artifact@v7
         with:
           name: ui-build
-          path: build/tools/ui/dist/
+          path: tools/ui/dist/
 
       - name: Install Hugging Face Hub CLI
         run: pip install -U huggingface_hub
@@ -49,12 +49,12 @@ jobs:
       - name: Sync built files to Hugging Face bucket (version tag)
         run: |
           # Upload the built files to the Hugging Face bucket under the release version
-          hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
+          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
 
       - name: Sync built files to Hugging Face bucket (latest)
         run: |
           # Also upload to the 'latest' directory for fallback downloads
-          hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
+          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
 
       - name: Verify upload
         run: |
diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml
new file mode 100644
index 00000000000..8a97a8284e5
--- /dev/null
+++ b/.github/workflows/ui-self-hosted.yml
@@ -0,0 +1,118 @@
+name: UI (self-hosted)
+
+# these are the same as ui.yml, but with self-hosted runners
+# the runners come with pre-installed Playwright browsers version: 1.56.1
+# the jobs are much lighter because they don't need to install node and playwright browsers
+
+on:
+  workflow_dispatch:
+    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/ui-self-hosted.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
+    ]
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/ui-self-hosted.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
+    ]
+
+env:
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ui-build:
+    name: Build static output
+    uses: ./.github/workflows/ui-build.yml
+
+  ui-checks:
+    name: Checks
+    needs: ui-build
+    runs-on: [self-hosted, PLAYWRIGHT]
+    continue-on-error: true
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install dependencies
+        id: setup
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Run type checking
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run check
+        working-directory: tools/ui
+
+      - name: Run linting
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run lint
+        working-directory: tools/ui
+
+      - name: Run Client tests
+        if: ${{ always() }}
+        run: npm run test:client
+        working-directory: tools/ui
+
+      - name: Run Unit tests
+        if: ${{ always() }}
+        run: npm run test:unit
+        working-directory: tools/ui
+
+  e2e-tests:
+    name: E2E Tests
+    needs: ui-build
+    runs-on: [self-hosted, PLAYWRIGHT]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install dependencies
+        id: setup
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run build
+        working-directory: tools/ui
+
+      - name: Build Storybook
+        if: ${{ always() }}
+        run: npm run build-storybook
+        working-directory: tools/ui
+
+      - name: Run UI tests
+        if: ${{ always() }}
+        run: npm run test:ui -- --testTimeout=60000
+        working-directory: tools/ui
+
+      - name: Run E2E tests
+        if: ${{ always() }}
+        run: npm run test:e2e
+        working-directory: tools/ui
diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui.yml
similarity index 93%
rename from .github/workflows/ui-ci.yml
rename to .github/workflows/ui.yml
index 761a9319414..b3712e45059 100644
--- a/.github/workflows/ui-ci.yml
+++ b/.github/workflows/ui.yml
@@ -1,4 +1,4 @@
-name: CI (UI)
+name: UI
 
 on:
   workflow_dispatch:
@@ -11,23 +11,25 @@ on:
     branches:
       - master
     paths: [
-      '.github/workflows/ui-ci.yml',
+      '.github/workflows/ui.yml',
+      '.github/workflows/ui-build.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
   pull_request:
     types: [opened, synchronize, reopened]
     paths: [
-      '.github/workflows/ui-ci.yml',
+      '.github/workflows/ui.yml',
+      '.github/workflows/ui-build.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
 
 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -39,7 +41,7 @@ jobs:
     uses: ./.github/workflows/ui-build.yml
 
   ui-checks:
-    name: UI Checks
+    name: Checks
     needs: ui-build
     runs-on: ubuntu-latest
     continue-on-error: true
diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml
index 2ab06eb9811..6e8bc1aa07c 100644
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -3,18 +3,20 @@ name: Update Operations Documentation
 on:
     push:
         paths:
+            - '.github/workflows/update-ops-docs.yml'
             - 'docs/ops.md'
             - 'docs/ops/**'
             - 'scripts/create_ops_docs.py'
     pull_request:
         paths:
+            - '.github/workflows/update-ops-docs.yml'
             - 'docs/ops.md'
             - 'docs/ops/**'
             - 'scripts/create_ops_docs.py'
 
 jobs:
     update-ops-docs:
-        runs-on: ubuntu-slim
+        runs-on: [self-hosted, fast, ARM64]
 
         steps:
         - name: Checkout repository
diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md
index b7597a4c3ae..06d97ae78ee 100644
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -1,7 +1,7 @@
 You are a coding agent. Here are some very important rules that you must follow:
 
 General:
-- By very precise and concise when writing code, comments, explanations, etc.
+- Be very precise and concise when writing code, comments, explanations, etc.
 - PR and commit titles format: `<module> : <title>`. Lookup recents for examples
 - Don't try to build or run the code unless you are explicitly asked to do so
 - Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
@@ -16,7 +16,8 @@ Pull requests (PRs):
 - New branch names are prefixed with "gg/"
 - Before opening a pull request, ask the user to confirm the description
 - When creating a pull request, look for the repository's PR template and follow it
-- For the AI usage disclosure section, write "YES. llama.cpp + pi"
+- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
+- Ask the user to tell you what model was used and write it in place of [MODEL]
 - Always create the pull requests in draft mode
 
 Commits:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6d6bb0e704..edd0ea1ded0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,24 +104,16 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
 
 # extra artifacts
-option(LLAMA_BUILD_TESTS            "llama: build tests"                                                                            ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS            "llama: build tools"                                                                            ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES         "llama: build examples"                                                                         ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER           "llama: build server example"                                                                   ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_UI                "llama: build the embedded Web UI for server"                                                   ON)
-option(LLAMA_USE_PREBUILT_UI         "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)"             ON)
-
-# Backward compat: when old var is set but new one isn't, forward the value
-if(DEFINED LLAMA_BUILD_WEBUI)
-    set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
-    message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
-endif()
-if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
-    set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
-    message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
-endif()
-option(LLAMA_TOOLS_INSTALL          "llama: install tools"                                                                          ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL          "llama: install tests"                                                                          ON)
+option(LLAMA_BUILD_TESTS     "llama: build tests"                                                                ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS     "llama: build tools"                                                                ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES  "llama: build examples"                                                             ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER    "llama: build server example"                                                       ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_APP       "llama: build the unified binary"                                                   ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_UI        "llama: build the embedded Web UI for server"                                       ON)
+option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
+
+option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
 
 # 3rd party libs
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
@@ -226,6 +218,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
     add_subdirectory(tools)
 endif()
 
+if (LLAMA_BUILD_APP)
+    add_subdirectory(app)
+endif()
+
 # Automatically add all files from the 'licenses' directory
 file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
 
diff --git a/CODEOWNERS b/CODEOWNERS
index f92fe98ce8c..4b9d9017715 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -49,7 +49,6 @@
 /examples/parallel/                     @ggerganov
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
-/examples/save-load-state/              @ggerganov
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 99504f14f31..6881a4d3ab3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -63,6 +63,7 @@ After submitting your PR:
 - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
 - Let other maintainers merge their own PRs
 - When merging a PR, make sure you have a good understanding of the changes
+- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
 
 Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
diff --git a/README.md b/README.md
index 71327e51453..dbe2c363a56 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ LLM inference in C/C++
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).
 
 ----
 
@@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
-| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
+| [WebGPU](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 | [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
new file mode 100644
index 00000000000..6c53ce0e4e2
--- /dev/null
+++ b/app/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(TARGET llama-app)
+
+add_executable(${TARGET} llama.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
+
+target_link_libraries(${TARGET} PRIVATE
+    llama-server-impl
+    llama-cli-impl
+    llama-completion-impl
+    llama-bench-impl
+    llama-batched-bench-impl
+    llama-fit-params-impl
+    llama-quantize-impl
+    llama-perplexity-impl
+)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/app/llama.cpp b/app/llama.cpp
new file mode 100644
index 00000000000..b0b86fd47d9
--- /dev/null
+++ b/app/llama.cpp
@@ -0,0 +1,95 @@
+#include "build-info.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+// visible
+int llama_server(int argc, char ** argv);
+int llama_cli(int argc, char ** argv);
+
+// hidden
+int llama_completion(int argc, char ** argv);
+int llama_bench(int argc, char ** argv);
+int llama_batched_bench(int argc, char ** argv);
+int llama_fit_params(int argc, char ** argv);
+int llama_quantize(int argc, char ** argv);
+int llama_perplexity(int argc, char ** argv);
+
+static int help(int argc, char ** argv);
+static int version(int argc, char ** argv);
+
+struct command {
+    const char * name;
+    const char * desc;
+    std::vector<std::string> aliases;
+    bool hidden;
+    int (*func)(int, char **);
+};
+
+static const command cmds[] = {
+    {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
+    {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
+    {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
+    {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
+    {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
+    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
+    {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
+    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
+    {"version",       "Show version",                                       {},           true,  version            },
+    {"help",          "Show available commands",                            {},           true,  help               },
+};
+
+static int version(int argc, char ** argv) {
+    printf("%s\n", llama_build_info());
+    return 0;
+}
+
+static int help(int argc, char ** argv) {
+    const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
+
+    printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
+
+    for (const auto & cmd : cmds) {
+        if (show_all || !cmd.hidden) {
+            printf("  %-15s %s\n", cmd.name, cmd.desc);
+        }
+    }
+    printf("\nRun 'llama <command> --help' for command-specific usage.\n");
+
+    return 0;
+}
+
+static bool matches(const std::string & arg, const command & cmd) {
+    if (arg == cmd.name) {
+        return true;
+    }
+    for (const auto & alias : cmd.aliases) {
+        if (arg == alias) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int main(int argc, char ** argv) {
+    const std::string arg = argc >= 2 ? argv[1] : "help";
+
+    for (const auto & cmd : cmds) {
+        if (matches(arg, cmd)) {
+
+            // router spawns children through this same binary, it needs the
+            // subcommand to relaunch as 'llama serve' and not bare options
+#ifdef _WIN32
+            _putenv_s("LLAMA_APP_CMD", cmd.name);
+#else
+            setenv("LLAMA_APP_CMD", cmd.name, 1);
+#endif
+            return cmd.func(argc - 1, argv + 1);
+        }
+    }
+
+    fprintf(stderr, "error: unknown command '%s'\n", arg.c_str());
+    return 1;
+}
diff --git a/build-xcframework.sh b/build-xcframework.sh
index c25a1ef28c1..d287d72fbd8 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0
 TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
+LLAMA_BUILD_APP=OFF
 LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
     -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
     -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
     -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
+    -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
     -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
     -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
     -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
diff --git a/ci/run.sh b/ci/run.sh
index a8cbd3371d3..e4a34ff0acd 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -66,6 +66,8 @@ fi
 
 if [ ! -z ${GG_BUILD_METAL} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+else
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF"
 fi
 
 if [ ! -z ${GG_BUILD_CUDA} ]; then
@@ -114,10 +116,7 @@ fi
 if [ ! -z ${GG_BUILD_VULKAN} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 
-    # if on Mac, disable METAL
     if [[ "$OSTYPE" == "darwin"* ]]; then
-        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
-
         MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
         MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
         if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
@@ -133,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
 fi
 
 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
 
     if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
         if [ -z "${CMAKE_PREFIX_PATH}" ]; then
@@ -167,6 +166,8 @@ fi
 
 if [ ! -z ${GG_BUILD_BLAS} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
+else
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF"
 fi
 
 if [ ! -z ${GG_BUILD_OPENVINO} ]; then
@@ -238,7 +239,7 @@ function gg_run_ctest_debug {
     (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
     set +e
 }
@@ -461,10 +462,10 @@ function gg_run_qwen3_0_6b {
 
     (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
 
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -700,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {
 
 ## main
 
-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
+export LLAMA_ARG_LOG_PREFIX=1
+export LLAMA_ARG_LOG_TIMESTAMPS=1
 
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
     # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
diff --git a/common/arg.cpp b/common/arg.cpp
index 13dfd413562..bdc2e9eb4fc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1334,12 +1334,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"-cpent", "--checkpoint-every-n-tokens"}, "N",
-        string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
+        {"-cms", "--checkpoint-min-step"}, "N",
+        string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step),
         [](common_params & params, int value) {
-            params.checkpoint_every_nt = value;
+            if (value < 0) {
+                throw std::invalid_argument("checkpoint-min-step must be non-negative");
+            }
+            params.checkpoint_min_step = value;
         }
-    ).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-cram", "--cache-ram"}, "N",
         string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
@@ -3023,7 +3026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.default_template_kwargs[item.key()] = item.value().dump();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
     add_opt(common_arg(
         {"-to", "--timeout"}, "N",
         string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -3324,7 +3327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params &, const std::string & value) {
             common_log_set_file(common_log_main(), value.c_str());
         }
-    ).set_env("LLAMA_LOG_FILE"));
+    ).set_env("LLAMA_ARG_LOG_FILE"));
     add_opt(common_arg(
         {"--log-colors"}, "[on|off|auto]",
         "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -3341,7 +3344,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                     string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
             }
         }
-    ).set_env("LLAMA_LOG_COLORS"));
+    ).set_env("LLAMA_ARG_LOG_COLORS"));
     add_opt(common_arg(
         {"-v", "--verbose", "--log-verbose"},
         "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3356,7 +3359,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.offline = true;
         }
-    ).set_env("LLAMA_OFFLINE"));
+    ).set_env("LLAMA_ARG_OFFLINE"));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
@@ -3364,13 +3367,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             " - 1: error\n"
             " - 2: warning\n"
             " - 3: info\n"
-            " - 4: debug\n"
+            " - 4: trace (more info)\n"
+            " - 5: debug\n"
             "(default: %d)\n", params.verbosity),
         [](common_params & params, int value) {
             params.verbosity = value;
             common_log_set_verbosity_thold(value);
         }
-    ).set_env("LLAMA_LOG_VERBOSITY"));
+    ).set_env("LLAMA_ARG_LOG_VERBOSITY"));
     add_opt(common_arg(
         {"--log-prefix"},
         {"--no-log-prefix"},
@@ -3590,6 +3594,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.draft.p_min = std::stof(value);
         }
     ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
+    add_opt(common_arg(
+        {"--spec-draft-backend-sampling"},
+        {"--no-spec-draft-backend-sampling"},
+        string_format("offload draft sampling to the backend (default: %s)",
+                      params.speculative.draft.backend_sampling ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.speculative.draft.backend_sampling = value;
+        }
+    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING"));
     add_opt(common_arg(
         {"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
         "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
diff --git a/common/chat-auto-parser-helpers.cpp b/common/chat-auto-parser-helpers.cpp
index 2499464cd82..81b17e5e1d2 100644
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -310,6 +310,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
 
 namespace autoparser {
 
+static const std::string ERR_TMPL = "#**ERROR**#";
+
 std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
     generation_params tmpl_params;
     tmpl_params.messages              = params.messages;
@@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par
         return common_chat_template_direct_apply(tmpl, tmpl_params);
     } catch (const std::exception & e) {
         LOG_DBG("Template application failed: %s\n", e.what());
-        return "";
+        return ERR_TMPL;
     }
 }
 
@@ -347,7 +349,7 @@ std::optional<compare_variants_result> compare_variants(
     std::string output_B = apply_template(tmpl, params_B);
 
     // Check for template application failures
-    if (output_A.empty() || output_B.empty()) {
+    if (output_A == ERR_TMPL || output_B == ERR_TMPL) {
         return std::nullopt;
     }
 
diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h
index c680e686867..7858f6572f2 100644
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -377,6 +377,8 @@ struct analyze_tools : analyze_base {
 
 struct autoparser {
     jinja::caps          jinja_caps;
+    std::string          user_start;
+    std::string          assistant_start;
     analyze_reasoning    reasoning;
     analyze_content      content;
     analyze_tools        tools;
@@ -387,6 +389,10 @@ struct autoparser {
 
     autoparser() = default;
 
+    // Find the starting marker for the user message and assistant message
+    std::string detect_user_start_marker(const common_chat_template & tmpl);
+    std::string detect_assistant_start_marker(const common_chat_template & tmpl);
+
     // Run full differential analysis on a template
     void analyze_template(const common_chat_template & tmpl);
 
diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp
index 9c7c9678acd..0875c5347f4 100644
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -8,6 +8,9 @@
 #include "peg-parser.h"
 
 #include <algorithm>
+#include <cctype>
+#include <ostream>
+#include <sstream>
 
 #define ANSI_RESET  "\033[0m"
 #define ANSI_PURPLE "\033[1m\x1b[38;5;126m"
@@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S";
 static const std::string ARG_FIRST = "AA_ARG_FST_AA";
 static const std::string ARG_SECOND = "BB_ARG_SND_BB";
 static const std::string USER_MSG = "U_USER_MSG Hello END_U";
+static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V";
 static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
 static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
 static const std::string CALL_ID_001 = "call00001";
@@ -71,6 +75,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
               analysis.content.end   = "<|END_OF_TURN_TOKEN|>";
               analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>");
               analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>");
+              analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>";
               LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET);
           }
       },
@@ -108,7 +113,59 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
               analysis.tools.function.close        = "```";
               LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
           }
-      }
+      },
+      // Nemotron Nano v2
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos &&
+              tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) {
+
+              analysis.tools.format.mode           = tool_format::JSON_NATIVE;
+              analysis.tools.format.section_start  = "";
+              analysis.tools.format.section_end    = "";
+              analysis.tools.format.per_call_start = "<TOOLCALL>";
+              analysis.tools.format.per_call_end   = "</TOOLCALL>";
+              analysis.content.mode                = content_mode::PLAIN;
+              analysis.content.start               = "";
+              analysis.content.end                 = "";
+              analysis.reasoning.mode              = reasoning_mode::TAG_BASED;
+              analysis.reasoning.start             = "<think>\n\n";
+              analysis.reasoning.end               = "</think>";
+              analysis.assistant_start             = "<SPECIAL_11>Assistant";
+              analysis.user_start                  = "<SPECIAL_11>User";
+              analysis.preserved_tokens.clear();
+              analysis.preserved_tokens.push_back("<SPECIAL_12>");
+              analysis.preserved_tokens.push_back("<SPECIAL_11>");
+              analysis.preserved_tokens.push_back("</think>");
+              analysis.preserved_tokens.push_back("<TOOLCALL>");
+              analysis.preserved_tokens.push_back("</TOOLCALL>");
+              LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET);
+          }
+      },
+      // Fireworks
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'"
+            " + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) {
+              analysis.assistant_start             = "<|start_header_id|>assistant<|end_header_id|>";
+              analysis.user_start                  = "<|start_header_id|>user<|end_header_id|>";
+              LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET);
+          }
+      },
+      // Solar Open
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) {
+              analysis.assistant_start             = "<|begin|>assistant";
+              LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET);
+          }
+      },
+      // Apriel 1.6
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) {
+              analysis.user_start                  = "<|begin_user|>";
+              analysis.assistant_start             = "<|begin_assistant|>";
+              LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
+          }
+      },
+
     });
 
 // Common JSON structures
@@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
     reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls);
     content = analyze_content(tmpl, reasoning);
     tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools());
+    assistant_start = detect_assistant_start_marker(tmpl);
+    user_start = detect_user_start_marker(tmpl);
     collect_preserved_tokens();
 
     for (auto & workaround : workarounds) {
@@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
     }
 
     LOG_DBG("\n--- Reasoning & Content Structure ---\n");
+    LOG_DBG("user_msg_start: %s\n", user_start.c_str());
+    LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str());
     LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str());
     LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str());
     LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str());
@@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() {
     add_token(tools.call_id.suffix);
 }
 
+std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) {
+    json user_msg = json{
+        { "role",    "user"   },
+        { "content", USER_MSG }
+    };
+
+    json assistant_no_reasoning = json{
+        { "role",    "assistant"   },
+        { "content", ASSISTANT_MSG }
+    };
+
+    template_params params;
+    params.messages              = json::array({ user_msg });
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        tmpl, params, [&](template_params & p) {
+            p.messages = json::array({ user_msg, assistant_no_reasoning });
+        }
+    );
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__);
+        return "";
+    }
+
+    auto usermsg = comparison->diff.right;
+    if (usermsg.find(ASSISTANT_MSG) == std::string::npos) {
+        LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__);
+    }
+
+    auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG));
+    if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) {
+        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start)));
+    }
+    if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) {
+        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end)));
+    }
+    return trim_whitespace(ast_prefix);
+}
+
+std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) {
+    json user_msg = json{
+        { "role",    "user"   },
+        { "content", USER_MSG }
+    };
+
+    json assistant = json{
+        { "role",    "assistant"   },
+        { "content", ASSISTANT_MSG }
+    };
+
+    json user_msg_two = json{
+        { "role",    "user"       },
+        { "content", USER_MSG_TWO }
+    };
+
+    template_params params;
+    params.messages              = json::array({});
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        tmpl, params, [&](template_params & p) {
+            p.messages = json::array({ user_msg });
+        }
+    );
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__);
+        params.messages = json::array({ user_msg_two, assistant });
+        comparison = compare_variants(
+            tmpl, params, [&](template_params & p) {
+                p.messages = json::array({ user_msg_two, assistant, user_msg });
+            }
+        );
+        if (!comparison) {
+            LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__);
+            return "";
+        }
+    }
+
+    auto usermsg = comparison->diff.right;
+    if (usermsg.find(USER_MSG) == std::string::npos) {
+        LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__);
+    }
+
+    if (usermsg.find(ASSISTANT_MSG) != std::string::npos) {
+        usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size());
+    }
+
+    auto candidate = usermsg.substr(0, usermsg.find(USER_MSG));
+    auto candidate_split = segmentize_markers(candidate);
+    std::stringstream result;
+    bool encountered_marker = false;
+    for (const auto & mrk : candidate_split) {
+        std::string lower_mrk = std::string(mrk.value);
+        std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(),
+            [](unsigned char c) { return std::tolower(c); });
+        // heuristic to weed out potential end markers, but only at the start
+        if (mrk.type == segment_type::MARKER && !encountered_marker &&
+            (lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) {
+            continue;
+        }
+        if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) {
+            continue;
+        }
+        encountered_marker |= mrk.type == segment_type::MARKER;
+        result << mrk.value;
+    }
+    return trim_whitespace(result.str());
+}
+
 analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools)
     : analyze_base(tmpl) {
     LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET);
diff --git a/common/chat.cpp b/common/chat.cpp
index 56873e3a1e9..ef151691c38 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -90,6 +90,45 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
     return text;
 }
 
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
+    if (delims.empty() || prompt.empty()) {
+        return {};
+    }
+
+    auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
+        std::vector<std::string>       all_delims;
+        std::vector<common_peg_parser> tagged_messages;
+
+        all_delims.reserve(delims.size());
+        tagged_messages.reserve(delims.size());
+        for (const auto & d : delims) {
+            all_delims.push_back(d.delimiter);
+        }
+
+        auto any_delim = p.until_one_of(all_delims);
+        for (const auto & d : delims) {
+            tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
+        }
+
+        return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
+    });
+
+    common_peg_parse_context ctx(prompt);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        return {};
+    }
+
+    std::vector<common_chat_msg_span> spans;
+    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
+        if (!node.tag.empty()) {
+            spans.push_back({ node.tag, node.start, node.end - node.start });
+        }
+    });
+
+    return spans;
+}
+
 json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
     if (!content.empty() && !content_parts.empty()) {
         throw std::runtime_error("Cannot specify both content and content_parts");
@@ -1042,6 +1081,14 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
 
     data.prompt            = prompt;
     data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
+    data.message_spans = common_chat_split_by_role(prompt, {
+        { "assistant", "<|start|>assistant" },
+        { "user",      "<|start|>user"      },
+        { "system",    "<|start|>developer" },
+        { "system",    "<|start|>system"    },
+        { "tool",      "<|start|>functions" },
+    });
+
     data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
     data.supports_thinking = true;
 
@@ -1181,6 +1228,11 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
         data.prompt += data.generation_prompt;
     }
 
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "user",      "<|turn>user\n"  },
+        { "assistant", "<|turn>model\n" },
+    });
+
     data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
     data.supports_thinking  = true;
     data.thinking_start_tag = "<|channel>thought";
@@ -2393,6 +2445,19 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
         struct autoparser::autoparser autoparser;
         autoparser.analyze_template(tmpl);
         auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
+
+        std::vector<common_chat_msg_delimiter> delimiters;
+        if (!autoparser.assistant_start.empty()) {
+            delimiters.push_back({ "assistant", autoparser.assistant_start });
+        }
+        if (!autoparser.user_start.empty()) {
+            delimiters.push_back({ "user", autoparser.user_start });
+        }
+
+        if (!delimiters.empty()) {
+            auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
+        }
+
         auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
         if (auto_params.supports_thinking) {
             auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
diff --git a/common/chat.h b/common/chat.h
index 8ace3e6ba69..5659cd42a07 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -143,6 +143,17 @@ struct common_chat_msg_diff {
     }
 };
 
+struct common_chat_msg_span {
+    std::string role;
+    std::size_t pos = 0;
+    std::size_t len = 0;
+};
+
+struct common_chat_msg_delimiter {
+    std::string role;
+    std::string delimiter;
+};
+
 struct common_chat_tool {
     std::string name;
     std::string description;
@@ -208,6 +219,7 @@ struct common_chat_params {
     std::vector<std::string>            preserved_tokens;
     std::vector<std::string>            additional_stops;
     std::string                         parser;
+    std::vector<common_chat_msg_span>   message_spans;
 };
 
 // per-message parsing syntax
@@ -219,6 +231,7 @@ struct common_chat_parser_params {
     bool                    reasoning_in_content = false;
     std::string             generation_prompt;
     bool                    parse_tool_calls     = true;
+    bool                    is_continuation      = false;
     bool                    echo                 = false;  // Include assistant prefilled msg in output
     bool                    debug                = false;  // Enable debug output for PEG parser
     common_peg_arena        parser               = {};
@@ -303,6 +316,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
         const std::string &                   src,
         autoparser::generation_params & params);
 
+
 // specialized per-task preset
 struct common_chat_prompt_preset {
     std::string system;
@@ -310,3 +324,6 @@ struct common_chat_prompt_preset {
 };
 
 common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
+
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
+
diff --git a/common/common.cpp b/common/common.cpp
index b6fdec3ce05..97daf281783 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -445,6 +445,27 @@ std::string string_strip(const std::string & str) {
     return str.substr(start, end - start);
 }
 
+std::string string_lcs(std::string_view a, std::string_view b) {
+    if (a.empty() || b.empty()) return {};
+
+    std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0));
+    size_t best_len = 0;
+    size_t best_end_a = 0;
+
+    for (size_t i = 1; i <= a.size(); ++i) {
+        for (size_t j = 1; j <= b.size(); ++j) {
+            if (a[i - 1] == b[j - 1]) {
+                dp[i][j] = dp[i - 1][j - 1] + 1;
+                if (dp[i][j] > best_len) {
+                    best_len = dp[i][j];
+                    best_end_a = i;
+                }
+            }
+        }
+    }
+    return std::string(a.substr(best_end_a - best_len, best_len));
+}
+
 std::string string_get_sortable_timestamp() {
     using clock = std::chrono::system_clock;
 
@@ -1173,7 +1194,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
             params.tensor_buft_overrides.data(),
             params.fit_params_target.data(),
             params.fit_params_min_ctx,
-            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+            params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
     }
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -1366,7 +1387,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
     }
 
     if (params.warmup) {
-        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
         llama_set_warmup(lctx, true);
 
diff --git a/common/common.h b/common/common.h
index 53c689bc11d..8a0e5eed5ee 100644
--- a/common/common.h
+++ b/common/common.h
@@ -305,6 +305,8 @@ struct common_params_speculative_draft {
     float p_split = 0.1f; // speculative decoding split probability
     float p_min   = 0.0f; // minimum speculative decoding probability (greedy)
 
+    bool backend_sampling = true; // offload draft sampling to the backend (default: on)
+
     common_params_model mparams;
 
     llama_context * ctx_tgt = nullptr;
@@ -592,7 +594,7 @@ struct common_params {
     bool    cache_prompt        = true;  // whether to enable prompt caching
     bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
     int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
-    int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
+    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
     int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
 
     std::string hostname      = "127.0.0.1";
@@ -615,11 +617,7 @@ struct common_params {
     std::map<std::string, std::string> default_template_kwargs;
 
     // UI configs
-#ifdef LLAMA_UI_DEFAULT_ENABLED
-    bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
-#else
-    bool ui = true; // default to enabled when not set
-#endif
+    bool ui = true;
 
     // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
     bool webui = ui;
@@ -733,6 +731,7 @@ std::string string_format(const char * fmt, ...);
 
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
+std::string string_lcs(std::string_view a, std::string_view b);
 
 std::string string_join(const std::vector<std::string> & values, const std::string & separator);
 std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
diff --git a/common/fit.cpp b/common/fit.cpp
index c10cb7f08b4..668d892e908 100644
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
     using std::runtime_error::runtime_error;
 };
 
-static std::vector<llama_device_memory_data> common_get_device_memory_data(
+std::vector<llama_device_memory_data> common_get_device_memory_data(
         const char * path_model,
         const llama_model_params * mparams,
         const llama_context_params * cparams,
diff --git a/common/fit.h b/common/fit.h
index e066092ec6c..643d3420095 100644
--- a/common/fit.h
+++ b/common/fit.h
@@ -1,6 +1,11 @@
 #pragma once
 
 #include "ggml.h"
+#include "ggml-backend.h"
+#include "llama.h"
+#include "../src/llama-ext.h"
+
+#include <vector>
 
 enum common_params_fit_status {
     COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
@@ -30,3 +35,14 @@ void common_fit_print(
                 struct llama_context_params * cparams);
 
 void common_memory_breakdown_print(const struct llama_context * ctx);
+
+// Load a model + context with no_alloc and return the per-device memory breakdown.
+std::vector<llama_device_memory_data> common_get_device_memory_data(
+                                  const char   * path_model,
+        const struct llama_model_params         * mparams,
+        const struct llama_context_params       * cparams,
+        std::vector<ggml_backend_dev_t>         & devs,
+                                      uint32_t  & hp_ngl,
+                                      uint32_t  & hp_n_ctx_train,
+                                      uint32_t  & hp_n_expert,
+                           enum ggml_log_level    log_level);
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 4d1b61a13ad..d5603f2b704 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -33,16 +33,15 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
 };
 
 static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) {
-    if (devices.empty()) {
-        return "default";
-    }
-
     std::string result;
     for (size_t i = 0; i < devices.size(); i++) {
-        if (i > 0) result += ", ";
+        if (devices[i] == nullptr) {
+            continue;
+        }
+        if (!result.empty()) result += ", ";
         result += ggml_backend_dev_name(devices[i]);
     }
-    return result;
+    return result.empty() ? "default" : result;
 }
 
 struct common_speculative_config {
@@ -414,8 +413,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
 
     std::vector<common_sampler_ptr> smpls;
 
+    // backend sampler chain per seq, attached to ctx_dft
+    std::vector<llama_sampler *> backend_chains;
+
     int32_t n_embd = 0;
 
+    bool kv_shared_with_target = false;
+
     // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
     // The last h-row of one process() call needs the first token of the NEXT
     // call to pair with, so it's stashed here until that next call fires.
@@ -442,10 +446,12 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
         auto * ctx_dft = this->params.ctx_dft;
         GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set");
 
-        n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
+        n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft));
+        GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) &&
+                "MTP input row width must match the target h_pre_norm width");
 
         LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
-        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
+        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
         LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
                 this->params.n_gpu_layers,
                 ggml_type_name(this->params.cache_type_k),
@@ -469,8 +475,27 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
         }
 
+        // offload draft sampling to the backend
+        backend_chains.assign(n_seq, nullptr);
+        if (this->params.backend_sampling) {
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
+                llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
+
+                if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
+                    LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
+                    llama_sampler_free(chain);
+                    chain = nullptr;
+                }
+                backend_chains[seq_id] = chain;
+            }
+        }
+
         llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
         llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
+        llama_set_mtp_source(ctx_dft, ctx_tgt);
+
+        kv_shared_with_target = llama_model_n_layer_kv(llama_get_model(ctx_dft)) == 0;
 
         pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
 
@@ -484,6 +509,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
     }
 
     ~common_speculative_impl_draft_mtp() override {
+        auto * ctx_dft = this->params.ctx_dft;
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
+            if (backend_chains[seq_id] == nullptr) {
+                continue;
+            }
+            if (ctx_dft) {
+                llama_set_sampler(ctx_dft, seq_id, nullptr);
+            }
+            llama_sampler_free(backend_chains[seq_id]);
+        }
+        backend_chains.clear();
+
         if (batch.token != nullptr) {
             free(batch.token);
             batch.token = nullptr;
@@ -496,9 +533,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
         if (N <= 0) {
             return;
         }
+
         auto * ctx_dft = this->params.ctx_dft;
         const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
-        if (pos_max < N - 1) {
+        if (pos_max < N - 1 && !kv_shared_with_target) {
             LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - "
                     "process() hook may not have run on every prefill ubatch "
                     "(need_embd / logits=1 on every prompt position?). "
@@ -541,48 +579,42 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
 
         const size_t row_bytes = (size_t) n_embd * sizeof(float);
 
-        common_batch_clear(batch);
+        // if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode
+        if (!kv_shared_with_target) {
+            common_batch_clear(batch);
 
-        for (int k = 0; k < n_tokens; ++k) {
-            common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
-        }
+            for (int k = 0; k < n_tokens; ++k) {
+                common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
+            }
 
-        // shift the tgt embeddings to the right by one position
-        // assumes that the tokens in the batch are sequential for each sequence
-        // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1]
-        //                                                       ^--- this is a problem
-        // TODO:this is generally true, but would be nice to assert it
-        {
-            const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt);
-            std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
+            // shift the tgt embeddings to the right by one position
+            // assumes that the tokens in the batch are sequential for each sequence
+            // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1]
+            //                                                       ^--- this is a problem
+            // TODO:this is generally true, but would be nice to assert it
+            {
+                const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt);
+                std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
+            }
 
-            //{
-            //    // string with seq_ids in the batch
-            //    std::stringstream ss;
-            //    for (int i = 0; i < n_tokens; ++i) {
-            //        ss << batch_in.seq_id[i][0] << ",";
-            //    }
-            //    LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str());
-            //}
-        }
+            // fill the pending embeddings from a previous run
+            auto set_h = [&](int idx, const float * h_row) {
+                std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes);
+            };
 
-        // fill the pending embeddings from a previous run
-        auto set_h = [&](int idx, const float * h_row) {
-            std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes);
-        };
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                if (i_batch_beg[seq_id] < 0) {
+                    continue;
+                }
 
-        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
-            if (i_batch_beg[seq_id] < 0) {
-                continue;
+                set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
             }
 
-            set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
-        }
-
-        const int32_t rc = llama_decode(ctx_dft, batch);
-        if (rc != 0) {
-            LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
-            return false;
+            const int32_t rc = llama_decode(ctx_dft, batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
+                return false;
+            }
         }
 
         for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2c38123dff8..05bdb021d01 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -73,7 +73,9 @@
     "Gemma3TextModel": "gemma",
     "Gemma3nForCausalLM": "gemma",
     "Gemma3nForConditionalGeneration": "gemma",
+    "Gemma4AssistantForCausalLM": "gemma",
     "Gemma4ForConditionalGeneration": "gemma",
+    "Gemma4ForCausalLM": "gemma",
     "GemmaForCausalLM": "gemma",
     "Glm4ForCausalLM": "glm",
     "Glm4MoeForCausalLM": "glm",
@@ -215,6 +217,7 @@
     "T5EncoderModel": "t5",
     "T5ForConditionalGeneration": "t5",
     "T5WithLMHeadModel": "t5",
+    "TalkieForCausalLM": "talkie",
     "UMT5ForConditionalGeneration": "t5",
     "UMT5Model": "t5",
     "UltravoxModel": "ultravox",
diff --git a/conversion/base.py b/conversion/base.py
index 30c2124c2b9..e1c15a95809 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -467,7 +467,14 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T
             elif quant_method == "compressed-tensors":
                 quant_format = quant_config["format"]
                 groups = quant_config["config_groups"]
-                if len(groups) > 1:
+                nvfp4_compressed_tensors = (
+                    quant_format == "nvfp4-pack-quantized"
+                    or quant_format == "mixed-precision"
+                    and bool(groups)
+                    and all(g.get("format") == "nvfp4-pack-quantized" for g in groups.values() if isinstance(g, dict))
+                )
+
+                if len(groups) > 1 and not nvfp4_compressed_tensors:
                     raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
                 weight_config = tuple(groups.values())[0]["weights"]
 
@@ -505,6 +512,9 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T
                             tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
                             if (base_name + "_zero_point") in self.model_tensors:
                                 tensors_to_remove.append(base_name + "_zero_point")
+                elif nvfp4_compressed_tensors:
+                    # Don't error from compressed-tensors, we'll handle them in _generate_nvfp4_tensors
+                    pass
                 else:
                     raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
             elif quant_method == "modelopt":
@@ -746,10 +756,13 @@ def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_s
         del experts, merged
 
     def prepare_tensors(self):
-        # detect NVFP4 quantization (ModelOpt format)
-        quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
-        quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
-        quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
+        # detect NVFP4 quantization (ModelOpt and Compressed-tensors formats)
+        quantization_config = self.hparams.get("quantization_config") or {}
+        quant_algo = quantization_config.get("quant_algo")
+        quant_method = quantization_config.get("quant_method")
+        quant_format = quantization_config.get("format")
+        quant_groups = quantization_config.get("config_groups") or {}
+        quant_layers = quantization_config.get("quantized_layers") or {}
         quant_config_file = self.dir_model / "hf_quant_config.json"
 
         if (not quant_algo or not quant_layers) and quant_config_file.is_file():
@@ -760,13 +773,25 @@ def prepare_tensors(self):
                 producer_name = (producer.get("name") or "").lower()
                 if quant_method is None:
                     self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
+                    quant_method = producer_name
                 quant_algo = quant_config.get("quant_algo", quant_algo)
+                quant_method = quant_config.get("quant_method", quant_method)
+                quant_format = quant_config.get("format", quant_format)
+                quant_groups = quant_config.get("config_groups", quant_groups) or {}
                 quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
 
         # Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
         # per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
+        nvfp4_compressed_tensors = quant_method == "compressed-tensors" and (
+            quant_format == "nvfp4-pack-quantized"
+            or quant_format == "mixed-precision"
+            and bool(quant_groups)
+            and all(g.get("format") == "nvfp4-pack-quantized" for g in quant_groups.values() if isinstance(g, dict))
+        )
         if quant_algo != "NVFP4":
-            if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
+            if nvfp4_compressed_tensors:
+                quant_algo = "NVFP4"
+            elif any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
                 quant_algo = "NVFP4"
 
         self._is_nvfp4 = quant_algo == "NVFP4"
@@ -776,6 +801,28 @@ def prepare_tensors(self):
         # This must run before dequant_model so NVFP4 tensors are removed
         # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
         if self._is_nvfp4:
+            if nvfp4_compressed_tensors:
+                # Convert compressed-tensors 'global' scales into the reciprocal
+                def inverse_scale(gen):
+                    def load():
+                        scale = LazyTorchTensor.to_eager(gen()).float()
+                        return 1.0 / scale
+                    return load
+
+                # Change the compressed-tensors names to the ModelOpt names for handling consistently later
+                for name in list(self.model_tensors.keys()):
+                    if name.endswith(".weight_packed"):
+                        weight_name = name.removesuffix("_packed")
+                        if weight_name not in self.model_tensors:
+                            self.model_tensors[weight_name] = self.model_tensors.pop(name)
+                    elif name.endswith(".weight_global_scale"):
+                        scale2_name = name.replace(".weight_global_scale", ".weight_scale_2")
+                        if scale2_name not in self.model_tensors:
+                            self.model_tensors[scale2_name] = inverse_scale(self.model_tensors.pop(name))
+                    elif name.endswith(".input_global_scale"):
+                        input_scale_name = name.replace(".input_global_scale", ".input_scale")
+                        if input_scale_name not in self.model_tensors:
+                            self.model_tensors[input_scale_name] = inverse_scale(self.model_tensors.pop(name))
             self._generate_nvfp4_tensors()
 
         self.dequant_model()
@@ -1575,6 +1622,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
             # ref: https://huggingface.co/sarvamai/sarvam-30b
             res = "sarvam-moe"
+        if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
+            # ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
+            res = "talkie"
+        if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
+            # ref: https://huggingface.co/openbmb/MiniCPM5-1B
+            res = "minicpm5"
 
         if res is None:
             logger.warning("\n")
@@ -1610,6 +1663,47 @@ def _set_vocab_gpt2(self) -> None:
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
+    def _set_vocab_hybriddna(self):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))  # ty: ignore[unresolved-attribute]
+        assert max(tokenizer.vocab.values()) < vocab_size  # ty: ignore[unresolved-attribute]
+
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}  # ty: ignore[unresolved-attribute]
+        # k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
+        # dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
+        # k-mer's own id (llama.cpp strips it on detokenization)
+        for kmer in tokenizer.kmers:  # ty: ignore[unresolved-attribute]
+            reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000"  # ty: ignore[unresolved-attribute]
+        added_vocab = tokenizer.get_added_vocab()  # ty: ignore[unresolved-attribute]
+        added_tokens_decoder = tokenizer.added_tokens_decoder  # ty: ignore[unresolved-attribute]
+
+        tokens: list[str] = []
+        toktypes: list[int] = []
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            else:
+                token: str = reverse_vocab[i]
+                if token in added_vocab:
+                    if added_tokens_decoder[i].special or self.does_token_look_special(token):
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                else:
+                    toktypes.append(gguf.TokenType.NORMAL)
+                tokens.append(token)
+
+        tokpre = self.get_vocab_base_pre(tokenizer)
+        self.gguf_writer.add_tokenizer_model("hybriddna")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
     def _set_vocab_qwen(self):
         from .qwen import QwenModel
 
diff --git a/conversion/gemma.py b/conversion/gemma.py
index a6e14fbcb98..be29a5b8209 100644
--- a/conversion/gemma.py
+++ b/conversion/gemma.py
@@ -614,7 +614,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Gemma4ForConditionalGeneration")
+@ModelBase.register("Gemma4ForConditionalGeneration", "Gemma4ForCausalLM")
 class Gemma4Model(Gemma3Model):
     model_arch = gguf.MODEL_ARCH.GEMMA4
 
@@ -765,6 +765,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("Gemma4AssistantForCausalLM")
+class Gemma4AssistantModel(Gemma4Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
+        self.gguf_writer.add_nextn_predict_layers(self.block_count)
+
+
 @ModelBase.register("Gemma4ForConditionalGeneration")
 class Gemma4VisionAudioModel(MmprojModel):
     has_audio_encoder = True
diff --git a/conversion/hunyuan.py b/conversion/hunyuan.py
index be54f5810b0..537f023aa01 100644
--- a/conversion/hunyuan.py
+++ b/conversion/hunyuan.py
@@ -189,7 +189,8 @@ def set_vocab(self):
             self.gguf_writer.add_token_list(tokens)
             self.gguf_writer.add_token_types(toktypes)
 
-            # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
+            # Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1;
+            # guard SpecialVocab so it doesn't try to emit an invalid pad id.
             token_types = None
             if (self.hparams.get("pad_token_id") or 0) < 0:
                 token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
@@ -250,7 +251,8 @@ def set_vocab(self):
             self._fix_special_tokens()
 
     def set_gguf_parameters(self):
-        # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
+        # Some HunYuanVL variants set num_experts=1 (not real MoE);
+        # prevent the parent class from emitting expert_count metadata in that case.
         saved_num_experts = self.hparams.pop("num_experts", None)
         super().set_gguf_parameters()
         if saved_num_experts is not None and saved_num_experts > 1:
@@ -288,51 +290,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
 @ModelBase.register("HunYuanVLForConditionalGeneration")
 class HunyuanVLVisionModel(MmprojModel):
-    # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
-    # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
-    # Each variant maps to a different projector type in clip.cpp so image
-    # preprocessing follows the correct code path.
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         assert self.hparams_vision is not None
-        # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
+        # HunyuanVL uses max_image_size instead of image_size
         if "image_size" not in self.hparams_vision:
             self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
 
-    @staticmethod
-    def is_ocr_variant(hparams: dict) -> bool:
-        """Return True for HunyuanOCR, False for HunyuanVL.
-
-        The projector's output dim must equal the text model's hidden_size by
-        construction (that's what "projector" means). HunyuanOCR pairs a 1B text
-        backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
-        ViT -> LLM projection dim is a hard architectural signature, not a
-        magic number.
-        """
-        vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
-        return vision_out == 1024
-
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         assert self.hparams_vision is not None
         vcfg = self.hparams_vision
-
-        if self.is_ocr_variant(self.global_config):
-            # --- HunyuanOCR ---
-            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
-            self.gguf_writer.add_vision_use_gelu(True)
-            self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
-            self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
-            self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
-            self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
-            return
-
-        # --- HunyuanVL ---
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
-        self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
-        self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
-        self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
+        self.gguf_writer.add_vision_use_gelu(True)
+        self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
+        self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
         self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
         self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
 
@@ -353,7 +325,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
-        # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
+        # HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
         if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
             return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
         return super().tensor_force_quant(name, new_name, bid, n_dims)
@@ -361,40 +333,18 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
 
 @ModelBase.register("HunYuanVLForConditionalGeneration")
 class HunyuanVLTextModel(HunYuanModel):
-    # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
-    # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
-    # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
-    # the config and pick the matching GGUF architecture.
     model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
 
-    @staticmethod
-    def _is_ocr_config(hparams: dict) -> bool:
-        # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
-        # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
-        # HunyuanVLVisionModel.is_ocr_variant.
-        return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
-
     def __init__(self, dir_model: Path, *args, **kwargs):
-        raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
-        if self._is_ocr_config(raw_hparams):
-            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
-        else:
-            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
         super().__init__(dir_model, *args, **kwargs)
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
-        # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
-        # the HunYuan-Dense arch which already handles standard rope in super().
-        if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
-            return
-
+        # XD-RoPE metadata for the HunyuanVL;
         if self.rope_parameters.get("rope_type") != "xdrope":
             return
 
-        # defaults for HunyuanVL. The C++ side later computes:
-        #   freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
         self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
         self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
diff --git a/conversion/llama.py b/conversion/llama.py
index 41fde5143f8..fd6167bfd91 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -51,6 +51,15 @@ def set_vocab(self):
         if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
             self._set_vocab_mistral()
 
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                if (add_prefix_space := tokenizer_config_json.get("add_prefix_space")) is not None:
+                    self.gguf_writer.add_add_space_prefix(add_prefix_space)
+                if tokenizer_config_json.get("tokenizer_class") == "HybridDNATokenizer":
+                    return self._set_vocab_hybriddna()
+
         try:
             self._set_vocab_sentencepiece()
         except FileNotFoundError:
@@ -72,13 +81,6 @@ def set_vocab(self):
             special_vocab._set_special_token("eot",    32010)
             special_vocab.add_to_gguf(self.gguf_writer)
 
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
-
         # Apply to granite small models only
         if self.hparams.get("vocab_size", 32000) == 49152:
             self.gguf_writer.add_add_bos_token(False)
diff --git a/conversion/qwen.py b/conversion/qwen.py
index 45d1f98c266..7eb135c832d 100644
--- a/conversion/qwen.py
+++ b/conversion/qwen.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from pathlib import Path
 from typing import Any, Callable, Iterable, TYPE_CHECKING
 
 import torch
@@ -549,6 +548,7 @@ class _Qwen35MtpMixin:
     tensor_map: gguf.TensorNameMap
     no_mtp: bool
     mtp_only: bool
+    _original_block_count: int | None = None
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -557,22 +557,44 @@ def __init__(self, *args, **kwargs):
             self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 
+    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
+        hparams = {**self.hparams, **self.hparams.get("text_config", {})}
+        key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
+        type(self)._original_block_count = hparams.get(key)
+        return super().index_tensors(remote_hf_model_id=remote_hf_model_id)  # ty: ignore[unresolved-attribute]
+
     @classmethod
     def filter_tensors(cls, item):
-        name, _ = item
+        assert cls._original_block_count is not None
+        # TODO: change TextModel to super()
+        if (titem := TextModel.filter_tensors(item)) is None:
+            return None
+        name, gen = titem
+        if name.startswith("model.mtp."):
+            name = name.replace("model.", "", 1)
         if name.startswith("mtp."):
             if cls.no_mtp:
                 return None
-            return item
-        if cls.mtp_only:
-            canonical = name.replace("language_model.", "")
-            keep = canonical in (
+            remapper = {
+                "fc":                    "eh_proj",
+                "pre_fc_norm_embedding": "enorm",
+                "pre_fc_norm_hidden":    "hnorm",
+                "norm":                  "shared_head.norm",
+            }
+            parts = name.split(".", 3)
+            if len(parts) == 4 and parts[1] == "layers" and parts[2].isdecimal():
+                mtp_idx = int(parts[2])
+                name = f"model.layers.{cls._original_block_count + mtp_idx}.{parts[3]}"
+            elif len(parts) == 3 and parts[1] in remapper:
+                name = f"model.layers.{cls._original_block_count}.{remapper[parts[1]]}.{parts[2]}"
+        elif cls.mtp_only:
+            keep = name in (
                 "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
                 "embed_tokens.weight", "norm.weight",
             )
             if not keep:
                 return None
-        return super().filter_tensors(item)  # ty: ignore[unresolved-attribute]
+        return name, gen
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()  # ty: ignore[unresolved-attribute]
@@ -594,29 +616,6 @@ def prepare_metadata(self, vocab_only: bool):
             self.metadata.version, size_label=None, output_type=output_type, model_type=None)    # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
         self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
 
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.startswith("mtp."):
-            n_layer = self.hparams["num_hidden_layers"]
-            if name.find("layers.") != -1:
-                assert bid is not None
-                name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
-                bid = bid + n_layer
-            else:
-                remapper = {
-                    "mtp.fc":                    "model.layers.{bid}.eh_proj",
-                    "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
-                    "mtp.pre_fc_norm_hidden":    "model.layers.{bid}.hnorm",
-                    "mtp.norm":                  "model.layers.{bid}.shared_head.norm",
-                }
-                stem   = Path(name).stem
-                suffix = Path(name).suffix
-                tmpl   = remapper[stem] + suffix
-                for b in range(n_layer, self.block_count):
-                    yield from super().modify_tensors(data_torch, tmpl.format(bid=b), b)  # ty: ignore[unresolved-attribute]
-                return
-
-        yield from super().modify_tensors(data_torch, name, bid)  # ty: ignore[unresolved-attribute]
-
 
 @ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
 class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):
diff --git a/conversion/talkie.py b/conversion/talkie.py
new file mode 100644
index 00000000000..a970b32d3bf
--- /dev/null
+++ b/conversion/talkie.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from typing import Iterable, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import LazyTorchTensor, ModelBase, TextModel, gguf
+
+
+@ModelBase.register("TalkieForCausalLM")
+class TalkieModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.TALKIE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        # Talkie used F.rms_norm without an explicit eps
+        self.gguf_writer.add_layer_norm_rms_eps(torch.finfo(torch.float32).eps)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        prefix = f"model.blocks.{bid}." if bid is not None else ""
+        suffix = name.removeprefix(prefix)
+
+        if suffix == "attn_gain.a_g":
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, ".scale"), data_torch
+            return
+        elif suffix == "mlp_gain.a_g":
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, ".scale"), data_torch
+            return
+        elif suffix == "lm_head_gain.w_g":
+            self.gguf_writer.add_logit_scale(LazyTorchTensor.to_eager(data_torch).item())
+            return
+        elif suffix in ("attn.attn_query.weight", "attn.attn_key.weight"):
+            # absorb inverse rope
+            head_dim = self.hparams["head_dim"]
+            shape = data_torch.shape
+            data_torch = torch.reshape(data_torch, (-1, head_dim, shape[-1]))
+            signs = torch.ones((1, head_dim, 1), dtype=data_torch.dtype)
+            signs[:, head_dim // 2 :, :] = -1
+            if self.lazy:
+                signs = LazyTorchTensor.from_eager(signs)
+            # (n_head, head_dim, n_in) -> (n_out, n_in)
+            data_torch = torch.reshape(data_torch * signs, shape)
+        elif suffix == "attn.head_gain.head_g":
+            # allow head gain to broadcast
+            data_torch = data_torch.unsqueeze(-1)
+
+        if not name.endswith(".weight"):
+            name += ".weight"
+
+        yield from super().modify_tensors(data_torch, name, bid)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ff840050861..1d18a1bf91f 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -115,15 +115,15 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--mmproj", action="store_true",
-        help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
+        help="Export multimodal projector (mmproj) for vision models. This will only work on some vision models. An 'mmproj-' prefix will be added to the output file name.",
     )
     parser.add_argument(
         "--mtp", action="store_true",
-        help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.",
+        help="Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. An 'mtp-' prefix will be added to the output file name.",
     )
     parser.add_argument(
         "--no-mtp", action="store_true",
-        help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.",
+        help="Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, but even though the bundled default is more space-efficient overall, this allows differing quantization which may be more performant.",
     )
     parser.add_argument(
         "--mistral-format", action="store_true",
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 8b2a9454f98..66aa1cb2fc0 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -156,6 +156,8 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
     {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
     {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
+    {"name": "talkie",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
+    {"name": "minicpm5",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 81658ba03d8..9a6437beab1 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -208,6 +208,16 @@ def split(self, split_size: int | Sequence[int], dim: int = 0) -> tuple[LoraTorc
     def to(self, *args, **kwargs):
         return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
 
+    def __mul__(self, other) -> LoraTorchTensor:
+        # Only output-side multiplication for now
+        # W = B @ A, so M_out * W == (M_out * B) @ A
+        if not isinstance(other, (int, float)) and other.shape and other.shape[-1] != 1:
+            raise NotImplementedError
+        return LoraTorchTensor(self._lora_A, self._lora_B * other)
+
+    def __rmul__(self, other) -> LoraTorchTensor:
+        return self * other
+
     @classmethod
     def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
         del types  # unused
diff --git a/docs/autoparser.md b/docs/autoparser.md
index adc4d43ed67..33ede1a2282 100644
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@@ -459,7 +459,7 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend
 
 - Usage: `./bin/llama-template-analysis path/to/template.jinja`
 
-**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
+**Debug Logging**: Enable with `LLAMA_ARG_LOG_VERBOSITY=2`
 
 - Shows detailed analysis steps, pattern extraction results, and generated parser structure
 
@@ -489,6 +489,7 @@ The following templates have active tests in `tests/test-chat.cpp`:
 | Qwen-QwQ-32B | Reasoning | Forced-open thinking |
 | NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper |
 | IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` |
+| IBM Granite 4.0 | JSON_NATIVE | `<tool_call>` wrapper (same template used by 4.1) |
 | ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags |
 | Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format |
 | DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode |
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 0c4660b541c..8059d6e420e 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -743,6 +743,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
 | GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
+| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
 
@@ -753,6 +754,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
 | Name            | Function                                                                         |
 |-----------------|----------------------------------------------------------------------------------|
 | DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
+| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |
 
 ## Design Rule
 
diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json
index c07bf5ca0c6..d37100764f1 100644
--- a/docs/backend/snapdragon/CMakeUserPresets.json
+++ b/docs/backend/snapdragon/CMakeUserPresets.json
@@ -10,8 +10,8 @@
             "ANDROID_ABI":      "arm64-v8a",
             "ANDROID_PLATFORM": "android-31",
             "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
-            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
-            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
@@ -33,8 +33,8 @@
         "name": "arm64-windows-snapdragon",
         "inherits": [ "base", "arm64-windows-llvm" ],
         "cacheVariables": {
-            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
-            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
@@ -59,8 +59,8 @@
         "toolset":      { "value": "host=x86_64", "strategy": "external" },
         "cacheVariables": {
             "CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake",
-            "CMAKE_C_FLAGS":   "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
-            "CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS":   "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md
index 2414eeaf6a4..e9f0e215858 100644
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
 This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
 
 ```
-~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.7
 [d]/> cd /workspace
 ```
 
@@ -24,7 +24,7 @@ Native Windows 11 arm64 builds has the following tools dependencies:
   - UCRT and Driver Kit
 - LLVM core libraries and Clang compiler (winget)
 - CMake, Git, Python (winget)
-- Hexagon SDK Community Edition 6.4 or later (see windows.md)
+- Hexagon SDK Community Edition 6.6 or later (see windows.md)
 - OpenCL SDK 2.3 or later (see windows.md)
 
 Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
@@ -45,7 +45,7 @@ Preset CMake variables:
   GGML_HEXAGON="ON"
   GGML_OPENCL="ON"
   GGML_OPENMP="OFF"
-  HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
+  HEXAGON_SDK_ROOT="/opt/hexagon/6.6.0.0"
 ...
 -- Including OpenCL backend
 -- Including Hexagon backend
diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md
index 6307e1b69f1..aa731413c90 100644
--- a/docs/backend/snapdragon/windows.md
+++ b/docs/backend/snapdragon/windows.md
@@ -28,15 +28,15 @@ c:\Qualcomm\OpenCL_SDK\2.3.2
 
 Either use the trimmed down version (optimized for CI) from
 
-    https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
+    https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz
 
 Or download the complete official version from
 
-    https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
+    https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.6.0.0
 
 Unzip/untar the archive into
 ```
-c:\Qualcomm\Hexagon_SDK\6.4.0.2
+c:\Qualcomm\Hexagon_SDK\6.6.0.0
 ```
 
 ## Install the latest Adreno GPU driver
@@ -123,10 +123,10 @@ The overall Hexagon backend build procedure for Windows on Snapdragon is the sam
 However, additional settings are required for generating and signing HTP Ops libraries.
 ```
 > $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
-> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
-> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
+> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0"
+> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0\tools\HEXAGON_Tools\19.0.07"
 > $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
-> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
+> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0"
 
 > cmake --preset arm64-windows-snapdragon-release -B build-wos
 ...
diff --git a/docs/build-riscv64-spacemit.md b/docs/build-riscv64-spacemit.md
index 08301d51cae..7a9a1f3ad4e 100644
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -5,7 +5,7 @@
 
 1. Prepare Toolchain For RISCV
 ~~~
-wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
+wget https://github.com/spacemit-com/toolchain/releases/download/v1.2.4/spacemit-toolchain-linux-glibc-x86_64-v1.2.4.tar.xz
 ~~~
 
 2. Build
diff --git a/docs/build.md b/docs/build.md
index a18479b3346..7beafbf5f46 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -735,7 +735,7 @@ ninja
 
 To read documentation for how to build on Android, [click here](./android.md)
 
-## WebGPU [In Progress]
+## WebGPU
 
 The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
 
diff --git a/docs/function-calling.md b/docs/function-calling.md
index 9ede914c04d..850b59ce7aa 100644
--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -291,6 +291,7 @@ Here are some models known to work (w/ chat template override when needed):
 llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
 llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
 llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+llama-server --jinja -fa -hf ibm-granite/granite-4.1-3b-GGUF:Q4_K_M
 
 # Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
 
diff --git a/docs/multimodal/granitevision.md b/docs/multimodal/granitevision.md
index 3118fe0cdc1..7a63484ffe1 100644
--- a/docs/multimodal/granitevision.md
+++ b/docs/multimodal/granitevision.md
@@ -176,7 +176,7 @@ Note that currently you cannot quantize the visual encoder because granite visio
 
 
 ### 5. Running the Model in Llama cpp
-Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
+Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the llama.cpp banner.
 
 ```bash
 $ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
diff --git a/docs/speculative.md b/docs/speculative.md
index 45e42d42a43..041ff58038d 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -247,7 +247,7 @@ Specifies a comma-separated list of speculative decoding types to use.
 |------|-------------|
 | `none` | No speculative decoding (default) |
 | `draft-simple` | Use a simple draft model for speculation |
-| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model |
+| `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
 | `ngram-cache` | Use n-gram cache lookup |
 | `ngram-simple` | Use simple n-gram pattern matching |
 | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a29dc707c3d..39f802d250e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -27,7 +27,6 @@ else()
     add_subdirectory(parallel)
     add_subdirectory(passkey)
     add_subdirectory(retrieval)
-    add_subdirectory(save-load-state)
     add_subdirectory(simple)
     add_subdirectory(simple-chat)
     add_subdirectory(speculative)
diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py
index c4ec5c524e9..5c9305b1237 100755
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None:
 
 def main(args_in: list[str] | None = None) -> None:
     output_choices = ["f32", "f16"]
-    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+    dummy_val = np.uint32(1)
+    if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")):
         # We currently only support Q8_0 output on little endian systems.
         output_choices.append("q8_0")
     parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
diff --git a/examples/llama.android/lib/build.gradle.kts b/examples/llama.android/lib/build.gradle.kts
index 9b290d6d4a7..ae95f41a838 100644
--- a/examples/llama.android/lib/build.gradle.kts
+++ b/examples/llama.android/lib/build.gradle.kts
@@ -25,6 +25,7 @@ android {
                 arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON"
 
                 arguments += "-DBUILD_SHARED_LIBS=ON"
+                arguments += "-DLLAMA_BUILD_APP=OFF"
                 arguments += "-DLLAMA_BUILD_COMMON=ON"
                 arguments += "-DLLAMA_OPENSSL=OFF"
 
diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md
index c43e642fee7..344f0ac94a9 100644
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO
 
 ## HuggingFace utilities
 The following targets are useful for creating collections and model repositories
-on Hugging Face in the the ggml-org. These can be used when preparing a release
+on Hugging Face in the ggml-org. These can be used when preparing a release
 to script the process for new model releases.
 
 For the following targets a `HF_TOKEN` environment variable is required.
diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py
index 614c1a86b9a..001d5889655 100755
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
         print("Using SentenceTransformer to apply all numbered layers")
         model = SentenceTransformer(model_path)
         tokenizer = model.tokenizer
-        config = model[0].auto_model.config
+        config = model[0].auto_model.config  # ty: ignore[unresolved-attribute]
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_path)
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
deleted file mode 100644
index 78024672e77..00000000000
--- a/examples/save-load-state/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-save-load-state)
-add_executable(${TARGET} save-load-state.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4aac5094d1c..f542f18b6d4 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
 
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 12)
+set(GGML_VERSION_MINOR 13)
 set(GGML_VERSION_PATCH 0)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
 
diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in
index 91c9d5cd343..23a3066f56d 100644
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -6,6 +6,7 @@
 include(CMakeFindDependencyMacro)
 find_dependency(Threads)
 if (NOT GGML_SHARED_LIB)
+    set(GGML_BASE_INTERFACE_LINK_LIBRARIES "")
     set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
     set(GGML_CPU_INTERFACE_LINK_OPTIONS   "")
 
@@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB)
 
     if (GGML_OPENMP_ENABLED)
         find_dependency(OpenMP)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "")
+        if (TARGET OpenMP::OpenMP_C)
+            list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C)
+        endif()
+        if (TARGET OpenMP::OpenMP_CXX)
+            list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX)
+        endif()
+        list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
     endif()
 
     if (GGML_CPU_HBM)
@@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml)
     add_library(ggml::ggml-base UNKNOWN IMPORTED)
     set_target_properties(ggml::ggml-base
         PROPERTIES
-            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
+            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}"
+            INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}")
 
     set(_ggml_all_targets "")
     if (NOT GGML_BACKEND_DL)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 78aa059dde3..a7926a21a9a 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -76,6 +76,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 // ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
+// ggml_backend_alloc_ctx_tensors_from_buft returns NULL on failure or if all tensors in ctx are already allocated or zero-sized
 GGML_API size_t                       ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 41566d41aef..f6725265504 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1189,8 +1189,8 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
-    // a - x
-    // b - dy
+    // a - dy
+    // b - x
     GGML_API struct ggml_tensor * ggml_silu_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index 02d5f221c03..67851ba6f16 100644
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@@ -76,10 +76,16 @@ extern "C" {
         struct ggml_context ** ctx;
     };
 
+    // callback to simulate or wrap a FILE pointer - read up to `len` bytes at `offset` into `output` and return the number of bytes read
+    typedef size_t (*gguf_reader_callback_t)(void * userdata, void * output, uint64_t offset, size_t len);
+
     GGML_API struct gguf_context * gguf_init_empty(void);
     GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
     GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+    GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params);
+
+    // max_chunk_read is the maximum number of bytes that the GGUF code will read at once from the callback, a value of 0 means no limit
+    GGML_API struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params);
 
     GGML_API void gguf_free(struct gguf_context * ctx);
 
@@ -87,7 +93,7 @@ extern "C" {
 
     GGML_API uint32_t gguf_get_version    (const struct gguf_context * ctx);
     GGML_API size_t   gguf_get_alignment  (const struct gguf_context * ctx);
-    GGML_API size_t   gguf_get_data_offset(const struct gguf_context * ctx);
+    GGML_API size_t   gguf_get_data_offset(const struct gguf_context * ctx);  // padded to gguf_get_alignment if and only if the gguf_context contains at least one tensor
 
     GGML_API int64_t      gguf_get_n_kv(const struct gguf_context * ctx);
     GGML_API int64_t      gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 3e48860bfc8..c26c3f1470d 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC)
     target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
 endif()
 
+if (GGML_OPENMP)
+    find_package(OpenMP)
+    if (OpenMP_FOUND)
+        set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
+    else()
+        set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
+        message(WARNING "OpenMP not found")
+    endif()
+else()
+    set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
+endif()
+
+if (GGML_OPENMP_ENABLED)
+    target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP)
+    target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+endif()
+
 add_library(ggml
             ggml-backend-dl.cpp
             ggml-backend-reg.cpp)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a4b01ccf8a1..3bda9abbe03 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o
 
 static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
     // shift all elements after idx by 1 to the left, overwriting the element at idx
-    for (int i = idx; i < chunk->n_free_blocks; i++) {
+    for (int i = idx; i < chunk->n_free_blocks - 1; i++) {
         chunk->free_blocks[i] = chunk->free_blocks[i+1];
     }
     chunk->n_free_blocks--;
diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
index df0f405ed9f..d0d64523b4a 100644
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -13,6 +13,7 @@
 #include <cstring>
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -392,64 +393,100 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
 // meta backend buffer
 //
 
+// Container to hold the tensor slices per simple ggml backend buffer.
+struct ggml_backend_meta_simple_tensor_container {
+    std::vector<ggml_context_ptr> ctxs;
+    std::map<const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
+
+    ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) {
+        ctxs.reserve(n_simple);
+        for (int i = 0; i < n_simple; i++) {
+            ctxs.emplace_back(ggml_init(params));
+        }
+    }
+    ggml_backend_meta_simple_tensor_container() {}
+};
+
 struct ggml_backend_meta_buffer_context {
+    // FIXME
+    // Most tensors can simply be stored statically in their own buffer.
+    // Externally created views however also need a mapping to simple tensors but they use the buffer of the view source.
+    // If external views are simply using that buffer they will slowly deplete its memory.
+    // Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp.
+    // Long-term: tie the lifetime of external views to the meta backend executing the graph instead,
+    //     currently not possible due to graph-external operations in the backend scheduler.
+    ggml_backend_meta_simple_tensor_container stc_static;
+    ggml_backend_meta_simple_tensor_container stc_compute[2];
+    int stc_compute_index      = 0;
+    int stc_compute_index_next = 0;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    // FIXME
+    // The size of the split state cache is unbounded and can theoretically grow infinitely large.
+    // However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive.
     static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
-
     std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
-    std::map<          const ggml_tensor *,        std::vector<ggml_tensor *>>                           simple_tensors;
-
-    struct buffer_config {
-        ggml_context          * ctx;
-        ggml_backend_buffer_t   buf;
-
-        buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {}
-    };
-    std::vector<buffer_config> buf_configs;
 
     int debug;
 
-    ggml_backend_meta_buffer_context() {
+    ggml_backend_meta_buffer_context(
+            ggml_backend_meta_simple_tensor_container & stc_static,
+            ggml_backend_meta_simple_tensor_container & stc_compute_0,
+            ggml_backend_meta_simple_tensor_container & stc_compute_1,
+            const std::vector<ggml_backend_buffer_t> & bufs)
+            : stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} {
+        this->bufs.reserve(bufs.size());
+        for (ggml_backend_buffer_t buf : bufs) {
+            this->bufs.emplace_back(buf);
+        }
         const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG");
         debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0;
     }
+
+    ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) {
+        if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) {
+            return stc_static;
+        }
+        return stc_compute[stc_compute_index];
+    }
 };
 
 static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
-    for (auto & [ctx, buf] : buf_ctx->buf_configs) {
-        ggml_backend_buffer_free(buf);
-        ggml_free(ctx);
-    }
     delete buf_ctx;
 }
 
 static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
-    return buf_ctx->buf_configs.size();
+    return buf_ctx->bufs.size();
 }
 
 static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
-    GGML_ASSERT(index < buf_ctx->buf_configs.size());
-    return buf_ctx->buf_configs[index].buf;
+    GGML_ASSERT(index < buf_ctx->bufs.size());
+    return buf_ctx->bufs[index].get();
 }
 
 static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
-    GGML_ASSERT(index < buf_ctx->buf_configs.size());
+    GGML_ASSERT(index < buf_ctx->bufs.size());
 
-    auto it = buf_ctx->simple_tensors.find(tensor);
-    if (it == buf_ctx->simple_tensors.end()) {
+    ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor);
+    auto it = stc.simple_tensors.find(tensor);
+    if (it == stc.simple_tensors.end()) {
         return nullptr;
     }
     return it->second[index];
 }
 
-static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
+
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
+        ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
     const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
 
@@ -785,7 +822,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
                 src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
                 continue;
             }
-            src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
+            src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
             GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
         }
 
@@ -1079,17 +1116,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
     return ret;
 }
 
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
+    return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync);
+}
+
 static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) {
     GGML_UNUSED(buffer);
     return (void *) 0x1000000000000000; // FIXME
 }
 
-static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
-    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
-    const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
+static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
+    const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
 
-    const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true);
+    const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true);
     GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
     GGML_ASSERT(split_state.n_segments <= 16);
 
@@ -1104,8 +1147,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
     std::vector<ggml_tensor *> simple_tensors;
     simple_tensors.reserve(n_simple_bufs);
     for (size_t j = 0; j < n_simple_bufs; j++) {
-        ggml_context          * simple_ctx = buf_ctx->buf_configs[j].ctx;
-        ggml_backend_buffer_t   simple_buf = buf_ctx->buf_configs[j].buf;
+        ggml_context          * simple_ctx = stc.ctxs[j].get();
+        ggml_backend_buffer_t   simple_buf = buf_ctx->bufs[j].get();
 
         if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
             // TODO: the following assert fails for llama-parallel even though the results are correct:
@@ -1158,7 +1201,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
             t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
         } else if (simple_buf != nullptr) {
             t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
-                + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
+                + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer));
         }
         t_ij->extra = tensor->extra;
         for (int i = 0; i < GGML_MAX_SRC; i++) {
@@ -1194,11 +1237,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
         }
     }
 
-    buf_ctx->simple_tensors[tensor] = simple_tensors;
+    stc.simple_tensors[tensor] = simple_tensors;
 
     return GGML_STATUS_SUCCESS;
 }
 
+static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
+    buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next;
+    return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor);
+}
+
 static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
     GGML_ASSERT(ggml_is_contiguous(tensor));
@@ -1275,6 +1325,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
             for (size_t j = 0; j < n_bufs; j++) {
                 ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 const size_t simple_offset = i_start * chunk_size_j;
                 ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;
@@ -1382,6 +1435,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
             for (size_t j = 0; j < n_bufs; j++){
                 const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 const size_t simple_offset = i_start * chunk_size_j;
                 ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;
@@ -1407,8 +1463,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) {
-    const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer);
-    for (size_t i = 0; i < n_buffers; i++) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
+    for (size_t i = 0; i < buf_ctx->bufs.size(); i++) {
         ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i));
     }
 }
@@ -1434,20 +1491,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) {
 static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
 
-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
+    const ggml_init_params params = {
+        /*.mem_size   =*/ 1024*1024*ggml_tensor_overhead(), // FIXME
         /*.mem_buffer =*/ nullptr,
         /*.no_alloc   =*/ true,
     };
+    ggml_backend_meta_simple_tensor_container stc_static;
+    ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts);
 
-    ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context();
     size_t max_size = 0;
-    buf_ctx->buf_configs.reserve(n_simple_bufts);
+    std::vector<ggml_backend_buffer_t> bufs;
+    bufs.reserve(n_simple_bufts);
     for (size_t i = 0; i < n_simple_bufts; i++) {
-        ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
-        max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
-        buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
+        bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size));
+        GGML_ASSERT(bufs.back() != nullptr);
+        max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back()));
     }
+    ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
 
     return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size);
 }
@@ -1455,28 +1516,53 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
 struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
     const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
 
-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
+    constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals.
+    const ggml_init_params params_static = {
+        /*.mem_size   =*/ ggml_get_mem_size(ctx),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    const ggml_init_params params_compute = {
+        /*.mem_size   =*/ compute_headroom*ggml_get_mem_size(ctx),
         /*.mem_buffer =*/ nullptr,
         /*.no_alloc   =*/ true,
     };
+    ggml_backend_meta_simple_tensor_container stc_static   (params_static,  n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts);
 
-    ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context();
-    meta_buf_ctx->buf_configs.reserve(n_simple_bufts);
-    for (size_t i = 0; i < n_simple_bufts; i++) {
-        meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr);
-    }
+    std::vector<ggml_backend_buffer_t> bufs(n_simple_bufts, nullptr);
+    ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
 
     ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0);
     for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
         t->buffer = meta_buf;
-        ggml_backend_meta_buffer_init_tensor(meta_buf, t);
+        ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t);
         t->data = (void *) 0x2000000000000000; // FIXME
     }
     for (size_t i = 0; i < n_simple_bufts; i++) {
-        meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(
-            meta_buf_ctx->buf_configs[i].ctx, ggml_backend_meta_buft_simple_buft(buft, i));
-        meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
+        ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get();
+        ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
+
+        // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
+        // For those edge cases, allocate a dummy buffer instead.
+        bool any_nonzero_slice = false;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            if (ggml_nelements(t) != 0) {
+                any_nonzero_slice = true;
+                break;
+            }
+        }
+        if (any_nonzero_slice) {
+            meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft));
+        } else {
+            meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0));
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                t->buffer = meta_buf_ctx->bufs[i].get();
+            }
+        }
+        GGML_ASSERT(meta_buf_ctx->bufs[i]);
+        meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get()));
     }
     return meta_buf;
 }
@@ -1605,6 +1691,9 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
                 ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
                 ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j,
                     i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;
@@ -1646,6 +1735,9 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
                 ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
                 const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j,
                     i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;
@@ -1692,6 +1784,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
     }
 
     if (needs_rebuild) {
+        std::set<ggml_backend_buffer_t> used_buffers;
+        for (int i = 0; i < cgraph->n_leafs; i++) {
+            if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) {
+                used_buffers.emplace(cgraph->leafs[i]->buffer);
+            }
+        }
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) {
+                used_buffers.emplace(cgraph->nodes[i]->buffer);
+            }
+        }
+        for (ggml_backend_buffer_t buf : used_buffers) {
+            ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context;
+            buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1;
+            ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next];
+            for (ggml_context_ptr & ctx : stc.ctxs) {
+                ggml_reset(ctx.get());
+            }
+            stc.simple_tensors.clear();
+        }
         size_t n_subgraphs  = 0;
         size_t max_tmp_size = 0;
 
@@ -1877,7 +1989,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
             const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
             const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
             const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
-            ggml_init_params params = {
+            const ggml_init_params params = {
                 /*.mem_size   =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
                 /*.mem_buffer =*/ nullptr,
                 /*.no_alloc   =*/ true,
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 4e36909f45e..87615921c09 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -306,7 +306,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
     GGML_ASSERT(tensor);
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
 
-    if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
+    if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) {
         for (size_t i = 0; i < n_copies; i++) {
             ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
         }
@@ -317,7 +317,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
     }
 
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
     backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
 }
 
@@ -379,7 +379,7 @@ void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data,
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     GGML_ASSERT(buf != NULL && "tensor buffer not set");
 
-    if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
+    if (n_copies <= 1 || buf->iface.get_tensor_2d == NULL) {
         for (size_t i = 0; i < n_copies; i++) {
             ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
         }
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index f3eccff7d72..8c735a045b3 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     endif()
 
-    if (GGML_OPENMP)
-        find_package(OpenMP)
-        if (OpenMP_FOUND)
-            set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
-            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
-
-            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-        else()
-            set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
-            message(WARNING "OpenMP not found")
-        endif()
+    if (GGML_OPENMP_ENABLED)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
+        target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
     endif()
 
     if (GGML_LLAMAFILE)
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index b54d4a6b107..d3953eee962 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -15,6 +15,7 @@ if (CUDAToolkit_FOUND)
         # 80     == Ampere, asynchronous data loading, faster tensor core instructions
         # 86     == RTX 3000, needs CUDA v11.1
         # 89     == RTX 4000, needs CUDA v11.8
+        # 90     == Hopper H100/200, needs CUDA v11.8
         # 120    == Blackwell, needs CUDA v12.8, FP4 tensor cores
         #
         # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
@@ -33,7 +34,7 @@ if (CUDAToolkit_FOUND)
             list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real)
 
             if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
-                list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
+                list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-virtual)
             endif()
 
             if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
index adb4d5f0cb9..c25f42b32bb 100644
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -2,6 +2,9 @@
 #include <cstdint>
 #include <utility>
 
+template<typename T, size_t>
+using type_for_index = T;
+
 static __device__ __forceinline__ float op_repeat(const float a, const float b) {
     return b;
     GGML_UNUSED(a);
@@ -52,6 +55,7 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
                                    const int              s12,
                                    const int              s13,
                                    src1_ptrs... src1s) {
+    ggml_cuda_pdl_lc();
     const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
     const uint32_t i1  = (blockDim.y * blockIdx.y + threadIdx.y);
     const uint32_t i2  = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
@@ -72,6 +76,7 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
     const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
     dst_t * dst_row = dst + i_dst;
 
+    ggml_cuda_pdl_sync();
     for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
         const uint32_t i10 = fastmodulo(i0, ne10);
 
@@ -141,6 +146,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
 
     const int i10 = fastmodulo(i0, ne10);
 
+    ggml_cuda_pdl_sync();
     float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
     if constexpr (sizeof...(src1_ptrs) > 0) {
         result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
@@ -282,35 +288,24 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
             const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1);
             const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2);
 
-            if constexpr (sizeof...(I) > 0) {
-                k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t><<<block_num, block_size, 0, stream>>>(
+            {
+                const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)block_num, block_size, 0, stream);
+                ggml_cuda_kernel_launch(k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t, type_for_index<const src1_t *, I>...>, launch_params,
                     src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11,
                     ne12, ne13,
                   /*s0,*/ s1,  s2,  s3,
                     s00, s01, s02, s03,
                     s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
-            } else {
-                k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t>
-                    <<<block_num, block_size, 0, stream>>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv,
-                                                           ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13,
-                                                         /*s0,*/ s1,  s2,  s3,
-                                                           s00, s01, s02, s03,
-                                                           s10, s11, s12, s13);
             }
         } else {
             const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
-            if constexpr (sizeof...(I) > 0) {
-                k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
+            {
+                const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+                ggml_cuda_kernel_launch(k_bin_bcast<bin_op, src0_t, src1_t, dst_t, type_for_index<const src1_t *, I>...>, launch_params,
                     src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
                   /*s0,*/ s1, s2,  s3,
-                    s00 ,s01, s02, s03,
-                    s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
-            } else {
-                k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
-                    src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
-                  /*s0,*/ s1,  s2,  s3,
                     s00, s01, s02, s03,
-                    s10, s11, s12, s13);
+                    s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
             }
         }
     }
@@ -333,6 +328,7 @@ static __global__ void k_repeat_back(
     }
 
     T sum = 0;
+    ggml_cuda_pdl_sync();
     for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
         for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
             for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 10817505d9f..50d7763dcdd 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -5,6 +5,7 @@
 #include "ggml-cuda.h"
 
 #include <cstdint>
+#include <cstdlib>
 #include <memory>
 
 #if defined(GGML_USE_HIP)
@@ -27,6 +28,7 @@
 #include <cstdio>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #if defined(GGML_USE_HIP)
@@ -50,6 +52,7 @@
 #define GGML_CUDA_CC_TURING          750
 #define GGML_CUDA_CC_AMPERE          800
 #define GGML_CUDA_CC_ADA_LOVELACE    890
+#define GGML_CUDA_CC_HOPPER          900
 // While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see
 // https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms
 #define GGML_CUDA_CC_BLACKWELL       1200
@@ -107,6 +110,27 @@
 #    define GGML_CUDA_USE_CUB
 #endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
 
+// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8.
+// However, this has been bugged in CTK < 12.3 for MSVC builds, see
+// https://github.com/ggml-org/llama.cpp/pull/22522#discussion_r3302393293
+// __CUDA_ARCH__  is undefined in host passes; GPU arch check happens in device-side code.
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && \
+    (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))
+#    define GGML_CUDA_USE_PDL
+#endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))
+
+static __device__ __forceinline__ void ggml_cuda_pdl_sync() {
+#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+}
+
+static __device__ __forceinline__ void ggml_cuda_pdl_lc() {
+#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+}
+
 #ifdef __CUDA_ARCH_LIST__
 constexpr bool ggml_cuda_has_arch_impl(int) {
     return false;
@@ -165,6 +189,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 
 #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
 
+
 #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
     static const char * cublas_get_error_str(const cublasStatus_t err) {
         return cublasGetStatusString(err);
@@ -1487,3 +1512,68 @@ struct ggml_cuda_mm_fusion_args_device {
     const void * gate_bias = nullptr;
     ggml_glu_op glu_op;
 };
+
+struct ggml_cuda_kernel_launch_params {
+    dim3 block_nums;
+    dim3 block_dims;
+    size_t shmem;
+    cudaStream_t stream;
+
+    // size_t shmem
+    ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const size_t shmem_, const cudaStream_t stream_)
+        : block_nums(block_nums_), block_dims(block_dims_), shmem(shmem_), stream(stream_) {}
+
+    // Some call sites pass ints instead of the required size_t. This 2nd constructor casts int->size_t to avoid these -Wnarrowing warnings.
+    ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const int shmem_, const cudaStream_t stream_)
+        : block_nums(block_nums_), block_dims(block_dims_), shmem((size_t)shmem_), stream(stream_) {}
+};
+
+#if defined(GGML_CUDA_USE_PDL)
+struct ggml_cuda_pdl_config {
+    cudaLaunchAttribute attr;
+    cudaLaunchConfig_t  cfg;
+
+    ggml_cuda_pdl_config(const ggml_cuda_kernel_launch_params & params) {
+        attr.id = cudaLaunchAttributeProgrammaticStreamSerialization;
+        attr.val.programmaticStreamSerializationAllowed = 1;
+
+        cfg = {};
+        cfg.gridDim          = params.block_nums;
+        cfg.blockDim         = params.block_dims;
+        cfg.dynamicSmemBytes = params.shmem;
+        cfg.stream           = params.stream;
+        cfg.attrs            = &attr;
+        cfg.numAttrs         = 1;
+    }
+
+    // Delete due to &attr
+    ggml_cuda_pdl_config(const ggml_cuda_pdl_config&) = delete;
+    ggml_cuda_pdl_config& operator=(const ggml_cuda_pdl_config&) = delete;
+    ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete;
+
+};
+#endif //defined(GGML_CUDA_USE_PDL)
+
+
+template<typename Kernel, typename... Args>
+static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) {
+#if defined(GGML_CUDA_USE_PDL)
+
+    static const bool env_pdl_enabled = []() {
+        const char * env = getenv("GGML_CUDA_PDL");
+        return env == nullptr || std::atoi(env) != 0;
+    }();
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
+        auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
+
+        CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));
+        return;
+    }
+#endif //defined(GGML_CUDA_USE_PDL)
+
+    kernel<<<launch_params.block_nums, launch_params.block_dims, launch_params.shmem, launch_params.stream>>>(std::forward<Args>(args)... );
+    CUDA_CHECK(cudaGetLastError());
+}
+
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
index 102f944f924..adba4d522a4 100644
--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -15,6 +15,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont
 
     const int64_t n = ne0 * ne1 * ne2;
 
+    ggml_cuda_pdl_sync();
     for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) {
         if constexpr (dim == 0) {
             const int64_t row = i / ne0;
@@ -64,8 +65,8 @@ static void concat_f32_cuda(const float * x,
     const int     num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
 
     if (dim == 0) {
-        concat_f32_cont<0>
-            <<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream);
+        ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
         return;
     }
     if (dim == 1) {
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index d208acf2d5f..121472ec228 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -16,6 +16,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne
                                   const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
                                   const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
                                   const int64_t nb12, const int64_t nb13) {
+    ggml_cuda_pdl_lc();
     const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= ne) {
@@ -36,6 +37,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne
     const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
     const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
 
+    ggml_cuda_pdl_sync();
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
@@ -59,6 +61,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
     __shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
     int cur_tile_buf = 0;
 
+    ggml_cuda_pdl_sync();
 #pragma unroll
     for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
 
@@ -142,6 +145,7 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne,
     const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
     const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
 
+    ggml_cuda_pdl_sync();
     cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 
@@ -168,6 +172,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne,
     const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
     const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
 
+    ggml_cuda_pdl_sync();
     cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 
@@ -182,6 +187,7 @@ static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const
     const src_t * x = (const src_t *) cx;
     dst_t *     dst = (dst_t *) cdst;
 
+    ggml_cuda_pdl_sync();
     dst[i] = ggml_cuda_cast<dst_t>(x[i]);
 }
 
@@ -192,8 +198,8 @@ cudaStream_t stream) {
 
     const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
     GGML_ASSERT(num_blocks < UINT_MAX);
-    cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
+    ggml_cuda_kernel_launch(cpy_scalar_contiguous<src_t, dst_t>, launch_params, cx, cdst, ne);
 }
 
 template<typename src_t, typename dst_t, bool transposed = false>
@@ -223,13 +229,15 @@ static void ggml_cpy_scalar_cuda(
         GGML_ASSERT(grid_z < USHRT_MAX);
         dim3 dimGrid(grid_x, grid_y, grid_z);
         dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
-        cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
-            (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
+        ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
+            cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
     } else {
         const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
         GGML_ASSERT(num_blocks < UINT_MAX);
-        cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-            (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
+        ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
+            cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
     }
 }
 
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index beeb5238946..debcb6e5447 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -636,6 +636,7 @@ static __global__ void flash_attn_mask_to_KV_max(
     if (tid < WARP_SIZE) {
         buf_iw[tid] = 1;
     }
+    ggml_cuda_pdl_sync();
     __syncthreads();
 
     int KV_max_sj = (ne30 - 1) * FATTN_KQ_STRIDE;
@@ -687,6 +688,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
         const uint3 fd_iter_j_z,
         const uint3 fd_iter_j) {
     constexpr int ncols = ncols1*ncols2;
+    ggml_cuda_pdl_lc();
 
     const int tile_idx = blockIdx.x; // One block per output tile.
     const int j        = blockIdx.y;
@@ -718,6 +720,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
 
     dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt_Q*D + (j*ne02 + c)*D + tid;
 
+    ggml_cuda_pdl_sync();
     // Load the partial result that needs a fixup
     float dst_val = *dst;
     float max_val;
@@ -809,6 +812,7 @@ static __global__ void flash_attn_stream_k_fixup_general(
     float dst_val = 0.0f;
     float max_val = 0.0f;
     float rowsum  = 0.0f;
+    ggml_cuda_pdl_sync();
     {
         dst_val = *dst;
 
@@ -867,6 +871,7 @@ static __global__ void flash_attn_combine_results(
         const float2 * __restrict__ VKQ_meta,
         float * __restrict__ dst,
         const int parallel_blocks) {
+    ggml_cuda_pdl_lc();
     // Dimension 0: threadIdx.x
     // Dimension 1: blockIdx.x
     // Dimension 2: blockIdx.y
@@ -890,6 +895,7 @@ static __global__ void flash_attn_combine_results(
     __builtin_assume(tid < D);
 
     extern __shared__ float2 meta[];
+    ggml_cuda_pdl_sync();
     for (int i = tid; i < 2*parallel_blocks; i += D) {
         ((float *) meta)[i] = ((const float *)VKQ_meta) [i];
     }
@@ -1146,7 +1152,9 @@ void launch_fattn(
     const uint3 ne01 = init_fastdiv_values(Q->ne[1]);
 
     GGML_ASSERT(block_dim.x % warp_size == 0);
-    fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
+
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream);
+    ggml_cuda_kernel_launch(fattn_kernel, launch_params,
         (const char *) Q->data,
         K_data,
         V_data,
@@ -1176,9 +1184,9 @@ void launch_fattn(
             const dim3 block_dim_combine(DV, 1, 1);
             const dim3 blocks_num_combine = {(unsigned)ntiles_dst, ncols1, ncols2};
 
-            flash_attn_stream_k_fixup_uniform<DV, ncols1, ncols2>
-                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
-                ((float *) KQV->data, dst_tmp_meta.ptr,
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream);
+            ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_uniform<DV, ncols1, ncols2>, launch_params,
+                (float *) KQV->data, dst_tmp_meta.ptr,
                  Q->ne[1], Q->ne[2], K->ne[2], nblocks_sk,
                  gqa_ratio, bpt, fd0, fd1, fd2);
         } else if (ntiles_dst % blocks_num.x != 0) {
@@ -1193,9 +1201,9 @@ void launch_fattn(
             const dim3 block_dim_combine(DV, 1, 1);
             const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
 
-            flash_attn_stream_k_fixup_general<DV, ncols1, ncols2>
-                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
-                ((float *) KQV->data, dst_tmp_meta.ptr,
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream);
+            ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_general<DV, ncols1, ncols2>, launch_params,
+                (float *) KQV->data, dst_tmp_meta.ptr,
                  Q->ne[1], Q->ne[2], gqa_ratio, total_work,
                  fd_k_j_z_ne12, fd_k_j_z, fd_k_j, fd_k);
         }
@@ -1204,9 +1212,9 @@ void launch_fattn(
         const dim3 blocks_num_combine(Q->ne[1], Q->ne[2], Q->ne[3]);
         const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);
 
-        flash_attn_combine_results<DV>
-            <<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
-            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream);
+        ggml_cuda_kernel_launch(flash_attn_combine_results<DV>, launch_params,
+            dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
     }
     CUDA_CHECK(cudaGetLastError());
 }
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index a25e912c4d2..4871b90df86 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1724,6 +1724,7 @@ static __global__ void flash_attn_ext_f16(
                             const int32_t nb21, const int32_t nb22, const int64_t nb23,
                             const int32_t ne31, const int32_t ne32, const int32_t ne33,
                             const int32_t nb31, const int32_t nb32, const int64_t nb33) {
+    ggml_cuda_pdl_sync(); // TODO optimize placement
 #if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))
 
     // Skip unused kernel variants for faster compilation:
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
index 7b0a5e5cf49..fac76f13593 100644
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -894,6 +894,8 @@ static __global__ void flash_attn_tile(
     }
     float KQ_sum[cpw] = {0.0f};
 
+    ggml_cuda_pdl_sync();
+
     // Load Q data, convert to FP16 if fast:
 #pragma unroll
     for (int jc0 = 0; jc0 < cpw; ++jc0) {
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
index f0bd42a5761..b0a6cf67f1a 100644
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -40,6 +40,7 @@ static __global__ void flash_attn_ext_vec(
                             const int32_t nb21, const int32_t nb22, const int64_t nb23,
                             const int32_t ne31, const int32_t ne32, const int32_t ne33,
                             const int32_t nb31, const int32_t nb32, const int64_t nb33) {
+    ggml_cuda_pdl_lc();
 #ifdef FLASH_ATTN_AVAILABLE
 
     // Skip unused kernel variants for faster compilation:
@@ -136,6 +137,8 @@ static __global__ void flash_attn_ext_vec(
 #endif // V_DOT2_F32_F16_AVAILABLE
     int    Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
     float2  Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
+
+    ggml_cuda_pdl_sync();
     if constexpr (Q_q8_1) {
 #pragma unroll
         for (int j0 = 0; j0 < ncols; j0 += nwarps) {
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
index f19defbff93..4b6f6501094 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -86,6 +86,7 @@ static __global__ void flash_attn_ext_f16(
     constexpr int kqs_padded = FATTN_KQ_STRIDE + 8;
     constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
 
+    ggml_cuda_pdl_sync();
     const int sequence = blockIdx.z / ne02;
     const int head = blockIdx.z - sequence*ne02;
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
diff --git a/ggml/src/ggml-cuda/fwht.cu b/ggml/src/ggml-cuda/fwht.cu
new file mode 100644
index 00000000000..184dc254c72
--- /dev/null
+++ b/ggml/src/ggml-cuda/fwht.cu
@@ -0,0 +1,101 @@
+#include "common.cuh"
+#include "fwht.cuh"
+
+template <int N>
+__launch_bounds__(4*ggml_cuda_get_physical_warp_size(), 1)
+__global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows, const float scale) {
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    const int64_t r = (int64_t) blockIdx.x * blockDim.y + threadIdx.y;
+
+    if (r >= n_rows) {
+        return;
+    }
+
+    src += r * N;
+    dst += r * N;
+
+    static constexpr int el_w = N / warp_size;
+    float     reg[el_w];
+    const int lane = threadIdx.x;
+
+    ggml_cuda_pdl_sync();
+#pragma unroll
+    for (int i = 0; i < el_w; ++i) {
+        reg[i] = src[i * warp_size + lane] * scale;
+    }
+
+#pragma unroll
+    for (int h = 1; h < warp_size; h *= 2) {
+#pragma unroll
+        for (int j = 0; j < el_w; j++) {
+            const float val  = reg[j];
+            const float val2 = __shfl_xor_sync(0xFFFFFFFF, val, h, warp_size);
+
+            reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
+        }
+    }
+
+#pragma unroll
+    for (int h = warp_size; h < N; h *= 2) {
+        const int step = h / warp_size;
+#pragma unroll
+        for (int j = 0; j < el_w; j += 2 * step) {
+#pragma unroll
+            for (int k = 0; k < step; k++) {
+                const float x = reg[j + k];
+                const float y = reg[j + k + step];
+
+                reg[j + k]        = x + y;
+                reg[j + k + step] = x - y;
+            }
+        }
+    }
+
+#pragma unroll
+    for (int i = 0; i < el_w; ++i) {
+        dst[i * warp_size + lane] = reg[i];
+    }
+}
+
+bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src, dst));
+    if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+    const int     n    = src->ne[0];
+    const int64_t rows = ggml_nrows(src);
+
+    const float * src_d = (const float *) src->data;
+    float *       dst_d = (float *) dst->data;
+
+    const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
+    const int rows_per_block = 4;
+
+    const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block;
+
+    cudaStream_t                         stream = ctx.stream();
+    dim3                                 grid_dims(num_blocks, 1, 1);
+    dim3                                 block_dims(warp_size, rows_per_block, 1);
+    const ggml_cuda_kernel_launch_params launch_params =
+        ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
+
+    const float scale = 1 / sqrtf(n);
+
+    switch (n) {
+        case 64:
+            ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        case 128:
+            ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        case 256:
+            ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        case 512:
+            ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        default:
+            return false;
+    }
+}
diff --git a/ggml/src/ggml-cuda/fwht.cuh b/ggml/src/ggml-cuda/fwht.cuh
new file mode 100644
index 00000000000..cf3df94cafa
--- /dev/null
+++ b/ggml/src/ggml-cuda/fwht.cuh
@@ -0,0 +1,4 @@
+#include "common.cuh"
+
+// Returns whether the Fast Walsh-Hadamard transform could be used.
+bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
index b4c9845e7a7..018d5d37d47 100644
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
@@ -1,4 +1,5 @@
 #include "gated_delta_net.cuh"
+#include "ggml-cuda/common.cuh"
 
 template <int S_v, bool KDA, bool keep_rs_t>
 __global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
@@ -53,6 +54,7 @@ gated_delta_net_cuda(const float * q,
     float         s_shard[rows_per_lane];
     // state is stored transposed: M[col][i] = S[i][col], row col is contiguous
 
+    ggml_cuda_pdl_sync();
 #pragma unroll
     for (int r = 0; r < rows_per_lane; r++) {
         const int i = r * warp_size + lane;
@@ -189,28 +191,29 @@ static void launch_gated_delta_net(
 
     int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
 
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
     switch (S_v) {
         case 16:
-            gated_delta_net_cuda<16, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
+            ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA, keep_rs_t>, launch_params,
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
             break;
         case 32:
-            gated_delta_net_cuda<32, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
+            ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA, keep_rs_t>, launch_params,
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
             break;
         case 64: {
-            gated_delta_net_cuda<64, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
+            ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA, keep_rs_t>, launch_params,
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
             break;
         }
         case 128: {
-            gated_delta_net_cuda<128, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
+            ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA, keep_rs_t>, launch_params,
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index 36b840e8148..457b695eb2a 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -11,6 +11,7 @@ static __global__ void k_get_rows(
         /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
         const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
 
+    ggml_cuda_pdl_sync();
     for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
         for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
             // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
@@ -48,6 +49,8 @@ static __global__ void k_get_rows_float(
         /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
         const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
 
+    ggml_cuda_pdl_lc();
+    ggml_cuda_pdl_sync();
     for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
         for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
             // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
@@ -83,6 +86,7 @@ static __global__ void k_get_rows_back_float(
 
     float sum = 0.0f;
 
+    ggml_cuda_pdl_sync();
     for (int64_t i = 0; i < nrows_grad; ++i) {
         if (rows[i] != dst_row) {
             continue;
@@ -156,7 +160,8 @@ static void get_rows_cuda_float(
     GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
     const uint3 ne12_fdv = init_fastdiv_values(ne12);
 
-    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{block_nums, block_dims, 0, stream};
+    ggml_cuda_kernel_launch(k_get_rows_float<src0_t, dst_t>, launch_params,
         src0_d, src1_d, dst_d,
         ne00, /*ne01, ne02, ne03,*/
         /*ne10,*/ ne11, ne12_fdv, /*ne13,*/
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e25be3592fd..23d1c069248 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -24,6 +24,7 @@
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/diag.cuh"
 #include "ggml-cuda/fattn.cuh"
+#include "ggml-cuda/fwht.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmf.cuh"
@@ -2594,6 +2595,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
     bool use_batched_cublas_f32  = src0->type == GGML_TYPE_F32;
 
+    const int32_t hint = ggml_get_op_params_i32(dst, 1);
+    if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
+        return;
+    }
+
     if (!split && use_mul_mat_vec_f) {
         // the custom F16 vector kernel can be used over batched cuBLAS GEMM
         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
index 49af5389957..a8f6046e46d 100644
--- a/ggml/src/ggml-cuda/mean.cu
+++ b/ggml/src/ggml-cuda/mean.cu
@@ -67,9 +67,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
     if ((nrows / nsm) < 2) {
         const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/true>, launch_params, src0_d, dst_d, ncols);
     } else {
         const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
-        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/true>, launch_params, src0_d, dst_d, ncols);
     }
 }
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index d9147202429..09d95f309b4 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -21,6 +21,7 @@ static __global__ void mul_mat_vec_f(
     int channel_y;
     int sample_dst;
 
+    ggml_cuda_pdl_sync();
     if constexpr (is_multi_token_id) {
         // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
         token_idx  = blockIdx.z;
@@ -298,6 +299,7 @@ static __global__ void mul_mat_vec_f(
         static_assert(std::is_same_v<T, void>, "unsupported type");
     }
 
+    ggml_cuda_pdl_lc();
 #pragma unroll
     for (int j = 0; j < ncols_dst; ++j) {
         sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
@@ -382,11 +384,13 @@ static void mul_mat_vec_f_switch_fusion(
         const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
         const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) {
 
+    const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, nbytes_shared, stream};
+
     const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
     if constexpr (ncols_dst == 1) {
         if (has_fusion) {
-            mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
+            ggml_cuda_kernel_launch(mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true, is_multi_token_id>, launch_params,
+                x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
             return;
@@ -395,8 +399,8 @@ static void mul_mat_vec_f_switch_fusion(
 
     GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
 
-    mul_mat_vec_f<T, type_acc, ncols_dst, block_size, false, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
-        (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
+    ggml_cuda_kernel_launch(mul_mat_vec_f<T, type_acc, ncols_dst, block_size, false, is_multi_token_id>, launch_params,
+        x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
         channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
         sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
 
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index da48f313a38..13b8b855282 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -359,7 +359,9 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
                 case GGML_TYPE_Q5_1:
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_K:
+                    return 8;
                 case GGML_TYPE_Q6_K:
+                    return 2;
                 case GGML_TYPE_IQ4_NL:
                     return 8;
                 default:
@@ -422,6 +424,7 @@ static __global__ void mul_mat_vec_q(
     uint32_t channel_y;
     uint32_t sample_dst;
 
+    ggml_cuda_pdl_sync();
     channel_x  = ncols_dst == 1 && ids ? ids[channel_dst]                     : fastdiv(channel_dst, channel_ratio);
     channel_y  = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
     sample_dst = blockIdx.z;
@@ -681,8 +684,9 @@ static void mul_mat_vec_q_switch_fusion(
     const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
     if constexpr (c_ncols_dst == 1) {
         if (has_fusion) {
-            mul_mat_vec_q<type, c_ncols_dst, true, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream);
+            ggml_cuda_kernel_launch(mul_mat_vec_q<type, c_ncols_dst, true, small_k>, launch_params,
+                 vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                  sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
             return;
@@ -691,8 +695,9 @@ static void mul_mat_vec_q_switch_fusion(
 
     GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
 
-    mul_mat_vec_q<type, c_ncols_dst, false, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
-        (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream);
+    ggml_cuda_kernel_launch(mul_mat_vec_q<type, c_ncols_dst, false, small_k>, launch_params,
+        vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
         channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
         sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
 }
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
index ef98f675aa7..09d9f3a7d62 100644
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -18,6 +18,7 @@ static __global__ void norm_f32(
 
     float2 mean_var = make_float2(0.0f, 0.0f);
 
+    ggml_cuda_pdl_sync();
     for (int col = tid; col < ncols; col += block_size) {
         const float xi = x[col];
         mean_var.x += xi;
@@ -46,6 +47,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
 
     float tmp = 0.0f; // partial sum for thread in warp
 
+    ggml_cuda_pdl_sync();
     for (int j = start; j < end; j += block_size) {
         tmp += x[j];
     }
@@ -95,6 +97,7 @@ static __global__ void rms_norm_f32(const float * x,
                                     const uint3   add_nrows_packed     = make_uint3(0, 0, 0),
                                     const uint3   add_nchannels_packed = make_uint3(0, 0, 0),
                                     const uint3   add_nsamples_packed  = make_uint3(0, 0, 0)) {
+    ggml_cuda_pdl_lc();
     const int nrows     = gridDim.x;
     const int nchannels = gridDim.y;
 
@@ -124,6 +127,7 @@ static __global__ void rms_norm_f32(const float * x,
 
     float tmp = 0.0f; // partial sum for thread in warp
 
+    ggml_cuda_pdl_sync();
     for (int col = tid; col < ncols; col += block_size) {
         const float xi = x[col];
         tmp += xi * xi;
@@ -163,6 +167,7 @@ static __global__ void rms_norm_back_f32(
     float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass
     float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs
 
+    ggml_cuda_pdl_sync();
     for (int col = tid; col < ncols; col += block_size) {
         const float xfi = xf[col];
         sum_xx += xfi * xfi;
@@ -253,6 +258,7 @@ static __global__ void l2_norm_f32(
 
     float tmp = 0.0f; // partial sum for thread in warp
 
+    ggml_cuda_pdl_sync();
     for (int col = tid; col < ncols; col += block_size) {
         const float xi = x[col];
         tmp += xi * xi;
@@ -261,6 +267,7 @@ static __global__ void l2_norm_f32(
     // sum up partial sums
     extern __shared__ float s_sum[];
     tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    ggml_cuda_pdl_lc();
 
     // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
     const float scale = rsqrtf(fmaxf(tmp, eps * eps));
@@ -300,10 +307,19 @@ static void rms_norm_f32_cuda(
     const dim3 blocks_num(nrows, nchannels, nsamples);
     if (ncols < 1024) {
         const dim3 block_dims(256, 1, 1);
-        rms_norm_f32<256, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        const ggml_cuda_kernel_launch_params launch_params = {blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_f32<256, false>, launch_params,
+            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+        // underlying cudaLaunchKernelEx does not support default params
+        nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0),
+        nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
     } else {
         const dim3 block_dims(1024, 1, 1);
-        rms_norm_f32<1024, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_f32<1024, false>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+        // underlying cudaLaunchKernelEx does not support default params
+        nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0),
+        nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
     }
 }
 
@@ -346,14 +362,20 @@ static void rms_norm_mul_f32_cuda(const float *  x,
         const uint3 mul_nsamples_packed  = init_fastdiv_values(mul_nsamples);
         if (ncols < 1024) {
             const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
+            ggml_cuda_kernel_launch(rms_norm_f32<256, true>, launch_params,
                 x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
-                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
+                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed,
+                // underlying cudaLaunchKernelEx does not support default params
+            nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
         } else {
             const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
+            ggml_cuda_kernel_launch(rms_norm_f32<1024, true>, launch_params,
                 x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
-                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
+                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed,
+                // underlying cudaLaunchKernelEx does not support default params
+            nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
         }
     } else {
         const uint3 mul_ncols_packed     = init_fastdiv_values(mul_ncols);
@@ -367,14 +389,16 @@ static void rms_norm_mul_f32_cuda(const float *  x,
         const uint3 add_nsamples_packed  = init_fastdiv_values(add_nsamples);
         if (ncols < 1024) {
             const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims,block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
+            ggml_cuda_kernel_launch(rms_norm_f32<256, true, true>, launch_params,
                 x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                 mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                 add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
                 add_nchannels_packed, add_nsamples_packed);
         } else {
             const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
+            ggml_cuda_kernel_launch(rms_norm_f32<1024, true, true>, launch_params,
                 x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                 mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                 add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
@@ -399,10 +423,12 @@ static void l2_norm_f32_cuda(
     const dim3 blocks_num(nrows, nchannels, nsamples);
     if (ncols < 1024) {
         const dim3 block_dims(WARP_SIZE, 1, 1);
-        l2_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, 0, stream};
+        ggml_cuda_kernel_launch(l2_norm_f32<WARP_SIZE>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
     } else {
         const dim3 block_dims(1024, 1, 1);
-        l2_norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
+        ggml_cuda_kernel_launch(l2_norm_f32<1024>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
     }
 }
 
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index 52f664719ae..49516965cad 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -6,6 +6,7 @@ static __global__ void quantize_q8_1(
         const float * __restrict__ x, void * __restrict__ vy,
         const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
         const int64_t ne0, const uint32_t ne1, const uint3 ne2) {
+    ggml_cuda_pdl_lc();
     const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i0 >= ne0) {
@@ -28,6 +29,7 @@ static __global__ void quantize_q8_1(
     const int64_t ib  = i_cont / QK8_1; // block index
     const int64_t iqs = i_cont % QK8_1; // quant index
 
+    ggml_cuda_pdl_sync();
     const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f;
     float amax = fabsf(xi);
     float sum = xi;
@@ -196,6 +198,7 @@ static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
     const int64_t i2 = blockIdx.z % ne2;
     const int64_t i3 = blockIdx.z / ne2;
 
+    ggml_cuda_pdl_sync();
     const int64_t i01 = ids ? ids[i1] : i1;
     const int64_t i02 = i2;
     const int64_t i03 = i3;
@@ -288,6 +291,7 @@ static __global__ void quantize_mmq_q8_1(
     const int64_t i3 = blockIdx.z / ne2;
 
     const int64_t i00 = i0;
+    ggml_cuda_pdl_sync();
     const int64_t i01 = ids ? ids[i1] : i1;
     const int64_t i02 = i2;
     const int64_t i03 = i3;
@@ -378,7 +382,8 @@ void quantize_row_q8_1_cuda(
     const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
     const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
     const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, block_size, 0, stream);
+    ggml_cuda_kernel_launch(quantize_q8_1, launch_params, x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv);
     GGML_UNUSED(type_src0);
 }
 
diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh
index de240fd4413..5895d3bf8e5 100644
--- a/ggml/src/ggml-cuda/reduce_rows.cuh
+++ b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -10,6 +10,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
     const int num_unroll = 8;
     float     temp[num_unroll];
     float     sum_temp[num_unroll] = { 0.0f };
+
+    ggml_cuda_pdl_sync();
     for (int i = col; i < ncols;) {
         for (int j = 0; j < num_unroll; ++j) {
             if (i < ncols) {
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 45a49a5dc2a..e20a5cb6bed 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -134,6 +134,7 @@ static __global__ void rope_neox(const T *            x,
                                  const float *        freq_factors,
                                  const int64_t *      row_indices,
                                  const int            set_rows_stride) {
+    ggml_cuda_pdl_lc();
     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (i0 >= ne00) {
@@ -148,6 +149,7 @@ static __global__ void rope_neox(const T *            x,
 
     int       idst = i0 / 2 + i1 * s1  + i2 * s2  + i3 * s3;
     const int ix   = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
+    ggml_cuda_pdl_sync();
 
     // Fusion optimization: ROPE + VIEW + SET_ROWS.
     // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
@@ -216,6 +218,7 @@ static __global__ void rope_multi(const T *            x,
     int       idst = i0 / 2 + i1 * s1  + i2 * s2  + i3 * s3;
     const int ix   = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
 
+    ggml_cuda_pdl_sync();
     if (i0 >= n_dims) {
         dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
         dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
@@ -300,6 +303,7 @@ static __global__ void rope_vision(const T *            x,
     int       idst = i0 / 2 + i1 * s1  + i2 * s2  + i3 * s3;
     const int ix   = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
 
+    ggml_cuda_pdl_sync();
     const int sect_dims = sections.v[0] + sections.v[1];
     const int sec_w     = sections.v[1] + sections.v[0];
     const int sector    = (i0 / 2) % sect_dims;
@@ -399,13 +403,14 @@ static void rope_neox_cuda(const T *            x,
     const dim3 block_nums(nr, n_blocks_x, 1);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
+    const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, 0, stream};
 
     if (freq_factors == nullptr) {
-        rope_neox<forward, false><<<block_nums, block_dims, 0, stream>>>(
+        ggml_cuda_kernel_launch(rope_neox<forward, false, T, D>, launch_params,
             x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
     } else {
-        rope_neox<forward, true><<<block_nums, block_dims, 0, stream>>>(
+        ggml_cuda_kernel_launch(rope_neox<forward, true, T, D>, launch_params,
             x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
     }
@@ -443,11 +448,13 @@ static void rope_multi_cuda(const T *            x,
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
     if (freq_factors == nullptr) {
-        rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(rope_multi<forward, false, T>, launch_params,
             x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
     } else {
-        rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(rope_multi<forward, true, T>, launch_params,
             x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
     }
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
index 0ddeff6a175..7b2e59a4383 100644
--- a/ggml/src/ggml-cuda/scale.cu
+++ b/ggml/src/ggml-cuda/scale.cu
@@ -3,9 +3,11 @@
 #define MAX_GRIDDIM_X 0x7FFFFFFF
 
 static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
+    ggml_cuda_pdl_lc();
     int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
     int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
 
+    ggml_cuda_pdl_sync();
     for (int64_t i = tid; i < nelements; i += stride) {
         dst[i] = scale * x[i] + bias;
     }
@@ -13,7 +15,8 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
 
 static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
     const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream);
+    ggml_cuda_kernel_launch(scale_f32, launch_params, x, dst, scale, bias, nelements);
 }
 
 void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu
index 631de7e8fa5..e14f96b824c 100644
--- a/ggml/src/ggml-cuda/set-rows.cu
+++ b/ggml/src/ggml-cuda/set-rows.cu
@@ -53,6 +53,7 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0,
     const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
     const int64_t i10 = i01;
 
+    ggml_cuda_pdl_sync();
     const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
 
     const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
@@ -157,7 +158,9 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0,
     const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
     const int64_t i10 = i01;
 
+    ggml_cuda_pdl_sync();
     const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+    ggml_cuda_pdl_lc();
 
     const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
     dst_t * dst_row_ptr    = dst + dst_row*s1 + i02*s2 + i03*s3;
@@ -203,9 +206,11 @@ static void set_rows_cuda(
         const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
         const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
 
-        k_set_rows<<<grid_size, block_size, 0, stream>>>(src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01,
-                                                         s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd,
-                                                         ne11_fd, ne12_fd);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_size, block_size, 0, stream);
+        ggml_cuda_kernel_launch(k_set_rows<src_t, idx_t, dst_t>, launch_params,
+            src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01,
+            s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd,
+            ne11_fd, ne12_fd);
     }
 }
 
diff --git a/ggml/src/ggml-cuda/softcap.cu b/ggml/src/ggml-cuda/softcap.cu
index 40dfe45d65c..9f0fa1051cf 100644
--- a/ggml/src/ggml-cuda/softcap.cu
+++ b/ggml/src/ggml-cuda/softcap.cu
@@ -1,18 +1,21 @@
 #include "softcap.cuh"
 
 static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) {
+    ggml_cuda_pdl_lc();
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
         return;
     }
 
+    ggml_cuda_pdl_sync();
     dst[i] = tanhf(scale * x[i]) * softcap;
 }
 
 static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE;
-    softcap_f32<<<num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream>>>(x, dst, scale, softcap, k);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream);
+    ggml_cuda_kernel_launch(softcap_f32, launch_params, x, dst, scale, softcap, k);
 }
 
 // fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu
index 4c4daf85dc6..48787b4b890 100644
--- a/ggml/src/ggml-cuda/ssm-conv.cu
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
@@ -1,3 +1,4 @@
+#include "common.cuh"
 #include "ssm-conv.cuh"
 #include "unary.cuh"
 
@@ -7,6 +8,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
                                     const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
                                     float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
                                     const int64_t n_t) {
+    ggml_cuda_pdl_lc();
     GGML_UNUSED(src0_nb0);
     const int tid  = threadIdx.x;
     const int bidx = blockIdx.x;
@@ -23,6 +25,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
     float x[d_conv] = { 0.0f };
     float w[d_conv] = { 0.0f };
 
+    ggml_cuda_pdl_sync();
 #pragma unroll
     for (size_t j = 0; j < d_conv; j++) {
         w[j] = w_block[tid * stride_w + j];
@@ -128,8 +131,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
         constexpr int kNC = decltype(NC)::value;
         if (n_t <= 32) {
             const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
-            ssm_conv_f32<apply_silu, threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
-                                                                       dst, dst_nb0, dst_nb1, dst_nb2, n_t);
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
+            ggml_cuda_kernel_launch(ssm_conv_f32<apply_silu, threads, kNC>, launch_params, src0, src1, bias, src0_nb0, src0_nb1,
+                                                                        src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
         } else {
             const int64_t split_n_t = 32;
             dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu
index c1d4e2bc8df..412980376ac 100644
--- a/ggml/src/ggml-cuda/ssm-scan.cu
+++ b/ggml/src/ggml-cuda/ssm-scan.cu
@@ -26,6 +26,7 @@ __global__ void __launch_bounds__(splitD, 1)
                  const int64_t s_off, const int64_t d_inner, const int64_t L_param)
 {
     const size_t L = L_template == 0 ? L_param : L_template;
+    ggml_cuda_pdl_sync();
     const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2);
     const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float));
     const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float));
@@ -135,6 +136,7 @@ __global__ void __launch_bounds__(d_state, 1)
 
     const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);
 
+    ggml_cuda_pdl_sync();
     // TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase
     const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
     const float * x_warp  = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float)));
@@ -206,7 +208,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
             constexpr int num_warps = threads/WARP_SIZE;
 
             const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
-            ssm_scan_f32_group<128/WARP_SIZE, 128><<<blocks, threads, 0, stream>>>(
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
+            ggml_cuda_kernel_launch(ssm_scan_f32_group<128/WARP_SIZE, 128>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                     src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
                     src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
@@ -215,7 +218,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
             constexpr int num_warps = threads/WARP_SIZE;
 
             const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
-            ssm_scan_f32_group<256/WARP_SIZE, 256><<<blocks, threads, 0, stream>>>(
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
+            ggml_cuda_kernel_launch(ssm_scan_f32_group<256/WARP_SIZE, 256>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                     src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
                     src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
@@ -231,58 +235,59 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
         const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
         const int  smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
         if (d_state == 16) {
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, smem_size, stream);
             switch (n_tok)
             {
             case 1:
-                ssm_scan_f32<threads, 16, 1><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 1>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             case 2:
-                ssm_scan_f32<threads, 16, 2><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 2>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             case 3:
-                ssm_scan_f32<threads, 16, 3><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 3>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             case 4:
-                ssm_scan_f32<threads, 16, 4><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 4>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             case 5:
-                ssm_scan_f32<threads, 16, 5><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 5>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             case 6:
-                ssm_scan_f32<threads, 16, 6><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 6>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             case 7:
-                ssm_scan_f32<threads, 16, 7><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 7>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             case 8:
-                ssm_scan_f32<threads, 16, 8><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 8>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
                 break;
             default:
-                ssm_scan_f32<threads, 16, 0><<<blocks, threads, smem_size, stream>>>(
+                ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 0>, launch_params,
                     src0, src1, src2, src3, src4, src5, src6, dst,
                 src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
                 src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
index 4025771aadb..0003658ca95 100644
--- a/ggml/src/ggml-cuda/sumrows.cu
+++ b/ggml/src/ggml-cuda/sumrows.cu
@@ -7,10 +7,12 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int
     const dim3 block_nums(nrows, 1, 1);
     if ((nrows / nsm) < 2) {
         const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, x, dst, ncols);
     } else {
         const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, x, dst, ncols);
     }
 }
 
@@ -34,10 +36,12 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     if ((nrows / nsm) < 2) {
         // Increase num threads to 512 for small nrows to better hide the latency
         const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, src0_d, dst_d, ncols);
     } else {
         // Enough active SMs to hide latency, use smaller blocks to allow better scheduling
         const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
-        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
+        ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, src0_d, dst_d, ncols);
     }
 }
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
index 3020e5c7433..da20c9aab7c 100644
--- a/ggml/src/ggml-cuda/topk-moe.cu
+++ b/ggml/src/ggml-cuda/topk-moe.cu
@@ -105,6 +105,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
         wt[i] = -INFINITY;
     }
 
+    ggml_cuda_pdl_sync();
 #pragma unroll
     for (int i = 0; i < n_experts; i += WARP_SIZE) {
         const int expert  = i + threadIdx.x;
@@ -161,6 +162,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
         output_weights[i] = 0.f;
     }
 
+    ggml_cuda_pdl_lc();
     for (int k = 0; k < n_expert_used; k++) {
         float max_val    = wt[0];
         int   max_expert = threadIdx.x;
@@ -271,51 +273,52 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
     dim3         grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
     dim3         block_dims(WARP_SIZE, rows_per_block, 1);
     cudaStream_t stream = ctx.stream();
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
 
     switch (n_expert) {
         case 1:
-            topk_moe_cuda<1, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                   clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<1, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 2:
-            topk_moe_cuda<2, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                   clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<2, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 4:
-            topk_moe_cuda<4, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                   clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<4, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 8:
-            topk_moe_cuda<8, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                   clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<8, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 16:
-            topk_moe_cuda<16, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                    clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<16, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 32:
-            topk_moe_cuda<32, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                    clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<32, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 64:
-            topk_moe_cuda<64, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                    clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<64, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 128:
-            topk_moe_cuda<128, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                     clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<128, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 256:
-            topk_moe_cuda<256, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                     clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<256, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 512:
-            topk_moe_cuda<512, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                     clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<512, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         case 576:
-            topk_moe_cuda<576, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
-                                                                     clamp_val, scale_val, config);
+            ggml_cuda_kernel_launch(topk_moe_cuda<576, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
             break;
         default:
             GGML_ASSERT(false && "fatal error");
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index 2aeba26f414..4cb805fa601 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -116,19 +116,22 @@ static __device__ __forceinline__ float op_trunc(float x) {
 
 template <float (*op)(float), typename T>
 static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
+    ggml_cuda_pdl_lc();
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
         return;
     }
 
+    ggml_cuda_pdl_sync();
     dst[i] = (T)op((float)x[i]);
 }
 
 template <float (*op)(float), typename T>
 static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
-    unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream);
+    ggml_cuda_kernel_launch(unary_op_kernel<op, T>, launch_params, x, dst, k);
 }
 
 template <float (*op)(float)>
@@ -258,6 +261,7 @@ void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
 template <float (*op)(float), typename T>
 static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) {
+    ggml_cuda_pdl_lc();
     const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
@@ -268,13 +272,15 @@ static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst,
     const int64_t j0 = (i / n) * o0 + (i % n);
     const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
 
+    ggml_cuda_pdl_sync();
     dst[i] = (T)(op((float)x[j0]) * (float)g[j1]);
 }
 
 template <float (*op)(float), typename T>
 static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) {
     const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE;
-    unary_gated_op_kernel<op><<<num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream>>>(x, g, dst, k, n, o0, o1);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream);
+    ggml_cuda_kernel_launch(unary_gated_op_kernel<op, T>, launch_params, x, g, dst, k, n, o0, o1);
 }
 
 template <float (*op)(float)>
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 2f75e97ac66..5e8a4a740c1 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -68,6 +68,7 @@ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C }
 static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
+static int opt_oppoll   = 0;    // polling for batch completions
 
 static std::regex* opt_opfilter = NULL; // regex of ops to not claim
 
@@ -550,7 +551,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
 
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
@@ -611,7 +612,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
 
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
     const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -660,6 +661,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
     ggml_aligned_free(buf_rp, row_size_rp);
 }
 
+static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
+    static const int qk = QK4_1;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const int x0             = (x->qs[i] & 0x0F);
+        const int x1             = (x->qs[i] >> 4);
+        qs[bi * qk + i + 0]      = x0;
+        qs[bi * qk + i + qk / 2] = x1;
+    }
+}
+
+static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
+    static const int qk = QK4_1;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const uint8_t x0 = qs[bi * qk + i + 0];
+        const uint8_t x1 = qs[bi * qk + i + qk / 2];
+        x->qs[i]         = x0 | (x1 << 4);
+    }
+}
+
+static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+    const int        nloe = k % qk;           // leftovers
+
+    const int dblk_size = 8 * 4;              // 8x (d, m) __fp16 = 32 bytes
+    const int qblk_size = qk / 2;             // int4 = 128 bytes
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    uint8_t * y_q = y + 0;                    // quants first
+    uint8_t * y_d = y + qrow_size;            // then scales/offsets
+
+    // Repack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+        unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
+        unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
+        unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
+        unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
+        unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
+        unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
+        unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
+        unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
+
+        bool partial = (nloe && i == nb-1);
+
+        uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
+        }
+    }
+
+    // Repack the scales and offsets
+    for (int i = 0; i < nb; i++) {
+        ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
+        for (int j = 0; j < 8; j++) {
+            d_m[j * 2 + 0] = x[i * 8 + j].d;
+            d_m[j * 2 + 1] = x[i * 8 + j].m;
+        }
+    }
+}
+
+static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+    const int        nloe = k % qk;           // leftovers
+
+    const int dblk_size = 8 * 4;              // 8x (d, m) __fp16 = 32 bytes
+    const int qblk_size = qk / 2;             // int4 = 128 bytes
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    const uint8_t * y_q = y + 0;              // quants first
+    const uint8_t * y_d = y + qrow_size;      // then scales/offsets
+
+    // Unpack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];
+        bool partial = (nloe && i == nb-1);
+
+        const uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            if (partial) {
+                qs[j*2+0] = q[j] & 0x0F;
+                qs[j*2+1] = q[j] >> 4;
+            } else {
+                qs[j+000] = q[j] & 0x0F;
+                qs[j+128] = q[j] >> 4;
+            }
+        }
+
+        pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Unpack the scales and offsets
+    for (int i = 0; i < nb; i++) {
+        const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
+        for (int j = 0; j < 8; j++) {
+            x[i * 8 + j].d = d_m[j * 2 + 0];
+            x[i * 8 + j].m = d_m[j * 2 + 1];
+        }
+    }
+}
+
+static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+    memset(qs, 0, sizeof(qs));
+
+    for (int i = 0; i < nb; i++) {
+        pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < 8; j++) {
+            x[i * 8 + j].d = 0;
+            x[i * 8 + j].m = 0;
+        }
+    }
+}
+
+static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
+
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+
+    const int64_t n_full_rows = n_bytes_to_copy / row_size;
+    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
+
+    for (int64_t i = 0; i < n_full_rows; i++) {
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    if (n_rem_bytes > 0) {
+        const int64_t i = n_full_rows;
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+
+        init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(buf_pd, src, n_rem_bytes);
+        repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, n_rem_bytes);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
+
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+
+    const int64_t n_full_rows = n_bytes_to_copy / row_size;
+    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    memset(buf_rp, 0, row_size_rp);  // clear-out padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < n_full_rows; i++) {
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+
+        memcpy(buf_rp, src, row_size);
+        unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
+        memcpy(dst, buf_pd, row_size);
+    }
+
+    if (n_rem_bytes > 0) {
+        const int64_t i = n_full_rows;
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+
+        // We still need to read and unpack the entire source row because quantization is block-based.
+        memcpy(buf_rp, src, row_size);
+        unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
+        memcpy(dst, buf_pd, n_rem_bytes);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
 // ======== Q8x4x2 ====================
 static void dump_block_q8_0(const block_q8_0 * b, int i) {
     HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
@@ -876,7 +1110,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
 
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size quants + scales)
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
@@ -937,7 +1171,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
 
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size quants + scales)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
     const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1238,7 +1472,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
 
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
@@ -1299,7 +1533,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
 
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
     const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1365,6 +1599,12 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
             repack_q4_0_q4x4x2(tensor, data, size);
             break;
 
+        case GGML_TYPE_Q4_1:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q4_1_q4x4x2(tensor, data, size);
+            break;
+
         case GGML_TYPE_Q8_0:
             GGML_ASSERT(offset == 0);
             GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@@ -1407,6 +1647,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
             repack_q4x4x2_q4_0(data, tensor, size);
             break;
 
+        case GGML_TYPE_Q4_1:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q4x4x2_q4_1(data, tensor, size);
+            break;
+
         case GGML_TYPE_Q8_0:
             GGML_ASSERT(offset == 0);
             GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@@ -1886,7 +2132,8 @@ void ggml_hexagon_session::flush_pending(bool all) {
         uint32_t               n_dbufs;
 
         // Read response packet from queue
-        int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, DSPQUEUE_TIMEOUT);
+        const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
+        int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
         if (err == AEE_EEXPIRED) {
             continue;
         }
@@ -2327,6 +2574,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q8_0:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_MXFP4:
@@ -2377,6 +2625,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q8_0:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_MXFP4:
@@ -2661,7 +2910,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
 
     int mode = op_params[2];
 
-    if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
+    if (mode == GGML_ROPE_TYPE_VISION) {
         return false;
     }
     if (mode & 1) {
@@ -2735,9 +2984,10 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
     if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
         return false;
     }
-
-    // TODO: add support for non-contiguous tensors
-    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+    if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
+        return false;
+    }
+    if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
         return false;
     }
 
@@ -2870,8 +3120,10 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
         case GGML_OP_SET_ROWS:        return HTP_OP_SET_ROWS;
         case GGML_OP_SUM_ROWS:        return HTP_OP_SUM_ROWS;
         case GGML_OP_ARGSORT:         return HTP_OP_ARGSORT;
+        case GGML_OP_NORM:            return HTP_OP_NORM;
         case GGML_OP_L2_NORM:         return HTP_OP_L2_NORM;
         case GGML_OP_RMS_NORM:        return HTP_OP_RMS_NORM;
+        case GGML_OP_CONCAT:          return HTP_OP_CONCAT;
         case GGML_OP_SCALE:           return HTP_OP_SCALE;
         case GGML_OP_SQR:             return HTP_OP_SQR;
         case GGML_OP_SQRT:            return HTP_OP_SQRT;
@@ -3284,6 +3536,25 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
     return true;
 }
 
+static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    int dim = ((const int32_t *) op->op_params)[0];
+    if (dim < 0 || dim >= GGML_MAX_DIMS) {
+        return false;
+    }
+
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        const struct ggml_tensor * src = op->src[i];
+        if (!src) {
+            continue;
+        }
+        if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
     const struct ggml_tensor * dst = op;
 
@@ -3338,10 +3609,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = ggml_hexagon_supported_add_id(sess, op);
             break;
 
+        case GGML_OP_NORM:
         case GGML_OP_L2_NORM:
-            supp = ggml_hexagon_supported_unary(sess, op);
-            break;
-
         case GGML_OP_RMS_NORM:
         case GGML_OP_SCALE:
             supp = ggml_hexagon_supported_unary(sess, op);
@@ -3434,6 +3703,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = ggml_hexagon_supported_cumsum(sess, op);
             break;
 
+        case GGML_OP_CONCAT:
+            supp = ggml_hexagon_supported_concat(sess, op);
+            break;
+
         case GGML_OP_FILL:
             supp = ggml_hexagon_supported_fill(sess, op);
             break;
@@ -3598,6 +3871,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
     // Basic sanity checks to make sure definitions match
     static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
                   "please update hexagon_type to match ggml_type");
+    static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
+                  "please update hexagon_type to match ggml_type");
     static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
                   "please update hexagon_type to match ggml_type");
     static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
@@ -3610,6 +3885,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
     const char * str_opstage  = getenv("GGML_HEXAGON_OPSTAGE");
     const char * str_opbatch  = getenv("GGML_HEXAGON_OPBATCH");
     const char * str_opqueue  = getenv("GGML_HEXAGON_OPQUEUE");
+    const char * str_oppoll   = getenv("GGML_HEXAGON_OPPOLL");
     const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
     const char * str_profile  = getenv("GGML_HEXAGON_PROFILE");
     const char * str_etm      = getenv("GGML_HEXAGON_ETM");
@@ -3647,6 +3923,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
     opt_opstage   = str_opstage  ? strtoul(str_opstage, NULL, 0)          : opt_opstage;
     opt_opbatch   = str_opbatch  ? strtoul(str_opbatch, NULL, 0)          : opt_opbatch;
     opt_opqueue   = str_opqueue  ? strtoul(str_opqueue, NULL, 0)          : opt_opqueue;
+    opt_oppoll    = str_oppoll   ? strtoul(str_oppoll,  NULL, 0)          : opt_oppoll;
     opt_profile   = str_profile  ? atoi(str_profile)                      : 0;
     opt_etm       = str_etm      ? atoi(str_etm)                          : 0;
     opt_nhvx      = str_nhvx     ? strtoul(str_nhvx, NULL, 0)             : opt_nhvx;
diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index 36f923243cd..d7927261a85 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(${HTP_LIB} SHARED
     ssm-conv.c
     cumsum-ops.c
     fill-ops.c
+    concat-ops.c
     diag-ops.c
     solve-tri-ops.c
     gated-delta-net-ops.c
@@ -58,14 +59,14 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
 if (_hmx_idx GREATER_EQUAL 0)
     target_sources(${HTP_LIB} PRIVATE
         hmx-queue.c
-        hmx-matmul-ops.c
         hmx-flash-attn-ops.c
+        hmx-matmul-ops.c
     )
 
     # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
     set_source_files_properties(
-        hmx-matmul-ops.c
         hmx-flash-attn-ops.c
+        hmx-matmul-ops.c
         PROPERTIES COMPILE_OPTIONS "-mhmx"
     )
 
diff --git a/ggml/src/ggml-hexagon/htp/concat-ops.c b/ggml/src/ggml-hexagon/htp/concat-ops.c
new file mode 100644
index 00000000000..61580f2c08f
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/concat-ops.c
@@ -0,0 +1,275 @@
+#include "htp-ctx.h"
+#include "htp-ops.h"
+#include "hexagon_types.h"
+#include "hexagon_protos.h"
+#include "hvx_hexagon_protos.h"
+#include "hex-dma.h"
+#include "vtcm-utils.h"
+#include "hvx-utils.h"
+#include "hex-fastdiv.h"
+#include <string.h>
+
+struct htp_concat_context {
+    struct htp_ops_context * octx;
+    uint32_t dim;
+    uint32_t nrows_per_thread;
+    struct fastdiv_values div_ne0;
+    struct fastdiv_values div_ne1;
+    struct fastdiv_values div_ne2;
+};
+
+static void concat_2d_f32_transposed(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_concat_context * cctx = (struct htp_concat_context *) data;
+    struct htp_ops_context * octx = cctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    const uint32_t src0_ne0 = src0->ne[0];
+    const uint32_t src1_ne0 = src1->ne[0];
+    const uint32_t ne1      = dst->ne[1];
+
+    const uint32_t start_i = ith * cctx->nrows_per_thread;
+    const uint32_t end_i   = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
+    if (start_i >= end_i) return;
+
+    dma_queue * q = octx->ctx->dma[ith];
+
+    uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
+    uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
+
+    const uint32_t block_i = 32;
+    const uint32_t spad1_stride = block_i * sizeof(float);
+
+    int32_t offsets[32] __attribute__((aligned(128)));
+    for(int k=0; k<32; k++) {
+        offsets[k] = k * spad1_stride;
+    }
+    HVX_Vector vv = *(HVX_Vector*)offsets;
+    const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 32);
+    const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(float), VLEN);
+    uint32_t mu = src1_ne0_padded * spad1_stride;
+
+    for (uint32_t i = start_i; i < end_i; i += block_i) {
+        uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
+
+        uint32_t src1_width_bytes = current_block_i * sizeof(float);
+        uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
+
+        uint32_t src0_row_bytes = src0_ne0 * sizeof(float);
+        uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
+
+        dma_queue_pop(q); // src1
+
+        HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
+
+        for (uint32_t j = 0; j < src1_ne0_padded; j += 32) {
+            #pragma unroll(4)
+            for (uint32_t ii = 0; ii < current_block_i; ii++) {
+                size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(float));
+                Q6_vgather_ARMVw(&vtcm_tmp[ii], rt, mu, vv);
+                uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(float);
+                hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
+            }
+        }
+
+        dma_queue_pop(q); // src0
+
+        uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
+        dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(float), current_block_i);
+
+        dma_queue_pop(q);
+    }
+}
+
+static void concat_2d_f16_transposed(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_concat_context * cctx = (struct htp_concat_context *) data;
+    struct htp_ops_context * octx = cctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    const uint32_t src0_ne0 = src0->ne[0];
+    const uint32_t src1_ne0 = src1->ne[0];
+    const uint32_t ne1      = dst->ne[1];
+
+    const uint32_t start_i = ith * cctx->nrows_per_thread;
+    const uint32_t end_i   = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
+    if (start_i >= end_i) return;
+
+    dma_queue * q = octx->ctx->dma[ith];
+
+    uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
+    uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
+
+    const uint32_t block_i = 64;
+    const uint32_t spad1_stride = block_i * sizeof(__fp16);
+
+    int16_t offsets[64] __attribute__((aligned(128)));
+    for(int k=0; k<64; k++) {
+        offsets[k] = k * spad1_stride;
+    }
+    HVX_Vector vv = *(HVX_Vector*)offsets;
+    const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 64);
+    const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(__fp16), VLEN);
+    uint32_t mu = src1_ne0_padded * spad1_stride;
+
+    for (uint32_t i = start_i; i < end_i; i += block_i) {
+        uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
+
+        uint32_t src1_width_bytes = current_block_i * sizeof(__fp16);
+        uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
+
+        uint32_t src0_row_bytes = src0_ne0 * sizeof(__fp16);
+        uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
+
+        dma_queue_pop(q); // src1
+
+        HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
+
+        for (uint32_t j = 0; j < src1_ne0_padded; j += 64) {
+            #pragma unroll(4)
+            for (uint32_t ii = 0; ii < current_block_i; ii++) {
+                size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(__fp16));
+                Q6_vgather_ARMVh(&vtcm_tmp[ii], rt, mu, vv);
+                uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(__fp16);
+                hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
+            }
+        }
+
+        dma_queue_pop(q); // src0
+
+        uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
+        dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(__fp16), current_block_i);
+
+        dma_queue_pop(q);
+    }
+}
+
+static void concat_generic(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_concat_context * cctx = (struct htp_concat_context *) data;
+    struct htp_ops_context * octx = cctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    const int dim = cctx->dim;
+    const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
+
+    const uint32_t ne[4] = {dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]};
+    const uint32_t total_elements = ne[0] * ne[1] * ne[2] * ne[3];
+    const uint32_t chunk_size = (total_elements + nth - 1) / nth;
+
+    const uint32_t start_idx = MIN(ith * chunk_size, total_elements);
+    const uint32_t end_idx   = MIN(start_idx + chunk_size, total_elements);
+
+    // Naive scalar element-wise copy
+    for (uint32_t idx = start_idx; idx < end_idx; idx++) {
+        uint32_t idx_div_ne0 = fastdiv(idx, &cctx->div_ne0);
+        uint32_t i0 = idx - idx_div_ne0 * ne[0];
+
+        uint32_t idx_div_ne01 = fastdiv(idx_div_ne0, &cctx->div_ne1);
+        uint32_t i1 = idx_div_ne0 - idx_div_ne01 * ne[1];
+
+        uint32_t idx_div_ne012 = fastdiv(idx_div_ne01, &cctx->div_ne2);
+        uint32_t i2 = idx_div_ne01 - idx_div_ne012 * ne[2];
+        uint32_t i3 = idx_div_ne012;
+
+        uint8_t * dst_ptr = (uint8_t *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2] + i1 * dst->nb[1] + i0 * dst->nb[0];
+
+        uint32_t idx_dim = 0;
+        if (dim == 0) idx_dim = i0;
+        else if (dim == 1) idx_dim = i1;
+        else if (dim == 2) idx_dim = i2;
+        else if (dim == 3) idx_dim = i3;
+
+        const struct htp_tensor * src = (idx_dim < src0->ne[dim]) ? src0 : src1;
+
+        uint32_t s0 = i0;
+        uint32_t s1 = i1;
+        uint32_t s2 = i2;
+        uint32_t s3 = i3;
+
+        if (dim == 0 && src == src1) s0 -= src0->ne[0];
+        if (dim == 1 && src == src1) s1 -= src0->ne[1];
+        if (dim == 2 && src == src1) s2 -= src0->ne[2];
+        if (dim == 3 && src == src1) s3 -= src0->ne[3];
+
+        uint8_t * src_ptr = (uint8_t *)src->data + s3 * src->nb[3] + s2 * src->nb[2] + s1 * src->nb[1] + s0 * src->nb[0];
+
+        if (type_size == 4) {
+            *(float*)dst_ptr = *(float*)src_ptr;
+        } else {
+            *(__fp16*)dst_ptr = *(__fp16*)src_ptr;
+        }
+    }
+}
+
+int op_concat(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    int dim = octx->op_params[0];
+
+    bool is_2d = dst->ne[2] == 1 && dst->ne[3] == 1;
+
+    const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
+    bool is_src1_transposed  = (src1->nb[0] > src1->nb[1]);
+    bool is_src0_transposed  = (src0->nb[0] > src0->nb[1]);
+
+    uint32_t n_threads = octx->n_threads;
+    struct htp_concat_context cctx;
+    cctx.octx = octx;
+    cctx.dim = dim;
+    cctx.div_ne0 = init_fastdiv_values(dst->ne[0]);
+    cctx.div_ne1 = init_fastdiv_values(dst->ne[1]);
+    cctx.div_ne2 = init_fastdiv_values(dst->ne[2]);
+
+    void (*worker_func)(unsigned int, unsigned int, void *) = concat_generic;
+
+    if (dim == 0 && is_2d && is_src1_transposed && !is_src0_transposed) {
+        n_threads = MIN(dst->ne[1], n_threads);
+        if (n_threads < 1) {
+            n_threads = 1;
+        }
+        uint32_t block_i = (type_size == 4) ? 32 : 64;
+
+        cctx.nrows_per_thread = hmx_ceil_div(dst->ne[1], n_threads);
+
+        // Allocate VTCM
+        uint32_t spad1_stride = block_i * type_size;
+
+        uint32_t src1_ne0_padded = hex_round_up(src1->ne[0], block_i);
+        uint32_t spad0_row_bytes = hex_round_up((src0->ne[0] + src1_ne0_padded) * type_size, VLEN);
+
+        octx->src0_spad.size_per_thread = block_i * spad0_row_bytes;
+        octx->src1_spad.size_per_thread = src1_ne0_padded * spad1_stride + block_i * VLEN;
+
+        octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
+        octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
+
+        if (octx->src0_spad.size + octx->src1_spad.size > octx->ctx->vtcm_size) {
+            return HTP_STATUS_VTCM_TOO_SMALL;
+        }
+
+        octx->src0_spad.data = octx->ctx->vtcm_base;
+        octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+
+        if (type_size == 4) {
+            worker_func = concat_2d_f32_transposed;
+        } else {
+            worker_func = concat_2d_f16_transposed;
+        }
+    }
+
+    worker_pool_run_func(octx->ctx->worker_pool, worker_func, &cctx, n_threads);
+    return HTP_STATUS_OK;
+}
diff --git a/ggml/src/ggml-hexagon/htp/cpy-ops.c b/ggml/src/ggml-hexagon/htp/cpy-ops.c
index 5c040a32224..ae507effa51 100644
--- a/ggml/src/ggml-hexagon/htp/cpy-ops.c
+++ b/ggml/src/ggml-hexagon/htp/cpy-ops.c
@@ -28,158 +28,170 @@ struct htp_copy_context {
     uint32_t          dst_blocks_per_row;
 
     uint32_t          src0_nrows_per_thread;
-
-    void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith);
 };
 
 #define cpy_preamble                              \
     const struct htp_tensor *src0 = octx->src[0]; \
     const struct htp_tensor *dst  = octx->dst;    \
                                                   \
-    const uint32_t ne00 = src0->ne[0];     \
-    const uint32_t ne01 = src0->ne[1];     \
-    const uint32_t ne02 = src0->ne[2];     \
-    const uint32_t ne03 = src0->ne[3];     \
-                                           \
-    const uint32_t nb00 = src0->nb[0];     \
-    const uint32_t nb01 = src0->nb[1];     \
-    const uint32_t nb02 = src0->nb[2];     \
-    const uint32_t nb03 = src0->nb[3];     \
-                                           \
-    const uint32_t  ne0 = dst->ne[0];      \
-    const uint32_t  ne1 = dst->ne[1];      \
-    const uint32_t  ne2 = dst->ne[2];      \
-    const uint32_t  ne3 = dst->ne[3];      \
-                                           \
-    const uint32_t  nb0 = dst->nb[0];      \
-    const uint32_t  nb1 = dst->nb[1];      \
-    const uint32_t  nb2 = dst->nb[2];      \
-    const uint32_t  nb3 = dst->nb[3];      \
-                                           \
+    const uint32_t ne00 = src0->ne[0];            \
+    const uint32_t ne01 = src0->ne[1];            \
+    const uint32_t ne02 = src0->ne[2];            \
+    const uint32_t ne03 = src0->ne[3];            \
+                                                  \
+    const uint32_t nb00 = src0->nb[0];            \
+    const uint32_t nb01 = src0->nb[1];            \
+    const uint32_t nb02 = src0->nb[2];            \
+    const uint32_t nb03 = src0->nb[3];            \
+                                                  \
+    const uint32_t  ne0 = dst->ne[0];             \
+    const uint32_t  ne1 = dst->ne[1];             \
+    const uint32_t  ne2 = dst->ne[2];             \
+    const uint32_t  ne3 = dst->ne[3];             \
+                                                  \
+    const uint32_t  nb0 = dst->nb[0];             \
+    const uint32_t  nb1 = dst->nb[1];             \
+    const uint32_t  nb2 = dst->nb[2];             \
+    const uint32_t  nb3 = dst->nb[3];             \
+                                                  \
     const uint32_t   nr = ne01;
 
-static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
-    cpy_preamble;
-
-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // copy by rows
-    for (uint32_t i03 = 0; i03 < ne03; i03++) {
-        for (uint32_t i02 = 0; i02 < ne02; i02++) {
-            #pragma unroll(2)
-            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
-                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
-                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2);
-                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size);
-            }
-        }
-    }
+#define DEFINE_CPY_SAMESHAPE(NAME, ELEM_TYPE, ELEM_SIZE)                                                       \
+static void cpy_thread_##NAME##_sameshape(unsigned int nth, unsigned int ith, void * data) {                   \
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;                                           \
+    struct htp_ops_context * octx = ct->octx;                                                                  \
+    cpy_preamble;                                                                                              \
+    const uint32_t dr  = ct->src0_nrows_per_thread;                                                            \
+    const uint32_t ir0 = dr * ith;                                                                             \
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;                                                    \
+    if (ir0 >= nr) return;                                                                                     \
+    for (uint32_t i03 = 0; i03 < ne03; i03++) {                                                                \
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {                                                            \
+            _Pragma("unroll(4)")                                                                               \
+            for (uint32_t i01 = ir0; i01 < ir1; i01++) {                                                       \
+                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;                     \
+                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;                    \
+                hex_l2fetch(src0_ptr, ne00 * ELEM_SIZE, nb01, 2);                                              \
+                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE);                                               \
+            }                                                                                                  \
+        }                                                                                                      \
+    }                                                                                                          \
 }
 
-static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) {
-    cpy_preamble;
-
-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // Fast path: when both src0 and dst are contiguous in memory
-    // Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice.
-    const bool src0_contig = (nb00 == ct->src0_type_size) &&
-                             (nb01 == ne00 * nb00) &&
-                             (nb02 == ne01 * nb01) &&
-                             (nb03 == ne02 * nb02);
-    const bool dst_contig  = (nb0  == ct->dst_type_size)  &&
-                             (nb1  == ne0  * nb0)  &&
-                             (nb2  == ne1  * nb1)  &&
-                             (nb3  == ne2  * nb2);
-
-    if (src0_contig && dst_contig) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01;
-                uint32_t  flat    = ((i03*ne02 + i02)*ne01 + ir0) * ne00;
-                uint8_t * dst_ptr = (uint8_t *) dst->data  + flat * ct->src0_type_size;
-                hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ct->src0_type_size);
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t k10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    // number of blocks in a row
-    const int64_t nk00 = ct->src0_blocks_per_row;
-    const int64_t nk0  = ct->dst_blocks_per_row;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            k10 += nk00 * ir0;
-            while (k10 >= nk0) {
-                k10 -= nk0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-            for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                for (int64_t k00 = 0; k00 < nk00; k00++) {
-                    const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          char * dst_ptr  = ((char *)  dst->data + k10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-                    memcpy(dst_ptr, src0_ptr, ct->dst_type_size);
-
-                    if (++k10 == nk0) {
-                        k10 = 0;
-                        if (++i11 == ne1) {
-                            i11 = 0;
-                            if (++i12 == ne2) {
-                                i12 = 0;
-                                if (++i13 == ne3) {
-                                    i13 = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            k10 += nk00 * (ne01 - ir1);
-            while (k10 >= nk0) {
-                k10 -= nk0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
+DEFINE_CPY_SAMESHAPE(f32, float, 4)
+DEFINE_CPY_SAMESHAPE(f16, __fp16, 2)
+
+#define DEFINE_CPY_RESHAPE(NAME, ELEM_TYPE, ELEM_SIZE)                                                         \
+static void cpy_thread_##NAME##_reshape(unsigned int nth, unsigned int ith, void * data) {                     \
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;                                           \
+    struct htp_ops_context * octx = ct->octx;                                                                  \
+    cpy_preamble;                                                                                              \
+    const uint32_t dr  = ct->src0_nrows_per_thread;                                                            \
+    const uint32_t ir0 = dr * ith;                                                                             \
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;                                                    \
+    if (ir0 >= nr) return;                                                                                     \
+    const bool src0_contig = (nb00 == ELEM_SIZE)   &&                                                          \
+                             (nb01 == ne00 * nb00) &&                                                          \
+                             (nb02 == ne01 * nb01) &&                                                          \
+                             (nb03 == ne02 * nb02);                                                            \
+    const bool dst_contig  = (nb0  == ELEM_SIZE)   &&                                                          \
+                             (nb1  == ne0  * nb0)  &&                                                          \
+                             (nb2  == ne1  * nb1)  &&                                                          \
+                             (nb3  == ne2  * nb2);                                                             \
+    if (src0_contig && dst_contig) {                                                                           \
+        for (int64_t i03 = 0; i03 < ne03; i03++) {                                                             \
+            for (int64_t i02 = 0; i02 < ne02; i02++) {                                                         \
+                uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01;                   \
+                uint32_t  flat    = ((i03*ne02 + i02)*ne01 + ir0) * ne00;                                      \
+                uint8_t * dst_ptr = (uint8_t *) dst->data  + flat * ELEM_SIZE;                                 \
+                hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ELEM_SIZE);                                  \
+            }                                                                                                  \
+        }                                                                                                      \
+        return;                                                                                                \
+    }                                                                                                          \
+    const bool reshape_flat_fast = (ne03 == 1 && ne2 == 1 && ne3 == 1) &&                                      \
+                                   (ne0 == ne00 * ne01) && (ne1 == ne02) &&                                    \
+                                   (nb00 == ELEM_SIZE) && (nb0 == ELEM_SIZE);                                  \
+    if (reshape_flat_fast) {                                                                                   \
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {                                                            \
+            for (uint32_t i01 = ir0; i01 < ir1; i01++) {                                                       \
+                uint8_t * src0_ptr = (uint8_t *) src0->data + i01 * nb01 + i02 * nb02;                         \
+                uint8_t * dst_ptr  = (uint8_t *) dst->data  + i01 * ne00 * ELEM_SIZE + i02 * nb1;              \
+                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE);                                               \
+            }                                                                                                  \
+        }                                                                                                      \
+        return;                                                                                                \
+    }                                                                                                          \
+    int64_t k10 = 0;                                                                                           \
+    int64_t i11 = 0;                                                                                           \
+    int64_t i12 = 0;                                                                                           \
+    int64_t i13 = 0;                                                                                           \
+    const int64_t nk00 = ct->src0_blocks_per_row;                                                              \
+    const int64_t nk0  = ct->dst_blocks_per_row;                                                               \
+    for (int64_t i03 = 0; i03 < ne03; i03++) {                                                                 \
+        for (int64_t i02 = 0; i02 < ne02; i02++) {                                                             \
+            k10 += nk00 * ir0;                                                                                 \
+            while (k10 >= nk0) {                                                                               \
+                k10 -= nk0;                                                                                    \
+                if (++i11 == ne1) {                                                                            \
+                    i11 = 0;                                                                                   \
+                    if (++i12 == ne2) {                                                                        \
+                        i12 = 0;                                                                               \
+                        if (++i13 == ne3) {                                                                    \
+                            i13 = 0;                                                                           \
+                        }                                                                                      \
+                    }                                                                                          \
+                }                                                                                              \
+            }                                                                                                  \
+            for (int64_t i01 = ir0; i01 < ir1; i01++) {                                                        \
+                for (int64_t k00 = 0; k00 < nk00; k00++) {                                                     \
+                    const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); \
+                          char * dst_ptr  = ((char *)  dst->data + k10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);  \
+                    memcpy(dst_ptr, src0_ptr, ELEM_SIZE);                                                      \
+                    if (++k10 == nk0) {                                                                        \
+                        k10 = 0;                                                                               \
+                        if (++i11 == ne1) {                                                                    \
+                            i11 = 0;                                                                           \
+                            if (++i12 == ne2) {                                                                \
+                                i12 = 0;                                                                       \
+                                if (++i13 == ne3) {                                                            \
+                                    i13 = 0;                                                                   \
+                                }                                                                              \
+                            }                                                                                  \
+                        }                                                                                      \
+                    }                                                                                          \
+                }                                                                                              \
+            }                                                                                                  \
+            k10 += nk00 * (ne01 - ir1);                                                                        \
+            while (k10 >= nk0) {                                                                               \
+                k10 -= nk0;                                                                                    \
+                if (++i11 == ne1) {                                                                            \
+                    i11 = 0;                                                                                   \
+                    if (++i12 == ne2) {                                                                        \
+                        i12 = 0;                                                                               \
+                        if (++i13 == ne3) {                                                                    \
+                            i13 = 0;                                                                           \
+                        }                                                                                      \
+                    }                                                                                          \
+                }                                                                                              \
+            }                                                                                                  \
+        }                                                                                                      \
+    }                                                                                                          \
 }
 
-static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
+DEFINE_CPY_RESHAPE(f32, float, 4)
+DEFINE_CPY_RESHAPE(f16, __fp16, 2)
+
+static void cpy_thread_f16_f32_sameshape(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;
+    struct htp_ops_context * octx = ct->octx;
     cpy_preamble;
 
     // parallelize by src0 rows
     const uint32_t dr  = ct->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+    if (ir0 >= nr) return;
 
     // copy by rows
     for (uint32_t i03 = 0; i03 < ne03; i03++) {
@@ -195,13 +207,16 @@ static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct ht
     }
 }
 
-static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
+static void cpy_thread_f32_f16_sameshape(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;
+    struct htp_ops_context * octx = ct->octx;
     cpy_preamble;
 
     // parallelize by src0 rows
     const uint32_t dr  = ct->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+    if (ir0 >= nr) return;
 
     // copy by rows
     for (uint32_t i03 = 0; i03 < ne03; i03++) {
@@ -217,11 +232,6 @@ static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct ht
     }
 }
 
-static void cpy_work_func(unsigned int n, unsigned int i, void *data) {
-    struct htp_copy_context *ct = (struct htp_copy_context *) data;
-    ct->copy(ct, ct->octx, n, i);
-}
-
 int op_cpy(struct htp_ops_context * octx) {
     cpy_preamble;
 
@@ -254,22 +264,32 @@ int op_cpy(struct htp_ops_context * octx) {
 
     ct.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;
 
+    worker_callback_t copy_fun;
+
     if (sametype && sameshape) {
-        ct.copy = cpy_thread_sametype_sameshape;
+        if (src0->type == HTP_TYPE_F32) {
+            copy_fun = cpy_thread_f32_sameshape;
+        } else {
+            copy_fun = cpy_thread_f16_sameshape;
+        }
     } else if (sameshape) {
         /**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32)
-            ct.copy = cpy_thread_f16_f32_sameshape;
+            copy_fun = cpy_thread_f16_f32_sameshape;
         else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16)
-            ct.copy = cpy_thread_f32_f16_sameshape;
+            copy_fun = cpy_thread_f32_f16_sameshape;
         else
             return HTP_STATUS_NO_SUPPORT;
     } else if (sametype) {
-        ct.copy = cpy_thread_sametype_reshape;
+        if (src0->type == HTP_TYPE_F32) {
+            copy_fun = cpy_thread_f32_reshape;
+        } else {
+            copy_fun = cpy_thread_f16_reshape;
+        }
     } else {
         return HTP_STATUS_NO_SUPPORT;
     }
 
-    worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_threads);
+    worker_pool_run_func(octx->ctx->worker_pool, copy_fun, &ct, n_threads);
 
     return HTP_STATUS_OK;
 }
diff --git a/ggml/src/ggml-hexagon/htp/get-rows-ops.c b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
index 5a1dc933860..bf7063e9880 100644
--- a/ggml/src/ggml-hexagon/htp/get-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
@@ -17,9 +17,13 @@
 
 struct get_rows_context {
     struct htp_ops_context * octx;
-    uint32_t src1_nrows_per_thread;
+    uint32_t tasks_per_thread;
+    uint32_t total_tasks;
+    uint32_t chunks_per_row;
+    uint32_t chunk_size;
     struct fastdiv_values get_rows_div_ne10;
     struct fastdiv_values get_rows_div_ne10_ne11;
+    struct fastdiv_values get_rows_div_chunks_per_row;
 };
 
 #define get_rows_preamble \
@@ -52,20 +56,23 @@ struct get_rows_context {
                                                \
     const uint32_t nr = ne10 * ne11 * ne12;
 
-static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
+static void get_rows_thread_f32_f32_dma(unsigned int nth, unsigned int ith, void *data) {
     struct get_rows_context * grctx = (struct get_rows_context *)data;
     struct htp_ops_context * octx = grctx->octx;
     get_rows_preamble;
 
     uint64_t qt = HAP_perf_get_qtimer_count();
 
-    // parallelize by src1 elements (which correspond to dst rows)
-    const uint32_t dr  = grctx->src1_nrows_per_thread;
+    const uint32_t dr  = grctx->tasks_per_thread;
     const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
+    if (ir0 >= grctx->total_tasks) {
+        return;
+    }
+    const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);
 
     const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
 
+    dma_queue * dma_queue = octx->ctx->dma[ith];
     for (uint32_t i = ir0; i < ir1; ++i) {
         const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
         const uint32_t rem = i - i12 * ne11 * ne10;
@@ -73,28 +80,76 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
         const uint32_t i10 = rem - i11 * ne10;
 
         const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
-
         uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
 
         if (i01 >= ne01) {
-            // invalid index, skip for now to avoid crash
             continue;
         }
 
         const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03;
         const uintptr_t dst_ptr  = octx->dst->data    + i10*nb1  + i11*nb2  + i12*nb3;
-        hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
+
+        while (!dma_queue_push(dma_queue, dma_make_ptr((void *)dst_ptr, (const void *)src0_ptr), nb1, nb01, ne00 * sizeof(float), 1)) {
+            dma_queue_pop(dma_queue);
+        }
     }
+    dma_queue_flush(dma_queue);
 
     qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
-    FARF(HIGH, "get-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+    FARF(HIGH, "get-rows-f32-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
          ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
 }
 
-int op_get_rows(struct htp_ops_context * octx) {
+static void get_rows_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
+    struct get_rows_context * grctx = (struct get_rows_context *)data;
+    struct htp_ops_context * octx = grctx->octx;
     get_rows_preamble;
 
-    const uint32_t n_threads = MIN(nr, octx->n_threads);
+    uint64_t qt = HAP_perf_get_qtimer_count();
+
+    const uint32_t dr  = grctx->tasks_per_thread;
+    const uint32_t ir0 = dr * ith;
+    if (ir0 >= grctx->total_tasks) {
+        return;
+    }
+    const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);
+
+    const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
+
+    const uint32_t chunks_per_row = grctx->chunks_per_row;
+    const uint32_t chunk_size     = grctx->chunk_size;
+    for (uint32_t i = ir0; i < ir1; ++i) {
+        const uint32_t row_idx   = fastdiv(i, &grctx->get_rows_div_chunks_per_row);
+        const uint32_t chunk_idx = i - row_idx * chunks_per_row;
+
+        const uint32_t i12 = fastdiv(row_idx, &grctx->get_rows_div_ne10_ne11);
+        const uint32_t rem = row_idx - i12 * ne11 * ne10;
+        const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
+        const uint32_t i10 = rem - i11 * ne10;
+
+        const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
+        uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
+
+        if (i01 >= ne01) {
+            continue;
+        }
+
+        const uint32_t offset = chunk_idx * chunk_size;
+        if (offset < ne00) {
+            const uint32_t copy_size = MIN(chunk_size, ne00 - offset);
+            const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03 + offset * sizeof(float);
+            const uintptr_t dst_ptr  = octx->dst->data    + i10*nb1  + i11*nb2  + i12*nb3  + offset * sizeof(float);
+            hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, copy_size);
+        }
+    }
+
+    qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
+    FARF(HIGH, "get-rows-f32-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+         ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
+}
+
+int op_get_rows(struct htp_ops_context * octx) {
+    get_rows_preamble;
 
     if (octx->src[0]->type != HTP_TYPE_F32) {
         return HTP_STATUS_NO_SUPPORT;
@@ -112,13 +167,52 @@ int op_get_rows(struct htp_ops_context * octx) {
         return HTP_STATUS_OK;
     }
 
+    const uint32_t nb00 = octx->src[0]->nb[0];
+    const uint32_t nb0  = octx->dst->nb[0];
+
+    const bool can_use_dma = (nb00 == sizeof(float)) && (nb0 == sizeof(float));
+    const bool use_dma = can_use_dma && (ne00 >= 2048);
+
     struct get_rows_context grctx;
     grctx.octx = octx;
     grctx.get_rows_div_ne10      = init_fastdiv_values(octx->src[1]->ne[0]);
     grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src[1]->ne[0] * octx->src[1]->ne[1]);
 
-    grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads;
+    if (use_dma) {
+        grctx.chunks_per_row = 1;
+        grctx.chunk_size = ne00;
+        grctx.total_tasks = nr;
+        grctx.get_rows_div_chunks_per_row = init_fastdiv_values(1);
+
+        const uint32_t n_threads = MIN(nr, octx->n_threads);
+        grctx.tasks_per_thread = (nr + n_threads - 1) / n_threads;
+
+        worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_dma, &grctx, n_threads);
+    } else {
+        uint32_t chunks_per_row = 1;
+        uint32_t chunk_size = ne00;
+        uint32_t total_tasks = nr;
+
+        if (nr < octx->n_threads) {
+            const uint32_t min_chunk_size = 1024;
+            uint32_t max_chunks = ne00 / min_chunk_size;
+            if (max_chunks == 0) {
+                max_chunks = 1;
+            }
+            chunks_per_row = MIN((octx->n_threads + nr - 1) / nr, max_chunks);
+            chunk_size = (ne00 + chunks_per_row - 1) / chunks_per_row;
+            total_tasks = nr * chunks_per_row;
+        }
+
+        grctx.chunks_per_row = chunks_per_row;
+        grctx.chunk_size = chunk_size;
+        grctx.total_tasks = total_tasks;
+        grctx.get_rows_div_chunks_per_row = init_fastdiv_values(chunks_per_row);
 
-    worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_threads);
+        const uint32_t n_threads = MIN(total_tasks, octx->n_threads);
+        grctx.tasks_per_thread = (total_tasks + n_threads - 1) / n_threads;
+
+        worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_hvx, &grctx, n_threads);
+    }
     return HTP_STATUS_OK;
 }
diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
index 4a4ff0b331d..a496f6289ae 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@@ -50,8 +50,8 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
     const size_t g_br         = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
     const size_t q_tile_size  = hex_align_up(g_br * DK * sizeof(__fp16), 4096);    // Q:  [g_br, DK]
     const size_t o_tile_size  = hex_align_up(g_br * DV * sizeof(__fp16), 4096);    // O:  [g_br, DV] x2 ping-pong
-    const size_t k_dma_size   = hex_align_up(Bc * DK * sizeof(__fp16), 4096);      // K DMA: [Bc, DK] x2 double-buf
-    const size_t v_dma_size   = hex_align_up(Bc * DV * sizeof(__fp16), 4096);      // V DMA: [Bc, DV] x2 double-buf
+    const size_t k_dma_size   = hex_align_up(Bc * hex_round_up(DK * sizeof(__fp16), 128), 4096);      // K DMA: [Bc, DK] x2 double-buf
+    const size_t v_dma_size   = hex_align_up(Bc * hex_round_up(DV * sizeof(__fp16), 128), 4096);      // V DMA: [Bc, DV] x2 double-buf
     const size_t k_tile_size  = hex_align_up(Bc * DK * sizeof(__fp16), 4096);      // K tiles: [Bc, DK] interleaved
     const size_t v_tile_size  = hex_align_up(Bc * DV * sizeof(__fp16), 4096);      // V tiles: [Bc, DV] interleaved
     const size_t s_tile_size  = hex_align_up(g_br * Bc * sizeof(__fp16), 4096);    // S/P:[g_br, Bc]
@@ -852,9 +852,10 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
             v_s_rowmax1 = hvx_vec_reduce_max_f16(v_s_rowmax1);
 
             // Splat m_prev[r], m_prev[r+1] from the per-row accumulator.
-            // vror brings the target lane to lane 0, then extract + re-splat.
-            HVX_Vector v_m_prev0 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2)));
-            HVX_Vector v_m_prev1 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2)));
+            // vror brings the target lane to lane 0, then vdelta replicates it
+            // across all lanes — stays in the vector domain (no store/reload).
+            HVX_Vector v_m_prev0 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2));
+            HVX_Vector v_m_prev1 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2));
 
             // HVX max — both operands are splats, so result is splat of m_new.
             HVX_Vector v_dup_m0 = Q6_Vhf_vmax_VhfVhf(v_m_prev0, v_s_rowmax0);
@@ -1277,7 +1278,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
     struct hmx_fa_context factx;
     memset(&factx, 0, sizeof(factx));
     factx.octx           = octx;
-    factx.n_threads      = octx->ctx->n_threads;
+    factx.n_threads      = n_threads;
     factx.DK             = DK;
     factx.DV             = DV;
     factx.n_kv           = nek1;
@@ -1327,10 +1328,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
     factx.m1          = powf(2.0f, -(max_bias / 2.0f) / factx.n_head_log2);
 
     // ======== VTCM allocation (GQA-aware) ========
+    const size_t size_k_row        = DK * sizeof(__fp16);
+    const size_t size_v_row        = DV * sizeof(__fp16);
+    const size_t size_k_row_padded = hex_round_up(size_k_row, 128);
+    const size_t size_v_row_padded = hex_round_up(size_v_row, 128);
+
     const size_t q_tile_bytes  = hex_align_up(g_br * DK * sizeof(__fp16), 4096);
     const size_t o_tile_bytes  = hex_align_up(g_br * DV * sizeof(__fp16), 4096);
-    const size_t k_dma_bytes   = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
-    const size_t v_dma_bytes   = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
+    const size_t k_dma_bytes   = hex_align_up(Bc * size_k_row_padded, 4096);
+    const size_t v_dma_bytes   = hex_align_up(Bc * size_v_row_padded, 4096);
     const size_t k_tile_bytes  = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
     const size_t v_tile_bytes  = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
     const size_t s_tile_bytes  = hex_align_up(g_br * Bc * sizeof(__fp16), 4096);
@@ -1400,11 +1406,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
     // ======== DMA setup ========
     dma_queue * const dma = ctx->dma[0];
 
-    // Padded row sizes for DMA
-    const size_t size_k_row        = nek0 * sizeof(__fp16);
-    const size_t size_v_row        = nev0 * sizeof(__fp16);
-    const size_t size_k_row_padded = hex_round_up(nek0 * sizeof(__fp16), 128);
-    const size_t size_v_row_padded = hex_round_up(nev0 * sizeof(__fp16), 128);
+    // Padded row sizes for DMA (defined in outer scope)
 
     const size_t n_row_tiles_g_br = g_br / HMX_FP16_TILE_N_ROWS;
     const size_t n_tiles_per_bc   = Bc / HMX_FP16_TILE_N_COLS;
diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index e05ccfd5fc7..ab5fd73380b 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -34,6 +34,10 @@ static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
     -8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0,
 };
 
+static const __fp16 q4_1_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
+    0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0,
+};
+
 // MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value
 // kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6
 static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
@@ -62,6 +66,8 @@ static inline size_t get_x4x2_row_stride(int weight_type, int k) {
         case HTP_TYPE_Q4_0:
         case HTP_TYPE_IQ4_NL:
             return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE);         // 144 * nb
+        case HTP_TYPE_Q4_1:
+            return (size_t) nb * (QK_Q4_0x4x2 / 2 + 32);                         // 160 * nb
         case HTP_TYPE_Q8_0:
             return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE);             // 272 * nb
         case HTP_TYPE_MXFP4:
@@ -201,11 +207,10 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32
 
 // Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using
 // full HVX vector width.  One vmemu + one vlut16 replaces 4 separate calls.
-// Output: out[0..3] each hold 32 FP16 values in the first 64 bytes.
-static inline void dequantize_x4x2_q4_0_x4groups_hvx(
+// Output: vector_x2 each hold 32 FP16 values in the first 64 bytes.
+static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
             const uint8_t *packed_128, bool upper_nibbles,
-            const __fp16 *scales_4, const HVX_Vector vlut_cvt,
-            HVX_Vector out[4]) {
+            const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
     // Load all 128 packed bytes (4 contiguous 32-byte groups)
     HVX_Vector vq = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
@@ -221,8 +226,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
     HVX_Vector v_hi = Q6_V_hi_W(vp);  // [group2: 32 fp16 | group3: 32 fp16]
 
     // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
-    volatile HVX_Vector vscale = hvx_vmemu(scales_4);
-
+    HVX_Vector vscale = hvx_vmemu(scales_4);
     HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
     HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
 
@@ -230,8 +234,57 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
     v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
 
     // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
-    out[0] = v_lo; // group0 already in [0:63]
-    out[1] = v_hi; // group2 already in [0:63]
+    HVX_Vector_x2 r = { v_lo,/* group1 already in [0:63] */
+                        v_hi /* group2 already in [0:63] */ };
+    return r;
+}
+
+static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale_offset, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_32);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_dm = hvx_vmemu(scale_offset);
+    HVX_Vector v_scales = hvx_vec_repl_f16(v_dm);
+    HVX_Vector v_offsets = hvx_vec_repl_f16(Q6_V_vror_VR(v_dm, 2));
+
+    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales), v_offsets));
+}
+
+static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx(
+            const uint8_t *packed_128, bool upper_nibbles,
+            const __fp16 *scales_offsets_4, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_128);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_lo = Q6_V_lo_W(vp);
+    HVX_Vector v_hi = Q6_V_hi_W(vp);
+
+    HVX_Vector vscale_offset = hvx_vmemu(scales_offsets_4);
+    HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2);
+    HVX_Vector vd = Q6_V_lo_W(dm_deal);
+    HVX_Vector vm = Q6_V_hi_W(dm_deal);
+
+    HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vd);
+    HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vd, 4));
+
+    HVX_Vector v_os01 = hvx_vec_repl_2x_f16(vm);
+    HVX_Vector v_os23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vm, 4));
+
+    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01), v_os01));
+    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23), v_os23));
+
+    HVX_Vector_x2 r = { v_lo, v_hi };
+    return r;
 }
 
 // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
@@ -292,12 +345,11 @@ static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t *  packed
 }
 
 // Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes).
-static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *  packed_128,
+static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *  packed_128,
                                                       bool             upper_nibbles,
                                                       int              sub_blk_base,
                                                       const HVX_Vector vlut_cvt,
-                                                      mxfp4_scales_t   scales,
-                                                      HVX_Vector       out[4]) {
+                                                      mxfp4_scales_t   scales) {
     HVX_Vector       vq       = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4  = Q6_Vb_vsplat_R(0x0F);
     HVX_Vector       v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
@@ -318,10 +370,8 @@ static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *  packed_12
     v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
     v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
 
-    out[0] = v_lo;
-    out[1] = Q6_V_vror_VR(v_lo, 64);
-    out[2] = v_hi;
-    out[3] = Q6_V_vror_VR(v_hi, 64);
+    HVX_Vector_x4 r = { v_lo, Q6_V_vror_VR(v_lo, 64), v_hi, Q6_V_vror_VR(v_hi, 64) };
+    return r;
 }
 
 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
@@ -335,11 +385,13 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
         int start_tile, int end_tile) {
 
     const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
-    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
+    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_Q4_1 || weight_type == HTP_TYPE_IQ4_NL);
+    const bool is_q4_1 = (weight_type == HTP_TYPE_Q4_1);
     const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
 
     const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
                                 (weight_type == HTP_TYPE_MXFP4)  ? hvx_vmem(mxfp4_to_fp16_lut) :
+                                (weight_type == HTP_TYPE_Q4_1)   ? hvx_vmem(q4_1_to_fp16_lut) :
                                                                    hvx_vmem(q4_0_to_fp16_lut);
 
     // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
@@ -360,8 +412,10 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
             unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
             bool upper            = (sub_blk_base >= 4);
             unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
-            unsigned scale_off    = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
-                                  + sub_blk_base * (int)sizeof(__fp16);   // 4 consecutive scales
+            unsigned dblk_size    = is_q4_1 ? 32 : HMX_X4X2_DBLK_SIZE;
+            unsigned scale_step   = is_q4_1 ? 4 : (int)sizeof(__fp16);
+            unsigned scale_off    = qrow_size + blk_idx * dblk_size
+                                  + sub_blk_base * scale_step;
 
             __fp16 *tile_bases[4];
             for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
@@ -371,20 +425,38 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
             unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
             unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
 
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                HVX_Vector v0[2];
-                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
-                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
-                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+            if (is_q4_1) {
+                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
+                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
 
+                    HVX_Vector_x2 dv0 = dequantize_x4x2_q4_1_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
+                    HVX_Vector_x2 dv1 = dequantize_x4x2_q4_1_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
 
-                r0 = vtcm_src + row_offset; row_offset += row_stride;
-                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
-                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
-                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
+                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+
+                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
+                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                }
+            } else {
+                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
+                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
+
+                    HVX_Vector_x2 dv0 = dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
+                    HVX_Vector_x2 dv1 = dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
+
+                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
+                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+
+                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
+                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                }
             }
 
             for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
@@ -415,21 +487,21 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                 mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
 
-                HVX_Vector v0[4], v1[4];
-                dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8, v0);
+                HVX_Vector_x4 dv0, dv1;
+                dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8);
                 if (row1 < n_cols) {
                     mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
-                    dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8, v1);
+                    dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8);
                 } else {
-                    v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
+                    dv1.v[0] = dv1.v[1] = dv1.v[2] = dv1.v[3] = Q6_V_vzero();
                 }
 
                 for (int g = 0; g < 4; g++) {
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[g]);
                 }
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
                 for (int g = 0; g < 4; g++) {
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[g]);
                 }
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
             }
@@ -450,26 +522,43 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
             unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;
             bool upper         = (sub_blk >= 4);
             unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
-            unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
+            unsigned dblk_size = is_q4_1 ? 32 : HMX_X4X2_DBLK_SIZE;
+            unsigned scale_step = is_q4_1 ? 4 : (int)sizeof(__fp16);
+            unsigned scale_off = qrow_size + blk_idx * dblk_size + sub_blk * scale_step;
 
             HVX_Vector v_off = v_scat_base;  // reset to column 0
             unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
             unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
-                    r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                HVX_Vector v1 = (row1 < n_cols)
-                    ? dequantize_x4x2_q4_0_group_hvx(
-                        r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
-                    : Q6_V_vzero();
-
-                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+            if (is_q4_1) {
+                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
+                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
+
+                    HVX_Vector v0 = dequantize_x4x2_q4_1_group_hvx(r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
+                    HVX_Vector v1 = (row1 < n_cols)
+                        ? dequantize_x4x2_q4_1_group_hvx(r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
+                        : Q6_V_vzero();
+
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                }
+            } else {
+                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
+                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
+
+                    HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
+                    HVX_Vector v1 = (row1 < n_cols)
+                        ? dequantize_x4x2_q4_0_group_hvx(r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
+                        : Q6_V_vzero();
+
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
+                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                }
             }
             (void) *(volatile HVX_Vector *)(tile_base);
         } else if (weight_type == HTP_TYPE_MXFP4) {
@@ -597,6 +686,8 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
 
 // --- End x4x2 dequantizers ---
 
+#pragma clang diagnostic ignored "-Wbackend-plugin" // spurios warning for hmx intrinsics
+
 // requires external HMX lock
 static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
                                 int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
@@ -612,11 +703,13 @@ static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict
             const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
             const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
 
-            for (int k = 0; k < n_dot_tiles; ++k) {
-                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
-                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
-                row_tiles += HMX_FP16_TILE_N_ELMS;
-                col_tiles += HMX_FP16_TILE_N_ELMS;
+            for (int k = 0, k_block; k < n_dot_tiles; k += k_block) {
+                k_block = hex_smin(n_dot_tiles - k, 32);
+                const uint32_t range = 2048u * (uint32_t)k_block - 1;
+                Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range);
+                row_tiles += k_block * HMX_FP16_TILE_N_ELMS;
+                col_tiles += k_block * HMX_FP16_TILE_N_ELMS;
             }
 
             __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
@@ -832,10 +925,6 @@ static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *
     worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, ctx->n_threads);
 }
 
-//
-
-#define FALLBACK_TO_STANDARD 1
-
 // C += AB
 static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b,
                                 const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
@@ -861,314 +950,80 @@ static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, co
                 Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
             }
 
-            for (int k = 0; k < n_dot_tiles; ++k) {
-                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
-                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
-                row_tiles += HMX_FP16_TILE_N_ELMS;
-                col_tiles += HMX_FP16_TILE_N_ELMS;
+            for (int k = 0, k_block; k < n_dot_tiles; k += k_block) {
+                k_block = hex_smin(n_dot_tiles - k, 32);
+                const uint32_t range = 2048u * (uint32_t)k_block - 1;
+                Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range);
+                row_tiles += k_block * HMX_FP16_TILE_N_ELMS;
+                col_tiles += k_block * HMX_FP16_TILE_N_ELMS;
             }
-            Q6_mxmem_AR_after_hf(accum_tile, 0);
-        }
-    }
-}
-
-static __attribute__((noinline)) int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx,
-                                       float *restrict out, const float *restrict x, const uint8_t *restrict w,
-                                       int m, int k, int n, int weight_type) {
-    // assume k % 32 == 0 && n % 32 == 0
-    const size_t row_stride = get_x4x2_row_stride(weight_type, k);
-    if (row_stride == 0) {
-        return -1;
-    }
-
-    const size_t vtcm_budget = ctx->vtcm_size;
-
-    const size_t K_BLOCK_SIZE = 1024;
-
-    // Fallback: if k doesn't need K-blocking, out-stationary has no advantage
-    const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
-    if (k_iters_check <= 1) {
-        FARF(HIGH, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
-        return FALLBACK_TO_STANDARD;
-    }
-
-    // Dynamic M,N search via hmx_compute_chunks
-    const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
-    const size_t per_m  = K_BLOCK_SIZE * sizeof(float)   // scratch1: M×K×4 (act DMA staging F32)
-                        + K_BLOCK_SIZE * sizeof(__fp16); // activation: M×K×2 (F16 tiles)
-    const size_t per_n  = sub_row_stride_alloc           // scratch0: N×sub_row(K) (packed quant)
-                        + K_BLOCK_SIZE * sizeof(__fp16); // weight: N×K×2 (F16 tiles)
-    const size_t per_mn = sizeof(__fp16);                // output: M×N×2 (out-stationary)
-
-    // Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
-    // scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
-    const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
-    const size_t overhead     = HMX_FP16_TILE_SIZE + 256 + align_margin;  // eye_tile + scales + alignment
-
-    size_t       M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
-    // Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
-    // From profiling: wt_dequant per element ≈ 1.5× activation load per element.
-    // m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
-    // n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
-    const size_t m_block_cost = (size_t) n * 3;
-    const size_t n_block_cost = (size_t) m * 2;
-    if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn,
-                           hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
-                           m_block_cost, n_block_cost, &M_BLOCK_SIZE,
-                           &N_BLOCK_SIZE, &vtcm_used) != 0) {
-        FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
-        return -1;
-    }
 
-    // Compute precise buffer sizes from searched M,N and fixed K
-    const size_t weight_size  = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
-    const size_t act_size     = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
-    const size_t out_size     = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
-    const size_t scratch0_sz  = hex_align_up(N_BLOCK_SIZE * sub_row_stride_alloc, HMX_FP16_TILE_SIZE);
-    const size_t scratch1_sz  = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(float), HMX_FP16_TILE_SIZE);
-
-    const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
-    if (total_vtcm > vtcm_budget) {
-        FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
-                    vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
-        return -1;
-    }
-
-    uint8_t *vtcm_ptr        = (uint8_t *) ctx->vtcm_base;
-    __fp16  *vtcm_weight     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_size);
-    __fp16  *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_size);
-    __fp16  *vtcm_output     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, out_size);
-    uint8_t *vtcm_scratch0   = vtcm_seq_alloc(&vtcm_ptr, scratch0_sz);
-    uint8_t *vtcm_scratch1   = vtcm_seq_alloc(&vtcm_ptr, scratch1_sz);
-    __fp16  *vtcm_eye_tile   = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, HMX_FP16_TILE_SIZE);
-    __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
-    assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
-
-    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", m, k, n, weight_type,
-         M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
-
-    // initialize eye tile (32x32 identity matrix)
-    {
-        HVX_Vector v;
-        v = Q6_V_vzero();
-        v = Q6_Vw_vinsert_VwR(v, 0x3c000000);
-        v = Q6_V_vror_VR(v, VLEN - 4);
-        v = Q6_Vw_vinsert_VwR(v, 0x00003c00);
-        for (int i = 0; i < 16; ++i) {
-            ((HVX_Vector *) vtcm_eye_tile)[i] = v;
-            v = Q6_V_vror_VR(v, VLEN - 8);
-        }
-    }
-    hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00));  // scale: 1.0, bias: 0.0 in FP16
-
-    TIMER_DEFINE(fetch);
-    TIMER_DEFINE(act_load);
-    TIMER_DEFINE(wt_dequant);
-    TIMER_DEFINE(core);
-
-    HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
-
-    for (size_t mr = 0; mr < m; mr += M_BLOCK_SIZE) {
-        size_t m_blk_sz = hex_smin(m - mr, M_BLOCK_SIZE);
-        for (size_t nc = 0; nc < n; nc += N_BLOCK_SIZE) {
-            size_t n_blk_sz = hex_smin(n - nc, N_BLOCK_SIZE);
-
-            const int n_row_tiles = hmx_ceil_div(m_blk_sz, HMX_FP16_TILE_N_ROWS);
-            const int n_col_tiles = hmx_ceil_div(n_blk_sz, HMX_FP16_TILE_N_COLS);
-
-            for (size_t kk = 0; kk < k; kk += K_BLOCK_SIZE) {
-                const size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE);
-
-                TIMER_START(fetch);
-                // fetch activation block into VTCM
-                {
-                    const float *activation_block = x + mr * k + kk;
-
-                    dma_queue_push(ctx->dma[0],
-                                     dma_make_ptr(vtcm_scratch1, activation_block),
-                                     k_blk_sz * sizeof(float),
-                                     k * sizeof(float),
-                                     k_blk_sz * sizeof(float),
-                                     m_blk_sz);
-                }
-
-                // fetch weight block into VTCM (x4x2 sub-block: quants + scales)
-                const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
-                {
-                    const int blk_start       = kk / QK_Q4_0x4x2;
-                    const int nb_sub          = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
-                    const int  full_qrow      = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
-                    const int  scale_blk_size = (weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;
-                    uint8_t       *dst        = vtcm_scratch0;
-                    const uint8_t *src        = w + nc * row_stride;
-                    const size_t  n_rows      = n_blk_sz;
-                    const size_t  src_stride  = row_stride;
-                    const size_t  dst_stride  = sub_row_stride;
-                    const size_t  quant_off   = (weight_type == HTP_TYPE_Q8_0) ? (blk_start * QK_Q8_0x4x2) : (blk_start * (QK_Q4_0x4x2 / 2));
-                    const size_t  quant_width = (weight_type == HTP_TYPE_Q8_0) ? (nb_sub    * QK_Q8_0x4x2) : (nb_sub    * (QK_Q4_0x4x2 / 2));
-                    const size_t  scale_off   = full_qrow + blk_start * scale_blk_size;
-                    const size_t  scale_width = nb_sub * scale_blk_size;
-
-                    // 2D DMA: quants sub-range
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(dst, src + quant_off), dst_stride, src_stride, quant_width, n_rows);
-                    // 2D DMA: scales sub-range
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(dst + quant_width, src + scale_off), dst_stride, src_stride, scale_width, n_rows);
-                }
-                TIMER_STOP(fetch);
-
-                TIMER_START(act_load);
-                // load activation block
-                {
-                    dma_queue_pop(ctx->dma[0]); // wait for act DNA
-                    transfer_activation_chunk_threaded(ctx, vtcm_activation, (float *) vtcm_scratch1, m_blk_sz, k_blk_sz, k_blk_sz);
-                }
-                TIMER_STOP(act_load);
-
-                TIMER_START(wt_dequant);
-                // dequantize weight block
-                {
-                    dma_queue_pop(ctx->dma[0]);
-                    dma_queue_pop(ctx->dma[0]);
-                    // vtcm_scratch0 is used to store the qweight chunk
-                    // worker_pool_run_func already returned, so fetch is done
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, vtcm_scratch0,
-                                                                n_blk_sz, k_blk_sz, sub_row_stride, weight_type);
-                }
-                TIMER_STOP(wt_dequant);
-
-                // core mma
-                TIMER_START(core);
-                {
-                    core_mma_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, vtcm_eye_tile, n_row_tiles,
-                                        n_col_tiles, k_blk_sz / HMX_FP16_TILE_N_COLS, kk == 0);
-                }
-                TIMER_STOP(core);
-            }
-
-            // store output block
-            {
-                float *output_block = out + (mr * n + nc);
-                transfer_output_chunk_threaded(ctx, output_block, vtcm_output, m_blk_sz, n_blk_sz, n);
-            }
+            Q6_mxmem_AR_after_hf(accum_tile, 0);
         }
     }
-
-    HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
-
-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "fetch: %lld us, act_load: %lld us, wt_dequant: %lld us, core: %lld us",
-         TIMER_US(fetch), TIMER_US(act_load), TIMER_US(wt_dequant), TIMER_US(core));
-#endif
-    return 0;
 }
 
-int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
+int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
                                      const uint8_t *restrict permuted_weight, int m, int k, int n,
                                      int weight_type) {
-    if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; }
     if (k % 32 != 0 || n % 32 != 0) { return -1; }
 
     if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) {
         return -1;
     }
 
-    // for large m, k (e.g. prefill FFN Down), use out-stationary version
-    if (m >= 128 && k > n && n > 1024) {
-        int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
-        if (rc != FALLBACK_TO_STANDARD) {
-            return rc;  // 0 success, -1 error
-        }
-        FARF(HIGH, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
-        // fall through to standard path
-    }
-
     size_t row_stride = get_x4x2_row_stride(weight_type, k);
     if (row_stride == 0) {
         return -1;
     }
 
-    FARF(HIGH, "hmx_matmul_qk: STANDARD path m=%d k=%d n=%d type=%d", m, k, n, weight_type);
-
     // --- Dynamic VTCM layout ---
-    const size_t vtcm_budget   = ctx->vtcm_size;
-    const size_t vec_dot_size  = k * sizeof(__fp16);
+    const size_t vec_dot_size = k * sizeof(__fp16);
+    const size_t vtcm_budget  = ctx->vtcm_size;
+    size_t vtcm_used = 0;
 
     // Pipeline = 4-stage DMA→dequant→HMX→store with HMX worker overlap.
-    // Only pays off when the chunker yields >=2 n-chunks, so the main loop can
-    // overlap HMX (C) with HVX (B/D); with a single n-chunk the extra VTCM for
-    // double-buffered output and the worker-dispatch overhead are pure loss.
-    // Try pipeline costs first; fall back to sequential if the layout collapses
-    // to one n-chunk. m >= 128 floor keeps HMX utilization reasonable.
-    const size_t pipe_per_n  = row_stride + 2 * vec_dot_size;  // Q + S0 + S1 (dequant bufs)
-    const size_t pipe_per_mn = 2 * sizeof(__fp16);             // O x 2 (output double buffer)
-    const size_t seq_per_n   = vec_dot_size + 2 * row_stride;  // W + S0 + S1 (x4x2 DMA bufs)
-    const size_t seq_per_mn  = sizeof(__fp16);                 // O x 1
+    const size_t size_per_n  = row_stride + 2 * vec_dot_size;  // Q + S0 + S1 (dequant bufs)
+    const size_t size_per_mn = 2 * sizeof(__fp16);             // O x 2 (output double buffer)
 
-    size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
-    bool   use_pipeline = false;
-
-    if (m >= 128) {
-        size_t mc = 0, nc = 0, used = 0;
-        if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, pipe_per_n, /*per_m=*/vec_dot_size, pipe_per_mn,
-                               hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
-                               /*m_block_cost=*/(size_t) n * 3,
-                               /*n_block_cost=*/(size_t) m * 2, &mc, &nc, &used) == 0 &&
-            hmx_ceil_div((size_t) n, nc) >= 2) {
-            m_chunk_n_rows = mc;
-            n_chunk_n_cols = nc;
-            vtcm_used      = used;
-            use_pipeline   = true;
-        }
+    size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0;
+    if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, size_per_n, /*per_m=*/vec_dot_size, size_per_mn,
+                           hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
+                           /*m_block_cost=*/(size_t) n * 3,
+                           /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used)) {
+        FARF(HIGH, "hmx-mm-q: VTCM too small : m %d k %d n %d budget %zu", m, k, n, vtcm_budget);
+        return -1;
     }
 
-    if (!use_pipeline) {
-        if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, seq_per_n, /*per_m=*/vec_dot_size, seq_per_mn,
-                               hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
-                               /*m_block_cost=*/(size_t) n * 3,
-                               /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
-            FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
-            return -1;
-        }
-    }
-
-    // Compute precise buffer sizes per execution path
-    const size_t weight_area_size = hex_align_up(
-        n_chunk_n_cols * (use_pipeline ? row_stride : vec_dot_size), HMX_FP16_TILE_SIZE);
-    const size_t activation_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE);
-    const size_t output_area_size = hex_align_up(
-        m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE);
+    const size_t weight_area_size = hex_align_up(n_chunk_n_cols * row_stride,   HMX_FP16_TILE_SIZE);
+    const size_t act_area_size    = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE);
+    const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE);
 
     size_t scratch0_size, scratch1_size, scratch2_size;
-    if (use_pipeline) {
-        scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE);  // dequant buf 0
-        scratch1_size = scratch0_size;                                                    // dequant buf 1
-        scratch2_size = output_area_size;                                                 // output buf 1
-    } else {
-        scratch0_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE);    // x4x2 DMA buf 0
-        scratch1_size = scratch0_size;                                                    // x4x2 DMA buf 1
-        scratch2_size = 0;                                                                // unused
-    }
+    scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE);  // dequant buf 0
+    scratch1_size = scratch0_size;                                                    // dequant buf 1
+    scratch2_size = output_area_size;                                                 // output  buf 1
 
     uint8_t *vtcm_ptr        = (uint8_t *) ctx->vtcm_base;
     __fp16  *vtcm_weight     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size);
-    __fp16  *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size);
+    __fp16  *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size);
     __fp16  *vtcm_output     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size);
     void    *vtcm_scratch0   = vtcm_seq_alloc(&vtcm_ptr, scratch0_size);
     void    *vtcm_scratch1   = vtcm_seq_alloc(&vtcm_ptr, scratch1_size);
     void    *vtcm_scratch2   = scratch2_size ? vtcm_seq_alloc(&vtcm_ptr, scratch2_size) : NULL;
     __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
-    if ((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) > vtcm_budget) {
-        FARF(ERROR, "%s: vtcm overflow: used=%zu limit=%zu", __func__,
-             (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+
+    vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base;
+    if (vtcm_used > vtcm_budget) {
+        FARF(ERROR, "hmx-mm-q: VTCM overflow: used %zu budget %zu", vtcm_used, vtcm_budget);
         return -1;
     }
 
     hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00));  // scale: 1.0, bias: 0.0 in FP16
 
-    FARF(HIGH, "%s: m=%d k=%d n=%d wtype=%d pipe=%d mc=%zu nc=%zu vtcm=%zu/%zu",
-         __func__, m, k, n, weight_type, use_pipeline,
-         m_chunk_n_rows, n_chunk_n_cols,
-         (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+    FARF(HIGH, "hmx-mm-q: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu",
+         m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget);
 
     TIMER_DEFINE(activation_load);
     TIMER_DEFINE(weight_load);
@@ -1178,184 +1033,115 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
     TIMER_DEFINE(total);
     TIMER_START(total);
 
-    FARF(HIGH, "hmx_matmul_qk: %s mc=%zu nc=%zu vtcm=%zu/%zu",
-         use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
-         (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+    // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
+    // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
 
-    if (!use_pipeline) {
-        HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
-        for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
-            // transfer activation matrix chunk into VTCM
-            const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
-            const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
+    // A --> B: vtcm_qweight, 1 buffer
+    // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
+    // C --> D: vtcm_output0/vtcm_output1, 2 buffers
 
-            TIMER_START(activation_load);
-            {
-                const float *activation_chunk = activation + mr * k;
-                transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
-            }
-            TIMER_STOP(activation_load);
-
-            void *buf_curr = vtcm_scratch0;
-            void *buf_next = vtcm_scratch1;
-
-            {
-                const size_t n_cols_first = hex_smin(n, n_chunk_n_cols);
-                dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first);
-            }
-
-            for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
-                const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
-                const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
-
-                TIMER_START(weight_load);
-                {
-                    dma_queue_pop(ctx->dma[0]);  // wait until current weight chunk become ready
-
-                    const size_t nc_next = nc + n_chunk_n_cols;
-                    if (nc_next < n) {
-                        const size_t n_cols_next = hex_smin(n - nc_next, n_chunk_n_cols);
-
-                        const uint8_t *next_weight_chunk = permuted_weight + nc_next * row_stride;
-
-                        dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), row_stride, row_stride, row_stride, n_cols_next);
-                    }
+    // Async timeline (C overlaps B+D):
+    //   main+HVX:   [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
+    //   HMX queue:                   [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
 
-                    // Dequant + vscatter writes directly to [K, N] transposed tiles.
-                    // HMX computes C = A x B, where A=[M,K] activation, B=[K,N] weight.
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, buf_curr, n_cols, k, row_stride, weight_type);
+    int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
+    hmx_matmul_job_t job_slots[2];  // persistent double-buffered job descriptors
 
-                    hex_swap_ptr(&buf_curr, &buf_next);
-                }
-                TIMER_STOP(weight_load);
+    for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
+        const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
 
-                TIMER_START(hmx_core);
-                {
-                    core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32);
-                }
-                TIMER_STOP(hmx_core);
+        void *vtcm_qweight        = vtcm_weight;
+        void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 };
+        void *vtcm_output_bufs[2] = { vtcm_output,   vtcm_scratch2 };
 
-                TIMER_START(output_store);
-                {
-                    float *output = dst + (mr * n + nc);
-                    transfer_output_chunk_threaded(ctx, output, vtcm_output, n_rows, n_cols, n);
-                }
-                TIMER_STOP(output_store);
-            }
+        // prologue: A0
+        const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols);
+        {
+            const uint8_t *qweight_chunk_A0 = permuted_weight;
+            dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
         }
-        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
-    } else {
-        // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
-        // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
-
-        // A --> B: vtcm_qweight, 1 buffer
-        // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
-        // C --> D: vtcm_output0/vtcm_output1, 2 buffers
-
-        // Async timeline (C overlaps B+D):
-        //   main+HVX:   [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
-        //   HMX queue:                   [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
 
-        int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
-        hmx_matmul_job_t job_slots[2];  // persistent double-buffered job descriptors
-
-        for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
-            const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
-
-            void *vtcm_qweight        = vtcm_weight;
-            void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 };
-            void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 };
+        {
+            const float *activation_chunk = activation + mr * k;
+            transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
+        }
 
-            // prologue: A0
-            const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols);
-            {
-                // Use 2D DMA (n_cols rows x row_stride) to avoid 16-bit roiwidth overflow.
-                const uint8_t *qweight_chunk_A0 = permuted_weight;
-                dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
+        // prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
+        {
+            // B0: wait for DMA, dequant weight chunk 0
+            dma_queue_pop(ctx->dma[0]);
+            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
+
+            // A1: issue DMA for weight chunk 1
+            const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
+            if (1 < n_chunk_cnt) {
+                const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
+                dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
             }
 
-            {
-                const float *activation_chunk = activation + mr * k;
-                transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
-            }
+            // submit C0 (non-blocking — HMX worker executes in parallel)
+            hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
+                                (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
+                                hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+            hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
 
-            // prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
-            {
-                // B0: wait for DMA, dequant weight chunk 0
+            // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
+            if (1 < n_chunk_cnt) {
                 dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
-
-                // A1: issue DMA for weight chunk 1
-                const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
-                if (1 < n_chunk_cnt) {
-                    const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
-                }
-
-                // submit C0 (non-blocking — HMX worker executes in parallel)
-                hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
-                                    (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
-                                    hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
-                                    hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
-                hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
-
-                // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
-                if (1 < n_chunk_cnt) {
-                    dma_queue_pop(ctx->dma[0]);
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
-                }
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
             }
+        }
 
-            // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
-            for (int i = 0; i < n_chunk_cnt; ++i) {
-                const size_t nc    = i * n_chunk_n_cols;
-                const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
-                const size_t nc_p2 = nc + 2 * n_chunk_n_cols;
+        // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
+        for (int i = 0; i < n_chunk_cnt; ++i) {
+            const size_t nc    = i * n_chunk_n_cols;
+            const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
+            const size_t nc_p2 = nc + 2 * n_chunk_n_cols;
 
-                const size_t n_cols    = hex_smin(n - nc, n_chunk_n_cols);
-                const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
-                const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
+            const size_t n_cols    = hex_smin(n - nc, n_chunk_n_cols);
+            const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
+            const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
 
-                // issue A_{i+2}: DMA push (non-blocking)
-                if (i + 2 < n_chunk_cnt) {
-                    const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
-                }
+            // issue A_{i+2}: DMA push (non-blocking)
+            if (i + 2 < n_chunk_cnt) {
+                const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
+                dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
+            }
 
-                // wait C_i: block until prologue/previous C completes
-                hmx_queue_pop(ctx->hmx_queue);
-
-                // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
-                // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
-                // counterpart — and (i+1)%2 was last used by C_{i-1} which completed
-                // before C_i was submitted.
-                if (i + 1 < n_chunk_cnt) {
-                    hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
-                                        (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
-                                        vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
-                                        hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
-                    hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
-                }
+            // wait C_i: block until prologue/previous C completes
+            hmx_queue_pop(ctx->hmx_queue);
+
+            // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
+            // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
+            // counterpart — and (i+1)%2 was last used by C_{i-1} which completed
+            // before C_i was submitted.
+            if (i + 1 < n_chunk_cnt) {
+                hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
+                                    (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
+                                    vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                    hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
+            }
 
-                // D_i: store output (multi-thread HVX, parallel with C_{i+1})
-                float *output_chunk = dst + (mr * n + nc);
-                transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
+            // D_i: store output (multi-thread HVX, parallel with C_{i+1})
+            float *output_chunk = dst + (mr * n + nc);
+            transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
 
-                // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
-                if (i + 2 < n_chunk_cnt) {
-                    dma_queue_pop(ctx->dma[0]);
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
-                }
+            // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
+            if (i + 2 < n_chunk_cnt) {
+                dma_queue_pop(ctx->dma[0]);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
             }
         }
-
-        hmx_queue_suspend(ctx->hmx_queue);
     }
 
+    hmx_queue_suspend(ctx->hmx_queue);
+
     TIMER_STOP(total);
 
 #if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d pipeline=%d", __func__, TIMER_US(total), m, k, n, use_pipeline);
+    FARF(HIGH, "hex-mm-q: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n);
     if (!use_pipeline) {
         FARF(HIGH, "  activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us",
              TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store));
@@ -1370,15 +1156,15 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
 
 //
 
-static inline int hmx_matmul_batch_r2(const hmx_matmul_w16a32_batched_params_t *params) {
+static inline int hmx_matmul_batch_r2(const hmx_matmul_f16_f32_batched_params_t *params) {
     return params->ne02 > 0 ? params->ne12 / params->ne02 : 1;
 }
 
-static inline int hmx_matmul_batch_r3(const hmx_matmul_w16a32_batched_params_t *params) {
+static inline int hmx_matmul_batch_r3(const hmx_matmul_f16_f32_batched_params_t *params) {
     return params->ne03 > 0 ? params->ne13 / params->ne03 : 1;
 }
 
-static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
+static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
                                                         int dst_b2, int dst_b3) {
     const int r2 = hmx_matmul_batch_r2(params);
     const int r3 = hmx_matmul_batch_r3(params);
@@ -1387,37 +1173,36 @@ static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_
                              (size_t) (dst_b3 / r3) * params->src0_nb3);
 }
 
-static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
+static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
                                                            int dst_b2, int dst_b3) {
     return (const float *) ((const uint8_t *) params->activation +
                             (size_t) dst_b2 * params->src1_nb2 +
                             (size_t) dst_b3 * params->src1_nb3);
 }
 
-static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
+static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
                                               int dst_b2, int dst_b3) {
     return (float *) ((uint8_t *) params->dst +
                       (size_t) dst_b2 * params->dst_nb2 +
                       (size_t) dst_b3 * params->dst_nb3);
 }
 
-static int hmx_mat_mul_permuted_w16a32_batched_legacy(struct htp_context *ctx,
-                                                      const hmx_matmul_w16a32_batched_params_t *params) {
+static int hmx_matmul_f16_f32_batched_legacy(struct htp_context *ctx,
+                                                      const hmx_matmul_f16_f32_batched_params_t *params) {
     int ret = 0;
     for (int b3 = 0; b3 < params->ne13 && ret == 0; ++b3) {
         for (int b2 = 0; b2 < params->ne12 && ret == 0; ++b2) {
-            ret = hmx_mat_mul_permuted_w16a32(ctx,
-                                              hmx_matmul_dst_batch_ptr(params, b2, b3),
-                                              hmx_matmul_activation_batch_ptr(params, b2, b3),
-                                              hmx_matmul_weight_batch_ptr(params, b2, b3),
-                                              params->m, params->k, params->n,
-                                              params->act_stride, params->weight_stride);
+            ret = hmx_matmul_f16_f32(ctx, hmx_matmul_dst_batch_ptr(params, b2, b3),
+                                           hmx_matmul_activation_batch_ptr(params, b2, b3),
+                                           hmx_matmul_weight_batch_ptr(params, b2, b3),
+                                           params->m, params->k, params->n,
+                                           params->act_stride, params->weight_stride);
         }
     }
     return ret;
 }
 
-int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmul_w16a32_batched_params_t *params) {
+int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params) {
     if (!ctx || !params || !params->dst || !params->activation || !params->permuted_weight) { return -1; }
     if (!params->m || !params->k || !params->n) { return -1; }
     if (params->act_stride < params->k || params->weight_stride < params->k || params->dst_stride < params->n) { return -1; }
@@ -1435,7 +1220,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
 
     if (group_size <= 1) {
         FARF(HIGH, "%s: no dim2 GQA reuse (group=%d), using legacy batched loop", __func__, group_size);
-        return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
+        return hmx_matmul_f16_f32_batched_legacy(ctx, params);
     }
 
     // Grouped path: reuse interleaved weight across all q_heads sharing a
@@ -1464,7 +1249,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
                            /*m_block_cost=*/(size_t) params->n,
                            /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
         FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
-        return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
+        return hmx_matmul_f16_f32_batched_legacy(ctx, params);
     }
 
     const size_t act_head_stride      = m_chunk_n_rows * (size_t) params->k;  // fp16 elements between heads
@@ -1486,7 +1271,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
 
     if ((size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base) > vtcm_budget) {
         FARF(HIGH, "%s: grouped layout overflowed VTCM, falling back to legacy batched loop", __func__);
-        return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
+        return hmx_matmul_f16_f32_batched_legacy(ctx, params);
     }
 
     hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00));  // scale: 1.0, bias: 0.0 in FP16
@@ -1614,7 +1399,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
 
 //
 
-int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
+int hmx_matmul_f16_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
                                 const __fp16 *restrict permuted_weight, int m, int k, int n,
                                 int act_stride, int weight_stride) {
     if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; }
diff --git a/ggml/src/ggml-hexagon/htp/hmx-ops.h b/ggml/src/ggml-hexagon/htp/hmx-ops.h
index 1c78ffadd1c..f114edb822f 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-ops.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-ops.h
@@ -33,14 +33,14 @@ typedef struct {
     size_t        src1_nb3;
     size_t        dst_nb2;
     size_t        dst_nb3;
-} hmx_matmul_w16a32_batched_params_t;
+} hmx_matmul_f16_f32_batched_params_t;
 
 // HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
 // act_stride: activation row stride in elements (= k for contiguous, or
 //             nb[1]/sizeof(float) for permuted tensors like attention Q).
 // weight_stride: weight row stride in elements (= k for compact weights, or
 //                nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
-int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
+int hmx_matmul_f16_f32(struct htp_context *ctx,
                                 float *restrict dst,
                                 const float *activation,
                                 const __fp16 *permuted_weight,
@@ -48,13 +48,12 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
                                 int act_stride,
                                 int weight_stride);
 
-// Batched F16 wrapper over hmx_mat_mul_permuted_w16a32.
+// Batched F16 wrapper over hmx_mat_mul_f16_f32.
 // Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
-int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx,
-                                        const hmx_matmul_w16a32_batched_params_t *params);
+int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
 
-// HMX matrix multiplication — tile-permuted quantised weights (Q4_0/Q8_0/IQ4_NL)
-int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx,
+// HMX matrix multiplication — quantised weights (Q4_0/Q8_0/IQ4_NL/MXFP4)
+int hmx_matmul_q_f32(struct htp_context *ctx,
                                       float *restrict dst,
                                       const float *activation,
                                       const uint8_t *permuted_weight,
diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h
index 6fe3e6c7d85..51f9243ce0a 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -104,6 +104,7 @@ int op_argsort(struct htp_ops_context * octx);
 int op_ssm_conv(struct htp_ops_context * octx);
 int op_cumsum(struct htp_ops_context * octx);
 int op_fill(struct htp_ops_context * octx);
+int op_concat(struct htp_ops_context * octx);
 int op_diag(struct htp_ops_context * octx);
 int op_solve_tri(struct htp_ops_context * octx);
 int op_gated_delta_net(struct htp_ops_context * octx);
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index 676e948a439..aadc77235ba 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -20,6 +20,7 @@ enum htp_data_type {
     HTP_TYPE_F32    = 0,
     HTP_TYPE_F16    = 1,
     HTP_TYPE_Q4_0   = 2,
+    HTP_TYPE_Q4_1   = 3,
     HTP_TYPE_Q8_0   = 8,
     HTP_TYPE_IQ4_NL = 20,
     HTP_TYPE_I32    = 26,
@@ -28,6 +29,7 @@ enum htp_data_type {
 
     // types used internally for repack, dyn.quant, etc
     HTP_TYPE_Q4_0x4x2 = 200,
+    HTP_TYPE_Q4_1x4x2,
     HTP_TYPE_Q8_0x4x2,
     HTP_TYPE_MXFP4x4x2,
 
@@ -88,6 +90,8 @@ enum htp_op_code {
     HTP_OP_GATED_DELTA_NET,
     HTP_OP_TRI,
     HTP_OP_PAD,
+    HTP_OP_NORM,
+    HTP_OP_CONCAT,
 
     HTP_OP_INVALID
 };
diff --git a/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h b/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
new file mode 100644
index 00000000000..c5b9a5d47c1
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
@@ -0,0 +1,90 @@
+#ifndef HVX_SIN_COS_H
+#define HVX_SIN_COS_H
+
+#include "hvx-base.h"
+#include "hvx-floor.h"
+
+static inline HVX_Vector hvx_vec_cos_f32(HVX_Vector x) {
+    HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
+    HVX_Vector const_half   = hvx_vec_splat_f32(0.5f);
+    HVX_Vector const_pi     = hvx_vec_splat_f32(3.141592653589793f);
+    HVX_Vector const_one    = hvx_vec_splat_f32(1.0f);
+    HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
+
+    // n = floor(x * (1/pi) + 0.5)
+    HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
+
+    // y = x - n * pi
+    HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
+
+    // Sign determination: if n is odd, sign is -1.0f, else 1.0f
+    // half_n = n * 0.5f
+    HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
+    // floor_half_n = floor(half_n)
+    HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
+    // is_odd = half_n > floor_half_n
+    HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
+    // sign = vmux(is_odd, -1.0f, 1.0f)
+    HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
+
+    // z = y^2
+    HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
+
+    // Chebyshev approximation for cos(y)
+    HVX_Vector c4 = hvx_vec_splat_f32(2.3557242013849433e-05f);
+    HVX_Vector c3 = hvx_vec_splat_f32(-0.0013871428263450528f);
+    HVX_Vector c2 = hvx_vec_splat_f32(0.041665895266688284f);
+    HVX_Vector c1 = hvx_vec_splat_f32(-0.4999999360426369f);
+    HVX_Vector c0 = hvx_vec_splat_f32(0.9999999999071725f);
+
+    HVX_Vector cos_y = hvx_vec_add_f32_f32(c3, hvx_vec_mul_f32_f32(z, c4));
+    cos_y = hvx_vec_add_f32_f32(c2, hvx_vec_mul_f32_f32(z, cos_y));
+    cos_y = hvx_vec_add_f32_f32(c1, hvx_vec_mul_f32_f32(z, cos_y));
+    cos_y = hvx_vec_add_f32_f32(c0, hvx_vec_mul_f32_f32(z, cos_y));
+
+    return hvx_vec_mul_f32_f32(cos_y, sign);
+}
+
+static inline HVX_Vector hvx_vec_sin_f32(HVX_Vector x) {
+    HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
+    HVX_Vector const_half   = hvx_vec_splat_f32(0.5f);
+    HVX_Vector const_pi     = hvx_vec_splat_f32(3.141592653589793f);
+    HVX_Vector const_one    = hvx_vec_splat_f32(1.0f);
+    HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
+
+    // n = floor(x * (1/pi) + 0.5)
+    HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
+
+    // y = x - n * pi
+    HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
+
+    // Sign determination: if n is odd, sign is -1.0f, else 1.0f
+    // half_n = n * 0.5f
+    HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
+    // floor_half_n = floor(half_n)
+    HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
+    // is_odd = half_n > floor_half_n
+    HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
+    // sign = vmux(is_odd, -1.0f, 1.0f)
+    HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
+
+    // z = y^2
+    HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
+
+    // Chebyshev approximation for sin(y)
+    HVX_Vector s4 = hvx_vec_splat_f32(2.642186986152672e-06f);
+    HVX_Vector s3 = hvx_vec_splat_f32(-0.00019825318964070864f);
+    HVX_Vector s2 = hvx_vec_splat_f32(0.00833326283319605f);
+    HVX_Vector s1 = hvx_vec_splat_f32(-0.16666666082087775f);
+    HVX_Vector s0 = hvx_vec_splat_f32(0.999999999915155f);
+
+    HVX_Vector sin_y = hvx_vec_add_f32_f32(s3, hvx_vec_mul_f32_f32(z, s4));
+    sin_y = hvx_vec_add_f32_f32(s2, hvx_vec_mul_f32_f32(z, sin_y));
+    sin_y = hvx_vec_add_f32_f32(s1, hvx_vec_mul_f32_f32(z, sin_y));
+    sin_y = hvx_vec_add_f32_f32(s0, hvx_vec_mul_f32_f32(z, sin_y));
+    sin_y = hvx_vec_mul_f32_f32(y, sin_y);
+
+    return hvx_vec_mul_f32_f32(sin_y, sign);
+}
+
+#endif /* HVX_SIN_COS_H */
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index e0452811ec3..0a760cd344c 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -14,6 +14,8 @@
 #include "hvx-sqrt.h"
 #include "hvx-arith.h"
 #include "hvx-div.h"
+#include "hvx-floor.h"
+#include "hvx-sin-cos.h"
 #include "hvx-base.h"
 
 #endif /* HVX_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 12003c1fd8a..7dd90ac7d7f 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -87,35 +87,37 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
         }
     }
 
+#if __HVX_ARCH__ >= 75
     {
-        // Power on HMX
+        // Power on HMX and set HMX clock
         HAP_power_request_t request;
         memset(&request, 0, sizeof(HAP_power_request_t));
-        request.type         = HAP_power_set_HMX;
-        request.hmx.power_up = TRUE;
-        FARF(ALWAYS, "Powering HMX on\n");
-        err = HAP_power_set((void *) &ctx, &request);
+        request.type = HAP_power_set_HMX_v2;
+        request.hmx_v2.set_power     = TRUE;
+        request.hmx_v2.power_up      = TRUE;
+        request.hmx_v2.set_clock     = TRUE;
+        request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.min_corner    = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.max_corner    = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.perf_mode     = HAP_CLK_PERF_HIGH;
+        FARF(ALWAYS, "Setting HMX clock\n");
+        err = HAP_power_set((void *) ctx, &request);
         if (err != AEE_SUCCESS) {
-            FARF(ERROR, "Error powering on HMX.");
+            FARF(ERROR, "Error setting HMX clock.");
             return err;
         }
     }
-
-#if __HVX_ARCH__ >= 75
+#else
     {
-        // Set HMX clock
+        // Power on HMX
         HAP_power_request_t request;
         memset(&request, 0, sizeof(HAP_power_request_t));
-        request.type = HAP_power_set_HMX_v2;
-        request.hmx_v2.set_clock = TRUE;
-        request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
-        request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
-        request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
-        request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
-        FARF(ALWAYS, "Setting HMX clock\n");
-        err = HAP_power_set((void *) &ctx, &request);
+        request.type         = HAP_power_set_HMX;
+        request.hmx.power_up = TRUE;
+        FARF(ALWAYS, "Powering HMX on\n");
+        err = HAP_power_set((void *) ctx, &request);
         if (err != AEE_SUCCESS) {
-            FARF(ERROR, "Error setting HMX clock.");
+            FARF(ERROR, "Error powering on HMX.");
             return err;
         }
     }
@@ -418,8 +420,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
 
     ctx->n_threads = n_hvx;
     for (int i = 0; i < ctx->n_threads; i++) {
-        // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
-        ctx->dma[i] = dma_queue_create(128);
+        ctx->dma[i] = dma_queue_create(256); // queue depth
     }
 
     // init worker pool
@@ -534,6 +535,7 @@ static int execute_op(struct htp_ops_context * octx) {
         case HTP_OP_ADD_ID:
             return op_binary(octx);
 
+        case HTP_OP_NORM:
         case HTP_OP_RMS_NORM:
         case HTP_OP_SCALE:
         case HTP_OP_SQR:
@@ -598,6 +600,9 @@ static int execute_op(struct htp_ops_context * octx) {
         case HTP_OP_PAD:
             return op_pad(octx);
 
+        case HTP_OP_CONCAT:
+            return op_concat(octx);
+
         case HTP_OP_GATED_DELTA_NET:
             return op_gated_delta_net(octx);
 
@@ -848,6 +853,11 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
         for (uint32_t i=0; i < n_ops; i++) {
             struct profile_data prof;
 
+            if (i == (n_ops-1)) {
+                // wake up the host before starting the last op
+                dspqueue_write_early_wakeup_noblock(queue, 0, 0);
+            }
+
             profile_start(ctx->profiler, &prof);
 
             proc_op_req(octx, tens, i, &ops[i]);
@@ -864,8 +874,6 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
             }
         }
 
-        // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
-
         struct htp_opbatch_rsp rsp;
         rsp.id        = req.id;
         rsp.status    = HTP_STATUS_OK;
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 2461ae617fa..7036c491bc4 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -40,6 +40,11 @@ struct htp_matmul_context {
          const void * restrict vx0, const void * restrict vx1,
          const void * restrict vy0, const void * restrict vy1);
 
+    void (*vec_dot_4x1)(const int n, float * restrict s0,
+         const void * restrict vx0, const void * restrict vx1,
+         const void * restrict vx2, const void * restrict vx3,
+         const void * restrict vy0);
+
     // Precomputed values
     uint32_t src0_nrows_per_thread;
     uint32_t src1_nrows_per_thread;
@@ -155,6 +160,13 @@ static inline size_t q8x4x2_row_size(uint32_t ne) {
     return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128);
 }
 
+static inline size_t q8_1x4x2_row_size(uint32_t ne) {
+    // ensures perfect alignment of quants and full row
+    const uint32_t qk = QK_Q8_0x4x2;
+    const uint32_t nb = (ne + qk - 1) / qk;
+    return hex_round_up(ne + nb * 8 * 2 * sizeof(__fp16), 128);
+}
+
 static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) {
     const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
 
@@ -223,6 +235,62 @@ static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, u
     return r;
 }
 
+static inline HVX_Vector_x8 hvx_vec_load_q4_1x4x8_full(const uint8_t * restrict ptr) {
+    const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
+
+    HVX_Vector v0_1 = vptr[0];  // first 256 elements (128 bytes)
+    HVX_Vector v2_3 = vptr[1];  // ...
+    HVX_Vector v4_5 = vptr[2];  // ...
+    HVX_Vector v6_7 = vptr[3];  // ...
+
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+
+    HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4);  // & 0x0F : first  128 elements
+    HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4);    // >> 4   : second 128 elements
+    HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4);  // & 0x0F ...
+    HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4);    // >> 4
+    HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4);  // & 0x0F
+    HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4);    // >> 4
+    HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4);  // & 0x0F
+    HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4);    // >> 4
+
+    HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
+    return r;
+}
+
+static HVX_Vector_x8 hvx_vec_load_q4_1x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
+    const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
+
+    const uint32_t qk   = QK_Q4_0x4x2; // 256
+    const uint32_t nb   = n / qk;
+    const uint32_t nloe = n % qk;
+
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+
+    HVX_Vector_x8 r;
+    uint32_t i = 0;
+
+    #pragma unroll(2)
+    for (i=0; i < nb; i++) {
+        HVX_Vector v = vptr[i];                    // 256 elements (128 bytes)
+        HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4);  // & 0x0F : first  128 elements
+        HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4);    // >> 4   : second 128 elements
+        r.v[i*2+0] = v0;
+        r.v[i*2+1] = v1;
+    }
+
+    if (nloe) {
+        HVX_Vector v = vptr[i];                    // 256 elements (128 bytes)
+        HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4);  // & 0x0F : even 128 elements
+        HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4);    // >> 4   : odd  128 elements
+        HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
+        r.v[i*2+0] = Q6_V_lo_W(v0_1_p);
+        r.v[i*2+1] = Q6_V_hi_W(v0_1_p);
+    }
+
+    return r;
+}
+
 static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) {
     const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
 
@@ -401,82 +469,96 @@ static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8
     return hvx_vec_rmpy_x8_partial(x, y, 512);
 }
 
-static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
+static void vec_dot_q4_1x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
     assert((unsigned long) vx0 % 128 == 0);
     assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_dblk_size = 8 * 4 * 2 * 2;                               // 32x (d, m) __fp16 = 128 bytes
     const uint32_t x_qblk_size = qk / 2;                                      // int4
     const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_dblk_size = 8 * 4 * 4;                                   // 32x (d, s) __fp16 = 128 bytes
     const uint32_t y_qblk_size = qk;                                          // int8
     const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
     const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);            // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size);  // then scales
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size);  // then scales/offsets
 
     const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales/sums
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vzero();
 
-    // Multiply and accumulate into int32.
-    // Compute combined scale (fp32).
-    // Apply scale to acc and accumulate into the row sum (qf32).
-
     const uint32_t nb   = n / qk;  // num full blocks
     const uint32_t nloe = n % qk;  // num leftover elemements
 
     uint32_t i = 0;
     for (; i < nb; i++) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
-        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector ds = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
+        HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2);
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal));
+        HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal));
+
+        HVX_Vector dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(dm, dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(dm_deal));
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s)));
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms);
 
-        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum));
     }
 
     // Process leftovers
     if (nloe) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
-        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector ds = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
+        HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2);
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal));
+        HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal));
+
+        HVX_Vector dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(dm, dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(dm_deal));
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s)));
 
         // Zero out unused elements
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ms                = Q6_V_vand_QV(bmask, r0_ms);
         r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms);
 
-        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum));
     }
 
     r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
-
     hvx_vec_store_u(s0, 4, r0_sum);
 }
 
-static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
+static void vec_dot_q4_1x4x2_q8x4x2_2x1(const int n, float * restrict s0,
                                       const void * restrict vx0, const void * restrict vx1,
                                       const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
@@ -486,11 +568,11 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_dblk_size = 8 * 4 * 2 * 2;                               // 32x (d, m) __fp16 = 128 bytes
     const uint32_t x_qblk_size = qk / 2;                                      // int4
     const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_dblk_size = 8 * 4 * 4;                                   // 32x (d, s) __fp16 = 128 bytes
     const uint32_t y_qblk_size = qk;                                          // int8
     const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
@@ -500,77 +582,306 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
     const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
 
     const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales/sums
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vzero();
     HVX_Vector r1_sum = Q6_V_vzero();
 
-    // Multiply and accumulate into int32.
-    // Compute combined scale (fp32).
-    // Apply scale to acc and accumulate into the row sum (qf32).
-
     const uint32_t nb   = n / qk;  // num full blocks
     const uint32_t nloe = n % qk;  // num leftover elemements
 
     uint32_t i = 0;
     for (; i < nb; i++) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
-        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
-        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector ds = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
+        HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2);
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal));
+        HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal));
+
+        HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal));
+
+        HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2);
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal));
+        HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal));
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s)));
+
         HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s)));
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms);
+
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms);
 
-        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
-        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum));
     }
 
     // Process leftovers
     if (nloe) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
-        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
-        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector ds = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
+        HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2);
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal));
+        HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal));
+
+        HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal));
+
+        HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2);
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal));
+        HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal));
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s)));
+
         HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s)));
 
         // Zero out unused elements
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ms                = Q6_V_vand_QV(bmask, r0_ms);
         r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r1_ms                = Q6_V_vand_QV(bmask, r1_ms);
         r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
         r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms);
+
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms);
 
-        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
-        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum));
     }
 
     HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
     hvx_vec_store_u(s0, 8, rsum);
 }
 
-static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
+static void vec_dot_q4_1x4x2_q8x4x2_4x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vx2, const void * restrict vx3,
+                                      const void * restrict vy0) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vx2 % 128 == 0);
+    assert((unsigned long) vx3 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2 * 2;                               // 32x (d, m) __fp16 = 128 bytes
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 4;                                   // 32x (d, s) __fp16 = 128 bytes
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+    const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0;            // quants first
+    const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size;  // then scales
+    const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0;            // quants first
+    const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales/sums
+
+    // Row sum (sf)
+    HVX_Vector r0_sum = Q6_V_vzero();
+    HVX_Vector r1_sum = Q6_V_vzero();
+    HVX_Vector r2_sum = Q6_V_vzero();
+    HVX_Vector r3_sum = Q6_V_vzero();
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r2_q = hvx_vec_load_q4_1x4x8_full(r2_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r3_q = hvx_vec_load_q4_1x4x8_full(r3_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q));
+
+        HVX_Vector ds = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
+        HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2);
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal));
+        HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal));
+
+        HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal));
+
+        HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2);
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal));
+        HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal));
+
+        HVX_Vector r2_dm = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size);
+        HVX_VectorPair r2_dm_deal = Q6_W_vdeal_VVR(r2_dm, r2_dm, -2);
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r2_dm_deal));
+        HVX_Vector r2_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r2_dm_deal));
+
+        HVX_Vector r3_dm = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size);
+        HVX_VectorPair r3_dm_deal = Q6_W_vdeal_VVR(r3_dm, r3_dm, -2);
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r3_dm_deal));
+        HVX_Vector r3_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r3_dm_deal));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s)));
+
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s)));
+
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r2_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_m, vy_s)));
+
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
+        HVX_Vector r3_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_m, vy_s)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms);
+
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms);
+
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r2_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_ms);
+
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
+        HVX_Vector r3_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_ms);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa_total, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa_total, r3_sum));
+    }
+
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r2_q = hvx_vec_load_q4_1x4x8_partial(r2_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r3_q = hvx_vec_load_q4_1x4x8_partial(r3_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe));
+
+        HVX_Vector ds = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
+        HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2);
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal));
+        HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal));
+
+        HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal));
+
+        HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2);
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal));
+        HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal));
+
+        HVX_Vector r2_dm = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size);
+        HVX_VectorPair r2_dm_deal = Q6_W_vdeal_VVR(r2_dm, r2_dm, -2);
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r2_dm_deal));
+        HVX_Vector r2_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r2_dm_deal));
+
+        HVX_Vector r3_dm = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size);
+        HVX_VectorPair r3_dm_deal = Q6_W_vdeal_VVR(r3_dm, r3_dm, -2);
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r3_dm_deal));
+        HVX_Vector r3_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r3_dm_deal));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s)));
+
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s)));
+
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r2_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_m, vy_s)));
+
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
+        HVX_Vector r3_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_m, vy_s)));
+
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ms                = Q6_V_vand_QV(bmask, r0_ms);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r1_ms                = Q6_V_vand_QV(bmask, r1_ms);
+        r2_dd                = Q6_V_vand_QV(bmask, r2_dd);
+        r2_ms                = Q6_V_vand_QV(bmask, r2_ms);
+        r3_dd                = Q6_V_vand_QV(bmask, r3_dd);
+        r3_ms                = Q6_V_vand_QV(bmask, r3_ms);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
+        r2_ia                = Q6_V_vand_QV(bmask, r2_ia);
+        r3_ia                = Q6_V_vand_QV(bmask, r3_ia);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms);
+
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms);
+
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r2_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_ms);
+
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
+        HVX_Vector r3_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_ms);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa_total, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa_total, r3_sum));
+    }
+
+    HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } };
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in);
+    hvx_vec_store_u(s0, 16, rsum);
+}
+
+
+static void vec_dot_q4_1x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
                                         const void * restrict vx0, const void * restrict vx1,
                                         const void * restrict vy0, const void * restrict vy1) {
     assert(n % 32 == 0);
@@ -581,11 +892,11 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_dblk_size = 8 * 4 * 2 * 2;                               // 32x (d, m) __fp16 = 128 bytes
     const uint32_t x_qblk_size = qk / 2;                                      // int4
     const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_dblk_size = 8 * 4 * 4;                                   // 32x (d, s) __fp16 = 128 bytes
     const uint32_t y_qblk_size = qk;                                          // int8
     const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
@@ -595,9 +906,9 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
     const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
 
     const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;              // quants first
-    const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;    // then scales
+    const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;    // then scales/sums
     const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;              // quants first
-    const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;    // then scales
+    const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;    // then scales/sums
 
     // Row sums (sf) - 4 accumulators for 2×2 tile
     HVX_Vector r0_c0_sum = Q6_V_vzero();
@@ -610,13 +921,13 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
 
     uint32_t i = 0;
     for (; i < nb; i++) {
-        // Load src1 columns (reused across both src0 rows)
+        // Load src1 columns
         HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
         HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
 
-        // Load src0 rows (reused across both src1 columns)
-        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
-        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
+        // Load src0 rows
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size);
 
         // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
         HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
@@ -625,16 +936,38 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
         HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
 
         // Load scales
-        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d   + i * y_dblk_size));
-        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d   + i * y_dblk_size));
-        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector ds0 = *(const HVX_UVector *) (y0_d   + i * y_dblk_size);
+        HVX_VectorPair ds0_deal = Q6_W_vdeal_VVR(ds0, ds0, -2);
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds0_deal));
+        HVX_Vector vy0_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds0_deal));
+
+        HVX_Vector ds1 = *(const HVX_UVector *) (y1_d   + i * y_dblk_size);
+        HVX_VectorPair ds1_deal = Q6_W_vdeal_VVR(ds1, ds1, -2);
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds1_deal));
+        HVX_Vector vy1_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds1_deal));
+
+        HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal));
+
+        HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2);
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal));
+        HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal));
 
         // Compute combined scales
         HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy0_s)));
+
         HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r0_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy1_s)));
+
         HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy0_s)));
+
         HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+        HVX_Vector r1_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy1_s)));
 
         // Apply scales and accumulate
         HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
@@ -642,40 +975,72 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
         HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
         HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
 
-        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
-        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
-        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
-        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+        HVX_Vector r0_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_ms);
+        HVX_Vector r0_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_ms);
+        HVX_Vector r1_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_ms);
+        HVX_Vector r1_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_ms);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa_total, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa_total, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa_total, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa_total, r1_c1_sum));
     }
 
     // Process leftovers
     if (nloe) {
         HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q   + i * y_qblk_size, nloe);
         HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q   + i * y_qblk_size, nloe);
-        HVX_Vector_x8 r0_q  = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
-        HVX_Vector_x8 r1_q  = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r0_q  = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q  = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
 
         HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
         HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
         HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
         HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
 
-        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d   + i * y_dblk_size));
-        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d   + i * y_dblk_size));
-        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector ds0 = *(const HVX_UVector *) (y0_d   + i * y_dblk_size);
+        HVX_VectorPair ds0_deal = Q6_W_vdeal_VVR(ds0, ds0, -2);
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds0_deal));
+        HVX_Vector vy0_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds0_deal));
+
+        HVX_Vector ds1 = *(const HVX_UVector *) (y1_d   + i * y_dblk_size);
+        HVX_VectorPair ds1_deal = Q6_W_vdeal_VVR(ds1, ds1, -2);
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds1_deal));
+        HVX_Vector vy1_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds1_deal));
+
+        HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2);
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal));
+        HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal));
+
+        HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2);
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal));
+        HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal));
 
         HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy0_s)));
+
         HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r0_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy1_s)));
+
         HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy0_s)));
+
         HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+        HVX_Vector r1_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy1_s)));
 
-        // Zero out unused scales
+        // Zero out unused elements
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
+        r0_c0_ms = Q6_V_vand_QV(bmask, r0_c0_ms);
         r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
+        r0_c1_ms = Q6_V_vand_QV(bmask, r0_c1_ms);
         r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
+        r1_c0_ms = Q6_V_vand_QV(bmask, r1_c0_ms);
         r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
+        r1_c1_ms = Q6_V_vand_QV(bmask, r1_c1_ms);
+
         r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
         r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
         r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
@@ -686,10 +1051,15 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
         HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
         HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
 
-        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
-        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
-        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
-        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+        HVX_Vector r0_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_ms);
+        HVX_Vector r0_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_ms);
+        HVX_Vector r1_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_ms);
+        HVX_Vector r1_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_ms);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa_total, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa_total, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa_total, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa_total, r1_c1_sum));
     }
 
     // Reduce and store results
@@ -700,26 +1070,26 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
     hvx_vec_store_u(s1, 8, r0_r1_c1_sum);  // row0,col1 row1,col1
 }
 
-static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
+static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
     assert((unsigned long) vx0 % 128 == 0);
     assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
-    const uint32_t x_qblk_size = qk;                                         // int8
-    const uint32_t x_qrow_size = n;                                          // int8 (not padded)
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
-    const uint32_t y_qblk_size = qk;                                         // int8
-    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);           // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size);  // then scales
 
-    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);              // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);    // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vzero();
@@ -729,12 +1099,12 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
     // Apply scale to acc and accumulate into the row sum (qf32).
 
     const uint32_t nb   = n / qk;  // num full blocks
-    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+    const uint32_t nloe = n % qk;  // num leftover elemements
 
     uint32_t i = 0;
     for (; i < nb; i++) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
-        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
 
@@ -751,7 +1121,433 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
     // Process leftovers
     if (nloe) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
-        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+
+        // Zero out unused elements
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+    }
+
+    r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
+
+    hvx_vec_store_u(s0, 4, r0_sum);
+}
+
+static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vy0) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
+
+    // Row sum (sf)
+    HVX_Vector r0_sum = Q6_V_vzero();
+    HVX_Vector r1_sum = Q6_V_vzero();
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elemements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        // Zero out unused elements
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+    }
+
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
+    hvx_vec_store_u(s0, 8, rsum);
+}
+
+static void vec_dot_q4x4x2_q8x4x2_4x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vx2, const void * restrict vx3,
+                                      const void * restrict vy0) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vx2 % 128 == 0);
+    assert((unsigned long) vx3 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;
+    const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0;
+    const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size;
+    const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0;
+    const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size;
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);
+
+    // Row sum (sf)
+    HVX_Vector r0_sum = Q6_V_vzero();
+    HVX_Vector r1_sum = Q6_V_vzero();
+    HVX_Vector r2_sum = Q6_V_vzero();
+    HVX_Vector r3_sum = Q6_V_vzero();
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r2_q = hvx_vec_load_q4x4x8_full(r2_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r3_q = hvx_vec_load_q4x4x8_full(r3_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size));
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
+    }
+
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r2_q = hvx_vec_load_q4x4x8_partial(r2_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r3_q = hvx_vec_load_q4x4x8_partial(r3_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size));
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
+
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r2_dd                = Q6_V_vand_QV(bmask, r2_dd);
+        r3_dd                = Q6_V_vand_QV(bmask, r3_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
+        r2_ia                = Q6_V_vand_QV(bmask, r2_ia);
+        r3_ia                = Q6_V_vand_QV(bmask, r3_ia);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
+    }
+
+    HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } };
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in);
+    hvx_vec_store_u(s0, 16, rsum);
+}
+
+
+static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
+                                        const void * restrict vx0, const void * restrict vx1,
+                                        const void * restrict vy0, const void * restrict vy1) {
+    assert(n % 32 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+    assert((unsigned long) vy1 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;              // quants first
+    const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;    // then scales
+    const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;              // quants first
+    const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;    // then scales
+
+    // Row sums (sf) - 4 accumulators for 2×2 tile
+    HVX_Vector r0_c0_sum = Q6_V_vzero();
+    HVX_Vector r0_c1_sum = Q6_V_vzero();
+    HVX_Vector r1_c0_sum = Q6_V_vzero();
+    HVX_Vector r1_c1_sum = Q6_V_vzero();
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        // Load src1 columns (reused across both src0 rows)
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
+
+        // Load src0 rows (reused across both src1 columns)
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
+
+        // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
+
+        // Load scales
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d   + i * y_dblk_size));
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d   + i * y_dblk_size));
+        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        // Compute combined scales
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+
+        // Apply scales and accumulate
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q   + i * y_qblk_size, nloe);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q   + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q  = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q  = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
+
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d   + i * y_dblk_size));
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d   + i * y_dblk_size));
+        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
+        r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
+        r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
+        r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
+        r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
+        r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
+        r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
+        r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
+
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Reduce and store results
+    HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
+    HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
+
+    hvx_vec_store_u(s0, 8, r0_r1_c0_sum);  // row0,col0 row1,col0
+    hvx_vec_store_u(s1, 8, r0_r1_c1_sum);  // row0,col1 row1,col1
+}
+
+static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t x_qblk_size = qk;                                         // int8
+    const uint32_t x_qrow_size = n;                                          // int8 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                         // int8
+    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);           // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);              // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);    // then scales
+
+    // Row sum (sf)
+    HVX_Vector r0_sum = Q6_V_vzero();
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
 
@@ -804,10 +1600,109 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
     // Row sum (qf32)
     HVX_Vector r0_sum = Q6_V_vzero();
     HVX_Vector r1_sum = Q6_V_vzero();
-
-    // Multiply and accumulate into int32.
-    // Compute combined scale (fp32).
-    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        // Zero out unused elements
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+    }
+
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
+    hvx_vec_store_u(s0, 8, rsum);
+}
+
+static void vec_dot_q8x4x2_q8x4x2_4x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vx2, const void * restrict vx3,
+                                      const void * restrict vy0) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vx2 % 128 == 0);
+    assert((unsigned long) vx3 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk;                                          // int8
+    const uint32_t x_qrow_size = n;                                           // int8 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+    const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0;            // quants first
+    const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size;  // then scales
+    const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0;            // quants first
+    const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
+
+    // Row sum (qf32)
+    HVX_Vector r0_sum = Q6_V_vzero();
+    HVX_Vector r1_sum = Q6_V_vzero();
+    HVX_Vector r2_sum = Q6_V_vzero();
+    HVX_Vector r3_sum = Q6_V_vzero();
 
     const uint32_t nb   = n / qk;  // num full blocks
     int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
@@ -817,58 +1712,86 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q    + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r2_q = hvx_vec_load_q8x4x8_full(r2_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r3_q = hvx_vec_load_q8x4x8_full(r3_x_q + i * x_qblk_size);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
         HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size));
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size));
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
         HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
 
         r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
         r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
     }
 
-    // Process leftovers
     if (nloe) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q    + i * y_qblk_size, nloe);
         HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
         HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r2_q = hvx_vec_load_q8x4x8_partial(r2_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r3_q = hvx_vec_load_q8x4x8_partial(r3_x_q + i * x_qblk_size, nloe);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d    + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
         HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size));
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size));
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
         HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
 
-        // Zero out unused elements
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
         r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r2_dd                = Q6_V_vand_QV(bmask, r2_dd);
+        r3_dd                = Q6_V_vand_QV(bmask, r3_dd);
         r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
         r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
+        r2_ia                = Q6_V_vand_QV(bmask, r2_ia);
+        r3_ia                = Q6_V_vand_QV(bmask, r3_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
 
         r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
         r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
     }
 
-    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
-    hvx_vec_store_u(s0, 8, rsum);
+    HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } };
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in);
+    hvx_vec_store_u(s0, 16, rsum);
 }
 
+
 static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
                                         const void * restrict vx0, const void * restrict vx1,
                                         const void * restrict vy0, const void * restrict vy1) {
@@ -1163,6 +2086,135 @@ static void vec_dot_iq4nlx4x2_q8x4x2_2x1(const int n,
     hvx_vec_store_u(s0, 8, rsum);
 }
 
+static void vec_dot_iq4nlx4x2_q8x4x2_4x1(const int n,
+                                         float * restrict s0,
+                                         const void * restrict vx0,
+                                         const void * restrict vx1,
+                                         const void * restrict vx2,
+                                         const void * restrict vx3,
+                                         const void * restrict vy0) {
+    assert(n % 32 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vx2 % 128 == 0);
+    assert((unsigned long) vx3 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+    const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0;            // quants first
+    const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size;  // then scales
+    const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0;            // quants first
+    const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
+
+    HVX_Vector r0_sum = Q6_V_vzero();
+    HVX_Vector r1_sum = Q6_V_vzero();
+    HVX_Vector r2_sum = Q6_V_vzero();
+    HVX_Vector r3_sum = Q6_V_vzero();
+
+    const uint32_t nb   = n / qk;
+    const uint32_t nloe = n % qk;
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r2_q = hvx_vec_load_iq4nlx4x8_full(r2_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r3_q = hvx_vec_load_iq4nlx4x8_full(r3_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size));
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
+    }
+
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r2_q = hvx_vec_load_iq4nlx4x8_partial(r2_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r3_q = hvx_vec_load_iq4nlx4x8_partial(r3_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size));
+        HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d)));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d)));
+
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r2_dd                = Q6_V_vand_QV(bmask, r2_dd);
+        r3_dd                = Q6_V_vand_QV(bmask, r3_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
+        r2_ia                = Q6_V_vand_QV(bmask, r2_ia);
+        r3_ia                = Q6_V_vand_QV(bmask, r3_ia);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
+    }
+
+    HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } };
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in);
+    hvx_vec_store_u(s0, 16, rsum);
+}
+
+
 static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n,
                                          float * restrict s0,
                                          float * restrict s1,
@@ -1286,33 +2338,144 @@ static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n,
     hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);
 }
 
-static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
+static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+
+    const uint32_t qk = QK_MXFP4x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 1;                                  // 32x e8m0
+    const uint32_t x_qblk_size = qk / 2;                                     // fp4
+    const uint32_t x_qrow_size = n / 2;                                      // fp4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                         // int8
+    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);           // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);              // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);    // then scales
+
+    // Row sum (sf)
+    HVX_Vector r0_sum = Q6_V_vzero();
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(   y_q    + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy_d            = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
+        vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(   y_q    + i * y_qblk_size, nloe);
+        HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy_d            = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
+        vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+
+        // Zero-out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+    }
+
+    r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
+
+    hvx_vec_store_u(s0, 4, r0_sum);
+}
+
+static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
     assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
     assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_MXFP4x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 1;                                  // 32x e8m0
-    const uint32_t x_qblk_size = qk / 2;                                     // fp4
-    const uint32_t x_qrow_size = n / 2;                                      // fp4 (not padded)
+    const uint32_t x_dblk_size = 8 * 4 * 1;                                   // 32x e8m0
+    const uint32_t x_qblk_size = qk / 2;                                      // fp4
+    const uint32_t x_qrow_size = n / 2;                                       // fp4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
-    const uint32_t y_qblk_size = qk;                                         // int8
-    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);           // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
 
-    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);              // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);    // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0;               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size;     // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vzero();
+    HVX_Vector r1_sum = Q6_V_vzero();
 
     // Multiply and accumulate into int32.
     // Compute combined scale (fp32).
-    // Apply scale to acc and accumulate into the row sum (qf32).
+    // Apply scale to acc and accumulate into the row sum (f32).
 
     const uint32_t nb   = n / qk;  // num full blocks
     int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
@@ -1321,11 +2484,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
     for (; i < nb; i++) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(   y_q    + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
         HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
 
         // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
         HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
@@ -1341,23 +2507,32 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
         r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
         r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
         r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
+        r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
+        r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
         r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
     // Process leftovers
     if (nloe) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(   y_q    + i * y_qblk_size, nloe);
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
         HVX_Vector vy_d = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
 
         // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
         HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
@@ -1373,30 +2548,40 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
         r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
         r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
         r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
+        r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
+        r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
 
-        // Zero-out unused scales
+        // Zero-out unused values
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
         r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
         r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
-    r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
-
-    hvx_vec_store_u(s0, 4, r0_sum);
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
+    hvx_vec_store_u(s0, 8, rsum);
 }
 
-static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
+static void vec_dot_mxfp4x4x2_q8x4x2_4x1(const int n, float * restrict s0,
                                       const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vx2, const void * restrict vx3,
                                       const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
     assert((unsigned long) vx0 % 128 == 0);
     assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vx2 % 128 == 0);
+    assert((unsigned long) vx3 % 128 == 0);
     assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_MXFP4x4x2 * 4;
@@ -1413,17 +2598,19 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
     const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
     const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
     const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+    const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0;            // quants first
+    const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size;  // then scales
+    const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0;            // quants first
+    const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size;  // then scales
 
     const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0;               // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size;     // then scales
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vzero();
     HVX_Vector r1_sum = Q6_V_vzero();
-
-    // Multiply and accumulate into int32.
-    // Compute combined scale (fp32).
-    // Apply scale to acc and accumulate into the row sum (f32).
+    HVX_Vector r2_sum = Q6_V_vzero();
+    HVX_Vector r3_sum = Q6_V_vzero();
 
     const uint32_t nb   = n / qk;  // num full blocks
     int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
@@ -1433,13 +2620,19 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(   y_q    + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r2_q = hvx_vec_load_mxfp4x4x8_full(r2_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r3_q = hvx_vec_load_mxfp4x4x8_full(r3_x_q + i * x_qblk_size);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q));
 
-        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
         HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_Vector r2_d = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size);
+        HVX_Vector r3_d = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size);
 
         // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
         HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
@@ -1447,9 +2640,6 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
         vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
 
         // Convert rX_d scales from e8m0 to fp32
-        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
-        // Left shift with zero fill to create FP32
-        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
         HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
         HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
         r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
@@ -1458,29 +2648,46 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
         r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
         r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
         r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
+        r2_d                 = Q6_V_vdelta_VV(r2_d, expand);
+        r2_d                 = Q6_V_vand_VV(r2_d, e8m0_mask);
+        r2_d                 = Q6_Vw_vasl_VwR(r2_d, 23);
+        r3_d                 = Q6_V_vdelta_VV(r3_d, expand);
+        r3_d                 = Q6_V_vand_VV(r3_d, e8m0_mask);
+        r3_d                 = Q6_Vw_vasl_VwR(r3_d, 23);
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
         HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r2_d, vy_d));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r3_d, vy_d));
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
 
         r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
         r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
     }
 
-    // Process leftovers
     if (nloe) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(   y_q    + i * y_qblk_size, nloe);
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
         HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r2_q = hvx_vec_load_mxfp4x4x8_partial(r2_x_q + i * x_qblk_size, nloe);
+        HVX_Vector_x8 r3_q = hvx_vec_load_mxfp4x4x8_partial(r3_x_q + i * x_qblk_size, nloe);
 
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q));
+        HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q));
 
         HVX_Vector vy_d = *(const HVX_UVector *) (y_d    + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
         HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+        HVX_Vector r2_d = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size);
+        HVX_Vector r3_d = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size);
 
         // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
         HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
@@ -1488,9 +2695,6 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
         vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
 
         // Convert rX_d scales from e8m0 to fp32
-        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
-        // Left shift with zero fill to create FP32
-        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
         HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
         HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
         r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
@@ -1499,28 +2703,46 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
         r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
         r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
         r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
+        r2_d                 = Q6_V_vdelta_VV(r2_d, expand);
+        r2_d                 = Q6_V_vand_VV(r2_d, e8m0_mask);
+        r2_d                 = Q6_Vw_vasl_VwR(r2_d, 23);
+        r3_d                 = Q6_V_vdelta_VV(r3_d, expand);
+        r3_d                 = Q6_V_vand_VV(r3_d, e8m0_mask);
+        r3_d                 = Q6_Vw_vasl_VwR(r3_d, 23);
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
         HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
+        HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r2_d, vy_d));
+        HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r3_d, vy_d));
 
         // Zero-out unused values
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
         r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r2_dd                = Q6_V_vand_QV(bmask, r2_dd);
+        r3_dd                = Q6_V_vand_QV(bmask, r3_dd);
         r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
         r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
+        r2_ia                = Q6_V_vand_QV(bmask, r2_ia);
+        r3_ia                = Q6_V_vand_QV(bmask, r3_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd);
+        HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd);
 
         r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
         r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
+        r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum));
+        r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum));
     }
 
-    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
-    hvx_vec_store_u(s0, 8, rsum);
+    HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } };
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in);
+    hvx_vec_store_u(s0, 16, rsum);
 }
 
+
 static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
                                         const void * restrict vx0, const void * restrict vx1,
                                         const void * restrict vy0, const void * restrict vy1) {
@@ -2138,7 +3360,6 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
 
     const uint32_t src0_start_row  = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row    = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
-    const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
 
     // no work for this thread
     if (src0_start_row >= src0_end_row) {
@@ -2168,39 +3389,89 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
     const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
     float * restrict dst_col          = (float *) dst->data;
 
-    // Prefill spad with 2x src0 rows
-    #pragma unroll(2)
-    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-        const uint32_t is0 = (ir0 - src0_start_row);
-        if (is0 >= MM_SPAD_SRC0_NROWS) {
-            break;
+    if (mmctx->vec_dot_4x1 != NULL) {
+        const uint32_t src0_end_row_x4 = src0_start_row + ((src0_end_row - src0_start_row) & ~3U);
+
+        // Prefill spad with 4x src0 rows
+        #pragma unroll(4)
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) {
+            const uint32_t is0 = (ir0 - src0_start_row);
+            if (is0 >= MM_SPAD_SRC0_NROWS) {
+                break;
+            }
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
+                           src0_stride, src0_row_size, 4);
         }
-        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
-                       src0_stride, src0_row_size, 2);
-    }
 
-    // Process src0 rows
-    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
-        mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
+        // Process src0 rows
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) {
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            mmctx->vec_dot_4x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, ss0 + 2 * src0_stride, ss0 + 3 * src0_stride, src1_col);
 
-        // Prefetch next (n + spad_nrows) row
-        const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
-        const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
-        if (pr0 < src0_end_row_x2) {
-            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size),
+            // Prefetch next (n + spad_nrows) row
+            const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
+            const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
+            if (pr0 < src0_end_row_x4) {
+                dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size),
+                               src0_stride, src0_row_size, 4);
+            }
+        }
+
+        // Process leftovers
+        uint32_t ir0 = src0_end_row_x4;
+        if (ir0 + 2 <= src0_end_row) {
+            const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                            src0_stride, src0_row_size, 2);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
+            ir0 += 2;
         }
-    }
+        if (ir0 < src0_end_row) {
+            const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
+                           src0_stride, src0_row_size, 1);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+            ir0 += 1;
+        }
+    } else {
+        const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
 
-    // Process the last row (if any)
-    if (src0_end_row != src0_end_row_x2) {
-        const uint32_t ir0 = src0_end_row_x2;
-        const uint32_t is0 = (ir0 - src0_start_row);
-        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
-                       src0_stride, src0_row_size, 1);
-        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
-        mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+        // Prefill spad with 2x src0 rows
+        #pragma unroll(2)
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+            const uint32_t is0 = (ir0 - src0_start_row);
+            if (is0 >= MM_SPAD_SRC0_NROWS) {
+                break;
+            }
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
+                           src0_stride, src0_row_size, 2);
+        }
+
+        // Process src0 rows
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
+
+            // Prefetch next (n + spad_nrows) row
+            const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
+            const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
+            if (pr0 < src0_end_row_x2) {
+                dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size),
+                               src0_stride, src0_row_size, 2);
+            }
+        }
+
+        // Process the last row (if any)
+        if (src0_end_row != src0_end_row_x2) {
+            const uint32_t ir0 = src0_end_row_x2;
+            const uint32_t is0 = (ir0 - src0_start_row);
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
+                           src0_stride, src0_row_size, 1);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+        }
     }
 
     hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
@@ -2432,6 +3703,94 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
 
 // *** dynamic quant
 
+static inline void quantize_block_f32_q8_1x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+    assert((unsigned long) x % 128 == 0);
+    assert((unsigned long) y_q % 128 == 0);
+
+    HVX_Vector * vx = (HVX_Vector *) x;
+    HVX_Vector zero = Q6_V_vzero();
+
+    // Use reduce max fp32 to find max(abs(e)) first
+    HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0]));
+    HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1]));
+    HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2]));
+    HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3]));
+
+    // Load and convert into QF32
+    HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero);  // 32 elements
+    HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero);  // 32 elements
+    HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero);  // 32 elements
+    HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero);  // 32 elements
+
+    // Convert to QF32
+    HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero);
+    HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero);
+    HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero);
+    HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero);
+
+    // Combine and convert to fp16
+    HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf)));
+    HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf)));
+
+    // Convert into fp16
+    HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
+    HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
+
+    HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd01_hf   = Q6_Vhf_equals_Vqf16(vd01_qf16);
+    HVX_Vector vd23_hf   = Q6_Vhf_equals_Vqf16(vd23_qf16);
+
+    // Divide input by the scale
+    HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf);
+    HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf);
+    vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
+    vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
+
+    // Convert to int8
+    HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
+    HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
+    HVX_Vector vx_i8    = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
+
+    *(HVX_Vector *) y_q = vx_i8;
+
+    // --- Sum calculation ---
+    const HVX_Vector ones = Q6_Vb_vsplat_R(1);
+    HVX_Vector v_sums = Q6_Vw_vrmpy_VbVb(vx_i8, ones); // sum every 4 consecutive elements
+    // Sum 8 elements:
+    v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 4));
+    v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 8));
+    v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 16));
+
+    // Copy to stack to extract sums and vmaxes
+    float vmax0[32] __attribute__((aligned(128)));
+    float vmax1[32] __attribute__((aligned(128)));
+    float vmax2[32] __attribute__((aligned(128)));
+    float vmax3[32] __attribute__((aligned(128)));
+    int32_t sums[32] __attribute__((aligned(128)));
+
+    hvx_vec_store_u(vmax0, 128, vmax0_sf);
+    hvx_vec_store_u(vmax1, 128, vmax1_sf);
+    hvx_vec_store_u(vmax2, 128, vmax2_sf);
+    hvx_vec_store_u(vmax3, 128, vmax3_sf);
+    hvx_vec_store_u(sums, 128, v_sums);
+
+    float d0 = vmax0[0] / 127.0f;
+    float d1 = vmax1[0] / 127.0f;
+    float d2 = vmax2[0] / 127.0f;
+    float d3 = vmax3[0] / 127.0f;
+
+    __fp16 * y_d_half = (__fp16 *) y_d;
+    y_d_half[0] = d0;
+    y_d_half[1] = (float) sums[0] * d0;
+    y_d_half[2] = d1;
+    y_d_half[3] = (float) sums[8] * d1;
+    y_d_half[4] = d2;
+    y_d_half[5] = (float) sums[16] * d2;
+    y_d_half[6] = d3;
+    y_d_half[7] = (float) sums[24] * d3;
+}
+
 static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
     assert((unsigned long) x % 128 == 0);
     assert((unsigned long) y_q % 128 == 0);
@@ -2656,6 +4015,77 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data)
          ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
+static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) {
+    assert(k % 32 == 0);
+    const uint32_t qk = QK_Q8_0x4x2;
+    const uint32_t nb = (k + qk - 1) / qk;
+
+    const uint32_t qrow_size = k;              // int8
+
+    const uint32_t dblk_size = 8 * 4;          // 8x (d, s) __fp16 = 32 bytes
+    const uint32_t qblk_size = QK_Q8_0x4x2;    // int8
+
+    uint8_t * restrict y_q = (y + 0);          // quants first
+    uint8_t * restrict y_d = (y + qrow_size);  // then scales/sums
+
+    // Temp scales override input since we're working off of the aligned temp buffer in VTCM
+    uint8_t * restrict t_d = (uint8_t *) x;
+
+    for (uint32_t i = 0; i < nb; i++) {
+        quantize_block_f32_q8_1x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
+        quantize_block_f32_q8_1x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
+    }
+
+    // now copy the scales/sums into final location
+    hvx_copy_f16_ua(y_d, t_d, nb * 16);
+}
+
+static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_matmul_context * mmctx = data;
+    struct htp_ops_context * octx = mmctx->octx;
+
+    const struct htp_tensor * src = octx->src[1];
+    uint8_t * restrict dst = octx->src1_spad.data;
+    struct htp_spad * spad = &octx->src0_spad;
+    uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
+
+    uint64_t t1 = HAP_perf_get_qtimer_count();
+
+    const uint32_t ne0 = src->ne[0];
+    const uint32_t ne1 = src->ne[1];
+    const uint32_t ne2 = src->ne[2];
+    const uint32_t ne3 = src->ne[3];
+
+    const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows
+
+    const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row
+
+    const size_t src_row_size = src->nb[1];
+    const size_t dst_row_size = q8_1x4x2_row_size(ne0);
+
+    uint8_t * restrict src_data = (uint8_t *) src->data + (src_row_size * ir_first);
+    uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first);
+    uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith);
+
+    const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float));
+    memset(tmp_data, 0, src_row_size_padded);  // zero-out temp row data for padding
+
+    for (uint32_t i = ir_first; i < ir_last; ++i) {
+        hex_l2fetch(src_data, src_row_size, src_row_size, 2);
+        hvx_copy_f32_aa(tmp_data, src_data, ne0);
+
+        quantize_row_f32_q8_1x4x2((float *) tmp_data, dst_data, ne0);
+        dst_data += dst_row_size;
+        src_data += src_row_size;
+    }
+
+    uint64_t t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "quantize-f32-q8_1x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
+         ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
 static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
     struct htp_matmul_context * mmctx = data;
     struct htp_ops_context * octx = mmctx->octx;
@@ -2751,24 +4181,35 @@ static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_t
             mmctx->vec_dot_1x1 = vec_dot_q4x4x2_q8x4x2_1x1;
             mmctx->vec_dot_2x1 = vec_dot_q4x4x2_q8x4x2_2x1;
             mmctx->vec_dot_2x2 = vec_dot_q4x4x2_q8x4x2_2x2;
+            mmctx->vec_dot_4x1 = vec_dot_q4x4x2_q8x4x2_4x1;
+            return 0;
+        case HTP_TYPE_Q4_1:
+            mmctx->type        = "q4_1x4x2-f32";
+            mmctx->vec_dot_1x1 = vec_dot_q4_1x4x2_q8x4x2_1x1;
+            mmctx->vec_dot_2x1 = vec_dot_q4_1x4x2_q8x4x2_2x1;
+            mmctx->vec_dot_2x2 = vec_dot_q4_1x4x2_q8x4x2_2x2;
+            mmctx->vec_dot_4x1 = vec_dot_q4_1x4x2_q8x4x2_4x1;
             return 0;
         case HTP_TYPE_Q8_0:
             mmctx->type        = "q8x4x2-f32";
             mmctx->vec_dot_1x1 = vec_dot_q8x4x2_q8x4x2_1x1;
             mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1;
             mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2;
+            mmctx->vec_dot_4x1 = vec_dot_q8x4x2_q8x4x2_4x1;
             return 0;
         case HTP_TYPE_IQ4_NL:
             mmctx->type        = "iq4nlx4x2-f32";
             mmctx->vec_dot_1x1 = vec_dot_iq4nlx4x2_q8x4x2_1x1;
             mmctx->vec_dot_2x1 = vec_dot_iq4nlx4x2_q8x4x2_2x1;
             mmctx->vec_dot_2x2 = vec_dot_iq4nlx4x2_q8x4x2_2x2;
+            mmctx->vec_dot_4x1 = vec_dot_iq4nlx4x2_q8x4x2_4x1;
             return 0;
         case HTP_TYPE_MXFP4:
             mmctx->type        = "mxfp4x4x2-f32";
             mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1;
             mmctx->vec_dot_2x1 = vec_dot_mxfp4x4x2_q8x4x2_2x1;
             mmctx->vec_dot_2x2 = vec_dot_mxfp4x4x2_q8x4x2_2x2;
+            mmctx->vec_dot_4x1 = vec_dot_mxfp4x4x2_q8x4x2_4x1;
             return 0;
         default:
             return -1;
@@ -2894,8 +4335,13 @@ static int op_matmul_hvx(struct htp_ops_context * octx) {
             return HTP_STATUS_NO_SUPPORT;
         }
 
-        quant_job_func = quantize_f32_q8x4x2;
-        src1_row_size  = q8x4x2_row_size(ne10);
+        if (src0->type == HTP_TYPE_Q4_1) {
+            quant_job_func = quantize_f32_q8_1x4x2;
+            src1_row_size  = q8_1x4x2_row_size(ne10);
+        } else {
+            quant_job_func = quantize_f32_q8x4x2;
+            src1_row_size  = q8x4x2_row_size(ne10);
+        }
         htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, 0);
     }
 
@@ -2962,7 +4408,7 @@ int op_matmul(struct htp_ops_context * octx) {
     // HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
     // Other types fall back to HVX.
     uint32_t wtype = src0->type;
-    if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) {
+    if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q4_1 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) {
         return op_matmul_hvx(octx);
     }
 
@@ -2995,7 +4441,6 @@ int op_matmul(struct htp_ops_context * octx) {
     //  is handled by HMX itself; when M < 32  fall back to HVX.
     const int m_total = (int) src1->ne[1];
     const int m_hmx   = m_total & ~31;   // 0 when M < 32
-
     if (m_hmx == 0) {
         return op_matmul_hvx(octx);
     }
@@ -3020,7 +4465,7 @@ int op_matmul(struct htp_ops_context * octx) {
 
     if (src0->type == HTP_TYPE_F16) {
         if (is_batched) {
-            hmx_matmul_w16a32_batched_params_t batch_params = {
+            hmx_matmul_f16_f32_batched_params_t batch_params = {
                 .dst             = (float *) dst->data,
                 .activation      = (float *) src1->data,
                 .permuted_weight = (const __fp16 *) src0->data,
@@ -3041,15 +4486,14 @@ int op_matmul(struct htp_ops_context * octx) {
                 .dst_nb2         = dst->nb[2],
                 .dst_nb3         = dst->nb[3],
             };
-            ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params);
+            ret = hmx_matmul_f16_f32_batched(octx->ctx, &batch_params);
         } else {
-            ret = hmx_mat_mul_permuted_w16a32(octx->ctx,
+            ret = hmx_matmul_f16_f32(octx->ctx,
                     (float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data,
                     m_total, k, n, act_stride, wgt_stride);
         }
     } else {
-        ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx,
-                    (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
+        ret = hmx_matmul_q_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
                     m_total, k, n, (int) src0->type);
     }
 
@@ -3100,8 +4544,13 @@ int op_matmul_id(struct htp_ops_context * octx) {
         return HTP_STATUS_NO_SUPPORT;
     }
 
-    quant_job_func = quantize_f32_q8x4x2;
-    src1_row_size  = q8x4x2_row_size(ne10);
+    if (src0->type == HTP_TYPE_Q4_1) {
+        quant_job_func = quantize_f32_q8_1x4x2;
+        src1_row_size  = q8_1x4x2_row_size(ne10);
+    } else {
+        quant_job_func = quantize_f32_q8x4x2;
+        src1_row_size  = q8x4x2_row_size(ne10);
+    }
 
     const size_t src2_spad_size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
     htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, src2_spad_size_per_thread);
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 1d8b0796bc9..c839044b84f 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -7,6 +7,7 @@
 
 #include <math.h>
 #include <string.h>
+#include <stdlib.h>
 
 #include "hex-dma.h"
 #include "hvx-utils.h"
@@ -18,9 +19,11 @@
 #include "htp-ops.h"
 #include "htp-ops.h"
 
-// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h
+// Redefined the rope type constants as we can't include ggml.h
 #define HTP_ROPE_TYPE_NORMAL 0
 #define HTP_ROPE_TYPE_NEOX   2
+#define HTP_ROPE_TYPE_MROPE  8
+#define HTP_ROPE_TYPE_IMROPE 40
 
 #define HTP_ROPE_SPAD_NROWS  16
 #define HTP_ROPE_SPAD_BLOCK  (HTP_ROPE_SPAD_NROWS/2)
@@ -73,6 +76,9 @@ struct htp_rope_context {
     size_t theta_cache_offset;
     uint32_t src0_nrows;
 
+    struct fastdiv_values div_ne2_ne1;
+    struct fastdiv_values div_ne1;
+
     uint64_t t_start;
 };
 
@@ -82,7 +88,30 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
     return (1 - MIN(1, MAX(0, y)));
 }
 
-static void rope_cache_init(const float    theta_base,
+// Compute one (cos, sin) pair into cache[i0], cache[i0+1] applying YaRN scaling.
+static inline void rope_yarn_one(float theta, float freq_scale, float * corr_dims,
+                                 uint32_t i0, float ext_factor, float mscale,
+                                 float * cache) {
+    float theta_extrap = theta;
+
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta_final  = theta_interp;
+    float mscale_final = mscale;
+
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta_final    = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale_final  *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+
+    cache[i0 + 0] = cosf(theta_final) * mscale_final;
+    cache[i0 + 1] = sinf(theta_final) * mscale_final;
+}
+
+static __attribute__((noinline)) void rope_cache_init(const float    theta_base,
                             const float    freq_scale,
                             const float *  freq_factors,
                             float *        corr_dims,
@@ -92,30 +121,137 @@ static void rope_cache_init(const float    theta_base,
                             float *        cache,
                             const float    theta_scale) {
     // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
-    float theta = theta_base;
+#if __HVX_ARCH__ >= 79
+    const bool is_v79_or_newer = true;
+#else
+    const bool is_v79_or_newer = false;
+#endif
+
+    if (is_v79_or_newer && ext_factor == 0.0f) {
+        // Fast path: fully vectorized
+        // We process 32 pairs (64 elements) per iteration.
+        const uint32_t n_blocks = ne0 / 64;
+
+        // Initialize theta scale powers: [1.0f, theta_scale, theta_scale^2, ..., theta_scale^31]
+        float __attribute__((aligned(128))) theta_powers[32];
+        theta_powers[0] = 1.0f;
+        for (int j = 1; j < 32; j++) {
+            theta_powers[j] = theta_powers[j - 1] * theta_scale;
+        }
+        HVX_Vector v_theta_powers = hvx_vmem(theta_powers);
 
-    for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
-        const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+        HVX_Vector v_freq_scale = hvx_vec_splat_f32(freq_scale);
+        HVX_Vector v_mscale = hvx_vec_splat_f32(mscale);
+
+        // Base theta starts at theta_base
+        float theta_block = theta_base;
+        // The scale factor for the next block is theta_scale^32
+        float theta_scale_32 = 1.0f;
+        for (int j = 0; j < 32; j++) {
+            theta_scale_32 *= theta_scale;
+        }
+
+        for (uint32_t b = 0; b < n_blocks; b++) {
+            uint32_t i0 = b * 64;
+            HVX_Vector v_theta_base = hvx_vec_splat_f32(theta_block);
+            HVX_Vector v_theta = hvx_vec_mul_f32_f32(v_theta_base, v_theta_powers);
+
+            if (freq_factors) {
+                // Load 32 elements of freq_factors
+                HVX_Vector v_ff = hvx_vmemu(freq_factors + i0 / 2);
+                HVX_Vector v_inv_ff = hvx_vec_inverse_f32(v_ff);
+                v_theta = hvx_vec_mul_f32_f32(v_theta, v_inv_ff);
+            }
 
-        float theta_extrap = theta / ff;
+            HVX_Vector v_theta_final = hvx_vec_mul_f32_f32(v_theta, v_freq_scale);
 
-        // Get n-d rotational scaling corrected for extrapolation
-        float theta_interp = freq_scale * theta_extrap;
-        float theta_final  = theta_interp;
-        float mscale_final = mscale;
+            HVX_Vector vcos = hvx_vec_cos_f32(v_theta_final);
+            HVX_Vector vsin = hvx_vec_sin_f32(v_theta_final);
 
-        if (ext_factor != 0.0f) {
-            float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-            theta_final    = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            vcos = hvx_vec_mul_f32_f32(vcos, v_mscale);
+            vsin = hvx_vec_mul_f32_f32(vsin, v_mscale);
 
-            // Get n-d magnitude scaling corrected for interpolation
-            mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+            HVX_VectorPair vstore = Q6_W_vshuff_VVR(vsin, vcos, -4);
+
+            if (((uintptr_t)cache) % 128 == 0) {
+                hvx_vmem(cache + i0 + 0)  = Q6_V_lo_W(vstore);
+                hvx_vmem(cache + i0 + 32) = Q6_V_hi_W(vstore);
+            } else {
+                hvx_vec_store_u(cache + i0 + 0,  32 * sizeof(float), Q6_V_lo_W(vstore));
+                hvx_vec_store_u(cache + i0 + 32, 32 * sizeof(float), Q6_V_hi_W(vstore));
+            }
+
+            theta_block *= theta_scale_32;
+        }
+
+        // Leftovers
+        float theta = theta_block;
+        for (uint32_t i0 = n_blocks * 64; i0 < ne0; i0 += 2) {
+            const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+            rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
+            theta *= theta_scale;
+        }
+    } else {
+        // Fallback to original scalar loop
+        float theta = theta_base;
+        for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
+            const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+            rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
+            theta *= theta_scale;
         }
+    }
+}
 
-        cache[i0 + 0] = cosf(theta_final) * mscale_final;
-        cache[i0 + 1] = sinf(theta_final) * mscale_final;
+// pos_t/h/w/e: the four position ids for this sequence step (t=time, h=height, w=width, e=extra).
+// sections[4]: number of head dims assigned to each position component.
+static __attribute__((noinline)) void mrope_cache_init(const float    pos_t,
+                             const float    pos_h,
+                             const float    pos_w,
+                             const float    pos_e,
+                             const int32_t  sections[4],
+                             const bool     is_imrope,
+                             const float    freq_scale,
+                             const float *  freq_factors,
+                             float *        corr_dims,
+                             const uint32_t ne0,
+                             const float    ext_factor,
+                             const float    mscale,
+                             float *        cache,
+                             const float    theta_scale) {
+    const int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
+    const int sec_w     = sections[0] + sections[1];
+    const int sec_e     = sec_w + sections[2];
+
+    float theta_t = pos_t;
+    float theta_h = pos_h;
+    float theta_w = pos_w;
+    float theta_e = pos_e;
 
-        theta *= theta_scale;
+    for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff     = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+        const int   sector = (i0 / 2) % sect_dims;
+
+        float theta;
+        if (is_imrope) {
+            // Interleaved: sector mod 3 selects component
+            if      (sector % 3 == 0 && sector < 3 * sections[0]) { theta = theta_t; }
+            else if (sector % 3 == 1 && sector < 3 * sections[1]) { theta = theta_h; }
+            else if (sector % 3 == 2 && sector < 3 * sections[2]) { theta = theta_w; }
+            else                                                   { theta = theta_e; }
+        } else {
+            // Contiguous sections
+            if      (sector < sections[0]) { theta = theta_t; }
+            else if (sector < sec_w)       { theta = theta_h; }
+            else if (sector < sec_e)       { theta = theta_w; }
+            else                           { theta = theta_e; }
+        }
+
+        rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
+
+        theta_t *= theta_scale;
+        theta_h *= theta_scale;
+        theta_w *= theta_scale;
+        theta_e *= theta_scale;
     }
 }
 
@@ -134,24 +270,18 @@ static void rope_corr_dims(int     n_dims,
 }
 
 static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
-    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
-    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
-    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
+    const uint32_t he = ne / 2;
+    const uint32_t nvec = he / 32;
+    const uint32_t nloe = he % 32;
 
-    uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2
+    for (uint32_t i = 0; i < nvec; i++) {
+        HVX_Vector v0 = ((const HVX_Vector *) src0)[i];
+        HVX_Vector v1 = hvx_vmemu(src0 + he + i * 32);
 
-    uint32_t he = ne / 2;         // half_dims offset in elements
-    uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors
+        HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
+        HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];
 
-    #pragma unroll(2)
-    for (uint32_t i = 0; i < nvec; i += 2) {
-        HVX_Vector v0 = vsrc[i/2+0];
-        HVX_Vector v1 = vsrc[i/2+hv];
-
-        HVX_Vector v2 = vtheta[i+0];
-        HVX_Vector v3 = vtheta[i+1];
-
-        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
 
         HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
         HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
@@ -161,37 +291,45 @@ static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * rest
         HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
         HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
 
-        vdst[i/2+0]  = Q6_Vsf_equals_Vqf32(v4);
-        vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5);
+        ((HVX_Vector *) dst)[i] = Q6_Vsf_equals_Vqf32(v4);
+        hvx_vmemu(dst + he + i * 32) = Q6_Vsf_equals_Vqf32(v5);
     }
 
-    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
-        const float cos_theta = theta_cache[i+0];
-        const float sin_theta = theta_cache[i+1];
-        float x0 = src0[i/2];
-        float x1 = src0[i/2 + he];
-        dst[i/2]      = x0 * cos_theta - x1 * sin_theta;
-        dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta;
+    if (nloe > 0) {
+        HVX_Vector v0 = hvx_vmemu(src0 + nvec * 32);
+        HVX_Vector v1 = hvx_vmemu(src0 + he + nvec * 32);
+
+        HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 0];
+        HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 1];
+
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
+
+        HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
+        HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
+
+        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+
+        hvx_vec_store_u(dst + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v4));
+        hvx_vec_store_u(dst + he + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v5));
     }
 }
 
 static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
-    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
-    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
-    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
+    const uint32_t nvec = ne / 64;
+    const uint32_t nloe = ne % 64;
 
-    uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two
+    for (uint32_t i = 0; i < nvec; i++) {
+        HVX_Vector v0 = ((const HVX_Vector *) src0)[i * 2 + 0];
+        HVX_Vector v1 = ((const HVX_Vector *) src0)[i * 2 + 1];
 
-    #pragma unroll(2)
-    for (uint32_t i = 0; i < nvec; i+=2) {
-        HVX_Vector v0 = vsrc[i+0];
-        HVX_Vector v1 = vsrc[i+1];
+        HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
+        HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];
 
-        HVX_Vector v2 = vtheta[i+0];
-        HVX_Vector v3 = vtheta[i+1];
-
-        HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);  // vx0_x1[0] = x0, vx0_x1[1] = x1
-        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
+        HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
 
         HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
         HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
@@ -203,17 +341,52 @@ static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict
 
         HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
 
-        vdst[i+0] = Q6_V_lo_W(vstore);
-        vdst[i+1] = Q6_V_hi_W(vstore);
+        ((HVX_Vector *) dst)[i * 2 + 0] = Q6_V_lo_W(vstore);
+        ((HVX_Vector *) dst)[i * 2 + 1] = Q6_V_hi_W(vstore);
     }
 
-    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
-        const float cos_theta = theta_cache[i+0];
-        const float sin_theta = theta_cache[i+1];
-        float x0 = src0[i+0];
-        float x1 = src0[i+1];
-        dst[i+0] = x0 * cos_theta - x1 * sin_theta;
-        dst[i+1] = x0 * sin_theta + x1 * cos_theta;
+    if (nloe > 0) {
+        if (nloe <= 32) {
+            HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
+            HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
+
+            HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(Q6_V_vzero(), v0, -4);
+            HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(Q6_V_vzero(), v2, -4);
+
+            HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+            HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+
+            HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+            HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+
+            HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
+
+            hvx_vec_store_u(dst + nvec * 64, nloe * sizeof(float), Q6_V_lo_W(vstore));
+        } else {
+            HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
+            HVX_Vector v1 = hvx_vmemu(src0 + nvec * 64 + 32);
+
+            HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
+            HVX_Vector v3 = hvx_vmemu(theta_cache + nvec * 64 + 32);
+
+            HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);
+            HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
+
+            HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+            HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+
+            HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+            HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+
+            HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
+
+            ((HVX_Vector *) dst)[nvec * 2 + 0] = Q6_V_lo_W(vstore);
+            hvx_vec_store_u(dst + nvec * 64 + 32, (nloe - 32) * sizeof(float), Q6_V_hi_W(vstore));
+        }
     }
 }
 
@@ -274,7 +447,8 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
     uint64_t tt = HAP_perf_get_qtimer_count();
 
     const int32_t mode    = rctx->mode;
-    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
+    // MROPE and IMROPE use NEOX-style pairing for the rotation
+    const bool    is_neox = (mode & HTP_ROPE_TYPE_NEOX) || (mode & HTP_ROPE_TYPE_MROPE);
 
     // VTCM setup
     uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
@@ -286,13 +460,19 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
     const int32_t * pos = (const int32_t *) src1->data;
     const float * freq_factors = src2 ? (const float *) src2->data : NULL;
 
-    uint32_t ir = 0;
+    const uint32_t i3_start = fastdiv(src0_start_row, &rctx->div_ne2_ne1);
+    const uint32_t rem      = fastmodulo(src0_start_row, ne2 * ne1, &rctx->div_ne2_ne1);
+    const uint32_t i2_start = fastdiv(rem, &rctx->div_ne1);
+    const uint32_t i1_start = fastmodulo(rem, ne1, &rctx->div_ne1);
+
+    uint32_t ir = src0_start_row;
     uint32_t prev_i2 = (uint32_t) -1;
 
-    for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
-        for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
-            for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads
-                if (ir < src0_start_row) { ir++; i1++; continue; }
+    for (uint32_t i3 = i3_start; i3 < ne3; i3++) { // batch
+        const uint32_t i2_init = (i3 == i3_start) ? i2_start : 0;
+        for (uint32_t i2 = i2_init; i2 < ne2; i2++) { // seq-len
+            const uint32_t i1_init = (i3 == i3_start && i2 == i2_start) ? i1_start : 0;
+            for (uint32_t i1 = i1_init; i1 < ne1; ) { // attn-heads
                 if (ir >= src0_end_row) goto done;
 
                 // Rows in this block
@@ -326,11 +506,25 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
                 if (i2 != prev_i2) {
                     prev_i2 = i2;
 
-                    const int32_t p = pos[i2];
-                    rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale);
-
-                    // FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache,
-                    //         (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
+                    const bool is_mrope = (rctx->mode & HTP_ROPE_TYPE_MROPE) != 0;
+                    if (is_mrope) {
+                        // src1 holds four position arrays stacked along ne0:
+                        // pos[i2], pos[i2+ne2], pos[i2+ne2*2], pos[i2+ne2*3]
+                        const bool is_imrope = (rctx->mode == HTP_ROPE_TYPE_IMROPE);
+                        mrope_cache_init(
+                            (float) pos[i2],
+                            (float) pos[i2 + ne2],
+                            (float) pos[i2 + ne2 * 2],
+                            (float) pos[i2 + ne2 * 3],
+                            rctx->sections, is_imrope,
+                            rctx->freq_scale, freq_factors, rctx->corr_dims,
+                            ne0, rctx->ext_factor, rctx->attn_factor,
+                            theta_cache, rctx->theta_scale);
+                    } else {
+                       rope_cache_init(pos[i2], rctx->freq_scale, freq_factors, rctx->corr_dims,
+                                        ne0, rctx->ext_factor, rctx->attn_factor,
+                                        theta_cache, rctx->theta_scale);
+                    }
                 }
 
                 // Skip output DMA transactions from prev block (if any)
@@ -410,7 +604,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     // Aligned row sizes for VTCM
     const size_t src0_row_size_aligned    = hex_round_up(src0_row_size, VLEN);
     const size_t dst_row_size_aligned     = hex_round_up(dst_row_size, VLEN);
-    const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
+    const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 256);
 
     // Calculate spad sizes per thread
     size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned;
@@ -467,6 +661,11 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     rctx.src0_nrows = src0_nrows;
     rctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
 
+    if (src0_nrows > 0) {
+        rctx.div_ne2_ne1 = init_fastdiv_values(dst->ne[2] * dst->ne[1]);
+        rctx.div_ne1     = init_fastdiv_values(dst->ne[1]);
+    }
+
     FARF(HIGH, "rope-f32 n-rows %u n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx.src0_nrows, rctx.n_dims, ne0,
          rctx.ext_factor, rctx.theta_scale, rctx.attn_factor);
 
diff --git a/ggml/src/ggml-hexagon/htp/set-rows-ops.c b/ggml/src/ggml-hexagon/htp/set-rows-ops.c
index 0def7b408bf..58c54967db0 100644
--- a/ggml/src/ggml-hexagon/htp/set-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/set-rows-ops.c
@@ -65,6 +65,9 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
     // parallelize by rows of src0
     const uint32_t dr  = srctx->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
+    if (ir0 >= nr) {
+        return;
+    }
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
     const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
@@ -109,6 +112,9 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da
     // parallelize by rows of src0
     const uint32_t dr  = srctx->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
+    if (ir0 >= nr) {
+        return;
+    }
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
     const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
diff --git a/ggml/src/ggml-hexagon/htp/ssm-conv.c b/ggml/src/ggml-hexagon/htp/ssm-conv.c
index a28fd03e978..d574da2e2bc 100644
--- a/ggml/src/ggml-hexagon/htp/ssm-conv.c
+++ b/ggml/src/ggml-hexagon/htp/ssm-conv.c
@@ -20,55 +20,56 @@
 #include "htp-ops.h"
 #include "hvx-utils.h"
 
-#define htp_ssm_conv_tensors_preamble                          \
-    const struct htp_tensor * restrict src0    = octx->src[0]; \
-    const struct htp_tensor * restrict src1    = octx->src[1]; \
-    const struct htp_tensor * restrict dst     = octx->dst;    \
-    struct htp_spad * restrict src0_spad = &octx->src0_spad; \
-    struct htp_spad * restrict src1_spad = &octx->src1_spad; \
-    struct htp_spad * restrict dst_spad  = &octx->dst_spad;  \
-                                                             \
-    const uint32_t ne00 = src0->ne[0];                       \
-    const uint32_t ne01 = src0->ne[1];                       \
-    const uint32_t ne02 = src0->ne[2];                       \
-    const uint32_t ne03 = src0->ne[3];                       \
-                                                             \
-    const uint32_t ne10 = src1->ne[0];                       \
-    const uint32_t ne11 = src1->ne[1];                       \
-    const uint32_t ne12 = src1->ne[2];                       \
-    const uint32_t ne13 = src1->ne[3];                       \
-                                                             \
-    const uint32_t ne0 = dst->ne[0];                         \
-    const uint32_t ne1 = dst->ne[1];                         \
-    const uint32_t ne2 = dst->ne[2];                         \
-    const uint32_t ne3 = dst->ne[3];                         \
-                                                             \
-    const uint32_t nb00 = src0->nb[0];                       \
-    const uint32_t nb01 = src0->nb[1];                       \
-    const uint32_t nb02 = src0->nb[2];                       \
-    const uint32_t nb03 = src0->nb[3];                       \
-                                                             \
-    const uint32_t nb10 = src1->nb[0];                       \
-    const uint32_t nb11 = src1->nb[1];                       \
-    const uint32_t nb12 = src1->nb[2];                       \
-    const uint32_t nb13 = src1->nb[3];                       \
-                                                             \
-    const uint32_t nb0 = dst->nb[0];                         \
-    const uint32_t nb1 = dst->nb[1];                         \
-    const uint32_t nb2 = dst->nb[2];                         \
+#define htp_ssm_conv_tensors_preamble                           \
+    const struct htp_tensor * restrict src0 = octx->src[0];     \
+    const struct htp_tensor * restrict src1 = octx->src[1];     \
+    const struct htp_tensor * restrict dst  = octx->dst;        \
+    struct htp_spad * restrict src0_spad    = &octx->src0_spad; \
+    struct htp_spad * restrict src1_spad    = &octx->src1_spad; \
+    struct htp_spad * restrict dst_spad     = &octx->dst_spad;  \
+                                                                \
+    const uint32_t ne00 = src0->ne[0];                          \
+    const uint32_t ne01 = src0->ne[1];                          \
+    const uint32_t ne02 = src0->ne[2];                          \
+    const uint32_t ne03 = src0->ne[3];                          \
+                                                                \
+    const uint32_t ne10 = src1->ne[0];                          \
+    const uint32_t ne11 = src1->ne[1];                          \
+    const uint32_t ne12 = src1->ne[2];                          \
+    const uint32_t ne13 = src1->ne[3];                          \
+                                                                \
+    const uint32_t ne0 = dst->ne[0];                            \
+    const uint32_t ne1 = dst->ne[1];                            \
+    const uint32_t ne2 = dst->ne[2];                            \
+    const uint32_t ne3 = dst->ne[3];                            \
+                                                                \
+    const uint32_t nb00 = src0->nb[0];                          \
+    const uint32_t nb01 = src0->nb[1];                          \
+    const uint32_t nb02 = src0->nb[2];                          \
+    const uint32_t nb03 = src0->nb[3];                          \
+                                                                \
+    const uint32_t nb10 = src1->nb[0];                          \
+    const uint32_t nb11 = src1->nb[1];                          \
+    const uint32_t nb12 = src1->nb[2];                          \
+    const uint32_t nb13 = src1->nb[3];                          \
+                                                                \
+    const uint32_t nb0 = dst->nb[0];                            \
+    const uint32_t nb1 = dst->nb[1];                            \
+    const uint32_t nb2 = dst->nb[2];                            \
     const uint32_t nb3 = dst->nb[3];
 
 struct htp_ssm_conv_context {
     struct htp_ops_context * octx;
     uint32_t nrows_per_thread;
+    uint32_t d_inner_tile;
     uint64_t t_start;
 };
 
-#define htp_ssm_conv_preamble                            \
+#define htp_ssm_conv_preamble                                                   \
     struct htp_ssm_conv_context * scctx = (struct htp_ssm_conv_context *) data; \
-    struct htp_ops_context * octx = scctx->octx;         \
-    htp_ssm_conv_tensors_preamble;                       \
-    dma_queue * dma_queue         = octx->ctx->dma[ith];
+    struct htp_ops_context *      octx  = scctx->octx;                          \
+    htp_ssm_conv_tensors_preamble;                                              \
+    dma_queue * dma_queue = octx->ctx->dma[ith];
 
 // Scalar FP32 SSM_CONV implementation
 static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
@@ -128,118 +129,211 @@ static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
          dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-// HVX FP32 SSM_CONV implementation - vectorizes across d_inner dimension
-static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
-    htp_ssm_conv_preamble;
-
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
 
-    const int nc  = src1->ne[0]; // d_conv
-    const int ncs = src0->ne[0]; // d_conv - 1 + n_t
+// In-register 32x32 fp32 transpose using std 5-stage HVX vshuff butterfly.
+static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) {
+    HVX_Vector tmp[32];
 
-    const uint32_t d_conv  = src1->ne[0];
-    const uint32_t d_inner = src0->ne[1];
-    const uint32_t n_t     = dst->ne[1];
-    const uint32_t n_s     = dst->ne[2];
+    // Stage 0 (R = -4): pair (2i, 2i+1) for i = 0..15. m -> tmp.
+    for (int i = 0; i < 16; ++i) {
+        HVX_VectorPair p = Q6_W_vshuff_VVR(m[2*i + 1], m[2*i], -4);
+        tmp[2*i + 0] = Q6_V_lo_W(p);
+        tmp[2*i + 1] = Q6_V_hi_W(p);
+    }
 
-    const float * src0_data = (const float *) src0->data;
-    const float * src1_data = (const float *) src1->data;
-    float *       dst_data  = (float *) dst->data;
+    // Stage 1 (R = -8): per block of 4, pair (b+0, b+2) and (b+1, b+3). tmp -> m.
+    for (int b = 0; b < 32; b += 4) {
+        HVX_VectorPair p0 = Q6_W_vshuff_VVR(tmp[b + 2], tmp[b + 0], -8);
+        HVX_VectorPair p1 = Q6_W_vshuff_VVR(tmp[b + 3], tmp[b + 1], -8);
+        m[b + 0] = Q6_V_lo_W(p0); m[b + 1] = Q6_V_hi_W(p0);
+        m[b + 2] = Q6_V_lo_W(p1); m[b + 3] = Q6_V_hi_W(p1);
+    }
 
-    // Calculate row range for this thread
-    const int dr = scctx->nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = MIN(ir0 + dr, d_inner);
-    const uint32_t ir  = ir1 - ir0;
+    // Stage 2 (R = -16): per block of 8, pair (b+i, b+i+4) for i = 0..3. m -> tmp.
+    for (int b = 0; b < 32; b += 8) {
+        for (int i = 0; i < 4; ++i) {
+            HVX_VectorPair p = Q6_W_vshuff_VVR(m[b + i + 4], m[b + i], -16);
+            tmp[b + 2*i + 0] = Q6_V_lo_W(p);
+            tmp[b + 2*i + 1] = Q6_V_hi_W(p);
+        }
+    }
 
-    if (ir0 >= ir1) {
-        return;  // No work for this thread
+    // Stage 3 (R = -32): per block of 16, pair (b+i, b+i+8) for i = 0..7. tmp -> m.
+    for (int b = 0; b < 32; b += 16) {
+        for (int i = 0; i < 8; ++i) {
+            HVX_VectorPair p = Q6_W_vshuff_VVR(tmp[b + i + 8], tmp[b + i], -32);
+            m[b + 2*i + 0] = Q6_V_lo_W(p);
+            m[b + 2*i + 1] = Q6_V_hi_W(p);
+        }
     }
 
-    // src0 and src1 gather offsets
-    uint32_t __attribute__((aligned(VLEN))) src0_offsets[VLEN_FP32] = { 0 };
-    uint32_t __attribute__((aligned(VLEN))) src1_offsets[VLEN_FP32] = { 0 };
+    // Stage 4 (R = -64): pair (i, i+16) for i = 0..15. m -> tmp -> m.
+    for (int i = 0; i < 16; ++i) {
+        HVX_VectorPair p = Q6_W_vshuff_VVR(m[i + 16], m[i], -64);
+        tmp[2 * i + 0]   = Q6_V_lo_W(p);
+        tmp[2 * i + 1]   = Q6_V_hi_W(p);
+    }
 
-    for (uint32_t i = 0; i < VLEN_FP32; ++i) {
-        src0_offsets[i] = i * (ncs)    * sizeof(float);
-        src1_offsets[i] = i * (d_conv) * sizeof(float);
+    for (int i = 0; i < 32; ++i) {
+        m[i] = tmp[i];
     }
+}
 
-    const uint32_t src0_gather_len = VLEN * ncs;
-    const uint32_t src1_gather_len = VLEN * d_conv;
+// HVX FP32 SSM_CONV implementation - channel-vectorized HVX kernel with src0/src1
+// transposed into VTCM.
+//
+// VTCM layouts (per thread):
+//   src1_T : {d_inner_per_thread, d_conv}   — staged once per launch (small).
+//   src0_T : {d_inner_tile,     ncs}        — staged per d_inner-tile.
+//
+// d_inner_tile is chosen so that per-thread VTCM stays under the budget.
+// Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially.
+#define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread
+
+// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM)
+static inline void transpose_src1(const float * src1_data,
+                                  uint32_t      src1_stride_inner,
+                                  uint32_t      i1_off,
+                                  uint32_t      d_inner_per_thread,
+                                  uint32_t      d_conv,
+                                  float *       src1_T) {
+    for (uint32_t i = 0; i < d_inner_per_thread; ++i) {
+        const float * src_row = src1_data + (i1_off + i) * src1_stride_inner;
+        for (uint32_t j = 0; j < d_conv; ++j) {
+            src1_T[j * d_inner_per_thread + i] = src_row[j];
+        }
+    }
+}
 
-    // gather scratchpads
-    HVX_Vector * src0_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + 0);
-    HVX_Vector * src1_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + VLEN);
+// HVX 32x32 src0 transpose: src0 {ncs, d_inner} (DDR) -> src0_T {d_inner_tile, ncs} (VTCM)
+static inline void transpose_src0_block(const float * src0_block,
+                                        uint32_t      ncs,
+                                        uint32_t      cb_n,
+                                        uint32_t      d_inner_tile,
+                                        float *       src0_T_block_dst,
+                                        uint32_t      cb /* dst column offset */) {
+    const uint32_t T_TILE = VLEN_FP32;
+
+    HVX_Vector __attribute__((aligned(VLEN))) sub[32];
+
+    for (uint32_t t0 = 0; t0 < ncs; t0 += T_TILE) {
+        const uint32_t t_n = MIN(T_TILE, ncs - t0);
+
+        // Load 32 rows (channels) of T_TILE samples; pad missing channels with zeros.
+        for (uint32_t r = 0; r < cb_n; ++r) {
+            const float * src_row = src0_block + r * ncs + t0;
+            if (t_n == T_TILE) {
+                sub[r] = *(const HVX_UVector *) src_row;
+            } else {
+                HVX_Vector v = hvx_vec_splat_f32(0.0f);
+                hvx_vec_store_u(&v, t_n * sizeof(float), hvx_vec_splat_f32(0.0f));
+
+                float __attribute__((aligned(VLEN))) tmp[VLEN_FP32] = { 0 };
+                for (uint32_t k = 0; k < t_n; ++k) tmp[k] = src_row[k];
+                v = *(const HVX_Vector *) tmp;
+                sub[r] = v;
+            }
+        }
+        for (uint32_t r = cb_n; r < T_TILE; ++r) {
+            sub[r] = hvx_vec_splat_f32(0.0f);
+        }
 
-    float * data_src0 = (float *) ((char *) src0->data + ir0 * src0->nb[1]);
-    float * data_src1 = (float *) ((char *) src1->data + ir0 * src1->nb[1]);
+        hvx_transpose_32x32_f32(sub);
 
-    uint8_t * spad_src0 = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
-    uint8_t * spad_src1 = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
+        // Store transposed sub-tile to src0_T at offsets (t0 + j) * d_inner_tile + cb.
+        // Only write the valid t_n rows of the transposed result.
+        for (uint32_t r = 0; r < t_n; ++r) {
+            float * dst = src0_T_block_dst + (t0 + r) * d_inner_tile + cb;
+            if (cb_n == T_TILE) {
+                *(HVX_UVector *) dst = sub[r];
+            } else {
+                hvx_vec_store_u(dst, cb_n * sizeof(float), sub[r]);
+            }
+        }
+    }
+}
 
-    // copy src1 workload to VTCM
-    dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src1, data_src1), nb11, nb11, ir);
+static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
+    htp_ssm_conv_preamble;
 
-    // FARF(HIGH, "ssm-conv-src1-fetch %d: ir0 %u size %u\n", ith, ir0, nb11 * ir);
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
-    for (uint32_t i3 = 0; i3 < n_s; ++i3) {
-        float * src0_data_ptr = (float *) ((char *) data_src0 + i3 * (src0->nb[2]));
+    const uint32_t d_conv  = src1->ne[0];
+    const uint32_t d_inner = src0->ne[1];
+    const uint32_t n_t     = dst->ne[1];
+    const uint32_t n_s     = dst->ne[2];
+    const uint32_t ncs     = src0->ne[0];
 
-        // copy src0 workload to VTCM
-        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0, src0_data_ptr), nb01, nb01, ir);
+    const uint32_t src0_stride_inner = src0->nb[1] / sizeof(float);
+    const uint32_t src0_stride_seq   = src0->nb[2] / sizeof(float);
+    const uint32_t src1_stride_inner = src1->nb[1] / sizeof(float);
+    const uint32_t dst_stride_token  = dst->nb[1]  / sizeof(float);
+    const uint32_t dst_stride_seq    = dst->nb[2]  / sizeof(float);
 
-        // FARF(HIGH, "ssm-conv-src0-fetch %d: ir0 %u i3 %u size %u\n", ith, ir0, i3, nb01 * ir);
+    const uint32_t dr  = scctx->nrows_per_thread;
+    const uint32_t ir0 = dr * ith;
+    const uint32_t ir1 = MIN(ir0 + dr, d_inner);
 
-        dma_queue_flush(dma_queue);
+    if (ir0 >= ir1) {
+        return;
+    }
 
-        for (uint32_t i2 = 0; i2 < n_t; ++i2) {
-            float * dst_ptr = (float *) ((char *) dst->data + ir0 * (dst->nb[0]) + i2 * (dst->nb[1]) + i3 * (dst->nb[2]));
+    const uint32_t d_inner_per_thread = ir1 - ir0;
+    const uint32_t d_inner_tile       = scctx->d_inner_tile;
 
-            const uint32_t nvec = ir / VLEN_FP32;
-            const uint32_t nloe = ir % VLEN_FP32;
-            uint32_t i1 = 0;
+    const float * src0_data = (const float *) src0->data;
+    const float * src1_data = (const float *) src1->data;
+    float       * dst_data  = (float       *) dst->data;
 
-            for (uint32_t vi1 = 0; vi1 < nvec; vi1++) {
-                HVX_Vector acc_vec = Q6_V_vsplat_R(0);
+    // Per-thread VTCM regions.
+    float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread);
+    float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread);
 
-                for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
-                    uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
-                    uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc)  * sizeof(float);
-                    Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
-                    Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
+    // Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout.
+    transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T);
 
-                    HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
-                    acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
-                }
+    const uint32_t C_TILE = VLEN_FP32;
 
-                *(HVX_UVector *) (dst_ptr + i1) = Q6_Vsf_equals_Vqf32(acc_vec);
-                i1 += VLEN_FP32;
-            }
+    for (uint32_t i3 = 0; i3 < n_s; ++i3) {
+        for (uint32_t tile_off = 0; tile_off < d_inner_per_thread; tile_off += d_inner_tile) {
+            const uint32_t tile_n = MIN(d_inner_tile, d_inner_per_thread - tile_off);
 
-            if (nloe) {
-                HVX_Vector acc_vec = Q6_V_vsplat_R(0);
+            // Place src0 chunk into VTCM in {d_inner_tile, ncs} layout.
+            const float * src0_block = src0_data + i3 * src0_stride_seq + (ir0 + tile_off) * src0_stride_inner;
 
-                for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
-                    uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
-                    uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc)  * sizeof(float);
-                    Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
-                    Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
+            for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) {
+                const uint32_t cb_n = MIN(C_TILE, tile_n - cb);
+                transpose_src0_block(src0_block + cb * src0_stride_inner, ncs, cb_n, d_inner_tile, src0_T, cb);
+            }
 
-                    HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
-                    acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
+            for (uint32_t t = 0; t < n_t; ++t) {
+                for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) {
+                    const uint32_t cb_n = MIN(C_TILE, tile_n - cb);
+
+                    HVX_Vector acc = hvx_vec_splat_f32(0.0f);
+                    for (uint32_t j = 0; j < d_conv; ++j) {
+                        HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb);
+                        HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb);
+                        acc          = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w));
+                    }
+                    HVX_Vector res = Q6_Vsf_equals_Vqf32(acc);
+
+                    float * dst_ptr = dst_data + i3 * dst_stride_seq + t * dst_stride_token + (ir0 + tile_off + cb);
+                    if (cb_n == C_TILE) {
+                        *(HVX_UVector *) dst_ptr = res;
+                    } else {
+                        hvx_vec_store_u(dst_ptr, cb_n * sizeof(float), res);
+                    }
                 }
-
-                hvx_vec_store_u(dst_ptr + i1, (ir - i1) * 4, Q6_Vsf_equals_Vqf32(acc_vec));
             }
         }
     }
 
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n",
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1,
+    FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) tile=%u * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n",
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1, d_inner_tile,
          src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1],
          dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
@@ -264,46 +358,44 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t use_hvx = 0;
-        if (d_inner >= VLEN_FP32 && d_inner % VLEN_FP32 == 0) {
-            int is_aligned = hex_is_aligned((void *) src0->data, VLEN) &&
-                             hex_is_aligned((void *) src1->data, VLEN) &&
-                             hex_is_aligned((void *) dst->data, VLEN);
-
-            if (is_aligned) {
-                use_hvx = 1;
-            }
+        if (d_inner >= VLEN_FP32 && n_t >= VLEN_FP32) {
+            use_hvx = 1;
         }
 
-        if (use_hvx) {
-            scctx.nrows_per_thread  = (d_inner + n_threads - 1) / n_threads; // d_inner chunks per thread
-            scctx.nrows_per_thread += (scctx.nrows_per_thread & 1); // round up to even
+        scctx.nrows_per_thread  = (d_inner + n_threads - 1) / n_threads;
+        scctx.nrows_per_thread += (scctx.nrows_per_thread & 1);
 
-            octx->src0_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb01, 256);
-            octx->src1_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb11, 256);
-            octx->dst_spad.size_per_thread  = hex_round_up(scctx.nrows_per_thread * sizeof(float), 256);
+        const uint32_t d_inner_per_thread = scctx.nrows_per_thread;
+        const uint32_t ncs                = src0->ne[0];
 
-            octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads;
-            octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads;
-            octx->dst_spad.size  = octx->dst_spad.size_per_thread  * n_threads;
+        const uint32_t src1_T_size = hex_round_up(d_conv * d_inner_per_thread * sizeof(float), 256);
+        const uint32_t src0_T_max = HTP_SSM_CONV_VTCM_BUDGET > src1_T_size ? HTP_SSM_CONV_VTCM_BUDGET - src1_T_size : 0;
 
-            // Compute gather scratchpad size for src0 and src1
-            const size_t gather_spad_size = n_threads * VLEN * 2;
+        uint32_t d_inner_tile = (src0_T_max / sizeof(float)) / ncs;
+        d_inner_tile -= (d_inner_tile % VLEN_FP32);
+        if (d_inner_tile == 0) {
+            FARF(HIGH, "ssm_conv-f32: inner tile rounds to 0 (ncs=%u), falling back to scalar\n", ncs);
+            use_hvx = 0;
+        } else {
+            scctx.d_inner_tile = d_inner_tile;
 
-            octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size;     octx->src0_spad.src = NULL;
-            octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
-            octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src  = NULL;
+            octx->src0_spad.size_per_thread = hex_round_up(d_inner_tile * ncs * sizeof(float), 256);
+            octx->src1_spad.size_per_thread = src1_T_size;
+            octx->dst_spad.size_per_thread  = 0;
 
-            FARF(HIGH, "ssm_conv-f32: gather-spad:%zu spad-per-thread:(%u:%u:%u) spad-sizes:(%u:%u:%u) spad-data:(%p:%p:%p)\n",
-                gather_spad_size, octx->src0_spad.size_per_thread, octx->src1_spad.size_per_thread,
-                octx->dst_spad.size_per_thread, octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size,
-                octx->src0_spad.data, octx->src1_spad.data, octx->dst_spad.data);
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads;
+            octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads;
+            octx->dst_spad.size  = 0;
 
-            const size_t total_spad_size =
-                gather_spad_size + octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+            octx->src0_spad.data = octx->ctx->vtcm_base;
+            octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+            octx->src0_spad.src  = NULL;
+            octx->src1_spad.src  = NULL;
 
-            if (total_spad_size > octx->ctx->vtcm_size) {
-                FARF(HIGH, "ssm_conv-f32: HVX scratchpad size %zu exceeds VTCM size %zu", total_spad_size,
-                     octx->ctx->vtcm_size);
+            const size_t total_spad = octx->src0_spad.size + octx->src1_spad.size;
+            if (total_spad > octx->ctx->vtcm_size) {
+                FARF(HIGH, "ssm_conv-f32: scratchpad %zu exceeds VTCM %zu, falling back to scalar\n",
+                     total_spad, octx->ctx->vtcm_size);
                 use_hvx = 0;
             }
         }
diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index 1ce881353ec..7d0431d8ba8 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -158,6 +158,79 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
     }
 }
 
+static void hvx_fast_norm_f32(const uint8_t * restrict src,
+                                  uint8_t * restrict dst,
+                                  uint8_t * restrict pad,
+                                  const int num_elems,
+                                  float     epsilon) {
+    (void)pad;
+
+    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
+    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
+
+    const int nvec = num_elems / VLEN_FP32;    // number of full vectors
+    const int nloe = num_elems % VLEN_FP32;    // leftover elements
+
+    // Compute sum of squares and sum of values for full vectors
+    HVX_Vector sum_sq_v = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector sum_x_v  = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2);
+        sum_x_v  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v,  Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero()));
+    }
+
+    // Handle tail elements using vectorized ops with masking
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2);
+        sum_x_v  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v,  Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero()));
+    }
+
+    // Reduce HVX sums
+    sum_sq_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_sq_v));
+    sum_x_v  = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_x_v));
+
+    HVX_Vector t_v            = hvx_vec_splat_f32((float) num_elems);
+    HVX_Vector denom_v        = hvx_vec_inverse_f32(t_v);
+    HVX_Vector mean_sq_v      = Q6_Vqf32_vmpy_VsfVsf(sum_sq_v, denom_v);
+    HVX_Vector mean_x_v       = Q6_Vqf32_vmpy_VsfVsf(sum_x_v,  denom_v);
+    HVX_Vector mean_x_sq_v    = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(mean_x_v), Q6_Vsf_equals_Vqf32(mean_x_v));
+    HVX_Vector var_v          = Q6_Vqf32_vsub_Vqf32Vqf32(mean_sq_v, mean_x_sq_v);
+    HVX_Vector var_epsilon_v  = Q6_Vqf32_vadd_Vqf32Vsf(var_v, epsilon_v);
+
+    // scale = rsqrt(variance + epsilon),  mean_x broadcast for subtraction
+    HVX_Vector scale_v  = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(var_epsilon_v));
+    HVX_Vector mean_x_b = hvx_vec_repl_f32(Q6_Vsf_equals_Vqf32(mean_x_v));
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b);
+        HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v);
+        v_dst[i] = Q6_Vsf_equals_Vqf32(v3);
+    }
+
+    // Handle tail elements using vectorized ops with masking
+    if (nloe > 0) {
+
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b);
+        HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v);
+        HVX_Vector result = Q6_Vsf_equals_Vqf32(v3);
+
+        // Store with masking to avoid overwriting memory beyond the tensor
+        hvx_vec_store_a(&v_dst[nvec], nloe * 4, result);
+    }
+}
+
 static void scale_f32(const float * restrict src,
                       float * restrict dst,
                       uint8_t * restrict spad,
@@ -196,6 +269,24 @@ static void rms_norm_f32(const float * restrict src,
     }
 }
 
+static void norm_f32(const float * restrict src,
+                         float * restrict dst,
+                         uint8_t * restrict spad,
+                         const uint32_t num_rows,
+                         const uint32_t row_elems,
+                         const size_t   row_size,
+                         int32_t *      op_params) {
+    float epsilon = 0.f;
+    memcpy(&epsilon, op_params, sizeof(float));
+
+    for (uint32_t ir = 0; ir < num_rows; ir++) {
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
+
+        hvx_fast_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
+    }
+}
+
 static void sqr_f32(const float * restrict src,
                     float * restrict dst,
                     uint8_t * restrict spad,
@@ -556,6 +647,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
 
         // Process block in VTCM
         switch (htp_op) {
+            case HTP_OP_NORM:
+                norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
             case HTP_OP_RMS_NORM:
                 rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                 break;
@@ -632,6 +726,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     const char * op_type = NULL;
 
     switch (octx->op) {
+        case HTP_OP_NORM:
+            op_type = "norm-f32";
+            break;
         case HTP_OP_RMS_NORM:
             op_type = "rmsnorm-f32";
             break;
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index e288a27f992..ba006d9b31a 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1897,7 +1897,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l
     char base[256];
     char name[256];
 
-    snprintf(base, 256, "kernel_pad_%s", ggml_type_name(op->src[0]->type));
+    // note: this is slower
+    //const bool is_c4 = op->src[0]->ne[0] % 4 == 0 && op->ne[0] % 4 == 0;
+    const bool is_c4 = false;
+
+    snprintf(base, 256, "kernel_pad_%s%s", ggml_type_name(op->src[0]->type), is_c4 ? "_4" : "");
     snprintf(name, 256, "%s", base);
 
     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
@@ -1907,6 +1911,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l
 
     res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
 
+    res.c4 = is_c4;
+
     return res;
 }
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h
index 1f212a92f98..4a3ebb5569d 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -215,6 +215,30 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
 // device
 //
 
+enum ggml_metal_device_id {
+    GGML_METAL_DEVICE_GENERIC = 0,
+
+    GGML_METAL_DEVICE_M1,
+    GGML_METAL_DEVICE_M1_PRO,
+    GGML_METAL_DEVICE_M1_MAX,
+    GGML_METAL_DEVICE_M1_ULTRA,
+    GGML_METAL_DEVICE_M2,
+    GGML_METAL_DEVICE_M2_PRO,
+    GGML_METAL_DEVICE_M2_MAX,
+    GGML_METAL_DEVICE_M2_ULTRA,
+    GGML_METAL_DEVICE_M3,
+    GGML_METAL_DEVICE_M3_PRO,
+    GGML_METAL_DEVICE_M3_MAX,
+    GGML_METAL_DEVICE_M3_ULTRA,
+    GGML_METAL_DEVICE_M4,
+    GGML_METAL_DEVICE_M4_PRO,
+    GGML_METAL_DEVICE_M4_MAX,
+    GGML_METAL_DEVICE_M5,
+    GGML_METAL_DEVICE_M5_PRO,
+    GGML_METAL_DEVICE_M5_MAX,
+    GGML_METAL_DEVICE_M5_ULTRA,
+};
+
 struct ggml_metal_device_props {
     int device;
     char name[128];
@@ -234,6 +258,8 @@ struct ggml_metal_device_props {
 
     bool supports_gpu_family_apple7;
 
+    enum ggml_metal_device_id device_id;
+
     int op_offload_min_batch_size;
 };
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 780dfe81bb3..885344ec670 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -628,6 +628,50 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
     free(rsets);
 }
 
+static enum ggml_metal_device_id ggml_metal_device_id_parse(const char * name) {
+    if (!name) {
+        return GGML_METAL_DEVICE_GENERIC;
+    }
+
+    static const char prefix[] = "Apple ";
+    if (strncmp(name, prefix, sizeof(prefix) - 1) != 0) {
+        return GGML_METAL_DEVICE_GENERIC;
+    }
+    const char * suffix = name + sizeof(prefix) - 1;
+
+    static const struct {
+        const char * name;
+        enum ggml_metal_device_id id;
+    } table[] = {
+        {"M1",       GGML_METAL_DEVICE_M1},
+        {"M1 Pro",   GGML_METAL_DEVICE_M1_PRO},
+        {"M1 Max",   GGML_METAL_DEVICE_M1_MAX},
+        {"M1 Ultra", GGML_METAL_DEVICE_M1_ULTRA},
+        {"M2",       GGML_METAL_DEVICE_M2},
+        {"M2 Pro",   GGML_METAL_DEVICE_M2_PRO},
+        {"M2 Max",   GGML_METAL_DEVICE_M2_MAX},
+        {"M2 Ultra", GGML_METAL_DEVICE_M2_ULTRA},
+        {"M3",       GGML_METAL_DEVICE_M3},
+        {"M3 Pro",   GGML_METAL_DEVICE_M3_PRO},
+        {"M3 Max",   GGML_METAL_DEVICE_M3_MAX},
+        {"M3 Ultra", GGML_METAL_DEVICE_M3_ULTRA},
+        {"M4",       GGML_METAL_DEVICE_M4},
+        {"M4 Pro",   GGML_METAL_DEVICE_M4_PRO},
+        {"M4 Max",   GGML_METAL_DEVICE_M4_MAX},
+        {"M5",       GGML_METAL_DEVICE_M5},
+        {"M5 Pro",   GGML_METAL_DEVICE_M5_PRO},
+        {"M5 Max",   GGML_METAL_DEVICE_M5_MAX},
+        {"M5 Ultra", GGML_METAL_DEVICE_M5_ULTRA},
+    };
+
+    for (size_t i = 0; i < sizeof(table)/sizeof(table[0]); ++i) {
+        if (strcmp(suffix, table[i].name) == 0) {
+            return table[i].id;
+        }
+    }
+    return GGML_METAL_DEVICE_GENERIC;
+}
+
 ggml_metal_device_t ggml_metal_device_init(int device) {
     ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
 
@@ -795,6 +839,8 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
 
             dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
 
+            dev->props.device_id = ggml_metal_device_id_parse([[dev->mtl_device name] UTF8String]);
+
             dev->props.op_offload_min_batch_size  = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
 
             dev->props.max_buffer_size            = dev->mtl_device.maxBufferLength;
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index a114391c2e8..206af227a2c 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
 
-    const int nth = std::min(1024, ne0);
+    int nth = std::min(256, ne0);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
+    // when rows are small, we can batch them together in a single threadgroup
+    int nrptg = 1;
+    if (nth < 256) {
+        nrptg = std::min((256 + nth - 1) / nth, ne1);
+        if (nrptg * nth > 256) {
+            nrptg = 256 / nth;
+        }
+    }
+
+    const int nw0 = (ne1 + nrptg - 1) / nrptg;
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1);
 
     return 1;
 }
@@ -816,9 +827,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
         ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
     } else {
         const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
-
         const int nth = MIN(args.ne00, nth_max);
-
         const int nk0 = (args.ne00 + nth - 1)/nth;
 
         ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne01, ne02, ne03, nth, 1, 1);
@@ -1788,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
         nk0 = ne10/ggml_blck_size(op->type);
     }
 
-    int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    int nth = std::min<int>(nk0*ne11, 256);
 
     // when rows are small, we can batch them together in a single threadgroup
     int nrptg = 1;
@@ -1799,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
             nrptg = (nth + nk0 - 1)/nk0;
             nth   = nk0;
 
-            if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+            if (nrptg*nth > 256) {
                 nrptg--;
             }
         }
@@ -1863,7 +1872,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
         nk0 = ne00/ggml_blck_size(op->type);
     }
 
-    int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    int nth = std::min<int>(nk0*ne01, 256);
 
     // when rows are small, we can batch them together in a single threadgroup
     int nrptg = 1;
@@ -1874,7 +1883,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
             nrptg = (nth + nk0 - 1)/nk0;
             nth   = nk0;
 
-            if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+            if (nrptg*nth > 256) {
                 nrptg--;
             }
         }
@@ -4039,14 +4048,21 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
 
     auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
 
-    const int nth = std::min(1024, ne0);
+    if (pipeline.c4) {
+        args.ne00 = ne00/4;
+        args.ne0  = ne0/4;
+    }
+
+    const int nth_max = MIN(64, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    const int nth = MIN(args.ne0, nth_max);
+    const int nk0 = (args.ne0 + 1024 - 1)/1024; // note: 1024 is hardcoded in the kernel!
 
     ggml_metal_encoder_set_pipeline(enc, pipeline);
     ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1);
 
     return 1;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index f6ffb2b3a1c..e772664ba91 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2643,7 +2643,7 @@ kernel void kernel_gated_delta_net_impl(
         b_ptr += args.ne21;
         g_ptr += args.ne21*G;
 
-        if (K > 1u) {
+        if (K > 1) {
             const int target_slot = (int)t - shift;
             if (target_slot >= 0 && target_slot < (int)K) {
                 device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base;
@@ -2655,7 +2655,7 @@ kernel void kernel_gated_delta_net_impl(
         }
     }
 
-    if (K == 1u) {
+    if (K == 1) {
         device float * dst_state = (device float *) (dst) + attn_size + state_out_base;
         FOR_UNROLL (short j = 0; j < NSG; j++) {
             const short is = tx*NSG + j;
@@ -5104,7 +5104,7 @@ kernel void kernel_upscale_bilinear_f32(
                 for (int64_t sx = x_min; sx < x_max; ++sx) {
                     const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0);
                     const float w  = wx * wy;
-                    const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
+                    device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
                     sum  += (*src_ptr) * w;
                     wsum += w;
                 }
@@ -5286,7 +5286,7 @@ kernel void kernel_upscale_bicubic_f32(
                 const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx));
                 const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3;
 
-                const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
+                device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
                 sum += (*src_ptr) * wx * wy;
             }
         }
@@ -5329,42 +5329,46 @@ kernel void kernel_roll_f32(
     }
 }
 
-kernel void kernel_pad_f32(
+template <typename T>
+kernel void kernel_pad_impl(
     constant ggml_metal_kargs_pad & args,
     device  const char * src0,
     device        char * dst,
     uint3 tgpig[[threadgroup_position_in_grid]],
     uint3 tpitg[[thread_position_in_threadgroup]],
     uint3   ntg[[threads_per_threadgroup]]) {
+    const int32_t i3 = tgpig.z;
+    const int32_t i2 = tgpig.y;
+    const int32_t k0 = tgpig.x/args.ne1;
+    const int32_t i1 = tgpig.x - k0*args.ne1;
 
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1;
+    const int32_t i03 = i3;
+    const int32_t i02 = i2;
+    const int32_t i01 = i1;
 
-    device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1);
+    device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
+    device       T * dst_ptr  = (device       T *) (dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1);
 
-    if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
-        for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-            if (i0 < args.ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            } else {
-                dst_ptr[i0] = 0.0f;
-            }
+    for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) {
+        const int32_t i0 = k0*1024 + tpitg.x + l0;
+        if (i0 >= args.ne0) {
+            break;
         }
 
-        return;
-    }
-
-    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-        dst_ptr[i0] = 0.0f;
+        if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
+            dst_ptr[i0] = src0_ptr[i0];
+        } else {
+            dst_ptr[i0] = 0.0f;
+        }
     }
 }
 
+typedef decltype(kernel_pad_impl<float>) kernel_pad_t;
+
+template [[host_name("kernel_pad_f32")]]   kernel kernel_pad_t kernel_pad_impl<float>;
+template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl<float4>;
+
+// TODO: this is slow - optimize
 kernel void kernel_pad_reflect_1d_f32(
     constant   ggml_metal_kargs_pad_reflect_1d & args,
     device  const char * src0,
@@ -7328,23 +7332,27 @@ kernel void kernel_cpy_t_t(
         device  const char * src0,
         device        char * dst,
         uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
         ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i03 = tgpig[2];
-    const int i02 = tgpig[1];
-    const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0];
-    const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
+    const int32_t i03 = tgpig[2];
+    const int32_t i02 = tgpig[1];
+    const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
+    const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
+
+    if (i01 >= args.ne01) {
+        return;
+    }
 
     const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
-    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
-    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
-    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
+    const int32_t i3 = n/(args.ne2*args.ne1*args.ne0);
+    const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
+    const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
+    const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
 
     device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.ne00; ) {
+    for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) {
         device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
         dst_data[i00] = (T1) src[0];
         break;
@@ -7376,23 +7384,27 @@ kernel void kernel_cpy_f32_q(
         device const char * src0,
         device char * dst,
         uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
         ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i03 = tgpig[2];
-    const int i02 = tgpig[1];
-    const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0];
-    const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
+    const int32_t i03 = tgpig[2];
+    const int32_t i02 = tgpig[1];
+    const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
+    const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
+
+    if (i01 >= args.ne01) {
+        return;
+    }
 
     const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
-    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
-    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
-    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK;
+    const int32_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK;
 
     device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) {
+    for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) {
         device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00);
 
         quantize_func(src, dst_data[i00]);
@@ -7417,24 +7429,28 @@ kernel void kernel_cpy_q_f32(
         device  const char * src0,
         device        char * dst,
         uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
         ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i03 = tgpig[2];
-    const int i02 = tgpig[1];
-    const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0];
-    const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
+    const int32_t i03 = tgpig[2];
+    const int32_t i02 = tgpig[1];
+    const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
+    const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
+
+    if (i01 >= args.ne01) {
+        return;
+    }
 
     const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
-    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
-    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
-    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
+    const int32_t i3 = n/(args.ne2*args.ne1*args.ne0);
+    const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
+    const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
+    const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
 
     device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
     device       T4x4    * dst_data = (device       T4x4    *)(dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) {
+    for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) {
         T4x4 temp;
         dequantize_func(src_data + i00/nl, i00%nl, temp);
         dst_data[i00] = temp;
@@ -7470,7 +7486,11 @@ kernel void kernel_concat(
 
     const int i3 = tgpig.z;
     const int i2 = tgpig.y;
-    const int i1 = tgpig.x;
+    const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;
+
+    if (i1 >= args.ne1) {
+        return;
+    }
 
     int o[4] = {0, 0, 0, 0};
     o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
index c6aba608736..f75d089b574 100644
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -110,6 +110,12 @@ set(GGML_OPENCL_KERNELS
     gemv_moe_q5_0_f32_ns
     gemm_moe_q5_1_f32_ns
     gemv_moe_q5_1_f32_ns
+    gemm_moe_q4_k_f32_ns
+    gemv_moe_q4_k_f32_ns
+    gemm_moe_q5_k_f32_ns
+    gemv_moe_q5_k_f32_ns
+    gemm_moe_q6_k_f32_ns
+    gemv_moe_q6_k_f32_ns
     gemm_moe_mxfp4_f32
     gemv_moe_mxfp4_f32
     gemm_moe_mxfp4_f32_ns
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 0e511592d53..42286435bc6 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -375,6 +375,11 @@ struct ggml_backend_opencl_device_context {
     ggml_backend_buffer_type buffer_type;
 
     cl_context context = nullptr;
+
+    GPU_FAMILY     gpu_family = GPU_FAMILY::UNKNOWN;
+    ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
+
+    size_t global_mem_size = 0;
 };
 
 // backend context
@@ -384,6 +389,18 @@ struct ggml_backend_opencl_context {
     cl_device_id device;
     std::string device_name;
 
+    ggml_cl_version platform_version;
+    ggml_cl_version opencl_c_version;
+
+    // argsort is loaded in supports_op because its availability depends on how
+    // many workgroups are allowed, which requires kernel compilation.
+    bool kernels_loaded_argsort = false;
+    // flash attn is loaded in supports_op because it contains multiple variants
+    // and takes time to compile, so we want to only compile it when needed.
+    bool kernels_loaded_flash_attn = false;
+    // rest of the kernels are currently always loaded in alloc_buffer.
+    bool kernels_loaded = false;
+
     std::string driver_version;
 
     GPU_FAMILY gpu_family;
@@ -558,6 +575,9 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_convert_block_q4_1_trans4_ns, kernel_restore_block_q4_1_trans4_ns;
     cl_kernel kernel_convert_block_q5_0_trans4_ns, kernel_restore_block_q5_0_trans4_ns;
     cl_kernel kernel_convert_block_q5_1_trans4_ns, kernel_restore_block_q5_1_trans4_ns;
+    cl_kernel kernel_convert_block_q4_k_trans4_ns, kernel_restore_block_q4_k_trans4_ns;
+    cl_kernel kernel_convert_block_q5_k_trans4_ns, kernel_restore_block_q5_k_trans4_ns;
+    cl_kernel kernel_convert_block_q6_k_trans4_ns, kernel_restore_block_q6_k_trans4_ns;
     cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
     cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns;
     cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
@@ -619,6 +639,9 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns;
     cl_kernel kernel_gemv_moe_q5_0_f32_ns, kernel_gemm_moe_q5_0_f32_ns;
     cl_kernel kernel_gemv_moe_q5_1_f32_ns, kernel_gemm_moe_q5_1_f32_ns;
+    cl_kernel kernel_gemv_moe_q4_k_f32_ns, kernel_gemm_moe_q4_k_f32_ns;
+    cl_kernel kernel_gemv_moe_q5_k_f32_ns, kernel_gemm_moe_q5_k_f32_ns;
+    cl_kernel kernel_gemv_moe_q6_k_f32_ns, kernel_gemm_moe_q6_k_f32_ns;
     cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
     cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns;
     cl_kernel kernel_moe_reorder_b;
@@ -638,11 +661,10 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_mul_mm_iq4_nl_f32_l4_lm;
 
     std::vector<ProfilingInfo> profiling_info;
+    std::vector<ProfilingInfo> profiling_results;
 
-    void write_profiling_info() {
-        FILE * fperf = fopen("cl_profiling.csv", "w");
-        if (!fperf) {
-            GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
+    void flush_profiling_batch() {
+        if (profiling_info.empty()) {
             return;
         }
 
@@ -666,6 +688,7 @@ struct ggml_backend_opencl_context {
             CL_CHECK(clGetEventProfilingInfo(
                 info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
             CL_CHECK(clReleaseEvent(info.evt));
+            info.evt = nullptr;
 
             char kernel_name[512];
             CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
@@ -683,10 +706,26 @@ struct ggml_backend_opencl_context {
             info.cmd_complete_duration_ns   = cmd_complete  - cmd_end;
             info.cmd_total_duration_ns      = cmd_complete  - cmd_queued;
         }
+        profiling_results.insert(profiling_results.end(),
+            std::make_move_iterator(profiling_info.begin()),
+            std::make_move_iterator(profiling_info.end()));
+        profiling_info.clear();
+    }
+
+    void write_profiling_info() {
+        if (profiling_results.empty()) {
+            return;
+        }
 
         // Dump a csv
+        FILE * fperf = fopen("cl_profiling.csv", "w");
+        if (!fperf) {
+            GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
+            return;
+        }
+
         fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
-        for (const ProfilingInfo & info : profiling_info) {
+        for (const ProfilingInfo & info : profiling_results) {
             fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
                 info.op_name.c_str(), info.kernel_name.c_str(),
                 info.cmd_duration_ns/1.e6f,
@@ -697,14 +736,14 @@ struct ggml_backend_opencl_context {
         fclose(fperf);
 
         // Dump a simple chrome trace
-        FILE* ftrace = fopen("cl_trace.json", "w");
+        FILE * ftrace = fopen("cl_trace.json", "w");
         if (!ftrace) {
             GGML_LOG_ERROR("Failed to open cl_trace.json\n");
             return;
         }
 
         fprintf(ftrace, "[\n");
-        for (const ProfilingInfo & info : profiling_info) {
+        for (const ProfilingInfo & info : profiling_results) {
             fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
                 info.kernel_name.c_str(), info.cmd_queued/1000);
             fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
@@ -715,6 +754,7 @@ struct ggml_backend_opencl_context {
             fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
                 info.kernel_name.c_str(), info.cmd_end/1000);
         }
+        fprintf(ftrace, "]\n");
         fclose(ftrace);
     }
 
@@ -735,6 +775,9 @@ struct ggml_backend_opencl_context {
 
         profiling_info.emplace_back();
         populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
+        if (profiling_info.size() >= 2048) {
+            flush_profiling_batch();
+        }
 #else
         GGML_UNUSED(tensor);
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
@@ -775,11 +818,13 @@ struct ggml_backend_opencl_context {
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
     void free() {
+        clFinish(queue);
+
         ref_count--;
         if (ref_count == 0) {
 #ifdef GGML_OPENCL_PROFILING
             write_profiling_info();
-            profiling_info.clear();
+            profiling_results.clear();
 #endif
         }
     }
@@ -787,6 +832,9 @@ struct ggml_backend_opencl_context {
 
 // All registered devices with a default device in the front.
 static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
+// All device contexts associated with the devices above.
+// The devices live as long as the process, so do the contexts.
+static std::vector<std::unique_ptr<ggml_backend_opencl_device_context>> g_ggml_backend_opencl_dev_ctxs;
 
 inline std::string read_file(const std::string &path) {
   std::ifstream ifs(path);
@@ -830,12 +878,120 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
     return p;
 }
 
-static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
+static void load_cl_kernels_argsort(ggml_backend_opencl_context *backend_ctx) {
+    // compiler options for general kernels
+    auto opencl_c_std =
+        std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
+    std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
+                               " -cl-mad-enable -cl-unsafe-math-optimizations"
+                               " -cl-finite-math-only -cl-fast-relaxed-math";
+
+    // argsort
+    if (!backend_ctx->kernels_loaded_argsort) {
+        cl_int err;
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "argsort.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("argsort.cl");
+#endif
+        backend_ctx->program_argsort_f32_i32 =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
+        backend_ctx->kernels_loaded_argsort = true;
+    }
+}
+
+static void load_cl_kernels_flash_attn(ggml_backend_opencl_context *backend_ctx) {
+    // compiler options for general kernels
+    auto opencl_c_std =
+        std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
+    std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
+                               " -cl-mad-enable -cl-unsafe-math-optimizations"
+                               " -cl-finite-math-only -cl-fast-relaxed-math";
+
+    // flash_attn
+    if (!backend_ctx->kernels_loaded_flash_attn) {
+        cl_int err;
+
+        #ifdef GGML_OPENCL_EMBED_KERNELS
+                const std::string kernel_src_f16 {
+                    #include "flash_attn_f16.cl.h"
+                };
+                const std::string kernel_src_f32 {
+                    #include "flash_attn_f32.cl.h"
+                };
+                const std::string kernel_src_f32_f16 {
+                    #include "flash_attn_f32_f16.cl.h"
+                };
+        #else
+                const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
+                const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
+                const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
+        #endif
+
+        if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
+            const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
+                { 40,  40, 32, 32}, { 64,  64, 64, 64}, { 80,  80, 64, 32}, { 96,  96, 64, 32},
+                {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
+                {192, 192, 16, 16}, {256, 256, 16, 16},
+            };
+
+            for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
+                const int dk = fa_dims[i].dk;
+                const int dv = fa_dims[i].dv;
+                const int bm = fa_dims[i].bm;
+                const int bn = fa_dims[i].bn;
+                std::string OPTS = compile_opts +
+                    " -D DK=" + std::to_string(dk) +
+                    " -D DV=" + std::to_string(dv) +
+                    " -D BLOCK_M=" + std::to_string(bm) +
+                    " -D BLOCK_N=" + std::to_string(bn);
+
+                cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
+                cl_kernel k_f16, k_f16_q1;
+                CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
+                CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
+                backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
+                backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
+                CL_CHECK(clReleaseProgram(prog_f16));
+
+                cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
+                cl_kernel k_f32, k_f32_q1;
+                CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
+                CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
+                backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
+                backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
+                CL_CHECK(clReleaseProgram(prog_f32));
+
+                cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
+                cl_kernel k_f32_f16, k_f32_f16_q1;
+                CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
+                CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
+                backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
+                backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
+                CL_CHECK(clReleaseProgram(prog_f32_f16));
+
+                backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
+                backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
+            }
+            backend_ctx->kernels_loaded_flash_attn = true;
+        }
+    }
+}
+
+static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
+    if (backend_ctx->kernels_loaded) {
+        return;
+    }
+
     cl_int err;
 
     // compiler options for general kernels
     auto opencl_c_std =
-        std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
+        std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
     std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
                                " -cl-mad-enable -cl-unsafe-math-optimizations"
                                " -cl-finite-math-only -cl-fast-relaxed-math";
@@ -981,6 +1137,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_trans4_ns", &err), err));
         CL_CHECK((backend_ctx->kernel_restore_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_trans4_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_convert_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_k_trans4_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_k_trans4_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_convert_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_k_trans4_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_k_trans4_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_convert_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_k_trans4_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_k_trans4_ns", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans4_ns", &err), err));
@@ -1974,89 +2136,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         GGML_LOG_CONT(".");
     }
 
-    // flash_attn
-    {
-        #ifdef GGML_OPENCL_EMBED_KERNELS
-                const std::string kernel_src_f16 {
-                    #include "flash_attn_f16.cl.h"
-                };
-                const std::string kernel_src_f32 {
-                    #include "flash_attn_f32.cl.h"
-                };
-                const std::string kernel_src_f32_f16 {
-                    #include "flash_attn_f32_f16.cl.h"
-                };
-        #else
-                const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
-                const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
-                const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
-        #endif
-
-        if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
-            const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
-                { 40,  40, 32, 32}, { 64,  64, 64, 64}, { 80,  80, 64, 32}, { 96,  96, 64, 32},
-                {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
-                {192, 192, 16, 16}, {256, 256, 16, 16},
-            };
-
-            for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
-                const int dk = fa_dims[i].dk;
-                const int dv = fa_dims[i].dv;
-                const int bm = fa_dims[i].bm;
-                const int bn = fa_dims[i].bn;
-                std::string OPTS = compile_opts +
-                    " -D DK=" + std::to_string(dk) +
-                    " -D DV=" + std::to_string(dv) +
-                    " -D BLOCK_M=" + std::to_string(bm) +
-                    " -D BLOCK_N=" + std::to_string(bn);
-
-                cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
-                cl_kernel k_f16, k_f16_q1;
-                CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
-                CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
-                backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
-                backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
-                CL_CHECK(clReleaseProgram(prog_f16));
-
-                cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
-                cl_kernel k_f32, k_f32_q1;
-                CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
-                CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
-                backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
-                backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
-                CL_CHECK(clReleaseProgram(prog_f32));
-
-                cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
-                cl_kernel k_f32_f16, k_f32_f16_q1;
-                CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
-                CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
-                backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
-                backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
-                CL_CHECK(clReleaseProgram(prog_f32_f16));
-
-                backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
-                backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
-            }
-            GGML_LOG_CONT(".");
-        }
-    }
-
-    // argsort
-    {
-#ifdef GGML_OPENCL_EMBED_KERNELS
-        const std::string kernel_src {
-            #include "argsort.cl.h"
-        };
-#else
-        const std::string kernel_src = read_file("argsort.cl");
-#endif
-        backend_ctx->program_argsort_f32_i32 =
-            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-
-        CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
-        GGML_LOG_CONT(".");
-    }
-
     // div
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3071,133 +3150,235 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         GGML_LOG_CONT(".");
     }
 
-    // gemv_moe_mxfp4_f32_ns
+    // gemv_moe_q4_k_f32_ns
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "gemv_moe_mxfp4_f32_ns.cl.h"
+            #include "gemv_moe_q4_k_f32_ns.cl.h"
         };
 #else
-        const std::string kernel_src = read_file("gemv_moe_mxfp4_f32_ns.cl");
+        const std::string kernel_src = read_file("gemv_moe_q4_k_f32_ns.cl");
 #endif
         cl_program prog =
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_mxfp4_f32_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_gemv_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q4_k_f32_ns", &err), err));
         CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
 
-    // gemm_moe_mxfp4_f32_ns
+    // gemm_moe_q4_k_f32_ns
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "gemm_moe_mxfp4_f32_ns.cl.h"
+            #include "gemm_moe_q4_k_f32_ns.cl.h"
         };
 #else
-        const std::string kernel_src = read_file("gemm_moe_mxfp4_f32_ns.cl");
+        const std::string kernel_src = read_file("gemm_moe_q4_k_f32_ns.cl");
 #endif
         cl_program prog =
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_mxfp4_f32_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_gemm_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q4_k_f32_ns", &err), err));
         CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
 
-    // moe_reorder_b
+    // gemv_moe_q5_k_f32_ns
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "moe_reorder_b.cl.h"
+            #include "gemv_moe_q5_k_f32_ns.cl.h"
         };
 #else
-        const std::string kernel_src = read_file("moe_reorder_b.cl");
+        const std::string kernel_src = read_file("gemv_moe_q5_k_f32_ns.cl");
 #endif
         cl_program prog =
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_moe_reorder_b = clCreateKernel(prog, "kernel_moe_reorder_b", &err), err));
+        CL_CHECK((backend_ctx->kernel_gemv_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_k_f32_ns", &err), err));
         CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
 
-    // moe_sort_by_expert
+    // gemm_moe_q5_k_f32_ns
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "moe_sort_by_expert.cl.h"
+            #include "gemm_moe_q5_k_f32_ns.cl.h"
         };
 #else
-        const std::string kernel_src = read_file("moe_sort_by_expert.cl");
+        const std::string kernel_src = read_file("gemm_moe_q5_k_f32_ns.cl");
 #endif
         cl_program prog =
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_moe_histogram = clCreateKernel(prog, "kernel_moe_histogram", &err), err));
-        CL_CHECK((backend_ctx->kernel_moe_scan = clCreateKernel(prog, "kernel_moe_scan", &err), err));
-        CL_CHECK((backend_ctx->kernel_moe_fill = clCreateKernel(prog, "kernel_moe_fill", &err), err));
-        CL_CHECK((backend_ctx->kernel_moe_scatter = clCreateKernel(prog, "kernel_moe_scatter", &err), err));
+        CL_CHECK((backend_ctx->kernel_gemm_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_k_f32_ns", &err), err));
         CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
 
-    // gemv_noshuffle_q6_k_f32
+    // gemv_moe_q6_k_f32_ns
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "gemv_noshuffle_q6_k_f32.cl.h"
+            #include "gemv_moe_q6_k_f32_ns.cl.h"
         };
 #else
-        const std::string kernel_src = read_file("gemv_noshuffle_q6_k_f32.cl");
+        const std::string kernel_src = read_file("gemv_moe_q6_k_f32_ns.cl");
 #endif
-
-        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
-                                       " -cl-mad-enable ";
-        if (backend_ctx->has_vector_subgroup_broadcast) {
-            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
-        }
-
         cl_program prog =
-            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q6_K_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_gemv_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q6_k_f32_ns", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
 
-    // gemm_noshuffle_q6_k_f32
+    // gemm_moe_q6_k_f32_ns
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "gemm_noshuffle_q6_k_f32.cl.h"
+            #include "gemm_moe_q6_k_f32_ns.cl.h"
         };
 #else
-        const std::string kernel_src = read_file("gemm_noshuffle_q6_k_f32.cl");
+        const std::string kernel_src = read_file("gemm_moe_q6_k_f32_ns.cl");
 #endif
         cl_program prog =
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q6_K_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_gemm_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q6_k_f32_ns", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
 
-    // gemv_noshuffle_q5_k_f32
+    // gemv_moe_mxfp4_f32_ns
     {
-        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
-                                       " -cl-mad-enable ";
-        if (backend_ctx->has_vector_subgroup_broadcast) {
-            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
-        }
-
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "gemv_noshuffle_q5_k_f32.cl.h"
+            #include "gemv_moe_mxfp4_f32_ns.cl.h"
         };
 #else
-        const std::string kernel_src = read_file("gemv_noshuffle_q5_k_f32.cl");
+        const std::string kernel_src = read_file("gemv_moe_mxfp4_f32_ns.cl");
 #endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 
-        cl_program prog = build_program_from_source(
+        CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_mxfp4_f32_ns", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemm_moe_mxfp4_f32_ns
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemm_moe_mxfp4_f32_ns.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemm_moe_mxfp4_f32_ns.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_mxfp4_f32_ns", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // moe_reorder_b
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "moe_reorder_b.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("moe_reorder_b.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_moe_reorder_b = clCreateKernel(prog, "kernel_moe_reorder_b", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // moe_sort_by_expert
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "moe_sort_by_expert.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("moe_sort_by_expert.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_moe_histogram = clCreateKernel(prog, "kernel_moe_histogram", &err), err));
+        CL_CHECK((backend_ctx->kernel_moe_scan = clCreateKernel(prog, "kernel_moe_scan", &err), err));
+        CL_CHECK((backend_ctx->kernel_moe_fill = clCreateKernel(prog, "kernel_moe_fill", &err), err));
+        CL_CHECK((backend_ctx->kernel_moe_scatter = clCreateKernel(prog, "kernel_moe_scatter", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemv_noshuffle_q6_k_f32
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemv_noshuffle_q6_k_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemv_noshuffle_q6_k_f32.cl");
+#endif
+
+        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
+                                       " -cl-mad-enable ";
+        if (backend_ctx->has_vector_subgroup_broadcast) {
+            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+        }
+
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q6_K_f32", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemm_noshuffle_q6_k_f32
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemm_noshuffle_q6_k_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemm_noshuffle_q6_k_f32.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q6_K_f32", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemv_noshuffle_q5_k_f32
+    {
+        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
+                                       " -cl-mad-enable ";
+        if (backend_ctx->has_vector_subgroup_broadcast) {
+            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
+        }
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemv_noshuffle_q5_k_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemv_noshuffle_q5_k_f32.cl");
+#endif
+
+        cl_program prog = build_program_from_source(
             backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
 
         CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_k_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_k_f32", &err), err));
@@ -3221,13 +3402,15 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
     }
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
     GGML_LOG_CONT("\n");
+    backend_ctx->kernels_loaded = true;
 }
 
 // XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
 // XXX    static bool initialized = false;
 // XXX    static ggml_backend_opencl_context *backend_ctx = nullptr;
 
-static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
+static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev);
+static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev);
 
 namespace /* anonymous */ {
 extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
@@ -3440,13 +3623,13 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
             /* .context = */ dev_ctx.get(),
         });
 
-        if (!ggml_cl2_init(&found_devices.back())) {
+        if (!ggml_opencl_is_device_supported(&found_devices.back())) {
             found_devices.pop_back();
-            GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
+            GGML_LOG_WARN("ggml_opencl: drop unsupported device '%s'.\n", dev->name);
             continue;
         }
 
-        dev_ctx.release();
+        g_ggml_backend_opencl_dev_ctxs.push_back(std::move(dev_ctx));
     }
 
     if (found_devices.size()) {
@@ -3463,8 +3646,79 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
     return found_devices;
 }
 
+// check if device should be accepted
+static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
+    GGML_ASSERT(dev);
+    GGML_ASSERT(dev->context);
+
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    GGML_ASSERT(dev_ctx->platform);
+    GGML_ASSERT(dev_ctx->device);
+
+    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
+        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
+        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
+        dev_ctx->gpu_family = GPU_FAMILY::ADRENO;
+
+        // Usually device version contains the detailed device name
+        dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
+        if (dev_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
+            dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
+        }
+    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
+        dev_ctx->gpu_family = GPU_FAMILY::INTEL;
+    } else {
+        GGML_LOG_WARN("ggml_opencl: unsupported GPU '%s'.\n", dev_ctx->device_name.c_str());
+        dev_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+        return false;
+    }
+
+    ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
+
+    // Check device OpenCL version, OpenCL 2.0 or above is required
+    ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, dev_ctx->device);
+    if (opencl_c_version.major < 2) {
+        GGML_LOG_WARN("ggml_opencl: OpenCL 2.0 or above is required\n");
+        return false;
+    }
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    if (dev_ctx->gpu_family != GPU_FAMILY::ADRENO) {
+        GGML_LOG_WARN("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
+            "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
+        return false;
+    }
+#endif
+
+    size_t ext_str_size;
+    clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+
+    char *ext_buffer = (char *)alloca(ext_str_size + 1);
+    clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    ext_buffer[ext_str_size] = '\0';
+
+    // Check if ext_buffer contains cl_khr_fp16
+    bool fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    if (!fp16_support) {
+        GGML_LOG_WARN("ggml_opencl: device does not support FP16\n");
+        return false;
+    }
+
+    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
+    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
+    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
+        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
+        GGML_LOG_WARN("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
+            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
+        return false;
+    }
+
+    clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL);
+    return true;
+}
+
 // Initialize device if it is supported (returns nullptr if it is not).
-static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
     GGML_ASSERT(dev);
     GGML_ASSERT(dev->context);
 
@@ -3486,33 +3740,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     // when the associated device is initialized
     backend_ctx->ref_count  = 0;
 
-    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
-        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
-        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
-        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
-        // Usually device version contains the detailed device name
-        backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
-        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
-            backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
-        }
-
+    backend_ctx->gpu_family = dev_ctx->gpu_family;
+    backend_ctx->adreno_gen = dev_ctx->adreno_gen;
+    if (backend_ctx->gpu_family == GPU_FAMILY::ADRENO) {
         // Use wave size of 64 for all Adreno GPUs.
         backend_ctx->adreno_wave_size = 64;
-    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
-        backend_ctx->gpu_family = GPU_FAMILY::INTEL;
-    } else {
-        GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
-        backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-        return nullptr;
-    }
-
-#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
-    if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
-        GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
-            "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
-        return nullptr;
     }
-#endif
 
     // Populate backend device name
     backend_ctx->device_name = dev_ctx->device_name;
@@ -3521,13 +3754,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     cl_device_id device = backend_ctx->device;
 
     ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
-
-    // Check device OpenCL version, OpenCL 2.0 or above is required
     ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
-    if (opencl_c_version.major < 2) {
-        GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
-        return nullptr;
-    }
+
+    backend_ctx->platform_version = platform_version;
+    backend_ctx->opencl_c_version = opencl_c_version;
 
     // Check driver version
     size_t driver_version_str_size;
@@ -3550,34 +3780,21 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     char *ext_buffer = (char *)alloca(ext_str_size + 1);
     clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
     ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+
     // Check if ext_buffer contains cl_khr_fp16
     backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
     GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
+
     // check Adreno large buffer support
     backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
 
-    // fp16 is required
-    if (!backend_ctx->fp16_support) {
-        GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
-        return nullptr;
-    }
-
-    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
-    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
-    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
-        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
-        GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
-            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
-        return nullptr;
-    }
-
     cl_uint base_align_in_bits;
     CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
     GGML_ASSERT(base_align_in_bits % 8u == 0);
     backend_ctx->alignment = base_align_in_bits / 8u;
     GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
 
-    clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &backend_ctx->global_mem_size, NULL);
+    backend_ctx->global_mem_size = dev_ctx->global_mem_size;
     GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
 
     clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
@@ -3665,8 +3882,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
 #endif
     CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
 
-    // Load kernels
-    load_cl_kernels(backend_ctx.get(), opencl_c_version);
+    // delay kernel loading until the first buffer is created
+    // load_cl_kernels(backend_ctx.get());
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     // Allocate intermediate buffers and images
@@ -3708,22 +3925,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     return dev_ctx->backend_ctx;
 }
 
-static void ggml_cl2_free(ggml_backend_t backend) {
+static void ggml_cl_free(ggml_backend_t backend) {
     ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
     ctx->free();
-
-    // The CL context is shared by all backends, release it if all backends have been released
-    bool should_release_opencl = true;
-    for (auto device : g_ggml_backend_opencl_devices) {
-        ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
-        if (ctx_dev->backend_ctx->ref_count > 0) {
-            should_release_opencl = false;
-        }
-    }
-
-    if (should_release_opencl) {
-        CL_CHECK(clReleaseContext(ctx->context));
-    }
 }
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
@@ -4148,6 +4352,8 @@ struct ggml_tensor_extra_cl_iq4_nl {
 struct ggml_tensor_extra_cl_q4_K {
     // Quantized values
     cl_mem q = nullptr;
+    // Quantized values in image1d_buffer_t.
+    cl_mem q_img = nullptr;
     // Scales for each super block.
     cl_mem s  = nullptr;
     // Scales
@@ -4176,12 +4382,18 @@ struct ggml_tensor_extra_cl_q4_K {
             CL_CHECK(clReleaseMemObject(dm));
             dm = nullptr;
         }
+        if (q_img != nullptr) {
+            CL_CHECK(clReleaseMemObject(q_img));
+            q_img = nullptr;
+        }
     }
 };
 
 struct ggml_tensor_extra_cl_q5_K {
     // Lower 4 bits of quantized weights.
     cl_mem q  = nullptr;
+    // Quantized values in image1d_buffer_t.
+    cl_mem q_img = nullptr;
     // Upper 1 bit of quantized weights.
     cl_mem qh = nullptr;
     // Scales for each block.
@@ -4222,6 +4434,10 @@ struct ggml_tensor_extra_cl_q5_K {
             CL_CHECK(clReleaseMemObject(dm));
             dm = nullptr;
         }
+        if (q_img != nullptr) {
+            CL_CHECK(clReleaseMemObject(q_img));
+            q_img = nullptr;
+        }
 
         size_q  = 0;
         size_qh = 0;
@@ -4234,6 +4450,8 @@ struct ggml_tensor_extra_cl_q5_K {
 struct ggml_tensor_extra_cl_q6_K {
     // Lower 4 bits of quantized weights.
     cl_mem ql = nullptr;
+    // Lower 4 bits as image1d_buffer_t
+    cl_mem ql_img = nullptr;
     // Upper 2 bits of quantized weights.
     cl_mem qh = nullptr;
     // Scales for each block.
@@ -4267,6 +4485,10 @@ struct ggml_tensor_extra_cl_q6_K {
             CL_CHECK(clReleaseMemObject(d));
             d = nullptr;
         }
+        if (ql_img != nullptr) {
+            CL_CHECK(clReleaseMemObject(ql_img));
+            ql_img = nullptr;
+        }
 
         size_ql = 0;
         size_qh = 0;
@@ -4289,7 +4511,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
 }
 
 static void ggml_backend_opencl_free(ggml_backend_t backend) {
-    ggml_cl2_free(backend);
+    ggml_cl_free(backend);
 }
 
 static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -4328,14 +4550,17 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
 // enqueued to it won't start until commands in the other devices have
 // completed.
 static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
-    if (g_ggml_backend_opencl_devices.size() < 2)
-      return; // No other devices to synchronize with.
+    if (g_ggml_backend_opencl_devices.size() < 2) {
+        return; // No other devices to synchronize with.
+    }
 
     std::vector<cl_event> events;
     events.reserve(g_ggml_backend_opencl_devices.size());
 
     for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
-        auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
+        ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) backend_dev.context;
+        auto * other_backend_ctx = dev_ctx->backend_ctx;
+
         if (backend_ctx != other_backend_ctx) {
             cl_event ev;
             CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
@@ -4488,7 +4713,7 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
 inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
     GGML_UNUSED(backend_ctx);
     int ne01 = tensor->ne[1];
-    return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
+    return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 32 == 0);
 }
 
 inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
@@ -4700,7 +4925,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
             // the quantizations here currently do not - they are only supported by Adreno with certain shapes
             if (op->src[0]->type == GGML_TYPE_Q4_1 ||
                 op->src[0]->type == GGML_TYPE_Q5_0 ||
-                op->src[0]->type == GGML_TYPE_Q5_1) {
+                op->src[0]->type == GGML_TYPE_Q5_1 ||
+                op->src[0]->type == GGML_TYPE_Q4_K ||
+                op->src[0]->type == GGML_TYPE_Q5_K ||
+                op->src[0]->type == GGML_TYPE_Q6_K) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
                 if (op->src[1]->type == GGML_TYPE_F32) {
                     return use_adreno_moe_kernels(backend_ctx, op->src[0])
@@ -4745,6 +4973,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
         case GGML_OP_IM2COL:
             return true;
         case GGML_OP_ARGSORT: {
+            load_cl_kernels_argsort(backend_ctx);
+
             cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
             int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
 
@@ -4762,6 +4992,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_FLASH_ATTN_EXT:
             {
+                load_cl_kernels_flash_attn(backend_ctx);
+
                 const ggml_tensor * q = op->src[0];
                 const ggml_tensor * k = op->src[1];
                 const ggml_tensor * v = op->src[2];
@@ -4829,7 +5061,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
 
 ggml_backend_t ggml_backend_opencl_init(void) {
     ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
-    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+    ggml_backend_opencl_context *backend_ctx = ggml_cl_init(dev);
 
     ggml_backend_t backend = new ggml_backend {
         /* .guid    = */ ggml_backend_opencl_guid(),
@@ -5208,15 +5440,13 @@ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer)
 }
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
-    return (void *) (uintptr_t) backend_ctx->alignment;
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    return (void *) (uintptr_t) dev_ctx->backend_ctx->alignment;
 }
 
 static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 
-    ggml_cl2_init(buffer->buft->device);
-
     if (tensor->view_src != nullptr) {
         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
 
@@ -5256,7 +5486,8 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
 }
 
 static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
 
     cl_context context = backend_ctx->context;
     cl_command_queue queue = backend_ctx->queue;
@@ -6047,32 +6278,75 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
             CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
         CL_CHECK(err);
 
-        #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
-        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
-        if (use_adreno_kernels(backend_ctx, tensor)) {
-            kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
-        }
-        #else
-        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
-        #endif
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            cl_kernel kernel = backend_ctx->kernel_convert_block_q4_k_trans4_ns;
 
-        cl_uchar mask_0F = 0x0F;
-        cl_uchar mask_F0 = 0xF0;
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
 
-        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
-        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
-        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
-        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
-        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
-        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
-        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
+            cl_uchar mask_0F = 0x0F;
+            cl_uchar mask_F0 = 0xF0;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0));
 
-        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
-        size_t local_work_size[] = {64, 1, 1};
+            size_t global_work_size[] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 256), static_cast<size_t>(ne02)};
+            size_t local_work_size[] = {64, 1, 1};
 
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clReleaseMemObject(data_device));
+
+            cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32};
+            cl_image_desc img_desc_q = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(ggml_nelements(tensor) / 8),
+                0, 0, 0, 0, 0, 0, 0,
+                { extra->q }
+            };
+            extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
+            CL_CHECK(err);
+            tensor->extra = extra;
+
+            return;
+        }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
+        if (use_adreno_kernels(backend_ctx, tensor)) {
+            kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
+        }
+#else
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+        cl_uchar mask_0F = 0x0F;
+        cl_uchar mask_F0 = 0xF0;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
         CL_CHECK(clReleaseMemObject(data_device));
 
         tensor->extra  = extra;
@@ -6157,14 +6431,58 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
         CL_CHECK(err);
 
-        #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            cl_kernel kernel = backend_ctx->kernel_convert_block_q5_k_trans4_ns;
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
+
+            cl_uchar mask_0F = 0x0F;
+            cl_uchar mask_F0 = 0xF0;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra->s));
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+            CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0));
+
+            size_t global_work_size[] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 256), static_cast<size_t>(ne02)};
+            size_t local_work_size[] = {64, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clReleaseMemObject(data_device));
+
+            cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32};
+            cl_image_desc img_desc_q = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(ggml_nelements(tensor) / 8),
+                0, 0, 0, 0, 0, 0, 0,
+                { extra->q }
+            };
+            extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
+            CL_CHECK(err);
+            tensor->extra = extra;
+
+            return;
+        }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
         cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K;
         if (use_adreno_kernels(backend_ctx, tensor)) {
             kernel = backend_ctx->kernel_convert_block_q5_K_noshuffle;
         }
-        #else
+#else
         cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K;
-        #endif
+#endif
 
         cl_uchar mask_0F = 0x0F;
         cl_uchar mask_F0 = 0xF0;
@@ -6232,6 +6550,79 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         cl_buffer_region region;
 
+        cl_uchar mask_0F = 0x0F;
+        cl_uchar mask_F0 = 0xF0;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        // Adreno MoE Q6_K kernel needs special transposed layout
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            size_t moe_size_ql = (size_t)(ggml_nelements(tensor) / 8) * sizeof(uint32_t);  // 4 bits per element
+            size_t moe_size_qh = (size_t)(ggml_nelements(tensor) / 16) * sizeof(uint32_t); // 2 bits per element
+            size_t moe_size_s  = size_s;
+            size_t moe_size_d  = size_d;
+
+            // Subbuffer for ql
+            region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
+            region.size = moe_size_ql;
+            CL_CHECK((extra->ql = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+            auto previous_origin = region.origin;
+
+            // Subbuffer for qh
+            region.origin = align_to(previous_origin + moe_size_ql, backend_ctx->alignment);
+            region.size = moe_size_qh;
+            CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+            previous_origin = region.origin;
+
+            // Subbuffer for scales
+            region.origin = align_to(previous_origin + moe_size_qh, backend_ctx->alignment);
+            region.size = moe_size_s;
+            CL_CHECK((extra->s = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+            previous_origin = region.origin;
+
+            // Subbuffer for d
+            region.origin = align_to(previous_origin + moe_size_s, backend_ctx->alignment);
+            region.size = moe_size_d;
+            CL_CHECK((extra->d = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+            cl_kernel kernel = backend_ctx->kernel_convert_block_q6_k_trans4_ns;
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0));
+
+            size_t global_work_size[] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 256), static_cast<size_t>(ne02)};
+            size_t local_work_size[] = {64, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clReleaseMemObject(data_device));
+
+            // Create image for ql
+            cl_image_format img_format_ql = {CL_R, CL_UNSIGNED_INT32};
+            cl_image_desc img_desc_ql = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(ggml_nelements(tensor) / 8),
+                0, 0, 0, 0, 0, 0, 0,
+                { extra->ql }
+            };
+            extra->ql_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_ql, &img_desc_ql, NULL, &err);
+            tensor->extra = extra;
+
+            return;
+        }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
         // Subbuffer for ql
         region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
         region.size = size_ql;
@@ -6331,7 +6722,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     GGML_ASSERT(tensor->extra);
 
-    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    ggml_backend_opencl_context *backend_ctx = dev_ctx->backend_ctx;
 
     cl_context context = backend_ctx->context;
     cl_command_queue queue = backend_ctx->queue;
@@ -6825,6 +7217,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         cl_uchar mask_F0 = 0xF0;
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            cl_int err;
+            cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                ggml_nbytes(tensor), NULL, &err);
+            CL_CHECK(err);
+
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q4_k_trans4_ns;
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->dm));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0));
+
+            size_t global_work_size[] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 256), static_cast<size_t>(ne02)};
+            size_t local_work_size[] = {64, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clEnqueueReadBuffer(
+                queue, data_device, CL_TRUE, offset,
+                size, data, 0, NULL, NULL));
+            CL_CHECK(clReleaseMemObject(data_device));
+            return;
+        }
         if (use_adreno_kernels(backend_ctx, tensor)) {
             int M = tensor->ne[1];
             int K = tensor->ne[0];
@@ -6901,6 +7327,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         cl_uchar mask_F0 = 0xF0;
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            cl_int err;
+            cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                ggml_nbytes(tensor), NULL, &err);
+            CL_CHECK(err);
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q5_k_trans4_ns;
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
+            CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0));
+
+            size_t global_work_size[] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 256), static_cast<size_t>(ne02)};
+            size_t local_work_size[] = {64, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clEnqueueReadBuffer(
+                queue, data_device, CL_TRUE, offset,
+                size, data, 0, NULL, NULL));
+            CL_CHECK(clReleaseMemObject(data_device));
+            return;
+        }
         if (use_adreno_kernels(backend_ctx, tensor)) {
             int M = tensor->ne[1];
             int K = tensor->ne[0];
@@ -6974,7 +7434,44 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
     if (tensor->type == GGML_TYPE_Q6_K) {
         ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
 
+        cl_uchar mask_0F = 0x0F;
+        cl_uchar mask_F0 = 0xF0;
+
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            cl_int err;
+            cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                ggml_nbytes(tensor), NULL, &err);
+            CL_CHECK(err);
+
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q6_k_trans4_ns;
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0));
+
+            size_t global_work_size[] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 256), static_cast<size_t>(ne02)};
+            size_t local_work_size[] = {64, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clEnqueueReadBuffer(
+                queue, data_device, CL_TRUE, offset,
+                size, data, 0, NULL, NULL));
+            CL_CHECK(clReleaseMemObject(data_device));
+            return;
+        }
         if (use_adreno_kernels(backend_ctx, tensor)) {
             static ggml_cl_buffer buf_trans_ql;
             static ggml_cl_buffer buf_trans_qh;
@@ -7070,8 +7567,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
 }
 
 static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_dev_t dev = buffer->buft->device;
-    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
+
     cl_command_queue queue = backend_ctx->queue;
 
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
@@ -7111,7 +7609,8 @@ static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer
 }
 
 static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
-    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
+    ggml_backend_opencl_context *backend_ctx = ggml_cl_init(buffer_type->device);
+    load_cl_kernels(backend_ctx);
 
     // clCreateBuffer returns -61 for size 0
     size = std::max(size, (size_t)1);
@@ -7134,15 +7633,15 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
 }
 
 static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
-    return backend_ctx->alignment;
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
+    return dev_ctx->backend_ctx->alignment;
 }
 
 static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
     static size_t max_size = -1;
     if (max_size == (size_t)-1) {
-        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
-        max_size = backend_ctx->max_alloc_size;
+        ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
+        max_size = dev_ctx->backend_ctx->max_alloc_size;
     }
     return max_size;
 }
@@ -7179,14 +7678,13 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
 
 static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
-    ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) dev_ctx->backend_ctx;
 
     static const size_t opencl_extra_margin = 1024ull*1024ull*1024ull;
 
     // OpenCL does not provide reliable currently-free device memory.
     // Use total/global memory as a best-effort upper bound.
     // Improved safety: Reduce by a 1GiB extra margin for common --fit
-    *total = backend_ctx->global_mem_size;
+    *total = dev_ctx->global_mem_size;
     *free  = *total > opencl_extra_margin ? *total - opencl_extra_margin : 0;
 }
 
@@ -7210,7 +7708,7 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
 }
 
 static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
-    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
+    ggml_backend_opencl_context * backend_ctx = ggml_cl_init(dev);
     // Getting a new reference to the backend, increase ref_count
     backend_ctx->ref_count++;
 
@@ -7247,6 +7745,7 @@ static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_bac
 }
 
 static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    ggml_cl_init(dev);
     return ggml_opencl_supports_op(dev, op);
 }
 
@@ -7259,8 +7758,8 @@ static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggm
 
     // Check cl_context is the same. clEnqueue* commands may not use
     // buffers from another cl_context.
-    ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
-    ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
+    ggml_backend_opencl_context * backend_ctx0 = ggml_cl_init(dev);
+    ggml_backend_opencl_context * backend_ctx1 = ggml_cl_init(buft->device);
     return backend_ctx0->context == backend_ctx1->context;
 }
 
@@ -13733,6 +14232,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
     ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
     ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra;
     ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra;
+    ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra;
+    ggml_tensor_extra_cl_q5_K * extra0_q5_K = (ggml_tensor_extra_cl_q5_K *)src0->extra;
+    ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
     ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
     ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
 #endif
@@ -13741,6 +14243,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
     (void)extra0_q4_1;
     (void)extra0_q5_0;
     (void)extra0_q5_1;
+    (void)extra0_q4_K;
+    (void)extra0_q5_K;
+    (void)extra0_q6_K;
 
     const int ne00 = src0->ne[0];
     const int ne01 = src0->ne[1];
@@ -13812,7 +14317,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14028,7 +14533,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14204,7 +14709,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14380,7 +14885,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14612,6 +15117,532 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
 #endif // GGML_OPENCL_SOA_Q
             break;
         }
+        case GGML_TYPE_Q4_K: {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+            if (use_adreno_moe_kernels(backend_ctx, src0)) {
+                cl_int status;
+
+                size_t local_size[3] = {64, 2, 1};
+                size_t global_size[3] = {64, 2, 1};
+
+                if (ne12 == 1) { // for gemv
+                    kernel = backend_ctx->kernel_gemv_moe_q4_k_f32_ns;
+
+                    cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
+
+                    // create a sub_buffer for src2
+                    cl_buffer_region region;
+                    region.origin = offset2;
+                    region.size = ne20 * ne21 * sizeof(int);
+                    buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // set thread grid
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
+                    global_size[1] = 4;
+                    global_size[2] = static_cast<size_t>(ne20);
+                    local_size[1] = 4;
+
+                    // create a sub_buffer for src1
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // create image for src1
+                    cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
+                    buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->q));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->dm));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->s));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src1_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extrad->data_device));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong),  &offsetd));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne11));
+
+                    // launch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    // deallocate sub buffers and images
+                    CL_CHECK(clReleaseMemObject(src1_sub_buffer));
+                    CL_CHECK(clReleaseMemObject(buf_src1_image));
+                    CL_CHECK(clReleaseMemObject(buf_src2));
+
+                } else { // for gemm
+                    kernel = backend_ctx->kernel_gemm_moe_q4_k_f32_ns;
+
+                    // Reorder router if called from test-backend-ops or when new router is generated.
+                    // Otherwise reuse the reordered result from previous mul_mat_id call.
+                    if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
+                        moe_router_reoerder(backend, src2, ne20);
+                        backend_ctx->toggle_reorder = false;
+                    }
+
+                    cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
+                    cl_mem buf_src2, buf_src2_emap;
+
+                    cl_buffer_region region;
+                    region.origin = 0;
+                    region.size = sizeof(int) * max_post_router_tile * n_tile_size;
+                    buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    region.origin = 0;
+                    region.size = sizeof(short) * max_post_router_tile;
+                    buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Reorder activations
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Create image for reordered src1
+                    region.origin = 0;
+                    region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
+                    backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
+                    buf_src1_reordered = clCreateSubBuffer(
+                        backend_ctx->prealloc_act_trans.buffer,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
+                    image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    unsigned short map_ratio = ne20 / ne11;
+                    GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem),        &sub_buf_src1_pre));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem),        &buf_src2));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem),        &buf_src1_reordered));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem),        &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int),  &ne00));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short),  &map_ratio));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int),  &n_tile_size));
+
+                    size_t reorder_b_local_size[3] = {256, 1, 1};
+                    size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
+
+                    // Dispatch reorder kernel
+                    backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
+
+                    // MoE kernel prepare
+                    region.origin = offsetd;
+                    region.size = ne0 * ne1 * ne2 * sizeof(float);
+                    sub_buf_dst = clCreateSubBuffer(
+                        extrad->data_device,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    // Create image for dst
+                    cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
+                    cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
+                    buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->q_img));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->dm));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_K->s));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &image_src1_reordered));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2_emap));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_dst_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+
+                    // set thread grid
+                    global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
+                    global_size[2] = static_cast<size_t>(max_post_router_tile);
+                    local_size[1] = 1;
+                    local_size[2] = 1;
+
+                    // Dispatch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    clReleaseMemObject(sub_buf_src1_pre);
+                    clReleaseMemObject(buf_src1_reordered);
+                    clReleaseMemObject(image_src1_reordered);
+                    clReleaseMemObject(buf_src2);
+                    clReleaseMemObject(buf_src2_emap);
+                    clReleaseMemObject(sub_buf_dst);
+                    clReleaseMemObject(buf_dst_image);
+                }
+                return;
+            }
+#endif //GGML_OPENCL_USE_ADRENO_KERNELS
+        }
+        case GGML_TYPE_Q5_K: {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+            if (use_adreno_moe_kernels(backend_ctx, src0)) {
+                cl_int status;
+
+                size_t local_size[3] = {64, 2, 1};
+                size_t global_size[3] = {64, 2, 1};
+
+                if (ne12 == 1) { // for gemv
+                    kernel = backend_ctx->kernel_gemv_moe_q5_k_f32_ns;
+
+                    cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
+
+                    // create a sub_buffer for src2
+                    cl_buffer_region region;
+                    region.origin = offset2;
+                    region.size = ne20 * ne21 * sizeof(int);
+                    buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // set thread grid
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
+                    global_size[1] = 4;
+                    global_size[2] = static_cast<size_t>(ne20);
+                    local_size[1] = 4;
+
+                    // create a sub_buffer for src1
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // create image for src1
+                    cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
+                    buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->q));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->qh));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->dm));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->s));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src1_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extrad->data_device));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong),  &offsetd));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne11));
+
+                    // launch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    // deallocate sub buffers and images
+                    CL_CHECK(clReleaseMemObject(src1_sub_buffer));
+                    CL_CHECK(clReleaseMemObject(buf_src1_image));
+                    CL_CHECK(clReleaseMemObject(buf_src2));
+
+                } else { // for gemm
+                    kernel = backend_ctx->kernel_gemm_moe_q5_k_f32_ns;
+
+                    // Reorder router if called from test-backend-ops or when new router is generated.
+                    // Otherwise reuse the reordered result from previous mul_mat_id call.
+                    if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
+                        moe_router_reoerder(backend, src2, ne20);
+                        backend_ctx->toggle_reorder = false;
+                    }
+
+                    cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
+                    cl_mem buf_src2, buf_src2_emap;
+
+                    cl_buffer_region region;
+                    region.origin = 0;
+                    region.size = sizeof(int) * max_post_router_tile * n_tile_size;
+                    buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    region.origin = 0;
+                    region.size = sizeof(short) * max_post_router_tile;
+                    buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Reorder activations
+                    // create a sub_buffer for src1
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Create image for reordered src1
+                    // Use pre-allocated placeholder
+                    region.origin = 0;
+                    region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
+                    backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
+                    buf_src1_reordered = clCreateSubBuffer(
+                        backend_ctx->prealloc_act_trans.buffer,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
+                    image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    unsigned short map_ratio = ne20 / ne11;
+                    GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem),        &sub_buf_src1_pre));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem),        &buf_src2));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem),        &buf_src1_reordered));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem),        &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int),  &ne00));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short),  &map_ratio));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int),  &n_tile_size));
+
+                    size_t reorder_b_local_size[3] = {256, 1, 1};
+                    size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
+
+                    // Dispatch reorder kernel
+                    backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
+
+                    // MoE kernel prepare
+                    // Create sub buffer for dst
+                    region.origin = offsetd;
+                    region.size = ne0 * ne1 * ne2 * sizeof(float);
+                    sub_buf_dst = clCreateSubBuffer(
+                        extrad->data_device,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    // Create image for dst
+                    cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
+                    cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
+                    buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->q_img));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->qh));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->s));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q5_K->dm));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &image_src1_reordered));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2_emap));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_dst_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+
+                    // set thread grid
+                    global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
+                    global_size[2] = static_cast<size_t>(max_post_router_tile);
+                    local_size[1] = 1;
+                    local_size[2] = 1;
+
+                    // Dispatch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    clReleaseMemObject(sub_buf_src1_pre);
+                    clReleaseMemObject(buf_src1_reordered);
+                    clReleaseMemObject(image_src1_reordered);
+                    clReleaseMemObject(buf_src2);
+                    clReleaseMemObject(buf_src2_emap);
+                    clReleaseMemObject(sub_buf_dst);
+                    clReleaseMemObject(buf_dst_image);
+                }
+                return;
+            }
+#endif //GGML_OPENCL_USE_ADRENO_KERNELS
+        }
+        case GGML_TYPE_Q6_K: {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+            if (use_adreno_moe_kernels(backend_ctx, src0)) {
+                cl_int status;
+
+                size_t local_size[3] = {64, 2, 1};
+                size_t global_size[3] = {64, 2, 1};
+
+                if (ne12 == 1) { // for gemv
+                    kernel = backend_ctx->kernel_gemv_moe_q6_k_f32_ns;
+
+                    cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
+
+                    // create a sub_buffer for src2
+                    cl_buffer_region region;
+                    region.origin = offset2;
+                    region.size = ne20 * ne21 * sizeof(int);
+                    buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // set thread grid
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
+                    global_size[1] = 4;
+                    global_size[2] = static_cast<size_t>(ne20);
+                    local_size[1] = 4;
+
+                    // create a sub_buffer for src1
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // create image for src1
+                    cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
+                    buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->ql));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->qh));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->s));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src1_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extrad->data_device));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong),  &offsetd));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne11));
+
+                    // launch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    // deallocate sub buffers and images
+                    CL_CHECK(clReleaseMemObject(src1_sub_buffer));
+                    CL_CHECK(clReleaseMemObject(buf_src1_image));
+                    CL_CHECK(clReleaseMemObject(buf_src2));
+
+                } else { // for gemm
+                    kernel = backend_ctx->kernel_gemm_moe_q6_k_f32_ns;
+
+                    // Reorder router if called from test-backend-ops or when new router is generated.
+                    // Otherwise reuse the reordered result from previous mul_mat_id call.
+                    if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
+                        moe_router_reoerder(backend, src2, ne20);
+                        backend_ctx->toggle_reorder = false;
+                    }
+
+                    cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
+                    cl_mem buf_src2, buf_src2_emap;
+
+                    cl_buffer_region region;
+                    region.origin = 0;
+                    region.size = sizeof(int) * max_post_router_tile * n_tile_size;
+                    buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    region.origin = 0;
+                    region.size = sizeof(short) * max_post_router_tile;
+                    buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Reorder activations
+                    // create a sub_buffer for src1
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Create image for reordered src1
+                    region.origin = 0;
+                    region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
+                    backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
+                    buf_src1_reordered = clCreateSubBuffer(
+                        backend_ctx->prealloc_act_trans.buffer,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
+                    image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    unsigned short map_ratio = ne20 / ne11;
+                    GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem),        &sub_buf_src1_pre));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem),        &buf_src2));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem),        &buf_src1_reordered));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem),        &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int),  &ne00));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short),  &map_ratio));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int),  &n_tile_size));
+
+                    size_t reorder_b_local_size[3] = {256, 1, 1};
+                    size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
+
+                    // Dispatch reorder kernel
+                    backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
+
+                    // MoE kernel prepare
+                    // Create sub buffer for dst
+                    region.origin = offsetd;
+                    region.size = ne0 * ne1 * ne2 * sizeof(float);
+                    sub_buf_dst = clCreateSubBuffer(
+                        extrad->data_device,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    // Create image for dst
+                    cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
+                    cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
+                    buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->ql_img));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->qh));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->s));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q6_K->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &image_src1_reordered));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2_emap));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_dst_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+
+                    // set thread grid
+                    global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
+                    global_size[2] = static_cast<size_t>(max_post_router_tile);
+                    local_size[1] = 1;
+                    local_size[2] = 1;
+
+                    // Dispatch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    clReleaseMemObject(sub_buf_src1_pre);
+                    clReleaseMemObject(buf_src1_reordered);
+                    clReleaseMemObject(image_src1_reordered);
+                    clReleaseMemObject(buf_src2);
+                    clReleaseMemObject(buf_src2_emap);
+                    clReleaseMemObject(sub_buf_dst);
+                    clReleaseMemObject(buf_dst_image);
+                }
+                return;
+            }
+#endif //GGML_OPENCL_USE_ADRENO_KERNELS
+        }
         case GGML_TYPE_MXFP4: {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
             if (use_adreno_moe_kernels(backend_ctx, src0)) {
@@ -14633,7 +15664,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl
index 8f06d570587..c25eabdd72b 100644
--- a/ggml/src/ggml-opencl/kernels/cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/cvt.cl
@@ -220,6 +220,10 @@ kernel void kernel_convert_block_q4_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_0;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -263,6 +267,10 @@ kernel void kernel_restore_block_q4_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_0;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -401,6 +409,10 @@ kernel void kernel_convert_block_q4_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_1;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -446,6 +458,10 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_1;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -491,6 +507,10 @@ kernel void kernel_convert_block_q5_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_0;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -536,6 +556,10 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_0;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -583,6 +607,10 @@ kernel void kernel_convert_block_q5_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_1;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -630,6 +658,10 @@ kernel void kernel_restore_block_q5_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_1;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -664,6 +696,415 @@ kernel void kernel_restore_block_q5_1_trans4_ns(
     ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
 }
 
+kernel void kernel_convert_block_q4_k_trans4_ns(
+    __global struct block_q4_K * src0,
+    __global uint  * dst_q,
+    __global half  * dst_d,
+    __global half  * dst_dm,
+    __global uchar * dst_s,
+    uint ne00,
+    uint ne01,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint ne00_blk = ne00 / QK_K;
+    uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i01 + i00 * ne01     + i02 * ne00_blk * ne01;
+
+    __global struct block_q4_K * b = src0 + src_blk_offset;
+
+    dst_d [dst_blk_offset] = b->d;
+    dst_dm[dst_blk_offset] = b->dm;
+
+    uint4 qv[8];
+    uchar * qv_bytes = (uchar *)qv;
+    for (int i = 0; i < QK_K / 64; ++i) {
+        for (int j = 0; j < 16; ++j) {
+            uchar x0 = b->q[i*32 + 2*j];
+            uchar x1 = b->q[i*32 + 2*j + 1];
+
+            qv_bytes[i*32 + j     ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
+            qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
+        }
+    }
+
+    uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
+    #pragma unroll
+    for (int p = 0; p < 8; ++p) {
+        uint4 v = qv[p];
+        dst_q[base + (p * 4 + 0) * ne01] = v.x;
+        dst_q[base + (p * 4 + 1) * ne01] = v.y;
+        dst_q[base + (p * 4 + 2) * ne01] = v.z;
+        dst_q[base + (p * 4 + 3) * ne01] = v.w;
+    }
+
+    __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
+    #pragma unroll
+    for (int i = 0; i < K_SCALE_SIZE; ++i) {
+        s_dst[i] = b->s[i];
+    }
+}
+
+kernel void kernel_restore_block_q4_k_trans4_ns(
+    __global uint  * src_q,
+    __global half  * src_d,
+    __global half  * src_dm,
+    __global uchar * src_s,
+    __global struct block_q4_K * dst0,
+    uint ne00,
+    uint ne01,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    uint i00 = get_global_id(1);  // block index along K
+    uint i01 = get_global_id(0);  // row index
+    uint i02 = get_global_id(2);  // batch index
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint ne00_blk = ne00 / QK_K;
+
+    uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+
+    __global struct block_q4_K * b = dst0 + dst_blk_offset;
+
+    b->d  = src_d[src_blk_offset];
+    b->dm = src_dm[src_blk_offset];
+
+    __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
+    for (int i = 0; i < K_SCALE_SIZE; ++i) {
+        b->s[i] = s_src[i];
+    }
+
+    uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
+
+    uint4 qv[8];
+    for (int p = 0; p < 8; ++p) {
+        qv[p].x = src_q[base + (p * 4 + 0) * ne01];
+        qv[p].y = src_q[base + (p * 4 + 1) * ne01];
+        qv[p].z = src_q[base + (p * 4 + 2) * ne01];
+        qv[p].w = src_q[base + (p * 4 + 3) * ne01];
+    }
+
+    uchar * qv_bytes = (uchar *)qv;
+    for (int i = 0; i < QK_K / 64; ++i) {
+        for (int j = 0; j < 16; ++j) {
+            uchar lo = qv_bytes[i*32 + j];
+            uchar hi = qv_bytes[i*32 + j + 16];
+            b->q[i*32 + 2*j]     = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
+            b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
+        }
+    }
+}
+
+kernel void kernel_convert_block_q5_k_trans4_ns(
+    __global struct block_q5_K * src0,
+    __global uint  * dst_qs,
+    __global uint  * dst_qh,
+    __global half  * dst_d,
+    __global half  * dst_dm,
+    __global uchar * dst_s,
+    uint ne00,
+    uint ne01,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint ne00_blk = ne00 / QK_K;
+    uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i01 + i00 * ne01     + i02 * ne00_blk * ne01;
+
+    __global struct block_q5_K * b = src0 + src_blk_offset;
+
+    dst_d [dst_blk_offset] = b->d;
+    dst_dm[dst_blk_offset] = b->dm;
+
+    for (int k = 0; k < 8; k++) {
+        uchar b0 = 0, b1 = 0, b2 = 0, b3 = 0;
+        for (int bit = 0; bit < 8; bit++) {
+            b0 |= (uchar)(((b->qh[bit]      >> k) & 1) << bit);
+            b1 |= (uchar)(((b->qh[8  + bit] >> k) & 1) << bit);
+            b2 |= (uchar)(((b->qh[16 + bit] >> k) & 1) << bit);
+            b3 |= (uchar)(((b->qh[24 + bit] >> k) & 1) << bit);
+        }
+        uint packed = (uint)b0 | ((uint)b1 << 8) | ((uint)b2 << 16) | ((uint)b3 << 24);
+        dst_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01] = packed;
+    }
+
+    uint4 qv[8];
+    uchar * qv_bytes = (uchar *)qv;
+    for (int i = 0; i < QK_K / 64; ++i) {
+        for (int j = 0; j < 16; ++j) {
+            uchar x0 = b->qs[i*32 + 2*j];
+            uchar x1 = b->qs[i*32 + 2*j + 1];
+
+            qv_bytes[i*32 + j     ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
+            qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
+        }
+    }
+
+    uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
+    #pragma unroll
+    for (int p = 0; p < 8; ++p) {
+        uint4 v = qv[p];
+        dst_qs[base + (p * 4 + 0) * ne01] = v.x;
+        dst_qs[base + (p * 4 + 1) * ne01] = v.y;
+        dst_qs[base + (p * 4 + 2) * ne01] = v.z;
+        dst_qs[base + (p * 4 + 3) * ne01] = v.w;
+    }
+
+    __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
+    #pragma unroll
+    for (int i = 0; i < K_SCALE_SIZE; ++i) {
+        s_dst[i] = b->s[i];
+    }
+}
+
+kernel void kernel_restore_block_q5_k_trans4_ns(
+    __global uint  * src_qs,
+    __global uint  * src_qh,
+    __global half  * src_d,
+    __global half  * src_dm,
+    __global uchar * src_s,
+    __global struct block_q5_K * dst0,
+    uint ne00,
+    uint ne01,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    uint i00 = get_global_id(1);  // block index along K
+    uint i01 = get_global_id(0);  // row index
+    uint i02 = get_global_id(2);  // batch index
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint ne00_blk = ne00 / QK_K;
+
+    uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+
+    __global struct block_q5_K * b = dst0 + dst_blk_offset;
+
+    b->d  = src_d[src_blk_offset];
+    b->dm = src_dm[src_blk_offset];
+
+    for (int j = 0; j < 32; j++) b->qh[j] = 0;
+    for (int k = 0; k < 8; k++) {
+        uint packed = src_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01];
+        uchar b0 = (uchar)(packed & 0xFF);
+        uchar b1 = (uchar)((packed >> 8) & 0xFF);
+        uchar b2 = (uchar)((packed >> 16) & 0xFF);
+        uchar b3 = (uchar)((packed >> 24) & 0xFF);
+        for (int bit = 0; bit < 8; bit++) {
+            b->qh[bit]      |= (uchar)(((b0 >> bit) & 1) << k);
+            b->qh[8  + bit] |= (uchar)(((b1 >> bit) & 1) << k);
+            b->qh[16 + bit] |= (uchar)(((b2 >> bit) & 1) << k);
+            b->qh[24 + bit] |= (uchar)(((b3 >> bit) & 1) << k);
+        }
+    }
+
+    __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
+    for (int i = 0; i < K_SCALE_SIZE; ++i) {
+        b->s[i] = s_src[i];
+    }
+
+    uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
+
+    uint4 qv[8];
+    for (int p = 0; p < 8; ++p) {
+        qv[p].x = src_qs[base + (p * 4 + 0) * ne01];
+        qv[p].y = src_qs[base + (p * 4 + 1) * ne01];
+        qv[p].z = src_qs[base + (p * 4 + 2) * ne01];
+        qv[p].w = src_qs[base + (p * 4 + 3) * ne01];
+    }
+
+    uchar * qv_bytes = (uchar *)qv;
+    for (int i = 0; i < QK_K / 64; ++i) {
+        for (int j = 0; j < 16; ++j) {
+            uchar lo = qv_bytes[i*32 + j];
+            uchar hi = qv_bytes[i*32 + j + 16];
+            b->qs[i*32 + 2*j]     = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
+            b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
+        }
+    }
+}
+
+kernel void kernel_convert_block_q6_k_trans4_ns(
+    __global struct block_q6_K * src0,
+    __global uint  * dst_ql,
+    __global uint  * dst_qh,
+    __global half  * dst_d,
+    __global char  * dst_s,
+    uint ne00,
+    uint ne01,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint ne00_blk = ne00 / QK_K;
+
+    uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i01 + i00 * ne01     + i02 * ne00_blk * ne01;
+
+    __global struct block_q6_K * b = src0 + src_blk_offset;
+
+    dst_d[dst_blk_offset] = b->d;
+
+    uint4 qlv[8];
+    uchar * qlv_bytes = (uchar *)qlv;
+    for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 16; ++j) {
+            uchar x0 = b->ql[i*64 + 2*j];
+            uchar x1 = b->ql[i*64 + 2*j + 1];
+            uchar x2 = b->ql[i*64 + 32 + 2*j];
+            uchar x3 = b->ql[i*64 + 32 + 2*j + 1];
+            qlv_bytes[i*64 + j     ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
+            qlv_bytes[i*64 + j + 16] = convert_uchar(x2 & mask_0F) | convert_uchar((x3 & mask_0F) << 4);
+            qlv_bytes[i*64 + j + 32] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
+            qlv_bytes[i*64 + j + 48] = convert_uchar((x2 & mask_F0) >> 4) | convert_uchar(x3 & mask_F0);
+        }
+    }
+
+    uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
+
+    #pragma unroll
+    for (int p = 0; p < 8; ++p) {
+        uint4 v = qlv[p];
+        dst_ql[ql_base + (p * 4 + 0) * ne01] = v.x;
+        dst_ql[ql_base + (p * 4 + 1) * ne01] = v.y;
+        dst_ql[ql_base + (p * 4 + 2) * ne01] = v.z;
+        dst_ql[ql_base + (p * 4 + 3) * ne01] = v.w;
+    }
+
+    uint qhv[16] = {0};
+
+    for (int n = 0; n < 2; ++n) {
+        for (int l = 0; l < 32; ++l) {
+            uchar h = b->qh[n*32 + l];
+            int u = l / 16;
+            int bit_pos = (l % 16) * 2;
+            qhv[(n*4 + 0)*2 + u] |= ((uint)((h >> 0) & 0x03)) << bit_pos;
+            qhv[(n*4 + 1)*2 + u] |= ((uint)((h >> 2) & 0x03)) << bit_pos;
+            qhv[(n*4 + 2)*2 + u] |= ((uint)((h >> 4) & 0x03)) << bit_pos;
+            qhv[(n*4 + 3)*2 + u] |= ((uint)((h >> 6) & 0x03)) << bit_pos;
+        }
+    }
+
+    uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
+
+    for (int p = 0; p < 16; ++p) {
+        dst_qh[qh_base + p * ne01] = qhv[p];
+    }
+
+    __global char * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
+    #pragma unroll
+    for (int i = 0; i < 16; ++i) {
+        s_dst[i] = b->scales[i];
+    }
+}
+
+kernel void kernel_restore_block_q6_k_trans4_ns(
+    __global uint  * src_ql,
+    __global uint  * src_qh,
+    __global half  * src_d,
+    __global char  * src_s,
+    __global struct block_q6_K * dst0,
+    uint ne00,
+    uint ne01,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    uint i00 = get_global_id(1);  // block index along K
+    uint i01 = get_global_id(0);  // row index
+    uint i02 = get_global_id(2);  // batch index
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint ne00_blk = ne00 / QK_K;
+
+    uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+
+    __global struct block_q6_K * b = dst0 + dst_blk_offset;
+
+    b->d = src_d[src_blk_offset];
+
+    uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
+    uint4 qlv[8];
+    for (int p = 0; p < 8; ++p) {
+        qlv[p].x = src_ql[ql_base + (p * 4 + 0) * ne01];
+        qlv[p].y = src_ql[ql_base + (p * 4 + 1) * ne01];
+        qlv[p].z = src_ql[ql_base + (p * 4 + 2) * ne01];
+        qlv[p].w = src_ql[ql_base + (p * 4 + 3) * ne01];
+    }
+
+    uchar * qlv_bytes = (uchar *)qlv;
+    for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 16; ++j) {
+            uchar lo_02 = qlv_bytes[i*64 + j];
+            uchar lo_13 = qlv_bytes[i*64 + j + 16];
+            uchar hi_02 = qlv_bytes[i*64 + j + 32];
+            uchar hi_13 = qlv_bytes[i*64 + j + 48];
+            b->ql[i*64 + 2*j]          = convert_uchar((lo_02 & mask_0F) | ((hi_02 & mask_0F) << 4));
+            b->ql[i*64 + 2*j + 1]      = convert_uchar(((lo_02 & mask_F0) >> 4) | (hi_02 & mask_F0));
+            b->ql[i*64 + 32 + 2*j]     = convert_uchar((lo_13 & mask_0F) | ((hi_13 & mask_0F) << 4));
+            b->ql[i*64 + 32 + 2*j + 1] = convert_uchar(((lo_13 & mask_F0) >> 4) | (hi_13 & mask_F0));
+        }
+    }
+
+    uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
+    uint qhv[16];
+    for (int p = 0; p < 16; ++p) {
+        qhv[p] = src_qh[qh_base + p * ne01];
+    }
+
+    for (int n = 0; n < 2; ++n) {
+        for (int l = 0; l < 32; ++l) {
+            int u = l / 16;
+            int bit_pos = (l % 16) * 2;
+            uchar v0 = (uchar)((qhv[(n*4 + 0)*2 + u] >> bit_pos) & 0x03);
+            uchar v1 = (uchar)((qhv[(n*4 + 1)*2 + u] >> bit_pos) & 0x03);
+            uchar v2 = (uchar)((qhv[(n*4 + 2)*2 + u] >> bit_pos) & 0x03);
+            uchar v3 = (uchar)((qhv[(n*4 + 3)*2 + u] >> bit_pos) & 0x03);
+            b->qh[n*32 + l] = v0 | (v1 << 2) | (v2 << 4) | (v3 << 6);
+        }
+    }
+
+    __global char * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
+    for (int i = 0; i < 16; ++i) {
+        b->scales[i] = s_src[i];
+    }
+}
+
 //------------------------------------------------------------------------------
 // block_mxfp4
 //------------------------------------------------------------------------------
@@ -762,6 +1203,10 @@ kernel void kernel_convert_block_mxfp4_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_MXFP4;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -805,6 +1250,10 @@ kernel void kernel_restore_block_mxfp4_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_MXFP4;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
index e404f392bdd..02cdbdd9fb1 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
@@ -163,7 +163,7 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -248,6 +248,10 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
index 02290c17eb1..d403ed0cab1 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
@@ -115,7 +115,7 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -198,6 +198,10 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
index e2574ae0187..b2bddf3f73a 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -200,6 +200,10 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
new file mode 100644
index 00000000000..ab8228d18ca
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
@@ -0,0 +1,283 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#define TILESIZE_K 16
+#define TILESIZE_M 64
+#define TILESIZE_N 32
+#define QK_K 256
+#define K_SCALE_SIZE 12
+
+inline void get_scale_min_k4(
+    int j,
+    global const uchar * q,
+    uchar * d,
+    uchar * m
+) {
+    if (j < 4) {
+        *d = q[j]   & 63;
+        *m = q[j+4] & 63;
+    } else {
+        *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2);
+        *m = ((q[j+4] >> 4) & 0x0F) | ((q[j]   & 0xC0) >> 2);
+    }
+}
+
+#define dequantize_q4_k(q4, a_f16, scale, minv) \
+    a_f16.s0 = (half)((float)(q4.s0 & 0x000F) * scale - minv); \
+    a_f16.s1 = (half)((float)((q4.s0 & 0x00F0) >> 4) * scale - minv); \
+    a_f16.s2 = (half)((float)((q4.s0 & 0x0F00) >> 8) * scale - minv); \
+    a_f16.s3 = (half)((float)((q4.s0 & 0xF000) >> 12) * scale - minv); \
+    a_f16.s4 = (half)((float)(q4.s1 & 0x000F) * scale - minv); \
+    a_f16.s5 = (half)((float)((q4.s1 & 0x00F0) >> 4) * scale - minv); \
+    a_f16.s6 = (half)((float)((q4.s1 & 0x0F00) >> 8) * scale - minv); \
+    a_f16.s7 = (half)((float)((q4.s1 & 0xF000) >> 12) * scale - minv); \
+    a_f16.s8 = (half)((float)(q4.s2 & 0x000F) * scale - minv); \
+    a_f16.s9 = (half)((float)((q4.s2 & 0x00F0) >> 4) * scale - minv); \
+    a_f16.sa = (half)((float)((q4.s2 & 0x0F00) >> 8) * scale - minv); \
+    a_f16.sb = (half)((float)((q4.s2 & 0xF000) >> 12) * scale - minv); \
+    a_f16.sc = (half)((float)(q4.s3 & 0x000F) * scale - minv); \
+    a_f16.sd = (half)((float)((q4.s3 & 0x00F0) >> 4) * scale - minv); \
+    a_f16.se = (half)((float)((q4.s3 & 0x0F00) >> 8) * scale - minv); \
+    a_f16.sf = (half)((float)((q4.s3 & 0xF000) >> 12) * scale - minv); \
+
+
+#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
+    acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
+    acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
+    acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
+    acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
+    acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
+    acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
+    acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
+    acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
+    acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
+    acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
+    acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
+    acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
+    acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
+    acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
+    acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
+    acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
+    acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
+    acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
+    acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
+    acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
+    acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
+    acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
+    acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
+    acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
+    acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
+    acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
+    acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
+    acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
+    acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
+    acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
+    acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
+    acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+    acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
+    acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
+    acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
+    acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
+    acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
+    acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
+    acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
+    acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
+    acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
+    acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
+    acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
+    acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
+    acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
+    acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
+    acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
+    acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
+    acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
+    acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
+    acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
+    acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
+    acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
+    acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
+    acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
+    acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
+    acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
+    acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
+    acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
+    acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
+    acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
+    acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
+    acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
+    acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+
+
+__attribute__((qcom_wave_pair_mode(1)))
+kernel void kernel_gemm_moe_q4_k_f32_ns(
+        __read_only  image1d_buffer_t src0_q,
+        __global     half *           src0_d,
+        __global     half *           src0_dm,
+        __global     uchar *          src0_s,
+        __read_only  image1d_buffer_t src1,
+        __global     uint *           src2,
+        __global     ushort *         src2_emap,
+        __write_only image1d_buffer_t dst,
+        __global     int *            total_tiles,
+        uint ne00,
+        uint ne01
+) {
+    uint block_id_m = get_global_id(1); // m_tile
+    uint block_id_n = get_global_id(2); // n_tile
+
+    // Boundary check
+    if (block_id_n >= total_tiles[0]) {
+        return;
+    }
+
+    __private half16 reg_a;
+    __private float32 reg_c = (float32)(0);
+    __local half4 shared_b[128];
+
+    const ushort expert_id = src2_emap[block_id_n];
+
+    const uint row = block_id_m * TILESIZE_M;
+    const uint col = block_id_n * TILESIZE_N;
+
+    uint sub_block_id_m = get_local_id(0);
+    uint2 b_global_offset;
+    b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
+    b_global_offset.y = b_global_offset.x + (16 * ne00);
+    uint2 b_local_offset;
+    b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
+    b_local_offset.y = b_local_offset.x + 16;
+
+    uint num_superblocks = ne00 / QK_K;
+    uint scales_per_row = num_superblocks * K_SCALE_SIZE;
+    uint row_idx = row + get_global_id(0);
+
+    // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16
+    for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
+        uint sub = step / 32;
+        uint sb = sub / 8;
+        uint j = sub % 8;
+
+        // Load d and dm for super-block
+        uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0);
+        half d_val = src0_d[d_offset];
+        half dm_val = src0_dm[d_offset];
+
+        // Load sub-block scale and min
+        global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE;
+        uchar sv, mn;
+        get_scale_min_k4(j, sc, &sv, &mn);
+
+        float scale = (float)d_val * (float)sv;
+        float minv = (float)dm_val * (float)mn;
+
+        // First sub-block (16 elements)
+        uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        uint b_sub_offset = col * ne00 + step;
+
+        // Load 16 q (64-bits) in transposed layout
+        uint2 q4x16;
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B
+        float8 bx8_f32;
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        half8 bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        half16 acc;
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+
+        // Second half (next 16 elements, same sub-block scale)
+        uint half_step = step + TILESIZE_K;
+        q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        b_sub_offset = col * ne00 + half_step;
+
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+    }
+
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
+    // Load post router and share in LM
+    __local uint out_idx[TILESIZE_N];
+
+    if (get_local_id(0) < TILESIZE_N) {
+        uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
+        if (idx == 0xFFFFFFFF) {
+            idx = src2[block_id_n * TILESIZE_N + 0];
+        }
+        out_idx[get_local_id(0)] = idx * ne01;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scatter results back to original position in output grid
+    uint m_offset = row + get_local_id(0);
+
+    write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
+    write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
+    write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
+    write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
+    write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
+    write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
+    write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
+    write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
+    write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
+    write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
+    write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
+    write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
+    write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
+    write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
+    write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
+    write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
+    write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
+    write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
+    write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
+    write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
+    write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
+    write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
+    write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
+    write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
+    write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
+    write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
+    write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
+    write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
+    write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
+    write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
+    write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
+
+    // Store zero padding parts to the index of first output in tile
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
+}
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
index 3524cb1bdbd..d1a35d58bb2 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -202,6 +202,10 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
index 5fc2a523234..90d345ecf51 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
@@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -204,6 +204,10 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
new file mode 100644
index 00000000000..13c26f6f3b6
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
@@ -0,0 +1,288 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#define TILESIZE_K 16
+#define TILESIZE_M 64
+#define TILESIZE_N 32
+#define QK_K 256
+#define K_SCALE_SIZE 12
+
+inline void get_scale_min_k4(
+    int j,
+    global const uchar * q,
+    uchar * d,
+    uchar * m
+) {
+    if (j < 4) {
+        *d = q[j]   & 63;
+        *m = q[j+4] & 63;
+    } else {
+        *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2);
+        *m = ((q[j+4] >> 4) & 0x0F) | ((q[j]   & 0xC0) >> 2);
+    }
+}
+
+#define dequantize_q5_k(qs5x16, qh5x16, a_f16, scale, m) \
+    a_f16.s0 = (half)((float)(( qs5x16.s0 & 0x000F)        | (( qh5x16.s0       & 0x01) << 4)) * scale + m); \
+    a_f16.s1 = (half)((float)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m)); \
+    a_f16.s2 = (half)((float)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m)); \
+    a_f16.s3 = (half)((float)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m)); \
+    a_f16.s4 = (half)((float)((( qs5x16.s1 & 0x000F)        | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m)); \
+    a_f16.s5 = (half)((float)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m)); \
+    a_f16.s6 = (half)((float)(((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \
+    a_f16.s7 = (half)((float)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m)); \
+    a_f16.s8 = (half)((float)((( qs5x16.s2 & 0x000F)        | (( qh5x16.s1       & 0x01) << 4)) * scale + m)); \
+    a_f16.s9 = (half)((float)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m)); \
+    a_f16.sa = (half)((float)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m)); \
+    a_f16.sb = (half)((float)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m)); \
+    a_f16.sc = (half)((float)((( qs5x16.s3 & 0x000F)        | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m)); \
+    a_f16.sd = (half)((float)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m)); \
+    a_f16.se = (half)((float)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m)); \
+    a_f16.sf = (half)((float)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m)); \
+
+
+#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
+    acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
+    acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
+    acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
+    acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
+    acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
+    acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
+    acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
+    acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
+    acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
+    acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
+    acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
+    acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
+    acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
+    acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
+    acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
+    acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
+    acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
+    acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
+    acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
+    acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
+    acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
+    acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
+    acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
+    acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
+    acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
+    acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
+    acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
+    acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
+    acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
+    acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
+    acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
+    acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+    acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
+    acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
+    acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
+    acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
+    acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
+    acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
+    acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
+    acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
+    acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
+    acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
+    acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
+    acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
+    acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
+    acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
+    acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
+    acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
+    acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
+    acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
+    acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
+    acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
+    acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
+    acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
+    acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
+    acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
+    acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
+    acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
+    acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
+    acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
+    acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
+    acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
+    acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
+    acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+
+
+__attribute__((qcom_wave_pair_mode(1)))
+kernel void kernel_gemm_moe_q5_k_f32_ns(
+        __read_only  image1d_buffer_t src0_q,
+        __global     uint *           src0_qh,
+        __global     uchar *          src0_s,
+        __global     half *           src0_d,
+        __global     half *           src0_dm,
+        __read_only  image1d_buffer_t src1,
+        __global     uint *           src2,
+        __global     ushort *         src2_emap,
+        __write_only image1d_buffer_t dst,
+        __global     int *            total_tiles,
+        uint ne00,
+        uint ne01
+) {
+    uint block_id_m = get_global_id(1); // m_tile
+    uint block_id_n = get_global_id(2); // n_tile
+
+    // Boundary check
+    if (block_id_n >= total_tiles[0]) {
+        return;
+    }
+
+    __private half16 reg_a;
+    __private float32 reg_c = (float32)(0);
+    __local half4 shared_b[128];
+
+    const ushort expert_id = src2_emap[block_id_n];
+
+    const uint row = block_id_m * TILESIZE_M;
+    const uint col = block_id_n * TILESIZE_N;
+
+    uint sub_block_id_m = get_local_id(0);
+    uint2 b_global_offset;
+    b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
+    b_global_offset.y = b_global_offset.x + (16 * ne00);
+    uint2 b_local_offset;
+    b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
+    b_local_offset.y = b_local_offset.x + 16;
+
+    uint num_superblocks = ne00 / QK_K;
+    uint scales_per_row = num_superblocks * K_SCALE_SIZE;
+    uint row_idx = row + get_global_id(0);
+
+    // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16
+    for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
+        uint sub = step / 32;
+        uint sb = sub / 8;
+        uint j = sub % 8;
+
+        // Load d and dm for super-block
+        uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0);
+        half d_val = src0_d[d_offset];
+        half dm_val = src0_dm[d_offset];
+
+        // Load sub-block scale and min
+        global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE;
+        uchar sv, mn;
+        get_scale_min_k4(j, sc, &sv, &mn);
+
+        float scale = (float)d_val * (float)sv;
+        float minv = -(float)dm_val * (float)mn;
+
+        // qh is stored at sub-block granularity
+        uint qh_offset = row + sub * ne01 + expert_id * num_superblocks * 8 * ne01 + get_global_id(0);
+        uchar4 qhx32 = as_uchar4(src0_qh[qh_offset]);
+
+        // First sub-block (16 elements)
+        uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        uint b_sub_offset = col * ne00 + step;
+
+        // Load 16 q (64-bits) in transposed layout
+        uint2 q4x16;
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B
+        float8 bx8_f32;
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        half8 bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q5_k(as_ushort4(q4x16), qhx32.lo, reg_a, scale, minv);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        half16 acc;
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+
+        // Second half
+        uint half_step = step + TILESIZE_K;
+        q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        b_sub_offset = col * ne00 + half_step;
+
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        dequantize_q5_k(as_ushort4(q4x16), qhx32.hi, reg_a, scale, minv);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+    }
+
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
+    // Load post router and share in LM
+    __local uint out_idx[TILESIZE_N];
+
+    if (get_local_id(0) < TILESIZE_N) {
+        uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
+        if (idx == 0xFFFFFFFF) {
+            idx = src2[block_id_n * TILESIZE_N + 0];
+        }
+        out_idx[get_local_id(0)] = idx * ne01;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scatter results back to original position in output grid
+    uint m_offset = row + get_local_id(0);
+
+    write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
+    write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
+    write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
+    write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
+    write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
+    write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
+    write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
+    write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
+    write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
+    write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
+    write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
+    write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
+    write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
+    write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
+    write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
+    write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
+    write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
+    write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
+    write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
+    write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
+    write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
+    write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
+    write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
+    write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
+    write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
+    write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
+    write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
+    write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
+    write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
+    write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
+    write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
+
+    // Store zero padding parts to the index of first output in tile
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
+}
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
new file mode 100644
index 00000000000..85ccebec78c
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
@@ -0,0 +1,267 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#define TILESIZE_K 16
+#define TILESIZE_M 64
+#define TILESIZE_N 32
+#define QK_K 256
+
+#define dequantize_q6_k(qs16, qh16, a_f16, scale) \
+    a_f16.s0 = (half)(((float)(( qs16.s0 & 0x000F)        | ((uint)(( qh16       ) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s1 = (half)(((float)((( qs16.s0 >> 4) & 0x000F) | ((uint)(( qh16 >>  2) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s2 = (half)(((float)((( qs16.s0 >> 8) & 0x000F) | ((uint)(( qh16 >>  4) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s3 = (half)(((float)((( qs16.s0 >>12) & 0x000F) | ((uint)(( qh16 >>  6) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s4 = (half)(((float)(( qs16.s1 & 0x000F)        | ((uint)(( qh16 >>  8) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s5 = (half)(((float)((( qs16.s1 >> 4) & 0x000F) | ((uint)(( qh16 >> 10) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s6 = (half)(((float)((( qs16.s1 >> 8) & 0x000F) | ((uint)(( qh16 >> 12) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s7 = (half)(((float)((( qs16.s1 >>12) & 0x000F) | ((uint)(( qh16 >> 14) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s8 = (half)(((float)(( qs16.s2 & 0x000F)        | ((uint)(( qh16 >> 16) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.s9 = (half)(((float)((( qs16.s2 >> 4) & 0x000F) | ((uint)(( qh16 >> 18) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.sa = (half)(((float)((( qs16.s2 >> 8) & 0x000F) | ((uint)(( qh16 >> 20) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.sb = (half)(((float)((( qs16.s2 >>12) & 0x000F) | ((uint)(( qh16 >> 22) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.sc = (half)(((float)(( qs16.s3 & 0x000F)        | ((uint)(( qh16 >> 24) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.sd = (half)(((float)((( qs16.s3 >> 4) & 0x000F) | ((uint)(( qh16 >> 26) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.se = (half)(((float)((( qs16.s3 >> 8) & 0x000F) | ((uint)(( qh16 >> 28) & 0x3) << 4)) - 32.f) * scale); \
+    a_f16.sf = (half)(((float)((( qs16.s3 >>12) & 0x000F) | ((uint)(( qh16 >> 30) & 0x3) << 4)) - 32.f) * scale); \
+
+
+#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
+    acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
+    acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
+    acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
+    acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
+    acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
+    acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
+    acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
+    acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
+    acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
+    acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
+    acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
+    acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
+    acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
+    acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
+    acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
+    acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
+    acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
+    acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
+    acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
+    acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
+    acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
+    acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
+    acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
+    acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
+    acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
+    acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
+    acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
+    acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
+    acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
+    acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
+    acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
+    acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+    acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
+    acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
+    acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
+    acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
+    acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
+    acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
+    acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
+    acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
+    acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
+    acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
+    acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
+    acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
+    acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
+    acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
+    acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
+    acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
+    acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
+    acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
+    acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
+    acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
+    acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
+    acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
+    acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
+    acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
+    acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
+    acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
+    acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
+    acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
+    acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
+    acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
+    acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
+    acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+
+
+__attribute__((qcom_wave_pair_mode(1)))
+kernel void kernel_gemm_moe_q6_k_f32_ns(
+        __read_only  image1d_buffer_t src0_ql,
+        __global     uint *           src0_qh,
+        __global     char *           src0_s,
+        __global     half *           src0_d,
+        __read_only  image1d_buffer_t src1,
+        __global     uint *           src2,
+        __global     ushort *         src2_emap,
+        __write_only image1d_buffer_t dst,
+        __global     int *            total_tiles,
+        uint ne00,
+        uint ne01
+) {
+    uint block_id_m = get_global_id(1); // m_tile
+    uint block_id_n = get_global_id(2); // n_tile
+
+    // Boundary check
+    if (block_id_n >= total_tiles[0]) {
+        return;
+    }
+
+    __private half16 reg_a;
+    __private float32 reg_c = (float32)(0);
+    __local half4 shared_b[128];
+
+    const ushort expert_id = src2_emap[block_id_n];
+
+    const uint row = block_id_m * TILESIZE_M;
+    const uint col = block_id_n * TILESIZE_N;
+
+    uint sub_block_id_m = get_local_id(0);
+    uint2 b_global_offset;
+    b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
+    b_global_offset.y = b_global_offset.x + (16 * ne00);
+    uint2 b_local_offset;
+    b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
+    b_local_offset.y = b_local_offset.x + 16;
+
+    uint num_superblocks = ne00 / QK_K;
+    uint scales_per_row = num_superblocks * 16;
+    uint row_idx = row + get_global_id(0);
+
+    // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16
+    for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
+        uint sub = step / 32;  // 32-element group index
+        uint sb = sub / 8;     // super-block index
+        uint j = sub % 8;      // group within super-block
+
+        // Load d for super-block
+        uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0);
+        half d_val = src0_d[d_offset];
+
+        // Load sub-block scales
+        global const char * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * 16;
+        float scale0 = (float)d_val * (float)sc[j * 2];
+        float scale1 = (float)d_val * (float)sc[j * 2 + 1];
+
+        uint qh_base = row + (sub * 2) * ne01 + expert_id * (num_superblocks * 16) * ne01 + get_global_id(0);
+        uint qh_first16 = src0_qh[qh_base];
+        uint qh_second16 = src0_qh[qh_base + ne01];
+
+        // First half (16 elements)
+        uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        uint b_sub_offset = col * ne00 + step;
+
+        // Load 16 ql nibbles (2 uints) from image
+        uint2 q4x16;
+        q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B
+        float8 bx8_f32;
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        half8 bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantize first 16 elements (scale0)
+        dequantize_q6_k(as_ushort4(q4x16), qh_first16, reg_a, scale0);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        half16 acc;
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+
+        // Second half
+        uint half_step = step + TILESIZE_K;
+        q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        b_sub_offset = col * ne00 + half_step;
+
+        q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x;
+
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        dequantize_q6_k(as_ushort4(q4x16), qh_second16, reg_a, scale1);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+    }
+
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
+    // Load post router and share in LM
+    __local uint out_idx[TILESIZE_N];
+
+    if (get_local_id(0) < TILESIZE_N) {
+        uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
+        if (idx == 0xFFFFFFFF) {
+            idx = src2[block_id_n * TILESIZE_N + 0];
+        }
+        out_idx[get_local_id(0)] = idx * ne01;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scatter results back to original position in output grid
+    uint m_offset = row + get_local_id(0);
+
+    write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
+    write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
+    write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
+    write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
+    write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
+    write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
+    write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
+    write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
+    write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
+    write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
+    write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
+    write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
+    write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
+    write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
+    write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
+    write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
+    write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
+    write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
+    write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
+    write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
+    write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
+    write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
+    write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
+    write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
+    write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
+    write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
+    write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
+    write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
+    write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
+    write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
+    write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
+
+    // Store zero padding parts to the index of first output in tile
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
+}
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
index e4b44c1a56a..75129e20c65 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
@@ -82,6 +82,10 @@ __kernel void kernel_gemv_moe_mxfp4_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
index 6f4d3f53216..2d28db63ec5 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
@@ -37,6 +37,10 @@ __kernel void kernel_gemv_moe_q4_0_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
index 3739a215705..b98bdc0f12e 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q4_1_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
new file mode 100644
index 00000000000..12464e9826e
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
@@ -0,0 +1,155 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#define N_SIMDGROUP 4
+#define SIMDGROUP_WIDTH 64
+
+inline void get_scale_min_k4(
+    int j,
+    global const uchar * q,
+    uchar * d,
+    uchar * m
+) {
+    if (j < 4) {
+        *d = q[j]   & 63;
+        *m = q[j+4] & 63;
+    } else {
+        *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2);
+        *m = ((q[j+4] >> 4) & 0x0F) | ((q[j]   & 0xC0) >> 2);
+    }
+}
+
+static inline float8 q4_k_to_fp32_packed8(ushort2 q4x8, float scale, float minv) {
+    float8 fp32x8;
+    fp32x8.s0 = (q4x8.s0 & 0x000F) * scale - minv;
+    fp32x8.s1 = ((q4x8.s0 & 0x00F0) >> 4) * scale - minv;
+    fp32x8.s2 = ((q4x8.s0 & 0x0F00) >> 8) * scale - minv;
+    fp32x8.s3 = ((q4x8.s0 & 0xF000) >> 12) * scale - minv;
+    fp32x8.s4 = (q4x8.s1 & 0x000F) * scale - minv;
+    fp32x8.s5 = ((q4x8.s1 & 0x00F0) >> 4) * scale - minv;
+    fp32x8.s6 = ((q4x8.s1 & 0x0F00) >> 8) * scale - minv;
+    fp32x8.s7 = ((q4x8.s1 & 0xF000) >> 12) * scale - minv;
+    return fp32x8;
+}
+
+__attribute__((qcom_reqd_sub_group_size("half")))
+__kernel void kernel_gemv_moe_q4_k_f32_ns(
+    __global uint *         src0_q,
+    __global half *         src0_d,
+    __global half *         src0_dm,
+    __global uchar *        src0_s,
+    __read_only image1d_buffer_t src1,
+    __global uint *         src2,
+    __global float *        dst,
+    ulong                   offsetd,
+    int                     ne00,
+    int                     ne01,
+    int                     ne11
+) {
+    uint i01  = get_global_id(0);
+    uint i20  = get_global_id(2);
+    uint sgid = get_local_id(1);
+    uint slid = get_sub_group_local_id();
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint i11 = i20 % ne11;
+
+    uint expert_id = src2[i20];
+
+    int num_superblocks = ne00 / QK_K;
+    int num_subblocks = ne00 / 32;
+    int scales_per_row = num_superblocks * K_SCALE_SIZE;
+
+    // Expert offsets in the transposed noshuffle layout
+    uint expert_q_offset = expert_id * (ne00 / 8) * ne01;
+    uint expert_d_offset = expert_id * num_superblocks * ne01;
+
+    __private float sum = 0.0f;
+
+    // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter
+    for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) {
+        uint sb = ib / 8;
+        uint j  = ib % 8;
+
+        // Load d and dmin for this super-block
+        half d_val   = src0_d[expert_d_offset + sb * ne01 + i01];
+        half dm_val  = src0_dm[expert_d_offset + sb * ne01 + i01];
+
+        // Load sub-block scale and min
+        global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE;
+        uchar sv, mn;
+        get_scale_min_k4(j, sc, &sv, &mn);
+
+        float scale = (float)d_val * (float)sv;
+        float minv  = (float)dm_val * (float)mn;
+
+        // Load 4 uints of quants (32 nibbles = 32 elements)
+        uint q_base = expert_q_offset + ib * ne01 * 4 + i01;
+
+        uint4 regQ;
+        regQ.s0 = src0_q[q_base];
+        regQ.s1 = src0_q[q_base + ne01];
+        regQ.s2 = src0_q[q_base + ne01 * 2];
+        regQ.s3 = src0_q[q_base + ne01 * 3];
+
+        // Load activations: 32 floats = 8 float4s
+        uint y_offset = i11 * ne00 / 4 + ib * 8;
+
+        float8 fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s0), scale, minv);
+
+        float4 shared_y4;
+        shared_y4 = read_imagef(src1, (y_offset + 0));
+        float4 acc = shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 1));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s1), scale, minv);
+
+        shared_y4 = read_imagef(src1, (y_offset + 2));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 3));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s2), scale, minv);
+
+        shared_y4 = read_imagef(src1, (y_offset + 4));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 5));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s3), scale, minv);
+
+        shared_y4 = read_imagef(src1, (y_offset + 6));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 7));
+        acc += shared_y4 * fp32x8.hi;
+
+        sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
+    }
+
+    // reduction in local memory, assumes #subgroups=4
+    __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
+    if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
+    if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
+    if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 output per thread in subgroup 0
+    if (sgid == 0) {
+        dst = dst + (offsetd >> 2);
+        dst[i01 + i20 * ne01] = sum;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
index 938054cf982..b43613638a8 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q5_0_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
index f33a4ef2757..7a666006e68 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
@@ -39,6 +39,10 @@ __kernel void kernel_gemv_moe_q5_1_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
new file mode 100644
index 00000000000..7d868d7abd9
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
@@ -0,0 +1,160 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#define N_SIMDGROUP 4
+#define SIMDGROUP_WIDTH 64
+
+inline void get_scale_min_k4(
+    int j,
+    global const uchar * q,
+    uchar * d,
+    uchar * m
+) {
+    if (j < 4) {
+        *d = q[j]   & 63;
+        *m = q[j+4] & 63;
+    } else {
+        *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2);
+        *m = ((q[j+4] >> 4) & 0x0F) | ((q[j]   & 0xC0) >> 2);
+    }
+}
+
+static inline float8 q5_k_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8, half s, half m) {
+    float8 fp32x8;
+    fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F)        | (( qh5x8       & 0x01) << 4)) * s + m);
+    fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) * s + m);
+    fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) * s + m);
+    fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) * s + m);
+    fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F)        | (((qh5x8 >> 4) & 0x01) << 4)) * s + m);
+    fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) * s + m);
+    fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) * s + m);
+    fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) * s + m);
+    return fp32x8;
+}
+
+__attribute__((qcom_reqd_sub_group_size("half")))
+__kernel void kernel_gemv_moe_q5_k_f32_ns(
+    __global uint *         src0_q,
+    __global uint *         src0_qh,
+    __global half *         src0_d,
+    __global half *         src0_dm,
+    __global uchar *        src0_s,
+    __read_only image1d_buffer_t src1,
+    __global uint *         src2,
+    __global float *        dst,
+    ulong                   offsetd,
+    int                     ne00,
+    int                     ne01,
+    int                     ne11
+) {
+    uint i01  = get_global_id(0);
+    uint i20  = get_global_id(2);
+    uint sgid = get_local_id(1);
+    uint slid = get_sub_group_local_id();
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint i11 = i20 % ne11;
+
+    uint expert_id = src2[i20];
+
+    int num_superblocks = ne00 / QK_K;
+    int num_subblocks = ne00 / 32;
+    int scales_per_row = num_superblocks * K_SCALE_SIZE;
+
+    // Expert offsets in the transposed noshuffle layout
+    uint expert_q_offset = expert_id * (ne00 / 8) * ne01;
+    uint expert_d_offset = expert_id * num_superblocks * ne01;
+
+    __private float sum = 0.0f;
+
+    // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter
+    for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) {
+        uint sb = ib / 8;
+        uint j  = ib % 8;
+
+        // Load d and dmin for this super-block
+        half d_val   = src0_d[expert_d_offset + sb * ne01 + i01];
+        half dm_val  = src0_dm[expert_d_offset + sb * ne01 + i01];
+
+        // sub_block index = sb * 8 + j
+        uint expert_qh_offset = expert_id * num_superblocks * 8 * ne01;
+        uchar4 regQh = as_uchar4(src0_qh[expert_qh_offset + (sb * 8 + j) * ne01 + i01]);
+
+        // Load sub-block scale and min
+        global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE;
+        uchar sv, mn;
+        get_scale_min_k4(j, sc, &sv, &mn);
+
+        float scale = (float)d_val * (float)sv;
+        float minv  = -(float)dm_val * (float)mn;
+
+        // Load 4 uints of quants (32 nibbles = 32 elements)
+        uint q_base = expert_q_offset + ib * ne01 * 4 + i01;
+
+        uint4 regQ;
+        regQ.s0 = src0_q[q_base];
+        regQ.s1 = src0_q[q_base + ne01];
+        regQ.s2 = src0_q[q_base + ne01 * 2];
+        regQ.s3 = src0_q[q_base + ne01 * 3];
+
+        // Load activations: 32 floats = 8 float4s
+        uint y_offset = i11 * ne00 / 4 + ib * 8;
+
+        float8 fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0, scale, minv);
+
+        float4 shared_y4;
+        shared_y4 = read_imagef(src1, (y_offset + 0));
+        float4 acc = shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 1));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1, scale, minv);
+
+        shared_y4 = read_imagef(src1, (y_offset + 2));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 3));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2, scale, minv);
+
+        shared_y4 = read_imagef(src1, (y_offset + 4));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 5));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3, scale, minv);
+
+        shared_y4 = read_imagef(src1, (y_offset + 6));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 7));
+        acc += shared_y4 * fp32x8.hi;
+
+        sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
+    }
+
+    // reduction in local memory, assumes #subgroups=4
+    __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
+    if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
+    if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
+    if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 output per thread in subgroup 0
+    if (sgid == 0) {
+        dst = dst + (offsetd >> 2);
+        dst[i01 + i20 * ne01] = sum;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
new file mode 100644
index 00000000000..c166bad5ba5
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
@@ -0,0 +1,141 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#define QK_K 256
+#define N_SIMDGROUP 4
+#define SIMDGROUP_WIDTH 64
+
+static inline float8 q6_k_to_fp32_packed8(ushort2 ql8, ushort qh8, float d_scale) {
+    float8 fp32x8;
+    fp32x8.s0 = ((float)(( ql8.s0 & 0x000F)        | ((uint)((qh8      ) & 0x3) << 4)) - 32.f) * d_scale;
+    fp32x8.s1 = ((float)((( ql8.s0 >> 4) & 0x000F) | ((uint)((qh8 >> 2) & 0x3) << 4)) - 32.f) * d_scale;
+    fp32x8.s2 = ((float)((( ql8.s0 >> 8) & 0x000F) | ((uint)((qh8 >> 4) & 0x3) << 4)) - 32.f) * d_scale;
+    fp32x8.s3 = ((float)((( ql8.s0 >> 12)& 0x000F) | ((uint)((qh8 >> 6) & 0x3) << 4)) - 32.f) * d_scale;
+    fp32x8.s4 = ((float)(( ql8.s1 & 0x000F)        | ((uint)((qh8 >> 8) & 0x3) << 4)) - 32.f) * d_scale;
+    fp32x8.s5 = ((float)((( ql8.s1 >> 4) & 0x000F) | ((uint)((qh8 >>10) & 0x3) << 4)) - 32.f) * d_scale;
+    fp32x8.s6 = ((float)((( ql8.s1 >> 8) & 0x000F) | ((uint)((qh8 >>12) & 0x3) << 4)) - 32.f) * d_scale;
+    fp32x8.s7 = ((float)((( ql8.s1 >> 12)& 0x000F) | ((uint)((qh8 >>14) & 0x3) << 4)) - 32.f) * d_scale;
+    return fp32x8;
+}
+
+__attribute__((qcom_reqd_sub_group_size("half")))
+__kernel void kernel_gemv_moe_q6_k_f32_ns(
+    __global uint *         src0_ql,
+    __global uint *         src0_qh,
+    __global char *         src0_s,
+    __global half *         src0_d,
+    __read_only image1d_buffer_t src1,
+    __global uint *         src2,
+    __global float *        dst,
+    ulong                   offsetd,
+    int                     ne00,
+    int                     ne01,
+    int                     ne11
+) {
+    uint i01  = get_global_id(0);
+    uint i20  = get_global_id(2);
+    uint sgid = get_local_id(1);
+    uint slid = get_sub_group_local_id();
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    uint i11 = i20 % ne11;
+
+    uint expert_id = src2[i20];
+
+    int num_superblocks = ne00 / QK_K;
+    int num_subblocks = ne00 / 32;  // 8 sub-blocks of 32 per super-block
+    int scales_per_row = num_superblocks * 16;
+
+    // Expert offsets in the transposed noshuffle layout
+    uint expert_ql_offset = expert_id * (ne00 / 8) * ne01;   // 32 uints per super-block
+    uint expert_qh_offset = expert_id * (ne00 / 16) * ne01;  // 16 uints per super-block
+    uint expert_d_offset  = expert_id * num_superblocks * ne01;
+
+    __private float sum = 0.0f;
+
+    // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter
+    for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) {
+        uint sb = ib / 8;   // super-block index
+        uint j  = ib % 8;   // 32-element group within super-block
+
+        // Load d for this super-block
+        half d_val = src0_d[expert_d_offset + sb * ne01 + i01];
+
+        // Load 2 sub-block scales
+        global const char * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * 16;
+        float scale0 = (float)d_val * (float)sc[j * 2];
+        float scale1 = (float)d_val * (float)sc[j * 2 + 1];
+
+        // Load 4 uints of ql
+        uint ql_base = expert_ql_offset + (ib * 4) * ne01 + i01;
+        uint4 regQL;
+        regQL.s0 = src0_ql[ql_base];
+        regQL.s1 = src0_ql[ql_base + ne01];
+        regQL.s2 = src0_ql[ql_base + ne01 * 2];
+        regQL.s3 = src0_ql[ql_base + ne01 * 3];
+
+        // Load 2 uints of qh
+        uint qh_base = expert_qh_offset + (ib * 2) * ne01 + i01;
+        uint2 regQH;
+        regQH.s0 = src0_qh[qh_base];
+        regQH.s1 = src0_qh[qh_base + ne01];
+
+        // Load activations: 32 floats = 8 float4s
+        uint y_offset = i11 * ne00 / 4 + ib * 8;
+
+        float8 fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s0), (ushort)(regQH.s0 & 0xFFFF), scale0);
+
+        float4 shared_y4;
+        shared_y4 = read_imagef(src1, (y_offset + 0));
+        float4 acc = shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 1));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s1), (ushort)(regQH.s0 >> 16), scale0);
+
+        shared_y4 = read_imagef(src1, (y_offset + 2));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 3));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s2), (ushort)(regQH.s1 & 0xFFFF), scale1);
+
+        shared_y4 = read_imagef(src1, (y_offset + 4));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 5));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s3), (ushort)(regQH.s1 >> 16), scale1);
+
+        shared_y4 = read_imagef(src1, (y_offset + 6));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (y_offset + 7));
+        acc += shared_y4 * fp32x8.hi;
+
+        sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
+    }
+
+    // reduction in local memory, assumes #subgroups=4
+    __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
+    if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
+    if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
+    if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 output per thread in subgroup 0
+    if (sgid == 0) {
+        dst = dst + (offsetd >> 2);
+        dst[i01 + i20 * ne01] = sum;
+    }
+}
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 15443aa554a..15d231f70c0 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -13,6 +13,10 @@
 #include <stdlib.h> // for qsort
 #include <stdio.h>  // for GGML_ASSERT
 
+#ifdef GGML_USE_OPENMP
+#include <omp.h>
+#endif
+
 #define GROUP_MAX_EPS 1e-15f
 #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
 #define GROUP_MAX_EPS_IQ2_S 1e-8f
@@ -3064,70 +3068,121 @@ void iq2xs_init_impl(enum ggml_type type) {
         }
         kmap_q2xs[index] = i;
     }
-    int8_t pos[8];
-    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    // The neighbour search runs in three passes:
+    //   1. Parallel: for each i, qsort and count its neighbours into n_per_i,
+    //      and reduce the totals (num_neighbors, num_not_in_map).
+    //   2. Serial: prefix-sum n_per_i into offsets[], so each i has a
+    //      pre-assigned slice of kneighbors_q2xs to write into.
+    //   3. Parallel: redo the qsort and write each i's neighbour list at
+    //      offsets[i].
+    int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(n_per_i);
     int num_neighbors = 0, num_not_in_map = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q2xs[i] >= 0) continue;
-        ++num_not_in_map;
-        for (int k = 0; k < 8; ++k) {
-            int l = (i >> 2*k) & 0x3;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
-        int n = 0; int d2 = dist2[0];
-        int nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            ++n;
-        }
-        num_neighbors += n;
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[8];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q2xs[i] >= 0) {
+                n_per_i[i] = 0;
+                continue;
+            }
+            ++num_not_in_map;
+            for (int k = 0; k < 8; ++k) {
+                int l = (i >> 2*k) & 0x3;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+            int n = 0; int d2 = dist2[0];
+            int nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                ++n;
+            }
+            n_per_i[i] = n;
+            num_neighbors += n;
+        }
+        free(dist2);
     }
     //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
     kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
     iq2_data[gindex].neighbours = kneighbors_q2xs;
+
+    int * offsets = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(offsets);
     int counter = 0;
     for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q2xs[i] >= 0) continue;
-        for (int k = 0; k < 8; ++k) {
-            int l = (i >> 2*k) & 0x3;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
-        kmap_q2xs[i] = -(counter + 1);
-        int d2 = dist2[0];
-        uint16_t * start = &kneighbors_q2xs[counter++];
-        int n = 0, nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            kneighbors_q2xs[counter++] = dist2[2*j+1];
-            ++n;
-        }
-        *start = n;
-    }
-    free(dist2);
+        if (kmap_q2xs[i] >= 0) {
+            offsets[i] = -1;
+            continue;
+        }
+        offsets[i] = counter;
+        counter += 1 + n_per_i[i];
+    }
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[8];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q2xs[i] >= 0) continue;
+            for (int k = 0; k < 8; ++k) {
+                int l = (i >> 2*k) & 0x3;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+            int local_counter = offsets[i];
+            kmap_q2xs[i] = -(local_counter + 1);
+            int d2 = dist2[0];
+            uint16_t * start = &kneighbors_q2xs[local_counter++];
+            int n = 0, nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                kneighbors_q2xs[local_counter++] = dist2[2*j+1];
+                ++n;
+            }
+            *start = n;
+        }
+        free(dist2);
+    }
+    free(offsets);
+    free(n_per_i);
 }
 
 void iq2xs_free_impl(enum ggml_type type) {
@@ -3663,70 +3718,115 @@ void iq3xs_init_impl(int grid_size) {
         }
         kmap_q3xs[index] = i;
     }
-    int8_t pos[4];
-    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    // See explanation of parallelism in iq2xs_init_impl
+    int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(n_per_i);
     int num_neighbors = 0, num_not_in_map = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q3xs[i] >= 0) continue;
-        ++num_not_in_map;
-        for (int k = 0; k < 4; ++k) {
-            int l = (i >> 3*k) & 0x7;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
-        int n = 0; int d2 = dist2[0];
-        int nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            ++n;
-        }
-        num_neighbors += n;
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[4];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q3xs[i] >= 0) {
+                n_per_i[i] = 0;
+                continue;
+            }
+            ++num_not_in_map;
+            for (int k = 0; k < 4; ++k) {
+                int l = (i >> 3*k) & 0x7;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+            int n = 0; int d2 = dist2[0];
+            int nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                ++n;
+            }
+            n_per_i[i] = n;
+            num_neighbors += n;
+        }
+        free(dist2);
     }
     //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
     kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
     iq3_data[gindex].neighbours = kneighbors_q3xs;
+
+    int * offsets = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(offsets);
     int counter = 0;
     for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q3xs[i] >= 0) continue;
-        for (int k = 0; k < 4; ++k) {
-            int l = (i >> 3*k) & 0x7;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
-        kmap_q3xs[i] = -(counter + 1);
-        int d2 = dist2[0];
-        uint16_t * start = &kneighbors_q3xs[counter++];
-        int n = 0, nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            kneighbors_q3xs[counter++] = dist2[2*j+1];
-            ++n;
-        }
-        *start = n;
-    }
-    free(dist2);
+        if (kmap_q3xs[i] >= 0) {
+            offsets[i] = -1;
+            continue;
+        }
+        offsets[i] = counter;
+        counter += 1 + n_per_i[i];
+    }
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[4];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q3xs[i] >= 0) continue;
+            for (int k = 0; k < 4; ++k) {
+                int l = (i >> 3*k) & 0x7;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+            int local_counter = offsets[i];
+            kmap_q3xs[i] = -(local_counter + 1);
+            int d2 = dist2[0];
+            uint16_t * start = &kneighbors_q3xs[local_counter++];
+            int n = 0, nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                kneighbors_q3xs[local_counter++] = dist2[2*j+1];
+                ++n;
+            }
+            *start = n;
+        }
+        free(dist2);
+    }
+    free(offsets);
+    free(n_per_i);
 }
 
 void iq3xs_free_impl(int grid_size) {
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 96bc1c98bd9..31e26ff48e4 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -224,6 +224,7 @@ struct sycl_device_info {
     int max_wg_per_cu; // max work groups per compute unit - refer to
                        // cudaOccupancyMaxActiveBlocksPerMultiprocessor
     bool    vmm;                // virtual memory support
+    size_t  vmm_granularity;    // granularity of virtual memory
     size_t  total_vram;
     sycl_hw_info hw_info;
     optimize_feature opt_feature;
@@ -238,10 +239,14 @@ struct ggml_sycl_device_info {
     std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
 
     int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
+
+    bool ext_oneapi_level_zero = true; // sycl::backend::ext_oneapi_level_zero used by all enumerated GPU devices
 };
 
 const ggml_sycl_device_info & ggml_sycl_info();
 
+static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128;
+
 struct ggml_sycl_pool {
     virtual ~ggml_sycl_pool() = default;
 
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index 5577bf73b28..4ae431a962e 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -3,6 +3,13 @@
 #include "dequantize.hpp"
 #include "presets.hpp"
 
+#if defined(__INTEL_LLVM_COMPILER)
+    #if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
+        #include <sycl/ext/oneapi/bfloat16.hpp>
+        #define GGML_SYCL_DMMV_HAS_BF16
+    #endif
+#endif
+
 static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const sycl::half *x = (const sycl::half *)vx;
 
@@ -11,6 +18,16 @@ static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat
     v.y() = x[ib + iqs + 1];
 }
 
+#ifdef GGML_SYCL_DMMV_HAS_BF16
+static void convert_bf16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const sycl::ext::oneapi::bfloat16 *x = (const sycl::ext::oneapi::bfloat16 *)vx;
+
+    // automatic bfloat16 -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+#endif
+
 static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const float * x = (const float *) vx;
 
@@ -217,6 +234,28 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
     }
 }
 
+#ifdef GGML_SYCL_DMMV_HAS_BF16
+static void convert_mul_mat_vec_bf16_sycl(const void *vx, const dfloat *y,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    // The qk=1 kernel iterates with stride 2*GGML_SYCL_DMMV_X, so ncols must be a
+    // multiple of that — not just GGML_SYCL_DMMV_X — to avoid out-of-bounds reads.
+    GGML_ASSERT(ncols % (2*GGML_SYCL_DMMV_X) == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<1, 1, convert_bf16>(vx, y, dst, ncols,
+                                                           nrows, item_ct1);
+            });
+    }
+}
+#endif
+
 /*
 DPCT1110:4: The total declared local variable size in device function
 dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@@ -1497,7 +1536,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
     bool src1_convert_f16 =
         src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
         src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
+        src0->type == GGML_TYPE_BF16;
 
     if (src1_convert_f16) {
         scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
@@ -1565,6 +1605,11 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
         case GGML_TYPE_F16:
             convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
+#ifdef GGML_SYCL_DMMV_HAS_BF16
+        case GGML_TYPE_BF16:
+            convert_mul_mat_vec_bf16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+#endif
         default:
             printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
             GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-sycl/gated_delta_net.cpp b/ggml/src/ggml-sycl/gated_delta_net.cpp
index ebc587524bf..9c2449aba0c 100644
--- a/ggml/src/ggml-sycl/gated_delta_net.cpp
+++ b/ggml/src/ggml-sycl/gated_delta_net.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 
-template <int S_v, bool KDA>
+template <int S_v, bool KDA, bool keep_rs_t>
 void gated_delta_net_sycl(const float *     q,
                           const float *     k,
                           const float *     v,
@@ -28,7 +28,8 @@ void gated_delta_net_sycl(const float *     q,
                           int64_t           sb3,
                           const sycl::uint3 neqk1_magic,
                           const sycl::uint3 rq3_magic,
-                          float             scale) {
+                          float             scale,
+                          int               K) {
     auto           item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
     const uint32_t h_idx    = item_ct1.get_group(2);
     const uint32_t sequence = item_ct1.get_group(1);
@@ -43,9 +44,13 @@ void gated_delta_net_sycl(const float *     q,
     float *       attn_data        = dst;
     float *       state            = dst + attn_score_elems;
 
-    const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
-    state += state_offset;
-    curr_state += state_offset;
+    // input state layout (D, K, n_seqs) — seq stride is K * D = K * H * S_v * S_v.
+    // output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
+    const int64_t state_in_offset      = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
+    const int64_t state_out_offset     = (sequence * H + h_idx) * S_v * S_v;
+    const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
+    state += state_out_offset;
+    curr_state += state_in_offset + col * S_v;
     attn_data += (sequence * n_tokens * H + h_idx) * S_v;
 
     constexpr int warp_size = ggml_sycl_get_physical_warp_size() < S_v ? ggml_sycl_get_physical_warp_size() : S_v;
@@ -55,9 +60,13 @@ void gated_delta_net_sycl(const float *     q,
 #pragma unroll
     for (int r = 0; r < rows_per_lane; r++) {
         const int i = r * warp_size + lane;
-        s_shard[r]  = curr_state[col * S_v + i];
+        s_shard[r]  = curr_state[i];
     }
 
+    // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
+    // are written; earlier slots are left untouched (caller-owned).
+    const int shift = (int) n_tokens - K;
+
     for (int t = 0; t < n_tokens; t++) {
         const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1;
         const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1;
@@ -131,17 +140,32 @@ void gated_delta_net_sycl(const float *     q,
         }
 
         attn_data += S_v * H;
-    }
+
 
     // Write state back to global memory
+        if constexpr (keep_rs_t) {
+            const int target_slot = t - shift;
+            if (target_slot >= 0 && target_slot < K) {
+                float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
 #pragma unroll
-    for (int r = 0; r < rows_per_lane; r++) {
-        const int i          = r * warp_size + lane;
-        state[col * S_v + i] = s_shard[r];
+                for (int r = 0; r < rows_per_lane; r++) {
+                    const int i = r * warp_size + lane;
+                    curr_state[col * S_v + i] = s_shard[r];
+                }
+            }
+        }
+    }
+
+    if constexpr (!keep_rs_t) {
+#pragma unroll
+        for (int r = 0; r < rows_per_lane; r++) {
+            const int i          = r * warp_size + lane;
+            state[col * S_v + i] = s_shard[r];
+        }
     }
 }
 
-template <bool KDA>
+template <bool KDA, bool keep_rs_t>
 static void launch_gated_delta_net(const float *   q_d,
                                    const float *   k_d,
                                    const float *   v_d,
@@ -165,6 +189,7 @@ static void launch_gated_delta_net(const float *   q_d,
                                    int64_t         neqk1,
                                    int64_t         rq3,
                                    float           scale,
+                                   int             K,
                                    dpct::queue_ptr stream) {
     //TODO: Add chunked kernel for even faster pre-fill
     const int warp_size = ggml_sycl_info().devices[ggml_sycl_get_device()].warp_size;
@@ -182,9 +207,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 16;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                      [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                         gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
+                                         gated_delta_net_sycl<sv, KDA, keep_rs_t>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
                                                                        n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
-                                                                       sb3, neqk1_magic, rq3_magic, scale);
+                                                                       sb3, neqk1_magic, rq3_magic, scale, K);
                                      });
             }
             break;
@@ -193,9 +218,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 32;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                      [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                         gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
+                                         gated_delta_net_sycl<sv, KDA, keep_rs_t>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
                                                                        n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
-                                                                       sb3, neqk1_magic, rq3_magic, scale);
+                                                                       sb3, neqk1_magic, rq3_magic, scale, K);
                                      });
             }
             break;
@@ -204,9 +229,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 64;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                         [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                            gated_delta_net_sycl<sv, KDA>(
+                                            gated_delta_net_sycl<sv, KDA, keep_rs_t>(
                                                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
-                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
+                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
                                         });
             }
             break;
@@ -216,9 +241,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 128;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                         [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                            gated_delta_net_sycl<sv, KDA>(
+                                            gated_delta_net_sycl<sv, KDA, keep_rs_t>(
                                                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
-                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
+                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
                                         });
             }
             break;
@@ -290,14 +315,30 @@ void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor *
 
     dpct::queue_ptr stream = ctx.stream();
 
+    // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
+    const int K = (int) src_state->ne[1];
+    const bool keep_rs = K > 1;
+
     if (kda) {
-        launch_gated_delta_net<true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
-            S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
-            sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        if (keep_rs) {
+            launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        } else {
+            launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        }
     } else {
-        launch_gated_delta_net<false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
-            S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
-            sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        if (keep_rs) {
+            launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        } else {
+            launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        }
     }
 }
 
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 2ea47f7153a..729a88b4db8 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -19,6 +19,7 @@
 #include <cstdlib>
 #include <float.h>
 #include <limits>
+#include <optional>
 #include <stdint.h>
 #include <stdio.h>
 #include <vector>
@@ -37,6 +38,11 @@
 #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
 #    include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
 #endif
+#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
+#    include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
+#    include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
+#    define GGML_SYCL_USE_VMM
+#endif
 #include <sycl/half_type.hpp>
 
 #include "ggml.h"
@@ -70,6 +76,7 @@ int g_ggml_sycl_debug = 0;
 int g_ggml_sycl_disable_optimize = 0;
 int g_ggml_sycl_disable_graph = 0;
 int g_ggml_sycl_disable_dnn = 0;
+int g_ggml_sycl_enable_vmm = 1;
 int g_ggml_sycl_prioritize_dmmv = 0;
 int g_ggml_sycl_use_async_mem_op = 0;
 int g_ggml_sycl_use_async_mem_op_requested = 1;
@@ -96,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
 //     GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
 // #endif
     for (int i = 0; i < info.device_count; ++i) {
-        info.devices[i].vmm = 0;
         dpct::device_info prop;
-        sycl::device device = dpct::dev_mgr::instance().get_device(i);
+        auto & device = dpct::dev_mgr::instance().get_device(i);
 
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, device)));
 
+#if !defined(GGML_SYCL_USE_VMM)
+        info.devices[i].vmm = 0;
+#else
+        info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
+        if (info.devices[i].vmm) {
+            // NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
+            // but the L0 API requires a larger page size for allocs above 2 MiB and
+            // rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
+            // Here we clamp it to 2 MiB for simplicity, but other devices may require
+            // calling zeVirtualMemQueryPageSize or yet unexposed public API.
+            const size_t physical_page = 2ull << 20; // 2 MiB
+            info.devices[i].vmm_granularity = std::max<size_t>(
+                sycl::ext::oneapi::experimental::get_mem_granularity(
+                    device, sycl::context(device)),
+                physical_page);
+        }
+#endif
+
         info.default_tensor_split[i] = total_vram;
         total_vram += prop.get_global_mem_size();
 
@@ -117,6 +141,12 @@ static ggml_sycl_device_info ggml_sycl_init() {
         info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
         info.devices[i].hw_info = get_device_hw_info(&device);
 
+        // Only check GPU devices; CPU devices use OpenCL and would otherwise
+        // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
+        if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) {
+            GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
+            info.ext_oneapi_level_zero = false;
+        }
     }
 
     for (int id = 0; id < info.device_count; ++id) {
@@ -228,28 +258,13 @@ static void ggml_check_sycl() try {
         g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
         g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
         g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
+        g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
         g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
 #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
-        g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
+        g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
 #else
         g_ggml_sycl_enable_level_zero = 0;
 #endif
-        if (g_ggml_sycl_enable_level_zero) {
-            // Verify all GPU devices use the Level Zero backend before enabling L0 APIs.
-            // Only check GPU devices; CPU devices use OpenCL and would otherwise
-            // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
-            for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) {
-                auto & q = dpct::dev_mgr::instance().get_device(i).default_queue();
-                if (!q.get_device().is_gpu()) {
-                    continue;
-                }
-                if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
-                    GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
-                    g_ggml_sycl_enable_level_zero = 0;
-                    break;
-                }
-            }
-        }
 
 #ifdef SYCL_FLASH_ATTN
         g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
@@ -285,6 +300,11 @@ static void ggml_check_sycl() try {
 #else
         GGML_LOG_INFO("  GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
 #endif
+#if defined(GGML_SYCL_USE_VMM)
+        GGML_LOG_INFO("  GGML_SYCL_USE_VMM: yes\n");
+#else
+        GGML_LOG_INFO("  GGML_SYCL_USE_VMM: no\n");
+#endif
 
         GGML_LOG_INFO("Running with Environment Variables:\n");
         GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
@@ -303,6 +323,11 @@ static void ggml_check_sycl() try {
         GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
 #else
         GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
+#endif
+#if defined(GGML_SYCL_USE_VMM)
+        GGML_LOG_INFO("  GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
+#else
+        GGML_LOG_INFO("  GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
 #endif
         GGML_LOG_INFO("  GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
         g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
@@ -764,7 +789,7 @@ catch (sycl::exception const &exc) {
 }
 
 static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
+    return SYCL_BUFFER_ALIGNMENT;
     GGML_UNUSED(buft);
 }
 
@@ -1187,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
 }
 
 static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
+    return SYCL_BUFFER_ALIGNMENT;
     GGML_UNUSED(buft);
 }
 
@@ -1472,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
     }
 };
 
+// pool with virtual memory management
+#if defined(GGML_SYCL_USE_VMM)
+struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
+    static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
+
+    int           device;
+    sycl::context ctx;
+    sycl::device  dev;
+
+    uintptr_t pool_addr = 0;
+    size_t    pool_used = 0;
+    size_t    pool_size = 0;
+    size_t    granularity;
+
+    // physical_mem owns the commits (unlike cuMemMap)
+    struct mapping {
+        sycl::ext::oneapi::experimental::physical_mem phys;
+        void * map_ptr;
+    };
+    std::vector<mapping> mappings;
+
+    explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
+        device(device_),
+        ctx(qptr_->get_context()),
+        dev(qptr_->get_device()),
+        granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
+    }
+
+    ~ggml_sycl_pool_vmm() {
+        if (pool_addr == 0) {
+            return;
+        }
+
+        // Per spec, unmap must (a) match the exact (ptr, size) of an earlier
+        // physical_mem::map() call and (b) precede destruction of the
+        // physical_mem objects (their dtors won't unmap).
+        for (auto & m : mappings) {
+            SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
+                m.map_ptr, m.phys.size(), ctx)));
+        }
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
+            pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+        // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
+        size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
+
+        size_t avail = pool_size - pool_used;
+
+        if (size > avail) {
+            // round up to the next multiple of the granularity
+            size_t reserve_size = GGML_PAD(size - avail, granularity);
+
+            GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
+
+            // allocate more physical memory
+            std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
+            SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
+
+            // reserve virtual address space (if not already reserved)
+            if (pool_addr == 0) {
+                SYCL_CHECK(CHECK_TRY_ERROR(
+                    pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
+                        SYCL_POOL_VMM_MAX_SIZE, ctx)));
+            }
+
+            // map at the end of the pool
+            void * map_ptr = nullptr;
+            SYCL_CHECK(CHECK_TRY_ERROR(
+                map_ptr = phys->map(pool_addr + pool_size, reserve_size,
+                                    sycl::ext::oneapi::experimental::address_access_mode::read_write)));
+
+            // stash these so we could unmap this exact range in dtor
+            mappings.push_back({
+                std::move(*phys),
+                map_ptr,
+            });
+
+            // add to the pool
+            pool_size += reserve_size;
+
+#ifdef DEBUG_SYCL_MALLOC
+            GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+                          device, (unsigned long long) (pool_size/1024/1024),
+                          (unsigned long long) (reserve_size/1024/1024));
+#endif
+        }
+
+        GGML_ASSERT(pool_addr != 0);
+
+        void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
+        *actual_size = size;
+        pool_used += size;
+
+#ifdef DEBUG_SYCL_MALLOC
+        GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
+#endif
+
+        return ptr;
+    }
+
+    void free(void * ptr, size_t size) override {
+#ifdef DEBUG_SYCL_MALLOC
+        GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
+#endif
+
+        pool_used -= size;
+
+        // all deallocations must be in reverse order of the allocations
+        GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
+    }
+};
+#endif // defined(GGML_SYCL_USE_VMM)
+
 struct ggml_sycl_pool_host : public ggml_sycl_pool {
     queue_ptr qptr;
     int       device;
@@ -1552,20 +1692,19 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
 }
 
 std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
-    // TBD: NO VMM support
-    // if (ggml_sycl_info().devices[device].vmm) {
-    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
-    // }
-   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
+#if defined(GGML_SYCL_USE_VMM)
+    if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
+        return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
+    }
+#endif // defined(GGML_SYCL_USE_VMM)
+    return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
 }
 
+
 std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
     return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
 }
 
-// TBD pool with virtual memory management
-// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
-
 /// kernels
 typedef void (*ggml_sycl_op_mul_mat_t)(
     ggml_backend_sycl_context & ctx,
@@ -3455,6 +3594,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
             return true;
         default:
             return false;
@@ -3818,8 +3958,13 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
 
 
 static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    // The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
+    // a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
+    // need ne[0] % DMMV_X == 0.
+    const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
+                                    2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
     return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
-           src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
+           src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
 }
 
 static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3923,35 +4068,17 @@ struct mmid_row_mapping {
 
 __dpct_inline__ static void k_copy_src1_to_contiguous(
     const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
-    int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
-    const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
+    const mmid_row_mapping *__restrict__ row_mapping,
     int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
-    const sycl::nd_item<3> &item_ct1, int &src1_row) {
-    int32_t iid1 = item_ct1.get_group(2);
-    int32_t id = item_ct1.get_group(1);
-
-    const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
+    const sycl::nd_item<3> &item_ct1) {
+    const int32_t src1_row = item_ct1.get_group(2);
 
-    if (row_id_i != i02) {
-        return;
-    }
+    const int32_t iid1 = row_mapping[src1_row].i2;
+    const int32_t id   = row_mapping[src1_row].i1;
 
     const int64_t i11 = id % ne11;
     const int64_t i12 = iid1;
 
-    if (item_ct1.get_local_id(2) == 0) {
-        src1_row =
-            dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
-                cur_src1_row, 1);
-        row_mapping[src1_row] = {id, iid1};
-    }
-    /*
-    DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
-    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
-    performance if there is no access to global memory.
-    */
-    item_ct1.barrier();
-
     const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
     float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
 
@@ -4026,6 +4153,47 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
         src1_row_stride, stream);
 }
 
+// counting sort of the routed rows by expert id (row_id_i, as chosen by the router):
+// builds a projection of a memory layout where each expert's slice is contiguous
+static void mmid_counting_sort_rows(
+        const ggml_tensor * ids, const char * ids_host,
+        int64_t n_ids, int64_t n_as, int64_t n_routed_rows,
+        std::vector<int64_t> & expert_counts,
+        std::vector<int64_t> & expert_row_offsets,
+        std::vector<mmid_row_mapping> & routed_row_src) {
+
+    // frequencies: how many routed rows each expert "owns"
+    expert_counts.assign(n_as, 0);
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+            expert_counts[row_id_i]++;
+        }
+    }
+
+    // where each expert's slice starts (row indices) and the previous ends
+    expert_row_offsets.assign(n_as + 1, 0);
+    for (int64_t i02 = 0; i02 < n_as; i02++) {
+        expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02];
+    }
+
+    std::vector<int64_t> expert_row_next = expert_row_offsets;
+    routed_row_src.resize(n_routed_rows);
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+
+            // find and validate the next free row for a given expert (row_id_i)
+            const int64_t routed_row = expert_row_next[row_id_i]++;
+            GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]);
+            GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]);
+            routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1};
+        }
+    }
+}
+
 static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                                  ggml_tensor *dst) try {
     scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -4104,99 +4272,91 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
         src1_row.data = src1_contiguous.get();
         dst_row.data  =  dst_contiguous.get();
 
-        for (int64_t i02 = 0; i02 < n_as; i02++) {
-            int64_t num_src1_rows = 0;
-            for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
-                for (int64_t id = 0; id < n_ids; id++) {
-                    const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+        // how many "owned" routed rows to pass to each expert
+        std::vector<int64_t> expert_row_counts;
+        // where each expert's slice starts and the previous ends (row indices, right-exclusive)
+        std::vector<int64_t> expert_row_offsets;
+        // the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
+        std::vector<mmid_row_mapping> routed_row_src;
 
-                    GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+        mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
+                                expert_row_counts, expert_row_offsets, routed_row_src);
 
-                    if (row_id_i != i02) {
-                        continue;
-                    }
+        ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), n_routed_rows);
+        SYCL_CHECK(CHECK_TRY_ERROR(
+                stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping))));
 
-                    num_src1_rows++;
-                }
-            }
+        const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
+        assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
+
+        {
+            sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
+            sycl::range<3> grid_dims(1, 1, n_routed_rows);
+            stream->submit([&](sycl::handler &cgh) {
+                char *__restrict src1_contiguous_get =
+                    src1_contiguous.get();
+                mmid_row_mapping *__restrict dev_row_mapping_get =
+                    dev_row_mapping.get();
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_copy_src1_to_contiguous(
+                            src1_original, src1_contiguous_get,
+                            dev_row_mapping_get,
+                            ne11, ne10, nb11, nb12,
+                            item_ct1);
+                    });
+            });
+        }
+
+        for (int64_t i02 = 0; i02 < n_as; i02++) {
+            const int64_t num_src1_rows = expert_row_counts[i02];
 
             if (num_src1_rows == 0) {
                 continue;
             }
 
-
-            ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
-            ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
-            SYCL_CHECK(CHECK_TRY_ERROR(
-                stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
-
-            const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
-            assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
-
-            {
-                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
-                sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
-                stream->submit([&](sycl::handler &cgh) {
-                    sycl::local_accessor<int, 0> src1_row_acc(cgh);
-
-                    char *__restrict src1_contiguous_get =
-                        src1_contiguous.get();
-                    int *__restrict dev_cur_src1_row_get =
-                        dev_cur_src1_row.get();
-                    mmid_row_mapping *__restrict dev_row_mapping_get =
-                        dev_row_mapping.get();
-                    size_t ids_nb_ct6 = ids->nb[1];
-                    size_t ids_nb_ct7 = ids->nb[0];
-
-                    cgh.parallel_for(
-                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_copy_src1_to_contiguous(
-                                src1_original, src1_contiguous_get,
-                                dev_cur_src1_row_get,
-                                dev_row_mapping_get, ids_dev, i02,
-                                ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
-                                item_ct1, src1_row_acc);
-                        });
-                });
-            }
+            const int64_t expert_row_offset = expert_row_offsets[i02];
 
             src0_row.data = src0_original + i02*nb02;
 
             GGML_ASSERT(nb11 == sizeof(float)*ne10);
             GGML_ASSERT(nb1 == sizeof(float)*ne0);
+            src1_row.data = src1_contiguous.get() + expert_row_offset*nb11;
             src1_row.ne[1] = num_src1_rows;
 
             src1_row.nb[1] = nb11;
             src1_row.nb[2] = num_src1_rows*nb11;
             src1_row.nb[3] = num_src1_rows*nb11;
 
+            dst_row.data = dst_contiguous.get() + expert_row_offset*nb1;
             dst_row.ne[1] = num_src1_rows;
             dst_row.nb[1] = nb1;
             dst_row.nb[2] = num_src1_rows*nb1;
             dst_row.nb[3] = num_src1_rows*nb1;
 
             ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+        }
 
-            {
-                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
-                sycl::range<3> grid_dims(1, 1, num_src1_rows);
-                stream->submit([&](sycl::handler &cgh) {
-                    const char *__restrict dst_contiguous_get =
-                        dst_contiguous.get();
-                    const mmid_row_mapping *__restrict dev_row_mapping_get =
-                        dev_row_mapping.get();
-
-                    cgh.parallel_for(
-                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_copy_dst_from_contiguous(dst_original,
-                                                       dst_contiguous_get,
-                                                       dev_row_mapping_get,
-                                                       ne0, nb1, nb2, item_ct1);
-                        });
-                });
-            }
+        {
+            sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
+            sycl::range<3> grid_dims(1, 1, n_routed_rows);
+            stream->submit([&](sycl::handler &cgh) {
+                const char *__restrict dst_contiguous_get =
+                    dst_contiguous.get();
+                const mmid_row_mapping *__restrict dev_row_mapping_get =
+                    dev_row_mapping.get();
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_copy_dst_from_contiguous(dst_original,
+                                                   dst_contiguous_get,
+                                                   dev_row_mapping_get,
+                                                   ne0, nb1, nb2, item_ct1);
+                    });
+            });
         }
     }
 }
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
index 6dbcea065b3..2d9e85794ad 100644
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -8,7 +8,10 @@ endif()
 
 find_package(Vulkan COMPONENTS glslc REQUIRED)
 
-find_package(SPIRV-Headers REQUIRED)
+if (DEFINED ENV{VULKAN_SDK})
+    list(APPEND CMAKE_PREFIX_PATH "$ENV{VULKAN_SDK}")
+endif()
+find_package(SPIRV-Headers CONFIG REQUIRED)
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
     # Parallel build object files
@@ -76,6 +79,12 @@ if (Vulkan_FOUND)
         "GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT"
     )
 
+    test_shader_extension_support(
+        "GL_NV_cooperative_matrix_decode_vector"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp"
+        "GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT"
+    )
+
     test_shader_extension_support(
         "GL_EXT_integer_dot_product"
         "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/integer_dot.comp"
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index d3fb19048d9..fb07282ef76 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -21,6 +21,19 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 
 #include <vulkan/vulkan.hpp>
 
+// Fallback definitions for VK_NV_cooperative_matrix_decode_vector in case the
+// installed Vulkan headers predate the extension.
+#ifndef VK_NV_cooperative_matrix_decode_vector
+#define VK_NV_cooperative_matrix_decode_vector 1
+#define VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME "VK_NV_cooperative_matrix_decode_vector"
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV ((VkStructureType)1000689000)
+typedef struct VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           cooperativeMatrixDecodeVector;
+} VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV;
+#endif
+
 // SPIR-V Headers: different SDK installations expose different include paths.
 // LunarG Vulkan SDK on Windows typically provides <spirv-headers/spirv.hpp>.
 // Linux packages, MSYS2 and MinGW often use the Khronos layout <spirv/unified1/spirv.hpp>.
@@ -398,6 +411,7 @@ enum vk_conv_shapes {
     CONV_SHAPE_128x128,
     CONV_SHAPE_64x32,
     CONV_SHAPE_32x256,
+    CONV_SHAPE_64x128,
     CONV_SHAPE_COUNT,
 };
 
@@ -412,6 +426,7 @@ vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = {
     { 128, 128, 16 }, // CONV_SHAPE_128x128
     {  64,  32, 32 }, // CONV_SHAPE_64x32
     {  32, 256, 16 }, // CONV_SHAPE_32x256
+    {  64, 128, 16 }, // CONV_SHAPE_64x128
 };
 
 enum dmmv_wg_sizes {
@@ -447,14 +462,16 @@ struct vk_fa_pipeline_state {
 };
 
 struct vk_conv2d_pipeline_state {
-    vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH)
-        : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH) {}
+    vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH, uint32_t aligned)
+        : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH), aligned(aligned) {}
 
     uint32_t s0, s1, p0, p1, d0, d1, KW, KH;
+    // when set, shader can skip K/CRS/NPQ bounds checks and address clamps
+    uint32_t aligned;
 
     bool operator<(const vk_conv2d_pipeline_state &b) const {
-        return std::tie(s0, s1, p0, p1, d0, d1, KW, KH) <
-               std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH);
+        return std::tie(s0, s1, p0, p1, d0, d1, KW, KH, aligned) <
+               std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH, b.aligned);
     }
 };
 
@@ -499,6 +516,12 @@ static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax      { GGM
                                                                              GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
                                                                              GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
 
+// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder
+// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion.
+static constexpr std::initializer_list<ggml_op> snake_pattern              { GGML_OP_MUL,      GGML_OP_SIN,
+                                                                             GGML_OP_SQR,      GGML_OP_MUL,
+                                                                             GGML_OP_ADD };
+
 //node #978 (  SOFT_MAX):     ffn_moe_probs-15 (   0K) [Vulka         ] use=2:    ffn_moe_logits-15 (   0K) [Vulka         ]
 //node #979 (   RESHAPE): ffn_moe_probs-15 (re (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
 //node #980 (   ARGSORT):   ffn_moe_argsort-15 (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
@@ -668,6 +691,7 @@ struct vk_device_struct {
     uint32_t coopmat_int_k;
 
     bool coopmat2;
+    bool coopmat2_decode_vector;
 
     bool pipeline_executable_properties_support {};
 
@@ -758,7 +782,8 @@ struct vk_device_struct {
     vk_pipeline pipeline_clamp_f32;
     vk_pipeline pipeline_pad_f32;
     vk_pipeline pipeline_roll_f32;
-    vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
+    vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32;
+    vk_pipeline pipeline_repeat_i16;
     vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_bf16_f32, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32;
     vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_bf16_f32, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
     vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
@@ -846,6 +871,9 @@ struct vk_device_struct {
     vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
     vk_pipeline pipeline_timestep_embedding_f32;
     vk_pipeline pipeline_conv_transpose_1d_f32;
+    vk_pipeline pipeline_snake_f32;
+    vk_pipeline pipeline_snake_f16;
+    vk_pipeline pipeline_snake_bf16;
     vk_pipeline pipeline_pool2d_f32;
     vk_pipeline pipeline_rwkv_wkv6_f32;
     vk_pipeline pipeline_rwkv_wkv7_f32;
@@ -1475,6 +1503,11 @@ struct vk_op_conv_transpose_1d_push_constants {
     int32_t s0;
 };
 
+struct vk_op_snake_push_constants {
+    uint32_t ne0;
+    uint32_t ne1;
+};
+
 struct vk_op_pool2d_push_constants {
     uint32_t IW; uint32_t IH;
     uint32_t OW; uint32_t OH;
@@ -2148,6 +2181,136 @@ static uint32_t compile_count = 0;
 static std::mutex compile_count_mutex;
 static std::condition_variable compile_count_cond;
 
+static constexpr uint32_t kSpvOpCooperativeMatrixLoadTensorNV = 5367;
+static constexpr uint32_t kSpvCapabilityCooperativeMatrixDecodeVectorNV = 5447;
+static constexpr uint32_t kSpvTensorAddressingDecodeVectorFuncBit = 0x4;
+
+// Remove SPV_NV_cooperative_matrix_decode_vector usage from a SPIR-V module so it
+// can be loaded on drivers that only support SPV_NV_cooperative_matrix2. Drops the
+// OpExtension declaration, the CooperativeMatrixDecodeVectorNV OpCapability, and the
+// DecodeVectorFunc operand from any OpCooperativeMatrixLoadTensorNV instruction.
+// Returns true when the input used the extension (and `out` was populated with a
+// stripped copy); returns false otherwise without touching `out`.
+static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count, std::vector<uint32_t> & out) {
+    static const char kDecodeVectorExt[] = "SPV_NV_cooperative_matrix_decode_vector";
+
+    if (word_count < 5) {
+        return false;
+    }
+
+    bool uses_decode_vector = false;
+    for (size_t pos = 5; pos < word_count; ) {
+        uint32_t word = code[pos];
+        uint32_t wc   = word >> spv::WordCountShift;
+        uint32_t op   = word & spv::OpCodeMask;
+        GGML_ASSERT(wc > 0 && pos + wc <= word_count);
+        if (op == spv::OpExtension && wc >= 2) {
+            const char * s = reinterpret_cast<const char *>(&code[pos + 1]);
+            if (strcmp(s, kDecodeVectorExt) == 0) {
+                uses_decode_vector = true;
+                break;
+            }
+        }
+        pos += wc;
+    }
+
+    if (!uses_decode_vector) {
+        return false;
+    }
+
+    VK_LOG_DEBUG("ggml_vk_strip_decode_vector: stripping SPV_NV_cooperative_matrix_decode_vector");
+
+    // Bulk-copy unchanged runs and only break the run when an instruction needs to
+    // be dropped or patched. Use reserve + insert/push_back so the destination buffer
+    // is touched exactly once (no zero-initialization pass from resize()).
+    out.clear();
+    out.reserve(word_count);
+
+    size_t run_start = 0;
+    auto flush_run = [&](size_t up_to) {
+        if (up_to > run_start) {
+            out.insert(out.end(), code + run_start, code + up_to);
+        }
+    };
+
+    for (size_t pos = 5; pos < word_count; ) {
+        uint32_t word = code[pos];
+        uint32_t wc   = word >> spv::WordCountShift;
+        uint32_t op   = word & spv::OpCodeMask;
+        GGML_ASSERT(wc > 0 && pos + wc <= word_count);
+
+        if (op == spv::OpExtension && wc >= 2) {
+            const char * s = reinterpret_cast<const char *>(&code[pos + 1]);
+            if (strcmp(s, kDecodeVectorExt) == 0) {
+                flush_run(pos);
+                pos += wc;
+                run_start = pos;
+                continue;
+            }
+        }
+
+        if (op == spv::OpCapability && wc == 2 && code[pos + 1] == kSpvCapabilityCooperativeMatrixDecodeVectorNV) {
+            flush_run(pos);
+            pos += wc;
+            run_start = pos;
+            continue;
+        }
+
+        if (op == kSpvOpCooperativeMatrixLoadTensorNV) {
+            // [opcode/wc][ResultType][Result][Pointer][Object][TensorLayout][MemOperand mask][mem extras...][TA mask][ta extras...]
+            GGML_ASSERT(wc >= 8);
+
+            uint32_t mem_mask = code[pos + 6];
+            size_t   cur      = pos + 7;
+            // Each of these MemoryAccess bits (when set) carries one trailing operand.
+            cur += (mem_mask & 0x2)     ? 1 : 0; // Aligned
+            cur += (mem_mask & 0x8)     ? 1 : 0; // MakePointerAvailable
+            cur += (mem_mask & 0x10)    ? 1 : 0; // MakePointerVisible
+            cur += (mem_mask & 0x10000) ? 1 : 0; // AliasScopeINTELMask
+            cur += (mem_mask & 0x20000) ? 1 : 0; // NoAliasINTELMask
+            GGML_ASSERT(cur < pos + wc);
+
+            uint32_t ta_mask = code[cur];
+            if ((ta_mask & kSpvTensorAddressingDecodeVectorFuncBit) == 0) {
+                pos += wc;
+                continue; // leave instruction inside the current unchanged run
+            }
+
+            flush_run(pos);
+
+            // Append unchanged prefix of the instruction (header through the mem-extras).
+            size_t inst_start = out.size();
+            size_t pre_n      = cur - pos;
+            out.insert(out.end(), code + pos, code + pos + pre_n);
+
+            // Emit TA mask with the DecodeVectorFunc bit cleared.
+            out.push_back(ta_mask & ~kSpvTensorAddressingDecodeVectorFuncBit);
+
+            // TA extras: TensorView (0x1) and DecodeFunc (0x2) are kept verbatim;
+            // DecodeVectorFunc (0x4) is dropped along with its trailing id operand.
+            size_t keep_ta_extras = ((ta_mask & 0x1) ? 1 : 0) + ((ta_mask & 0x2) ? 1 : 0);
+            if (keep_ta_extras) {
+                out.insert(out.end(), code + cur + 1, code + cur + 1 + keep_ta_extras);
+            }
+
+            GGML_ASSERT(wc == pre_n + 1 + keep_ta_extras + 1);
+
+            // Patch the instruction header with the new (one-shorter) word count.
+            uint32_t new_wc = wc - 1;
+            out[inst_start] = (new_wc << spv::WordCountShift) | op;
+
+            pos += wc;
+            run_start = pos;
+            continue;
+        }
+
+        pos += wc;
+    }
+
+    flush_run(word_count);
+    return true;
+}
+
 static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
                                          uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
                                          bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
@@ -2219,6 +2382,18 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
         shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
     }
 
+#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    if (device->coopmat2 && !device->coopmat2_decode_vector) {
+        const uint32_t * src   = spirv.empty() ? reinterpret_cast<const uint32_t *>(spv_data) : spirv.data();
+        size_t           src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size();
+        std::vector<uint32_t> stripped;
+        if (ggml_vk_strip_decode_vector(src, src_n, stripped)) {
+            spirv = std::move(stripped);
+            shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
+        }
+    }
+#endif
+
     pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
 
     vk::PushConstantRange pcr(
@@ -4690,9 +4865,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_repeat_i32, "repeat_i32", repeat_i32_len, repeat_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
 #define CREATE_UNARY(name)  \
     ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);  \
     ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
@@ -4845,6 +5022,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_snake_f32,  "snake_f32",  snake_f32_len,  snake_f32_data,  "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_snake_f16,  "snake_f16",  snake_f16_len,  snake_f16_data,  "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
+
     ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
@@ -4916,7 +5097,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     // conv2d, conv_transpose_2d
     for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
-        uint32_t conv2d_WG_SIZE  = 256;
+        // smaller WG for the small-tile fallback gives more concurrent WGs per SM
+        uint32_t conv2d_WG_SIZE  = (s == CONV_SHAPE_64x32) ? 128 : 256;
         uint32_t use_collectives = 0;  // Enables subgroup ops for preventing the re-calculation of indices.
         uint32_t conv2d_TS_K     = (s == CONV_SHAPE_64x32) ? 4 : 8;
         uint32_t conv2d_SHMEM_PAD = 4;
@@ -4955,18 +5137,77 @@ static void ggml_vk_load_shaders(vk_device& device) {
                 conv2d_BS.CRS);  // CRS block size should be capped at subgroup size for correctness when shuffle is used.
         }
 
-        uint32_t conv2d_shmem_req =
-            (conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
-        if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
+        // cm1 is used only when cm2 is unavailable; capped at 64x128 (due to shared memory size).
+        // Requires 16x16x16 f16-acc since that's the fragment shape hard-coded in the shader.
+        // Subgroup size must be 32 or 64 (to keep WG_SIZE sane) and we need
+        // subgroup_size_control to force the driver to actually use it.
+        bool conv2d_use_cm1 = false;
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+        conv2d_use_cm1 = !device->coopmat2 &&
+                         device->coopmat_support && device->coopmat_support_16x16x16_f16acc &&
+                         device->subgroup_size_control &&
+                         (device->subgroup_size == 32 || device->subgroup_size == 64) &&
+                         s != CONV_SHAPE_128x128;
+#endif
+
+        const uint32_t conv2d_cm1_shmem_pad = 8;
+
+        auto shmem_req = [&](uint32_t pad, bool csh_store, bool fp16_shmem) {
+            const uint32_t elem_size = fp16_shmem ? (uint32_t)sizeof(uint16_t) : (uint32_t)sizeof(float);
+            const uint32_t csh_elems = csh_store ? conv2d_BS.K * conv2d_BS.NPQ : 0u;
+            return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size;
+        };
+
+        // coopmat1 needs to store the output through shared memory, so check up front
+        // whether it'll fit and disable it before applying coopmat1 parameters.
+        if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) {
+            conv2d_use_cm1 = false;
+        }
+
+        uint32_t conv2d_WM = 16, conv2d_WN = 16;  // cm1 subgroup tile, ignored otherwise
+        if (conv2d_use_cm1) {
+            conv2d_SHMEM_PAD = conv2d_cm1_shmem_pad;
+            // 16x16x16 fragments; pick WM/WN to keep WG_SIZE at 256
+            // (i.e. 8 subgroups for sg=32, 4 subgroups for sg=64).
+            const bool sg64 = (device->subgroup_size == 64);
+            switch (s) {
+                case CONV_SHAPE_64x32:   conv2d_WM = sg64 ? 32 : 16; conv2d_WN = 16; break;
+                case CONV_SHAPE_64x128:  conv2d_WM = 32; conv2d_WN = sg64 ? 64 : 32; break;
+                case CONV_SHAPE_32x256:  conv2d_WM = sg64 ? 16 : 32; conv2d_WN = sg64 ? 128 : 32; break;
+                default: break;
+            }
+            const uint32_t warps_M = conv2d_BS.K / conv2d_WM;
+            const uint32_t warps_N = conv2d_BS.NPQ / conv2d_WN;
+            conv2d_WG_SIZE         = warps_M * warps_N * device->subgroup_size;
+        }
+
+        // stage cm2 accumulator through shmem for coalesced global stores;
+        // skipped on 128x128 where the extra Csh footprint hurts occupancy.
+        // cm1 always uses the staged path.
+        uint32_t conv2d_csh_store = (device->coopmat2 && s != CONV_SHAPE_128x128) ? 1u : 0u;
+        if (conv2d_use_cm1) {
+            conv2d_csh_store = 1;
+        }
+
+        // shmem is fp16 on cm2/cm1 (matches Csh), fp32 on scalar
+        const bool conv2d_use_fp16_shmem = device->coopmat2 || conv2d_use_cm1;
+
+        // shrink CRS if the non-cm1 config still doesn't fit
+        if (device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_SHMEM_PAD, conv2d_csh_store, conv2d_use_fp16_shmem)) {
+            GGML_ASSERT(!conv2d_use_cm1);
             conv2d_BS.CRS = 8;
             if (use_collectives) {
                 conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS);
             }
+            conv2d_csh_store = 0;
         }
 
         std::array<uint32_t, 3> wg_denoms = { conv2d_BS.K, 1, 1 };
         std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
 
+        // cm1 needs a fixed subgroup width to match the WG_SIZE we computed
+        const uint32_t conv2d_required_subgroup_size = conv2d_use_cm1 ? device->subgroup_size : 0;
+
 #define CREATE_CONV(name, type_suffix, spv_suffix) \
         for (auto &c : device->pipeline_##name##type_suffix[s]) { \
             const vk_conv2d_pipeline_state &state = c.first;  \
@@ -4979,10 +5220,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
             spec_constants_cpy.push_back(state.d1); \
             spec_constants_cpy.push_back(state.KW); \
             spec_constants_cpy.push_back(state.KH); \
+            spec_constants_cpy.push_back(state.aligned); \
+            spec_constants_cpy.push_back(conv2d_csh_store); \
+            spec_constants_cpy.push_back(conv2d_WM); \
+            spec_constants_cpy.push_back(conv2d_WN); \
             ggml_vk_create_pipeline( \
                 device, c.second, #name #type_suffix, \
                 name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \
-                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives);    \
+                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives || conv2d_required_subgroup_size, conv2d_required_subgroup_size);    \
         }
 #define CREATE_CONVS(spv_suffix) \
         CREATE_CONV(conv2d, _f32, spv_suffix) \
@@ -4993,6 +5238,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
         if (device->coopmat2) {
             CREATE_CONVS(_cm2)
         } else
+#endif
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+        if (conv2d_use_cm1) {
+            CREATE_CONVS(_cm1)
+        } else
 #endif
         if (conv2d_UNROLL) {
             CREATE_CONVS(_unroll)
@@ -5065,6 +5315,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
         bool amd_shader_core_properties2 = false;
         bool pipeline_robustness = false;
         bool coopmat2_support = false;
+        bool coopmat2_decode_vector_support = false;
         bool pipeline_executable_properties_support = false;
         device->coopmat_support = false;
         device->integer_dot_product = false;
@@ -5099,6 +5350,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
                        !getenv("GGML_VK_DISABLE_COOPMAT2")) {
                 coopmat2_support = true;
 #endif
+            } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 &&
+                       !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) {
+                coopmat2_decode_vector_support = true;
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
             } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                        !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
@@ -5376,6 +5630,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
         }
 #endif
 
+        VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {};
+        coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV;
+        if (coopmat2_decode_vector_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+            last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+            device_extensions.push_back(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME);
+        }
+
 #if defined(VK_KHR_shader_bfloat16)
         VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {};
         bfloat16_features.pNext = nullptr;
@@ -5535,6 +5797,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
                     found_fp32_128 && found_fp32_256 &&
                     coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) {
                     device->coopmat2 = true;
+                    device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
                 }
             }
 #endif
@@ -5750,8 +6013,12 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
         ggml_vk_load_shaders(device);
 
-        // Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled
-        const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !allow_graphics_queue;
+        // Prefer a dedicated transfer queue on AMD dGPUs (non-GCN) when graphics queue use is disabled.
+        const bool prefers_transfer_queue =
+            device->vendor_id == VK_VENDOR_ID_AMD &&
+            device->architecture != AMD_GCN &&
+            !device->uma &&
+            !allow_graphics_queue;
 
         if (!device->single_queue) {
             const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
@@ -5817,6 +6084,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     bool fp16_compute = false;
     bool coopmat_support = false;
     bool coopmat2_support = false;
+    bool coopmat2_decode_vector_support = false;
     bool integer_dot_product = false;
     bool bfloat16_support = false;
 
@@ -5835,6 +6103,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
                    !getenv("GGML_VK_DISABLE_COOPMAT2")) {
             coopmat2_support = true;
 #endif
+        } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 &&
+                   !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) {
+            coopmat2_decode_vector_support = true;
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
         } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                     !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
@@ -5919,6 +6190,13 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     }
 #endif
 
+    VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {};
+    coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV;
+    if (coopmat2_decode_vector_support) {
+        last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+        last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+    }
+
     vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
 
     fp16 = fp16 && vk12_features.shaderFloat16;
@@ -5943,7 +6221,14 @@ static void ggml_vk_print_gpu_info(size_t idx) {
 #endif
                    && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture);
 
-    std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
+    coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
+#if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    coopmat2_decode_vector_support = false;
+#endif
+
+    std::string matrix_cores = coopmat2_support ? (coopmat2_decode_vector_support ? "NV_coopmat2v" : "NV_coopmat2")
+                             : coopmat_support  ? "KHR_coopmat"
+                             : "none";
 
     std::string device_name = props2.properties.deviceName.data();
     GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
@@ -9455,10 +9740,23 @@ static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, u
     // so small convolutions will still choose a smaller tile.
     const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
 
-    if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
+    // 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile.
+    bool allow_128x128 = true;
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) {
+        allow_128x128 = false;
+    }
+#endif
+
+    if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
         return CONV_SHAPE_128x128;
     } else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) {
         return CONV_SHAPE_32x256;
+    } else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
+        return CONV_SHAPE_64x128;
+    } else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
+        // cm1 fallback for large K when 128x128 isn't available
+        return CONV_SHAPE_64x128;
     } else {
         return CONV_SHAPE_64x32;
     }
@@ -9630,7 +9928,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
         return nullptr;
     case GGML_OP_REPEAT:
         if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
-            return ctx->device->pipeline_repeat_f32;
+            return ctx->device->pipeline_repeat_i32;
+        }
+        if (ggml_type_size(src0->type) == 2 && ggml_type_size(dst->type) == 2) {
+            return ctx->device->pipeline_repeat_i16;
         }
         return nullptr;
     case GGML_OP_REPEAT_BACK:
@@ -9990,7 +10291,18 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0;
             uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1;
             uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1;
-            vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH);
+
+            // tile-aligned shapes let the shader skip bounds checks
+            const uint32_t Cin = (uint32_t)src1->ne[2];
+            const uint32_t CRS = Cin * KW * KH;
+            const uint32_t BS_K   = vk_conv_block_sizes[shape].K;
+            const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS;
+            const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ;
+            const uint32_t aligned = ((K   % BS_K   == 0) &&
+                                      (CRS % BS_CRS == 0) &&
+                                      (NPQ % BS_NPQ == 0)) ? 1u : 0u;
+
+            vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned);
 
             std::map<vk_conv2d_pipeline_state, vk_pipeline> *pipelines = nullptr;
             if (op == GGML_OP_CONV_2D) {
@@ -12110,6 +12422,45 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
     ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
 }
 
+// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
+// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
+// dedicated kernel directly. The pattern is validated by
+// ggml_vk_can_fuse_snake before this call.
+static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
+    const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
+    const ggml_tensor * sqr  = cgraph->nodes[node_idx + 2];
+    const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
+    ggml_tensor *       add  = cgraph->nodes[node_idx + 4];
+
+    // x carries the full activation shape, a is the broadcast operand
+    const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
+    const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
+
+    // mul1 reads sqr and inv_b in either operand order
+    const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
+
+    vk_pipeline pipeline = nullptr;
+    switch (x->type) {
+        case GGML_TYPE_F32:  pipeline = ctx->device->pipeline_snake_f32;  break;
+        case GGML_TYPE_F16:  pipeline = ctx->device->pipeline_snake_f16;  break;
+        case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break;
+        default:             GGML_ABORT("unsupported type");
+    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
+
+    vk_subbuffer x_buf     = ggml_vk_tensor_subbuffer(ctx, x);
+    vk_subbuffer a_buf     = ggml_vk_tensor_subbuffer(ctx, a);
+    vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b);
+    vk_subbuffer dst_buf   = ggml_vk_tensor_subbuffer(ctx, add);
+
+    vk_op_snake_push_constants pc{};
+    pc.ne0 = static_cast<uint32_t>(x->ne[0]);
+    pc.ne1 = static_cast<uint32_t>(x->ne[1]);
+
+    std::array<uint32_t, 3> elements = { pc.ne0, pc.ne1, 1 };
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements);
+}
+
 static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
     uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
     const int32_t k1 = dst->op_params[1];
@@ -13318,7 +13669,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
 
         break;
     case GGML_OP_MUL:
-        ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
+        if (ctx->num_additional_fused_ops) {
+            ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx);
+        } else {
+            ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
+        }
 
         break;
     case GGML_OP_DIV:
@@ -14691,6 +15046,65 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
     return true;
 }
 
+// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add.
+// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that
+// the broadcast operands a and inv_b share a [1, C] layout.
+static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
+    GGML_UNUSED(ctx);
+    if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) {
+        return false;
+    }
+
+    const ggml_tensor * mul0     = cgraph->nodes[node_idx + 0];
+    const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1];
+    const ggml_tensor * sqr      = cgraph->nodes[node_idx + 2];
+    const ggml_tensor * mul1     = cgraph->nodes[node_idx + 3];
+    const ggml_tensor * add      = cgraph->nodes[node_idx + 4];
+
+    const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
+    const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
+
+    const ggml_tensor * inv_b    = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
+    const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
+
+    if (x_in_add != x) {
+        return false;
+    }
+    if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) {
+        return false;
+    }
+    // Shader bindings: data_a is A_TYPE so it follows x's precision, while
+    // data_b and data_c are hardcoded float, so the broadcast operands must
+    // be F32 regardless of x's type.
+    if (a->type     != GGML_TYPE_F32) return false;
+    if (inv_b->type != GGML_TYPE_F32) return false;
+    // Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline).
+    if (mul0->type     != x->type) return false;
+    if (sin_node->type != x->type) return false;
+    if (sqr->type      != x->type) return false;
+    if (mul1->type     != x->type) return false;
+    if (add->type      != x->type) return false;
+    if (!ggml_are_same_shape(a, inv_b)) {
+        return false;
+    }
+    if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) {
+        return false;
+    }
+    // Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b
+    // must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader.
+    if (x->ne[2]     != 1 || x->ne[3]     != 1) return false;
+    if (add->ne[2]   != 1 || add->ne[3]   != 1) return false;
+    if (a->ne[2]     != 1 || a->ne[3]     != 1) return false;
+    if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false;
+    // Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1],
+    // so every operand must be contiguous.
+    if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) ||
+        !ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) {
+        return false;
+    }
+    return true;
+}
+
 // Check whether the tensors overlap in memory.
 // Fusions can potentially overwrite src tensors in ways that are not prevented
 // by ggml-alloc. If the fusion src is being applied in a way that's elementwise
@@ -14998,6 +15412,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                 op_srcs_fused_elementwise[0] = false;
                 op_srcs_fused_elementwise[1] = false;
                 op_srcs_fused_elementwise[2] = false;
+            } else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) {
+                ctx->num_additional_fused_ops = 4;
+                fusion_string = "SNAKE";
+                // elementwise=true: snake.comp is safe under exact aliasing because each
+                // thread reads data_x[idx] into a register before writing data_d[idx]
+                // with a data dependency on that register. The overlap check still
+                // rejects partial overlaps (different base or size).
+                std::fill_n(op_srcs_fused_elementwise, 5, true);
             } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
                        ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
                        ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
@@ -15288,6 +15710,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
         if (keep_pattern(topk_moe_late_softmax)) {
             continue;
         }
+        if (keep_pattern(snake_pattern)) {
+            continue;
+        }
 
         // First, grab the next unused node.
         current_set.push_back(first_unused);
@@ -15310,7 +15735,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
             if (match_pattern(topk_moe_early_softmax_norm, j) ||
                 match_pattern(topk_moe_sigmoid_norm_bias, j) ||
                 match_pattern(topk_moe_early_softmax, j) ||
-                match_pattern(topk_moe_late_softmax, j)) {
+                match_pattern(topk_moe_late_softmax, j) ||
+                match_pattern(snake_pattern, j)) {
                 continue;
             }
             bool ok = true;
@@ -16020,7 +16446,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 return false;
             }
         case GGML_OP_REPEAT:
-            return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
+            return ggml_type_size(op->type) == ggml_type_size(op->src[0]->type) &&
+                  (ggml_type_size(op->type) == sizeof(float) || ggml_type_size(op->type) == 2);
         case GGML_OP_REPEAT_BACK:
             return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ROPE:
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
index e1f613fb4f6..10a9ea21025 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -11,6 +11,10 @@ if (GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
     add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
     message(STATUS "Enabling coopmat2 glslc support")
 endif()
+if (GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    add_compile_definitions(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    message(STATUS "Enabling coopmat2 decode_vector glslc support")
+endif()
 if (GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
     add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
     message(STATUS "Enabling dot glslc support")
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 875c012cd3b..1428ef68d81 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -7,6 +7,13 @@
 #extension GL_KHR_memory_scope_semantics : enable
 #endif
 
+#ifdef COOPMAT
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_KHR_memory_scope_semantics : enable
+#endif
+
 #ifdef USE_COLLECTIVES
 #    extension GL_KHR_shader_subgroup_shuffle : enable
 #endif
@@ -77,6 +84,39 @@ layout(constant_id = 12) const uint d1             = 1;
 // Kernel spatial sizes
 layout(constant_id = 13) const uint KW             = 1;
 layout(constant_id = 14) const uint KH             = 1;
+// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned)
+layout(constant_id = 15) const uint aligned        = 0;
+// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this.
+layout(constant_id = 16) const uint csh_store      = 0;
+
+#ifdef COOPMAT
+// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of
+// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN ==
+// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size.
+layout(constant_id = 17) const uint WM             = 32;
+layout(constant_id = 18) const uint WN             = 32;
+const uint TM = 16;
+const uint TN = 16;
+const uint TK = 16;
+const uint cms_per_row = WM / TM;
+const uint cms_per_col = WN / TN;
+const uint warps_M     = BS_K / WM;
+const uint warps_N     = BS_NPQ / WN;
+#endif
+
+// without padding, H_idx/W_idx are in bounds by construction (non-TRANSPOSE only)
+#ifdef TRANSPOSE
+const bool hw_in_bounds = false;
+#else
+const bool hw_in_bounds = (p0 == 0) && (p1 == 0);
+#endif
+
+// TRANSPOSE stride alignment is trivially satisfied for stride 1
+#ifdef TRANSPOSE
+const bool stride_in_bounds = (s0 == 1) && (s1 == 1);
+#else
+const bool stride_in_bounds = true;
+#endif
 
 uint32_t       tid     = gl_LocalInvocationID.x;
 const uint32_t WG_SIZE = gl_WorkGroupSize.x;
@@ -94,7 +134,7 @@ uint32_t n_elems_out = K * NPQ;
 // Number of blocktiles per input
 uint32_t NB_CRS = splitWork(CRS, BS_CRS);
 
-#ifdef COOPMAT2
+#if defined(COOPMAT2) || defined(COOPMAT)
 #define SHMEM_TYPE float16_t
 #else
 #define SHMEM_TYPE float
@@ -112,6 +152,17 @@ const uint32_t Bsh_len = BS_CRS * Bsh_stride;
 shared SHMEM_TYPE Ash[Ash_len];  // K x CRS
 shared SHMEM_TYPE Bsh[Bsh_len];  // CRS x NPQ
 
+#if defined(COOPMAT2) || defined(COOPMAT)
+// stage matC through shmem so global stores are row-major (NPQ-contiguous)
+const uint32_t Csh_stride = BS_NPQ;
+#ifdef COOPMAT
+const uint32_t Csh_len    = BS_K * Csh_stride;
+#else
+const uint32_t Csh_len    = csh_store != 0 ? BS_K * Csh_stride : 1;
+#endif
+shared SHMEM_TYPE Csh[Csh_len];  // K x NPQ
+#endif
+
 // Threadtile sizes
 const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
 
@@ -161,7 +212,7 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
     uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
     uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
     uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
-    if (K_idx < K && NPQ_idx < NPQ) {
+    if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
         dst_data[dst_idx] = D_TYPE(elem);
     }
     return elem;
@@ -176,6 +227,13 @@ void main() {
 #ifdef COOPMAT2
     coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
     matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
+#elif defined(COOPMAT)
+    coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
+    [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
+        sums[i] = coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0);
+    }
+    const uint warp_r = gl_SubgroupID / warps_N;
+    const uint warp_c = gl_SubgroupID % warps_N;
 #else
     float regC[TS_K][TS_NPQ];
     for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
@@ -228,12 +286,15 @@ void main() {
             uint32_t B_lx    = Ac;
             uint32_t K_idx   = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
 #ifdef TRANSPOSE
-            uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1);
+            uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03;
 #else
-            uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
+            uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03;
 #endif
+            if (aligned == 0) {
+                knl_idx = min(knl_idx, K * CRS - 1);
+            }
             float    val     = knl_data[knl_idx];
-            if (K_idx >= K || CRS_idx_a >= CRS) {
+            if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) {
                 val = 0.0;
             }
             Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
@@ -282,15 +343,27 @@ void main() {
             uint32_t H_idx = OH_idx * s1 + KH_idx_b * d1 - p1;
             uint32_t W_idx = OW_idx * s0 + KW_idx_b * d0 - p0;
 #endif
-            uint32_t src_idx =
-                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+            uint32_t src_idx = W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
+            // skip clamp when address can't go OOB
+            if (aligned == 0 || !hw_in_bounds || !stride_in_bounds) {
+                src_idx = min(max(src_idx, 0), p.Cin * p.N * p.W * p.H - 1);
+            }
             float val = src_data[src_idx];
-            if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
-                || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
+            bool oob = false;
+            if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) {
+                oob = true;
+            }
+            // also catches lower-bound underflow (idx wraps to 0x80000000+)
+            if (!hw_in_bounds && (H_idx >= p.H || W_idx >= p.W)) {
+                oob = true;
+            }
 #ifdef TRANSPOSE
-                || (H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0)
+            if (!stride_in_bounds &&
+                ((H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0))) {
+                oob = true;
+            }
 #endif
-                ) {
+            if (oob) {
                 val = 0.0;
             }
             Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
@@ -303,6 +376,23 @@ void main() {
         coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
         matC = coopMatMulAdd(matA, matB, matC);
+#elif defined(COOPMAT)
+        // each subgroup multiplies its grid of fragments per TK-sized CRS chunk
+        [[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) {
+            coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a[cms_per_row];
+            [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+                const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK;
+                coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
+            }
+            [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+                coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
+                const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN;
+                coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
+                [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+                    sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]);
+                }
+            }
+        }
 #else
         if (T_y * TS_K < K) {
             UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
@@ -325,8 +415,51 @@ void main() {
         barrier();
     }
     /* Save C* */
+#if defined(COOPMAT2) || defined(COOPMAT)
+    // stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores
+#ifdef COOPMAT
+    const bool use_staged_store = true;
+#else
+    const bool use_staged_store = (csh_store != 0);
+#endif
+    if (use_staged_store) {
+#ifdef COOPMAT
+        // cm1: each subgroup stores its fragment grid into its Csh slot
+        [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+            [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+                const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN;
+                coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
+            }
+        }
+#else
+        coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
+#endif
+        barrier();
+
+        // cooperative shmem->global: WG threads spread across BS_NPQ (the
+        // contiguous direction of dst), each iter covers store_rows_per_iter K-rows
+        const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ;
+        const uint32_t store_iters         = BS_K / store_rows_per_iter;
+        const uint32_t k_thread_offset     = tid / BS_NPQ;
+        const uint32_t npq_thread          = tid % BS_NPQ;
+        [[unroll]] for (uint32_t i = 0; i < store_iters; i++) {
+            uint32_t k_local = i * store_rows_per_iter + k_thread_offset;
+            uint32_t K_idx   = B_idx_K * BS_K + k_local;
+            uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread;
+            uint32_t N_idx   = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL);
+            uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL);
+            uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
+            uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
+            if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
+                dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]);
+            }
+        }
+    }
 #ifdef COOPMAT2
-    coopMatPerElementNV(matC, matC, perElemOpStore);
+    else {
+        coopMatPerElementNV(matC, matC, perElemOpStore);
+    }
+#endif
 #else
     if (T_y * TS_K < K) {
         for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
@@ -337,7 +470,7 @@ void main() {
                 uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
                 uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
                 uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
-                if (K_idx < K && NPQ_idx < NPQ) {
+                if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
                     dst_data[dst_idx] = regC[T_ly][T_lx];
                 }
             }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
index 88d07d2dfd5..e67299fdeca 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
@@ -5,21 +5,60 @@
 #include "types.glsl"
 
 #if defined(DATA_A_F32)
+FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
+    return data_a[a_offset + ib];
+}
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    return vec4(data_a[a_offset + ib    ], data_a[a_offset + ib + 1],
+                data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
+}
+vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
+    return vec4(data_a[a_offset + ib    ], data_a[a_offset + ib + 1],
+                data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
+}
+
 #endif
 
 #if defined(DATA_A_F16)
+FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
+    return data_a[a_offset + ib];
+}
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    return vec4(data_a[a_offset + ib    ], data_a[a_offset + ib + 1],
+                data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
+}
+vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
+    const vec2 a = data_a_packed32[(a_offset + ib)/2];
+    const vec2 b = data_a_packed32[(a_offset + ib)/2 + 1];
+    return vec4(a, b);
+}
 #endif
 
 #if defined(DATA_A_BF16)
+FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
+    return bf16_to_fp32(data_a[a_offset + ib]);
+}
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     return vec2(bf16_to_fp32(data_a[a_offset + ib]), bf16_to_fp32(data_a[a_offset + ib + 1]));
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    return vec4(bf16_to_fp32(data_a[a_offset + ib    ]), bf16_to_fp32(data_a[a_offset + ib + 1]),
+                bf16_to_fp32(data_a[a_offset + ib + 2]), bf16_to_fp32(data_a[a_offset + ib + 3]));
+}
+vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
+    const uint a = data_a_packed32[(a_offset + ib)/2];
+    const uint b = data_a_packed32[(a_offset + ib)/2 + 1];
+    return vec4(uintBitsToFloat((a & 0x0000ffff) << 16),
+                uintBitsToFloat( a & 0xffff0000),
+                uintBitsToFloat((b & 0x0000ffff) << 16),
+                uintBitsToFloat( b & 0xffff0000));
+}
 #endif
 
 #if defined(DATA_A_Q4_0)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
index c582aba87dc..7171cbfa559 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
@@ -1,4 +1,12 @@
 
+// Each format defines a scalar dequantFunc<T> plus a V=4 dequantFunc<T>_v
+// passed as the optional vector decoder to coopMatLoadTensorNV via
+// GL_NV_cooperative_matrix_decode_vector. When the driver doesn't support
+// the extension, ggml-vulkan.cpp strips it from the compiled SPIR-V.
+#ifdef GL_NV_cooperative_matrix_decode_vector
+#extension GL_NV_cooperative_matrix_decode_vector : enable
+#endif
+
 #include "types.glsl"
 
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufF32 {
@@ -25,6 +33,19 @@ float16_t dequantFuncQ1_0(const in decodeBufQ1_0 bl, const in uint blockCoords[2
     return bit != 0u ? d : -d;
 }
 
+f16vec4 dequantFuncQ1_0_v(const in decodeBufQ1_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d  = bl.block.d;
+    const float16_t md = -d;
+    const uint idx = coordInBlock[1];
+    const uint qs_nib = uint(bl.block.qs[idx >> 3]) >> (idx & 0x4u);
+    return f16vec4(
+        (qs_nib & 1u) != 0u ? d : md,
+        (qs_nib & 2u) != 0u ? d : md,
+        (qs_nib & 4u) != 0u ? d : md,
+        (qs_nib & 8u) != 0u ? d : md);
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
    block_q4_0_packed16 block;
 };
@@ -42,10 +63,28 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ4_0_v(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_i = (idx & 0xE) >> 1;       // even, in {0,2,4,6}
+    const uint qsw = uint32_t(bl.block.qs[qs_i    ])
+                   | (uint32_t(bl.block.qs[qs_i + 1u]) << 16);
+    // shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
+    const uint q4   = (qsw >> shift) & 0x0F0F0F0Fu;
+    const u8vec4 q  = unpack8(q4);
+    return f16vec4((vec4(q) - vec4(8.0)) * vec4(float(d)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
    block_q4_1 block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1_packed32 {
+   block_q4_1_packed32 block;
+};
+
 float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     const float16_t d = bl.block.d;
@@ -60,10 +99,27 @@ float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ4_1_v(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ4_1_packed32 bl32 = decodeBufQ4_1_packed32(bl);
+    const float16_t d = bl.block.d;
+    const float16_t m = bl.block.m;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_w  = (idx & 0xC) >> 2;      // iqs / 4 in [0,4)
+    const uint qsw   = uint32_t(bl32.block.qs[qs_w]);
+    const u8vec4 q   = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+    return f16vec4(vec4(q) * vec4(float(d)) + vec4(float(m)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 {
    block_q5_0 block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0_packed16 {
+   block_q5_0_packed16 block;
+};
+
 float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     const float16_t d = bl.block.d;
@@ -82,10 +138,32 @@ float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ5_0_v(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ5_0_packed16 bl16 = decodeBufQ5_0_packed16(bl);
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_i  = (idx & 0xC) >> 1;      // packed16 word index, in {0,2,4,6}
+    const uint qsw = uint32_t(bl16.block.qs[qs_i    ])
+                   | (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
+    const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+
+    const uint uint_qh = uint(bl16.block.qh[1]) << 16 | uint(bl16.block.qh[0]);
+    const uint qh_pack = uint_qh >> idx;      // bits 0..3 = element idx..idx+3 high bits
+    const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
+
+    return f16vec4((vec4(ql) + vec4(qh_high) - vec4(16.0)) * vec4(float(d)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 {
    block_q5_1 block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1_packed32 {
+   block_q5_1_packed32 block;
+};
+
 float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     const float16_t d = bl.block.d;
@@ -105,6 +183,23 @@ float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ5_1_v(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ5_1_packed32 bl32 = decodeBufQ5_1_packed32(bl);
+    const float16_t d = bl.block.d;
+    const float16_t m = bl.block.m;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_w  = (idx & 0xC) >> 2;      // iqs / 4 in [0,4)
+    const uint qsw   = uint32_t(bl32.block.qs[qs_w]);
+    const u8vec4 ql  = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+
+    const uint qh_pack = bl.block.qh >> idx;  // bits 0..3 = element idx..idx+3 high bits
+    const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
+
+    return f16vec4((vec4(ql) + vec4(qh_high)) * vec4(float(d)) + vec4(float(m)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
    block_q8_0_packed16 block;
 };
@@ -121,6 +216,17 @@ float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ8_0_v(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint base = idx >> 1u;
+    const uint w =  uint(uint16_t(bl.block.qs[base]))
+                 | (uint(uint16_t(bl.block.qs[base + 1u])) << 16u);
+    const i8vec4 qi = unpack8(int32_t(w));
+    return f16vec4(vec4(qi) * vec4(float(d)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K {
    block_q2_K block;
 };
@@ -129,6 +235,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2
    block_q2_K_packed16 block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K_packed32 {
+   block_q2_K_packed32 block;
+};
+
 float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
@@ -147,10 +257,36 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ2_K_v(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ2_K_packed32 bl32 = decodeBufQ2_K_packed32(bl);
+    const f16vec2 dm = bl.block.dm;
+    const uint idx = coordInBlock[1];
+
+    const uint scalesi = idx >> 4;                      // 0..15
+    const uint qsshift = (idx & 0x60) >> 4;             // 0,2,4,6
+
+    // qs_i (packed16) = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1) is even for idx % 4 == 0,
+    // so qs_w (packed32) = qs_i / 2 = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2).
+    const uint qs_w   = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2);
+    const uint qsw    = uint32_t(bl32.block.qs[qs_w]);
+    const uint qs4    = (qsw >> qsshift) & 0x03030303u;
+    const u8vec4 qi   = unpack8(qs4);
+
+    const uint scales      = bl.block.scales[scalesi];
+    const float16_t d_sub  = dm.x * float16_t(scales & 0xF);
+    const float16_t m_sub  = dm.y * float16_t(scales >> 4);
+    return f16vec4(vec4(qi) * vec4(float(d_sub)) - vec4(float(m_sub)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K {
    block_q3_K block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K_packed16 {
+   block_q3_K_packed16 block;
+};
+
 float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     const uint idx = coordInBlock[1];
@@ -179,6 +315,47 @@ float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ3_K_v(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ3_K_packed16 bl16 = decodeBufQ3_K_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint n         = idx >> 7;             // 0,1
+    const uint is        = idx >> 4;             // 0..15
+    const uint halfsplit = (idx & 0x60) >> 5;    // 0,1,2,3
+    const uint qsshift   = halfsplit << 1;       // 0,2,4,6
+    const uint hbit      = (n << 2) + halfsplit; // 0..7   (bit position in hmask byte)
+
+    uint32_t scaleidx0      = (is < 8) ? is : (is - 8);
+    uint32_t scaleidx0shift = (is < 8) ? 0u : 4u;
+    uint32_t scaleidx1      = is + 8 - (is / 4) * 4;
+    uint32_t scaleidx1shift = (is / 4) * 2;
+
+    const int8_t us = int8_t(
+        ((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) |
+        (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4));
+    const float16_t dl = bl.block.d * float16_t(int(us) - 32);
+
+    // For idx % 4 == 0: (idx & 0x1F) == (idx & 0x1C) is a multiple of 4.
+    const uint qsi = (n << 5) + (idx & 0x1Cu);
+    const uint hmi =             (idx & 0x1Cu);
+
+    // Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
+    // After this: byte j of qsw / hmw holds the data for element idx+j.
+    const uint qsw = uint32_t(bl16.block.qs[qsi >> 1])
+                   | (uint32_t(bl16.block.qs[(qsi >> 1) + 1u]) << 16);
+    const uint hmw = uint32_t(bl16.block.hmask[hmi >> 1])
+                   | (uint32_t(bl16.block.hmask[(hmi >> 1) + 1u]) << 16);
+
+    // qsshift in {0,2,4,6} and hbit in {0..7}: per-byte masks isolate the wanted bits
+    // with no inter-byte leakage.
+    const uint ql4 = (qsw >> qsshift) & 0x03030303u;
+    const uint qh4 = (hmw >> hbit)    & 0x01010101u;
+
+    const ivec4 q = ivec4(unpack8(ql4 | (qh4 << 2))) - ivec4(4);
+    return f16vec4(vec4(q) * vec4(float(dl)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K {
    block_q4_K block;
 };
@@ -187,6 +364,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
    block_q4_K_packed16 block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed32 {
+   block_q4_K_packed32 block;
+};
+
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 {
    block_q4_K_packed128 block;
 };
@@ -334,6 +515,55 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
     return float16_t(ret);
 }
 
+f16vec4 dequantFuncQ4_K_v(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ4_K_packed32 bl32 = decodeBufQ4_K_packed32(bl);
+    decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint is = idx >> 5;                    // 0..7
+
+#if defined(IS_MUL_MM2) && defined(DATA_A_Q4_K)
+    vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
+    float d = v.x;
+    float m = v.y;
+#else
+    uvec4 v = bl128.block.q4k[0];
+    const vec2 loadd = vec2(unpackFloat2x16(v.x));
+
+    uint32_t sc;
+    uint32_t mbyte;
+
+    uint32_t scale0 = v.y;
+    uint32_t scale4 = v.z;
+    uint32_t scale8 = v.w;
+
+    uint32_t sc_lo = scale0;
+    uint32_t mb_lo = scale4;
+    uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
+    uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
+
+    sc = is < 4 ? sc_lo : sc_hi;
+    mbyte = is < 4 ? mb_lo : mb_hi;
+    sc = sc >> (8 * (is & 3));
+    mbyte = mbyte >> (8 * (is & 3));
+    sc &= 0x3F;
+    mbyte &= 0x3F;
+
+    const float d = loadd.x * float(sc);
+    const float m = loadd.y * float(mbyte);
+#endif
+
+    // idx in [0,256); vector decode uses idx a multiple of 4. packed32 word index:
+    // (qs_i >> 1) == (idx >> 6) * 8 + ((idx & 0x1E) >> 2). sh is 0 or 4 only, so a
+    // single (w >> sh) & 0x0F0F0F0F isolates all four nibbles without inter-byte leakage.
+    const uint sh = (idx & 0x20u) >> 3u;
+    const uint w = uint32_t(bl32.block.qs[(idx >> 6) * 8u + ((idx & 0x1Eu) >> 2)]);
+    const u8vec4 q = unpack8((w >> sh) & 0x0F0F0F0Fu);
+
+    return f16vec4(vec4(d) * vec4(q) - vec4(m));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K {
    block_q5_K block;
 };
@@ -346,6 +576,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
    block_q5_K_packed128 block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed32 {
+   block_q5_K_packed32 block;
+};
+
 float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
@@ -399,6 +633,58 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
     return float16_t(ret);
 }
 
+f16vec4 dequantFuncQ5_K_v(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ5_K_packed32 bl32 = decodeBufQ5_K_packed32(bl);
+    decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl);
+    const uint idx = coordInBlock[1];
+    const uint is = idx >> 5;
+
+#if defined(IS_MUL_MM2) && defined(DATA_A_Q5_K)
+    vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
+    float d = v.x;
+    float m = v.y;
+#else
+    uvec4 v = bl128.block.q5k[0];
+
+    const f16vec2 loadd = unpackFloat2x16(v.x);
+
+    uint32_t sc;
+    uint32_t mbyte;
+
+    uint32_t scale0 = v.y;
+    uint32_t scale4 = v.z;
+    uint32_t scale8 = v.w;
+
+    uint32_t sc_lo = scale0;
+    uint32_t mb_lo = scale4;
+    uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
+    uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
+
+    sc = is < 4 ? sc_lo : sc_hi;
+    mbyte = is < 4 ? mb_lo : mb_hi;
+    sc = sc >> (8 * (is & 3));
+    mbyte = mbyte >> (8 * (is & 3));
+    sc &= 0x3F;
+    mbyte &= 0x3F;
+
+    const float16_t d = loadd.x * float16_t(sc);
+    const float16_t m = loadd.y * float16_t(mbyte);
+#endif
+
+    // sh is 0 or 4; mask 0x0F0F0F0F covers the four nibbles regardless (no inter-byte leakage).
+    const uint sh = (idx & 0x20u) >> 3u;
+    const uint qs_w = (idx >> 6) * 8u + ((idx & 0x1Eu) >> 2);
+    const uint qh_w = (idx & 0x1Eu) >> 2;
+
+    const uint ql4 = (uint32_t(bl32.block.qs[qs_w]) >> sh) & 0x0F0F0F0Fu;
+    // qh stores bit `is` per element across 4 consecutive bytes; one shift+mask handles all 4.
+    const uint qh4 = ((uint32_t(bl32.block.qh[qh_w]) >> is) & 0x01010101u) << 4u;
+
+    const u8vec4 qi = unpack8(ql4 | qh4);
+    return f16vec4(vec4(qi) * vec4(d) - vec4(m));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K {
    block_q6_K block;
 };
@@ -431,6 +717,35 @@ float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2
     return ret;
 }
 
+f16vec4 dequantFuncQ6_K_v(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint b = (idx & 0x40) >> 6;
+    const uint qhshift = (idx & 0x60) >> 4;          // 0,2,4,6
+    const uint is = idx >> 4;
+    const uint sh = b * 4;                            // 0 or 4
+
+    const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
+
+    const uint ql_i = ((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1);
+    const uint qh_i = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1);
+
+    // Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
+    // After this: byte j of qlw / qhw holds the data for element idx+j.
+    const uint qlw = uint32_t(bl16.block.ql[ql_i    ]) | (uint32_t(bl16.block.ql[ql_i + 1]) << 16);
+    const uint qhw = uint32_t(bl16.block.qh[qh_i    ]) | (uint32_t(bl16.block.qh[qh_i + 1]) << 16);
+
+    // sh in {0,4} and qhshift in {0,2,4,6}: per-byte masks 0x0F / 0x03 keep only the
+    // wanted bits with no inter-byte leakage; place qh's 2 bits at nibble high position.
+    const uint ql4 = (qlw >> sh) & 0x0F0F0F0Fu;
+    const uint qh4 = ((qhw >> qhshift) & 0x03030303u) << 4u;
+
+    const ivec4 qi = ivec4(unpack8(ql4 | qh4));
+    return f16vec4((vec4(qi) - vec4(32.0f)) * vec4(float(dscale)));
+}
+
 #if defined(DATA_A_IQ1_S)
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ1_S {
    block_iq1_s block;
@@ -453,6 +768,29 @@ float16_t dequantFuncIQ1_S(const in decodeBufIQ1_S bl, const in uint blockCoords
     float16_t ret = float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * int(idx % 8), 2)) + float16_t(delta));
     return ret;
 }
+
+f16vec4 dequantFuncIQ1_S_v(const in decodeBufIQ1_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint ib32 = idx >> 5;
+    const uint ib8  = idx >> 3;
+    const int  i8b  = int(idx & 4);              // 0 or 4
+
+    const uint qh = bl.block.qh[ib32];
+    const uint qs = bl.block.qs[ib8];
+    const float dl    = float(d) * float(2 * bitfieldExtract(qh, 12, 3) + 1);
+    const float delta = ((qh & 0x8000u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
+    const uint  grid  = iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)];
+
+    const ivec4 q = ivec4(
+        bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
+    return f16vec4((vec4(q) + vec4(delta)) * dl);
+}
 #endif
 
 #if defined(DATA_A_IQ1_M)
@@ -485,6 +823,33 @@ float16_t dequantFuncIQ1_M(const in decodeBufIQ1_M bl, const in uint blockCoords
     float16_t ret = d * float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * i8, 2)) + float16_t(delta));
     return ret;
 }
+
+f16vec4 dequantFuncIQ1_M_v(const in decodeBufIQ1_M bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ1_M_packed64 bl64 = decodeBufIQ1_M_packed64(bl);
+    const uint idx = coordInBlock[1];
+
+    uvec2 scales = unpack32(bl64.block.scales);
+    const float16_t d = uint16BitsToHalf(uint16_t(((scales.x & 0xF000) >> 12) | ((scales.x & 0xF0000000) >> 24) | ((scales.y & 0xF000) >> 4) | ((scales.y & 0xF0000000) >> 16)));
+
+    const uint ib8  = idx >> 3;
+    const uint ib16 = idx >> 4;
+    const int  i8b  = int(idx & 4);   // 0 or 4 -- i8 base for the V=4 group
+
+    const uint sc = bl.block.scales[ib8 / 8];
+    const uint qs = bl.block.qs[ib8];
+    const uint qh = bl.block.qh[ib16] >> (4 * (ib8 & 1));
+    const float dl    = 2.0 * float(bitfieldExtract(sc, 3 * int(ib16 & 3), 3)) + 1.0;
+    const float delta = ((qh & 8u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
+    const uint  grid  = iq1s_grid[qs | ((qh & 7u) << 8)];
+
+    const ivec4 q = ivec4(
+        bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
+    return f16vec4((vec4(q) + vec4(delta)) * (float(d) * dl));
+}
 #endif
 
 #if defined(DATA_A_IQ2_XXS)
@@ -520,6 +885,33 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
     vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
     return float16_t(ret[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ2_XXS_v(const in decodeBufIQ2_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ2_XXS_packed16 bl16 = decodeBufIQ2_XXS_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint ib32 = idx >> 5;
+    const uint ib8  = (idx & 0x18) >> 3;
+    const uint iqs  = 8 * ib32 + ib8;
+
+    const uint qs        = bl.block.qs[iqs];
+    const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
+    const float dscale   = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
+
+    uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
+    sign |= bitCount(sign) << 7;
+    const uint sb = sign >> (idx & 7u);
+
+    const uint   g2 = iq2xxs_grid[qs][(idx & 4) >> 2];
+    const u8vec4 g  = unpack8(g2);
+
+    return f16vec4(
+        dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif
 
 #if defined(DATA_A_IQ2_XS)
@@ -548,6 +940,31 @@ float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoor
     vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
     return float16_t(ret[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ2_XS_v(const in decodeBufIQ2_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint idx = coordInBlock[1];
+
+    const uint is     = idx >> 5;
+    const uint sshift = (idx & 0x10) >> 2;
+    const uint iqs    = idx >> 3;
+
+    const uint16_t qs     = bl.block.qs[iqs];
+    const float    dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF));
+
+    uint sign = uint(qs >> 9);
+    sign |= bitCount(sign) << 7;
+    const uint sb = sign >> (idx & 7u);
+
+    const uint   g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2];
+    const u8vec4 g  = unpack8(g2);
+
+    return f16vec4(
+        dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif
 
 #if defined(DATA_A_IQ2_S)
@@ -576,6 +993,32 @@ float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords
     const vec2 v = db * vec2(sign01) * vec2(unpack8(g2));
     return float16_t(v[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ2_S_v(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint idx = coordInBlock[1];
+
+    const uint ib32    = idx >> 5;
+    const uint ib8     = idx >> 3;
+    const uint qhshift = 2 * (ib8 % 4);
+
+    const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf;
+    const uint qs    = bl.block.qs[ib8];
+    const uint qh    = bl.block.qh[ib32];
+    const uint sb    = uint(bl.block.qs[QUANT_K / 8 + ib8]) >> (idx & 0x6u);
+
+    const float d  = float(bl.block.d);
+    const float db = d * 0.25 * (0.5 + scale);
+
+    const uint   g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2];
+    const u8vec4 g  = unpack8(g2);
+
+    return f16vec4(
+        db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif
 
 #if defined(DATA_A_IQ3_XXS)
@@ -609,6 +1052,32 @@ float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCo
     const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
     return float16_t(v[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ3_XXS_v(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint iqs = idx >> 2;
+    const uint is  = QUANT_K / 4 + ((idx & 0xE0) >> 3);
+
+    const float d     = float(bl.block.d);
+    const uint  qs    = bl.block.qs[iqs];
+    const uint  signs = pack32(u16vec2(bl16.block.qs[is/2+0], bl16.block.qs[is/2+1]));
+    const float db    = d * 0.5 * (0.5 + (signs >> 28));
+
+    const uint sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
+    const uint sb    = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6u);
+
+    const uint   grid = iq3xxs_grid[qs];
+    const u8vec4 g    = unpack8(grid);
+
+    return f16vec4(
+        db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif
 
 #if defined(DATA_A_IQ3_S)
@@ -635,6 +1104,30 @@ float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords
 
     return float16_t(v[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ3_S_v(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint idx = coordInBlock[1];
+
+    const uint iqs = idx >> 2;
+    const uint iqh = idx >> 5;
+
+    const float d     = float(bl.block.d);
+    const uint  qs    = bl.block.qs[iqs];
+    const uint  qh    = bl.block.qh[iqh];
+    const uint  sb    = uint(bl.block.signs[iqs / 2]) >> (idx & 0x6u);
+    const uint  scale = bl.block.scales[iqs / 16];
+    const float db    = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
+
+    const uint   grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
+    const u8vec4 g    = unpack8(grid);
+
+    return f16vec4(
+        db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif
 
 #if defined(DATA_A_IQ4_XS)
@@ -642,6 +1135,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
    block_iq4_xs block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufIQ4_XS_packed32 {
+   block_iq4_xs_packed32 block;
+};
+
 float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     const float16_t d = bl.block.d;
@@ -657,6 +1154,30 @@ float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoor
     float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
     return ret;
 }
+
+f16vec4 dequantFuncIQ4_XS_v(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ4_XS_packed32 bl32 = decodeBufIQ4_XS_packed32(bl);
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint ib32   = idx >> 5;                                   // 0..7
+    const uint sl     = (bl32.block.scales_l >> (4 * ib32)) & 0xF;
+    const uint sh     = (uint(bl32.block.scales_h) >> (2 * ib32)) & 0x3;
+    const uint qshift = (idx & 0x10) >> 2;                          // {0, 4}
+    const uint qs_w   = 4 * ib32 + ((idx & 0xC) >> 2);              // iqs / 4, in [0,32)
+
+    const float16_t dl = d * float16_t(int(sl | (sh << 4)) - 32);
+
+    const uint qsw  = bl32.block.qs[qs_w];
+    const u8vec4 qv = unpack8((qsw >> qshift) & 0x0F0F0F0Fu);
+    const vec4 ret = vec4(
+        float(kvalues_iq4nl[qv.x]),
+        float(kvalues_iq4nl[qv.y]),
+        float(kvalues_iq4nl[qv.z]),
+        float(kvalues_iq4nl[qv.w])) * float(dl);
+    return f16vec4(ret);
+}
 #endif
 
 #if defined(DATA_A_IQ4_NL)
@@ -664,6 +1185,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
    block_iq4_nl block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL_packed16 {
+   block_iq4_nl_packed16 block;
+};
+
 float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     const float16_t d = bl.block.d;
@@ -676,6 +1201,24 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
     float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
     return ret;
 }
+
+f16vec4 dequantFuncIQ4_NL_v(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ4_NL_packed16 bl16 = decodeBufIQ4_NL_packed16(bl);
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_i  = (idx & 0xC) >> 1;      // packed16 word index, in {0,2,4,6}
+    const uint qsw = uint32_t(bl16.block.qs[qs_i    ])
+                   | (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
+    // shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
+    const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+    return f16vec4(
+        float(d) * float(kvalues_iq4nl[q.x]),
+        float(d) * float(kvalues_iq4nl[q.y]),
+        float(d) * float(kvalues_iq4nl[q.z]),
+        float(d) * float(kvalues_iq4nl[q.w]));
+}
 #endif
 
 #if defined(DATA_A_MXFP4)
@@ -695,6 +1238,26 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
     float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5);
     return ret;
 }
+
+f16vec4 dequantFuncMXFP4_v(const in decodeBufMXFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float d = e8m0_to_fp32(bl.block.e);
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx & 0xF;
+    const uint shift = (idx & 0x10) >> 2;
+    uvec4 qv = uvec4(
+        uint(bl.block.qs[iqs]),
+        uint(bl.block.qs[iqs + 1u]),
+        uint(bl.block.qs[iqs + 2u]),
+        uint(bl.block.qs[iqs + 3u]));
+    qv = (qv >> shift) & 0xFu;
+    const vec4 ret = vec4(
+        float(kvalues_mxfp4[qv.x]),
+        float(kvalues_mxfp4[qv.y]),
+        float(kvalues_mxfp4[qv.z]),
+        float(kvalues_mxfp4[qv.w])) * d * 0.5f;
+    return f16vec4(ret);
+}
 #endif
 
 #if defined(DATA_A_NVFP4)
@@ -702,6 +1265,10 @@ layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVF
    block_nvfp4 block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVFP4_packed32 {
+   block_nvfp4_packed32 block;
+};
+
 float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
     const uint idx = coordInBlock[1];
@@ -713,56 +1280,97 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords
     qs = (qs >> shift) & 0xF;
     return float16_t(kvalues_mxfp4[qs] * d * 0.5);
 }
+
+f16vec4 dequantFuncNVFP4_v(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufNVFP4_packed32 bl32 = decodeBufNVFP4_packed32(bl);
+    const uint idx = coordInBlock[1];
+    const uint sub   = idx >> 4;
+    const uint qs_w  = ((idx & 0x30) >> 3) + ((idx & 0x4u) >> 2);  // iqs / 4, in [0,8)
+    const uint shift = (idx & 0x8) >> 1;
+    const float d    = ue4m3_to_fp32(bl.block.d[sub]);
+
+    const uint qsw  = uint32_t(bl32.block.qs[qs_w]);
+    const u8vec4 qv = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+    const vec4 ret = vec4(
+        float(kvalues_mxfp4[qv.x]),
+        float(kvalues_mxfp4[qv.y]),
+        float(kvalues_mxfp4[qv.z]),
+        float(kvalues_mxfp4[qv.w])) * d * 0.5f;
+    return f16vec4(ret);
+}
 #endif
 
 #if defined(DATA_A_Q1_0)
 #define dequantFuncA dequantFuncQ1_0
+#define dequantFuncA_v dequantFuncQ1_0_v
 #elif defined(DATA_A_Q4_0)
 #define dequantFuncA dequantFuncQ4_0
+#define dequantFuncA_v dequantFuncQ4_0_v
 #elif defined(DATA_A_Q4_1)
 #define dequantFuncA dequantFuncQ4_1
+#define dequantFuncA_v dequantFuncQ4_1_v
 #elif defined(DATA_A_Q5_0)
 #define dequantFuncA dequantFuncQ5_0
+#define dequantFuncA_v dequantFuncQ5_0_v
 #elif defined(DATA_A_Q5_1)
 #define dequantFuncA dequantFuncQ5_1
+#define dequantFuncA_v dequantFuncQ5_1_v
 #elif defined(DATA_A_Q8_0)
 #define dequantFuncA dequantFuncQ8_0
+#define dequantFuncA_v dequantFuncQ8_0_v
 #elif defined(DATA_A_Q2_K)
 #define dequantFuncA dequantFuncQ2_K
+#define dequantFuncA_v dequantFuncQ2_K_v
 #elif defined(DATA_A_Q3_K)
 #define dequantFuncA dequantFuncQ3_K
+#define dequantFuncA_v dequantFuncQ3_K_v
 #elif defined(DATA_A_Q4_K)
 #define dequantFuncA dequantFuncQ4_K
+#define dequantFuncA_v dequantFuncQ4_K_v
 #define fetch_scales fetch_scalesQ4_K
 #define store_scales store_scalesQ4_K
 #elif defined(DATA_A_Q5_K)
 #define dequantFuncA dequantFuncQ5_K
+#define dequantFuncA_v dequantFuncQ5_K_v
 #define fetch_scales fetch_scalesQ5_K
 #define store_scales store_scalesQ4_K
 #elif defined(DATA_A_Q6_K)
 #define dequantFuncA dequantFuncQ6_K
+#define dequantFuncA_v dequantFuncQ6_K_v
 #elif defined(DATA_A_IQ1_S)
 #define dequantFuncA dequantFuncIQ1_S
+#define dequantFuncA_v dequantFuncIQ1_S_v
 #elif defined(DATA_A_IQ1_M)
 #define dequantFuncA dequantFuncIQ1_M
+#define dequantFuncA_v dequantFuncIQ1_M_v
 #elif defined(DATA_A_IQ2_XXS)
 #define dequantFuncA dequantFuncIQ2_XXS
+#define dequantFuncA_v dequantFuncIQ2_XXS_v
 #elif defined(DATA_A_IQ2_XS)
 #define dequantFuncA dequantFuncIQ2_XS
+#define dequantFuncA_v dequantFuncIQ2_XS_v
 #elif defined(DATA_A_IQ2_S)
 #define dequantFuncA dequantFuncIQ2_S
+#define dequantFuncA_v dequantFuncIQ2_S_v
 #elif defined(DATA_A_IQ3_XXS)
 #define dequantFuncA dequantFuncIQ3_XXS
+#define dequantFuncA_v dequantFuncIQ3_XXS_v
 #elif defined(DATA_A_IQ3_S)
 #define dequantFuncA dequantFuncIQ3_S
+#define dequantFuncA_v dequantFuncIQ3_S_v
 #elif defined(DATA_A_IQ4_XS)
 #define dequantFuncA dequantFuncIQ4_XS
+#define dequantFuncA_v dequantFuncIQ4_XS_v
 #elif defined(DATA_A_IQ4_NL)
 #define dequantFuncA dequantFuncIQ4_NL
+#define dequantFuncA_v dequantFuncIQ4_NL_v
 #elif defined(DATA_A_MXFP4)
 #define dequantFuncA dequantFuncMXFP4
+#define dequantFuncA_v dequantFuncMXFP4_v
 #elif defined(DATA_A_NVFP4)
 #define dequantFuncA dequantFuncNVFP4
+#define dequantFuncA_v dequantFuncNVFP4_v
 #elif defined(DATA_A_F32)
 #define dequantFuncA dequantFuncF32
 #endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp
new file mode 100644
index 00000000000..65e9c678401
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp
@@ -0,0 +1,7 @@
+#version 460
+
+#extension GL_NV_cooperative_matrix_decode_vector : require
+
+void main()
+{
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
index 141bb870883..6d45b4931df 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -11,6 +11,9 @@
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_cooperative_matrix : enable
 #extension GL_NV_cooperative_matrix2 : enable
+#ifdef GL_NV_cooperative_matrix_decode_vector
+#extension GL_NV_cooperative_matrix_decode_vector : enable
+#endif
 #extension GL_EXT_buffer_reference : enable
 #extension GL_KHR_shader_subgroup_ballot : enable
 #extension GL_KHR_shader_subgroup_vote : enable
@@ -54,6 +57,41 @@ float16_t faDecodeV(const decodeBufFA_V bl_in, const uint blockCoords[2], const
     }
 }
 
+// V=4 vector decode for K/V; dispatches to per-format _v decoders.
+f16vec4 faDecodeKVector(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
+    switch (FaTypeK) {
+        case 0u: return f16vec4(decodeBufF32(bl_in).block);
+        case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock);
+        case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock);
+        case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock);
+        case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock);
+        case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock);
+        case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock);
+        default: return f16vec4(0);
+    }
+}
+
+f16vec4 faDecodeVVector(const decodeBufFA_V bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
+    switch (FaTypeV) {
+        case 0u: return f16vec4(decodeBufF32(bl_in).block);
+        case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock);
+        case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock);
+        case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock);
+        case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock);
+        case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock);
+        case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock);
+        default: return f16vec4(0);
+    }
+}
+
+#ifdef GL_NV_cooperative_matrix_decode_vector
+#define FADECODEK , faDecodeK, faDecodeKVector
+#define FADECODEV , faDecodeV, faDecodeVVector
+#else
+#define FADECODEK , faDecodeK
+#define FADECODEV , faDecodeV
+#endif
+
 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
 layout (binding = 1) readonly buffer K {uint8_t data_k[];};
 layout (binding = 2) readonly buffer V {uint8_t data_v[];};
@@ -259,7 +297,7 @@ void main() {
         // F16: bs_k==1 (direct load). F32: bs_k==4 (vec4 / dequantFuncF32). Q4/Q8 family: bs_k==32. Q1_0: bs_k==128.
         const bool k_use_decode = (bs_k > 1u);
         if (k_use_decode) {
-            coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose, faDecodeK);
+            coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose FADECODEK);
         } else {
             coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
         }
@@ -325,7 +363,7 @@ void main() {
         uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
         const bool v_use_decode = (bs_v > 1u);
         if (v_use_decode) {
-            coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad), faDecodeV);
+            coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) FADECODEV);
         } else {
             coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
         }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
index ba4c2103f0c..f4130d223b1 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@@ -44,36 +44,81 @@ void im2col(const uint ow, const uint z_idx) {
 
     const uint KHKW = p.KH * p.KW;
 
+    // Precompute base input coordinates
+    const int base_iw = int(ow * p.s0) - p.p0;
+    const int base_ih = int(oh * p.s1) - p.p1;
+
+    // Precompute step deltas
+    const uint delta_ic  = BLOCK_SIZE / KHKW;
+    const uint delta_rem = BLOCK_SIZE % KHKW;
+
+    const uint delta_ky  = delta_rem / p.KW;
+    const uint delta_kx  = delta_rem % p.KW;
+
+    const uint delta_ic_offset = delta_ic * p.offset_delta;
+
+    // If using BDA mode, precompute the base pointer  and step size
+#if BDA
+    const BDA_STORAGE_T base_dst_addr = p.dst_addr + D_SIZE * dst_row;
+    const uint bda_step = D_SIZE * BLOCK_SIZE;
+#endif
+
     uint wg_x = gl_WorkGroupID.x;
     do {
         const uint wg_offset = wg_x * 512;
 
-        [[unroll]] for (uint i = 0; i < NUM_ITER; ++i) {
-            const uint chw_idx = wg_offset + gidx + i * BLOCK_SIZE;
+        uint chw_idx = wg_offset + gidx;
+
+        uint ic  = chw_idx / KHKW;
+        uint rem = chw_idx % KHKW;
+
+        uint ky  = rem / p.KW;
+        uint kx  = rem % p.KW;
 
+        uint ic_offset = src_batch + ic * p.offset_delta;
+
+        // Initialize running pointer/index for the destination buffer
+#if BDA
+        BDA_STORAGE_T current_dst_addr = base_dst_addr + D_SIZE * chw_idx;
+#else
+        uint current_dst_idx = dst_row + chw_idx;
+#endif
+
+        [[unroll]] for (uint i = 0; i < NUM_ITER; ++i) {
             if (chw_idx >= p.CHW) {
                 return;
             }
 
-            const uint ic = chw_idx / KHKW;
-            const uint rem = chw_idx - ic * KHKW;
-            const uint ky = rem / p.KW;
-            const uint kx = rem - ky * p.KW;
-
-            const uint iiw = ow * p.s0 + kx * p.d0 - p.p0;
-            const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
+            const int iiw = base_iw + int(kx * p.d0);
+            const int iih = base_ih + int(ky * p.d1);
 
             A_TYPE val = A_TYPE(0);
-            if (iih < p.IH && iiw < p.IW) {
-                val = data_a[src_batch + ic * p.offset_delta + iih * p.IW + iiw];
+            if (uint(iih) < p.IH && uint(iiw) < p.IW) {
+                val = data_a[ic_offset + uint(iih) * p.IW + uint(iiw)];
             }
 
 #if BDA
-            D_ptr out_ptr = D_ptr(p.dst_addr + D_SIZE * (dst_row + chw_idx));
-            out_ptr.d = D_TYPE(val);
+            D_ptr(current_dst_addr).d = D_TYPE(val);
+            current_dst_addr += bda_step;
 #else
-            data_d[dst_row + chw_idx] = D_TYPE(val);
+            data_d[current_dst_idx] = D_TYPE(val);
+            current_dst_idx += BLOCK_SIZE;
 #endif
+
+            chw_idx   += BLOCK_SIZE;
+            ic_offset += delta_ic_offset;
+            kx        += delta_kx;
+            ky        += delta_ky;
+
+            // Handle X axis wrap
+            uint kx_wrap = uint(kx >= p.KW);
+            kx          -= kx_wrap * p.KW;
+            ky          += kx_wrap;
+
+            // Handle Y axis wrap
+            uint ky_wrap = uint(ky >= p.KH);
+            ky          -= ky_wrap * p.KH;
+            ic_offset   += ky_wrap * p.offset_delta;
         }
 
         wg_x += gl_NumWorkGroups.x;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index 2271be4021b..5a9d0e778fd 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -10,12 +10,38 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 #if !defined(DATA_A_F32) && !defined(DATA_A_F16) && !defined(DATA_A_BF16)
 #define K_PER_ITER 8
 #else
-#define K_PER_ITER 2
+#define K_PER_ITER 4
 #endif
 
 
 uint a_offset, b_offset, d_offset, y_offset;
 
+vec4 load_b(const uint j, const uint iybs, const uint iqs, const bool lastiter, out bool OOB_y, out bool OOB_z, out bool OOB_w) {
+    // Check if the latter elements are OOB, and don't fetch B or accumulate it.
+    OOB_y = lastiter && (iybs + iqs + y_offset >= p.ncols);
+    OOB_z = lastiter && (iybs + iqs + y_offset*2 >= p.ncols);
+    OOB_w = lastiter && (iybs + iqs + y_offset*3 >= p.ncols);
+
+    if (!OOB_w) {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*3]));
+    } else if (!OOB_z) {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]),
+                 0);
+    } else if (!OOB_y) {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
+                 0, 0);
+    } else {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 0, 0, 0);
+    }
+}
+
 void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
 {
     [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
@@ -25,6 +51,8 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
 
 #if K_PER_ITER == 8
 #if QUANT_R == 2
+        // Note that we end up fetching bogus elements here, but its fine as they'll be
+        // within an accessible block.
         const vec4 bv02 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
         const vec4 bv13 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4]);
         const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
@@ -34,18 +62,11 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
         const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
 #endif
 #else
-        // Check if the second of the pair of elements is OOB, and don't fetch B or
-        // accumulate it. We still fetch a pair of elements for A, which is fine for
-        // quantized formats since they'll be within the same block. We should
-        // probably skip fetching the second element for F16/F32, but as of now we
-        // still do.
-        const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
-
-        FLOAT_TYPE b0 = 0, b1 = 0;
-        b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
-        if (!OOB) {
-            b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
-        }
+        bool OOB_y;
+        bool OOB_z;
+        bool OOB_w;
+
+        const vec4 b = load_b(j, iybs, iqs, lastiter, OOB_y, OOB_z, OOB_w);
 #endif
         uint ibi = first_row*p.ncols;
         [[unroll]] for (uint n = 0; n < num_rows; ++n) {
@@ -71,22 +92,60 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
 
             temp[j][n] += rowtmp;
 #else
-            const vec2 v = dequantize(ib, iqs, a_offset);
-
-            // matrix multiplication
-            temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
-            if (!OOB) {
-                temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
+            if (!OOB_w) {
+                const vec4 v = dequantize4(ib, iqs, a_offset);
+                temp[j][n] += dot(v, b);
+            } else if (!OOB_z) {
+                const vec2 v0 = dequantize(ib, iqs, a_offset);
+                const FLOAT_TYPE v1 = dequantize1(ib + 2/QUANT_R, iqs, a_offset);
+                const vec3 v = vec3(v0.x, v0.y, v1);
+                const vec3 b0 = vec3(b.x, b.y, b.z);
+                temp[j][n] += dot(v, b0);
+            } else if (!OOB_y) {
+                const vec2 v0 = dequantize(ib, iqs, a_offset);
+                const vec2 b0 = vec2(b.x, b.y);
+                temp[j][n] += dot(v0, b0);
+            } else {
+                const FLOAT_TYPE v = dequantize1(ib, iqs, a_offset);
+                temp[j][n] = fma(v, b.x, temp[j][n]);
             }
 #endif
         }
     }
 }
 
+#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)
+void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i)
+{
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
+        const uint iqs = 0; // quant index
+        const uint iybs = col; // y block start index
+
+        const vec4 b = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
+
+        uint ibi = first_row*p.ncols;
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib = (ibi + col)/QUANT_K; // block index
+            ibi += p.ncols;
+
+            const vec4 v = dequantize4_2aligned(ib, iqs, a_offset);
+
+            // matrix multiplication
+            temp[j][n] += dot(v, b);
+        }
+    }
+}
+#endif
+
 void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
     const uint tid = gl_LocalInvocationID.x;
 
     get_offsets(a_offset, b_offset, d_offset);
+    const bool is_aligned_nonquant =
+        p.batch_stride_b % 4 == 0 && b_offset % 4 == 0 &&
+        p.ncols % 4 == 0 && BLOCK_SIZE % 4 == 0 &&
+        K_PER_ITER == 4;
 
     y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
 
@@ -105,17 +164,26 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
     int unroll_count = 4;
     uint unrolled_iters = num_iters & ~(unroll_count - 1);
 
-#if K_PER_ITER == 2
+    uint i = 0;
+
+#if K_PER_ITER == 4
     // If the K dimension is odd, we need lastiter==true on the last iteration
     // so OOB is computed correctly. Skip some unrolling to make that happen.
-    if ((p.ncols & 1) != 0 &&
+    if ((p.ncols & 3) != 0 &&
         unrolled_iters == num_iters &&
         unrolled_iters > 0) {
         unrolled_iters -= unroll_count;
     }
+    if (is_aligned_nonquant) {
+        while (i < unrolled_iters) {
+            // Manually partially unroll the loop
+            [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+                iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
+                i++;
+            }
+        }
+    } else {
 #endif
-
-    uint i = 0;
     while (i < unrolled_iters) {
         // Manually partially unroll the loop
         [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
@@ -123,18 +191,30 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
             i++;
         }
     }
+#if K_PER_ITER == 4
+    }
+#endif
 
     unroll_count = 2;
     unrolled_iters = num_iters & ~(unroll_count - 1);
 
-#if K_PER_ITER == 2
-    if ((p.ncols & 1) != 0 &&
+#if K_PER_ITER == 4
+    if ((p.ncols & 3) != 0 &&
         unrolled_iters == num_iters &&
         unrolled_iters > 0) {
         unrolled_iters -= unroll_count;
     }
-#endif
 
+    if (is_aligned_nonquant) {
+        while (i < unrolled_iters && is_aligned_nonquant) {
+            // Manually partially unroll the loop
+            [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+                iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
+                i++;
+            }
+        }
+    } else {
+#endif
     while (i < unrolled_iters) {
         // Manually partially unroll the loop
         [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
@@ -142,10 +222,25 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
             i++;
         }
     }
+#if K_PER_ITER == 4
+    }
+#endif
+
+#if K_PER_ITER == 4
+    if (is_aligned_nonquant) {
+        while (i < num_iters) {
+            iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
+            i++;
+        }
+    } else {
+#endif
     while (i < num_iters) {
         iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
         i++;
     }
+#if K_PER_ITER == 4
+    }
+#endif
 
     reduce_result(temp, d_offset, first_row, num_rows, tid);
 }
@@ -164,6 +259,6 @@ void main() {
         if (first_row >= p.stride_d) {
             return;
         }
-        compute_outputs(first_row, p.stride_d - first_row);
+        compute_outputs(first_row, min(NUM_ROWS, p.stride_d - first_row));
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
index 497a18ff8a7..250d708479b 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@@ -71,10 +71,12 @@ layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 
 #if QUANT_K > 1
-#define DECODEFUNCA , dequantFuncA
-
 #include "dequant_funcs_cm2.glsl"
-
+#if defined(dequantFuncA_v) && defined(GL_NV_cooperative_matrix_decode_vector)
+#define DECODEFUNCA , dequantFuncA, dequantFuncA_v
+#else
+#define DECODEFUNCA , dequantFuncA
+#endif
 #else
 #define DECODEFUNCA
 #endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp
new file mode 100644
index 00000000000..8585538cbb0
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp
@@ -0,0 +1,49 @@
+#version 450
+
+#include "types.glsl"
+
+// Fused snake activation: y = x + sin(b * x)^2 * c
+//   data_a [ne0, ne1]   per element activation x    (A_TYPE)
+//   data_b [1,   ne1]   per channel multiplier      (float)
+//   data_c [1,   ne1]   per channel inverse scale   (float, precomputed as 1 / freq)
+//   data_d [ne0, ne1]   output y                    (D_TYPE)
+layout (binding = 0) readonly  buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly  buffer B {float  data_b[];};
+layout (binding = 2) readonly  buffer C {float  data_c[];};
+layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (push_constant) uniform parameter {
+    uint32_t ne0;
+    uint32_t ne1;
+} p;
+
+// Load A_TYPE to float
+float load_val(uint32_t idx) {
+#if defined(DATA_A_BF16)
+    return bf16_to_fp32(uint32_t(data_a[idx]));
+#else
+    return float(data_a[idx]);
+#endif
+}
+
+// Store float as D_TYPE
+void store_val(uint32_t idx, float v) {
+#if defined(DATA_D_BF16)
+    data_d[idx] = D_TYPE(fp32_to_bf16(v));
+#else
+    data_d[idx] = D_TYPE(v);
+#endif
+}
+
+void main() {
+    const uint32_t i0 = gl_GlobalInvocationID.x;
+    const uint32_t i1 = gl_GlobalInvocationID.y;
+    if (i0 >= p.ne0 || i1 >= p.ne1) return;
+
+    const uint32_t idx = i0 + i1 * p.ne0;
+    const float xi = load_val(idx);
+    const float s  = sin(data_b[i1] * xi);
+    store_val(idx, xi + s * s * data_c[i1]);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
index 4bcd97756fd..f84d6f87334 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
@@ -31,6 +31,7 @@
 #else
 #define A_TYPE float16_t
 #endif
+#define A_TYPE_PACKED32 f16vec2
 #endif
 
 #if defined(DATA_A_BF16)
@@ -44,6 +45,7 @@
 #else
 #define A_TYPE uint16_t
 #endif
+#define A_TYPE_PACKED32 uint32_t
 #endif
 
 #define QUANT_K_Q4_0 32
@@ -1722,11 +1724,18 @@ struct block_nvfp4
     uint8_t qs[QUANT_K_NVFP4 / 2];
 };
 
+struct block_nvfp4_packed32
+{
+    uint32_t d[QUANT_K_NVFP4 / 16 / 4];
+    uint32_t qs[QUANT_K_NVFP4 / 2 / 4];
+};
+
 #if defined(DATA_A_NVFP4)
 #define QUANT_K QUANT_K_NVFP4
 #define QUANT_R QUANT_R_NVFP4
 #define QUANT_AUXF 1
 #define A_TYPE block_nvfp4
+#define A_TYPE_PACKED32 block_nvfp4_packed32
 #endif
 
 #if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index e3a9d61a558..24b9d25f733 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -798,9 +798,11 @@ void process_shaders() {
 
     string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("repeat_i32", "repeat.comp", {{"A_TYPE", "int32_t"}, {"D_TYPE", "int32_t"}});
     string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
+    string_to_spv("repeat_i16", "repeat.comp", {{"A_TYPE", "int16_t"}, {"D_TYPE", "int16_t"}});
+
     string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
     string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
@@ -952,6 +954,10 @@ void process_shaders() {
 
     string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"},  {"B_TYPE", "float"}, {"D_TYPE", "float"}});
 
+    string_to_spv("snake_f32",  "snake.comp", {{"DATA_A_F32", "1"},  {"A_TYPE", "float"},     {"D_TYPE", "float"}});
+    string_to_spv("snake_f16",  "snake.comp", {{"DATA_A_F16", "1"},  {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("snake_bf16", "snake.comp", {{"DATA_A_BF16", "1"}, {"DATA_D_BF16", "1"}, {"A_TYPE", "uint16_t"},  {"D_TYPE", "uint16_t"}});
+
     string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
     string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
@@ -980,8 +986,16 @@ void process_shaders() {
                 string_to_spv(name + (unroll ? "_unroll" : ""), "conv2d_mm.comp", defines);
 #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
                 if (unroll) {
-                    defines["COOPMAT2"] = "1";
-                    string_to_spv(name, "conv2d_mm.comp", defines, true, false, true);
+                    auto cm2_defines = defines;
+                    cm2_defines["COOPMAT2"] = "1";
+                    string_to_spv(name, "conv2d_mm.comp", cm2_defines, true, false, true);
+                }
+#endif
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+                if (unroll) {
+                    auto cm1_defines = defines;
+                    cm1_defines["COOPMAT"] = "1";
+                    string_to_spv(name, "conv2d_mm.comp", cm1_defines, true, true, false);
                 }
 #endif
             }
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index 4c4eda1cbe5..60e98a60741 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -52,7 +52,7 @@
 #define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4
 #define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG      4
 
-// default size for legacy matrix multiplication
+// default size for reg-tile matrix multiplication
 #define WEBGPU_MUL_MAT_WG_SIZE 256
 
 // Same hash combine function as in boost
@@ -93,6 +93,8 @@ struct ggml_webgpu_shader_lib_context {
     uint32_t sg_mat_k                 = 0;
     uint32_t min_subgroup_size        = 0;
     uint32_t max_subgroup_size        = 0;
+    bool     supports_dot_product     = false;
+    std::string vendor;
 };
 
 struct webgpu_pipeline {
@@ -850,31 +852,15 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
 
 /** Matrix Multiplication **/
 
-struct ggml_webgpu_legacy_mul_mat_pipeline_key {
-    ggml_type src0_type;
-    ggml_type src1_type;
-
-    bool operator==(const ggml_webgpu_legacy_mul_mat_pipeline_key & other) const {
-        return src0_type == other.src0_type && src1_type == other.src1_type;
-    }
-};
-
-struct ggml_webgpu_legacy_mul_mat_pipeline_key_hash {
-    size_t operator()(const ggml_webgpu_legacy_mul_mat_pipeline_key & key) const {
-        size_t seed = 0;
-        ggml_webgpu_hash_combine(seed, key.src0_type);
-        ggml_webgpu_hash_combine(seed, key.src1_type);
-        return seed;
-    }
-};
-
 struct ggml_webgpu_mul_mat_vec_pipeline_key {
     ggml_type src0_type;
     ggml_type src1_type;
     int       vectorized;
+    bool      use_mmvq;
 
     bool operator==(const ggml_webgpu_mul_mat_vec_pipeline_key & other) const {
-        return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized;
+        return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized &&
+               use_mmvq == other.use_mmvq;
     }
 };
 
@@ -884,6 +870,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {
         ggml_webgpu_hash_combine(seed, key.src0_type);
         ggml_webgpu_hash_combine(seed, key.src1_type);
         ggml_webgpu_hash_combine(seed, key.vectorized);
+        ggml_webgpu_hash_combine(seed, key.use_mmvq);
         return seed;
     }
 };
@@ -894,6 +881,20 @@ struct ggml_webgpu_mul_mat_vec_shader_decisions {
     uint32_t vec_size;
 };
 
+struct ggml_webgpu_quantize_q8_pipeline_key {
+    ggml_type src0_type;
+
+    bool operator==(const ggml_webgpu_quantize_q8_pipeline_key & other) const { return src0_type == other.src0_type; }
+};
+
+struct ggml_webgpu_quantize_q8_pipeline_key_hash {
+    size_t operator()(const ggml_webgpu_quantize_q8_pipeline_key & key) const {
+        size_t seed = 0;
+        ggml_webgpu_hash_combine(seed, key.src0_type);
+        return seed;
+    }
+};
+
 struct ggml_webgpu_mul_mat_pipeline_key {
     ggml_type src0_type;
     ggml_type src1_type;
@@ -1051,6 +1052,36 @@ struct ggml_webgpu_soft_max_pipeline_key_hash {
     }
 };
 
+/** MMVQ **/
+
+inline bool ggml_webgpu_can_use_mmvq(const ggml_tensor * src0,
+                                     const ggml_tensor * src1,
+                                     bool                supports_dot_product,
+                                     const std::string & vendor) {
+    if (src1->ne[1] == 1) {
+        bool supports_dp4a = vendor == "amd" || vendor == "intel" || vendor == "nvidia";
+        if (supports_dp4a && supports_dot_product) {
+            switch (src1->type) {
+                case GGML_TYPE_F32:
+                    switch (src0->type) {
+                        case GGML_TYPE_Q4_0:
+                        case GGML_TYPE_Q4_1:
+                        case GGML_TYPE_Q8_0:
+                        case GGML_TYPE_Q2_K:
+                        case GGML_TYPE_Q4_K:
+                            return src0->ne[0] % 4 == 0;
+                        default:
+                            break;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    return false;
+}
+
 class ggml_webgpu_shader_lib {
     wgpu::Device           device;
     pre_wgsl::Preprocessor preprocessor;
@@ -1099,14 +1130,12 @@ class ggml_webgpu_shader_lib {
                        webgpu_pipeline,
                        ggml_webgpu_flash_attn_blk_pipeline_key_hash>
         flash_attn_blk_pipelines;
-    std::unordered_map<ggml_webgpu_legacy_mul_mat_pipeline_key,
-                       webgpu_pipeline,
-                       ggml_webgpu_legacy_mul_mat_pipeline_key_hash>
-        mul_mat_legacy_pipelines;  // legacy mul_mat (non-subgroup/non-regtile/non-vec)
     std::unordered_map<ggml_webgpu_mul_mat_vec_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_vec_pipeline_key_hash>
         mul_mat_vec_pipelines;     // fast mat-vec (n==1)
     std::unordered_map<ggml_webgpu_mul_mat_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_pipeline_key_hash>
                                              mul_mat_fast_pipelines;       // fast mat-mat (reg-tile or subgroup)
+    std::unordered_map<ggml_webgpu_quantize_q8_pipeline_key, webgpu_pipeline, ggml_webgpu_quantize_q8_pipeline_key_hash>
+                                             quantize_q8_pipelines;
     std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines;  // key is fixed
     std::unordered_map<ggml_webgpu_mul_mat_id_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_id_pipeline_key_hash>
         mul_mat_id_pipelines;                                              // src0_type/src1_type
@@ -1631,7 +1660,7 @@ class ggml_webgpu_shader_lib {
         key.type                              = context.dst->type;
         key.d_state                           = (int) context.src0->ne[0];
         key.xbc_overlap                       = ggml_webgpu_tensor_overlap(context.src1, context.src4) &&
-                          ggml_webgpu_tensor_overlap(context.src1, context.src5);
+                                                ggml_webgpu_tensor_overlap(context.src1, context.src5);
 
         auto it = ssm_scan_pipelines.find(key);
         if (it != ssm_scan_pipelines.end()) {
@@ -1744,6 +1773,44 @@ class ggml_webgpu_shader_lib {
         return pad_pipelines[key];
     }
 
+    webgpu_pipeline get_quantize_q8_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        ggml_webgpu_quantize_q8_pipeline_key key = {};
+        key.src0_type                            = context.src0->type;
+
+        auto it = quantize_q8_pipelines.find(key);
+        if (it != quantize_q8_pipelines.end()) {
+            return it->second;
+        }
+        const char *             shader_src = wgsl_quantize_q8;
+        std::vector<std::string> defines;
+        std::string              variant = "quantize_q8";
+
+        uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE;
+
+        defines.push_back("SRC1_INNER_TYPE=f32");
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
+
+        const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
+        std::string                     src0_name   = src0_traits->type_name;
+        std::string                     type_upper  = src0_name;
+        variant += "_" + src0_name;
+        std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
+
+        defines.push_back("MUL_ACC_" + type_upper);
+        defines.push_back("Q8_1_T");
+
+        defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
+        variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
+
+        auto processed             = preprocessor.preprocess(shader_src, defines);
+        auto decisions             = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+        decisions->wg_size         = wg_size;
+        webgpu_pipeline pipeline   = ggml_webgpu_create_pipeline(device, processed, variant);
+        pipeline.context           = decisions;
+        quantize_q8_pipelines[key] = pipeline;
+        return quantize_q8_pipelines[key];
+    }
+
     webgpu_pipeline get_mul_mat_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
         ggml_webgpu_mul_mat_vec_pipeline_key key = {};
         key.src0_type                            = context.src0->type;
@@ -1752,6 +1819,8 @@ class ggml_webgpu_shader_lib {
                           (context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
                                                        1 :
                                                        0;
+        key.use_mmvq                             =
+            ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor);
 
         auto it = mul_mat_vec_pipelines.find(key);
         if (it != mul_mat_vec_pipelines.end()) {
@@ -1788,6 +1857,19 @@ class ggml_webgpu_shader_lib {
                     defines.push_back("U32_DEQUANT_HELPERS");
                     defines.push_back("SRC0_INNER_TYPE=u32");
                     switch (context.src0->type) {
+                        case GGML_TYPE_Q8_0:
+                        case GGML_TYPE_Q4_0:
+                        case GGML_TYPE_Q4_1:
+                            if (key.use_mmvq) {
+                                defines.push_back("LEGACY_QUANTS");
+                            }
+                            break;
+                        case GGML_TYPE_Q2_K:
+                        case GGML_TYPE_Q4_K:
+                            if (key.use_mmvq) {
+                                defines.push_back("K_QUANTS");
+                            }
+                            break;
                         case GGML_TYPE_IQ1_S:
                         case GGML_TYPE_IQ1_M:
                         case GGML_TYPE_IQ2_S:
@@ -1840,6 +1922,11 @@ class ggml_webgpu_shader_lib {
             outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
         }
 
+        if (key.use_mmvq) {
+            defines.push_back("MMVQ");
+            defines.push_back("Q8_1_T");
+        }
+
         defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
         defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
         defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
@@ -2018,100 +2105,6 @@ class ggml_webgpu_shader_lib {
         return mul_mat_fast_pipelines[key];
     }
 
-    webgpu_pipeline get_mul_mat_legacy_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        ggml_webgpu_legacy_mul_mat_pipeline_key key = {};
-        key.src0_type                               = context.src0->type;
-        key.src1_type                               = context.src1->type;
-
-        auto it = mul_mat_legacy_pipelines.find(key);
-        if (it != mul_mat_legacy_pipelines.end()) {
-            return it->second;
-        }
-
-        std::vector<std::string> defines;
-        std::string              variant = "mul_mat";
-
-        switch (context.src1->type) {
-            case GGML_TYPE_F32:
-                defines.push_back("SRC1_TYPE=f32");
-                variant += "_f32";
-                break;
-            case GGML_TYPE_F16:
-                defines.push_back("SRC1_TYPE=f16");
-                variant += "_f16";
-                break;
-            default:
-                GGML_ABORT("Unsupported src1 type for mul_mat legacy shader");
-        }
-
-        const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
-        const char *                    src0_name   = src0_traits->type_name;
-
-        switch (context.src0->type) {
-            case GGML_TYPE_F32:
-                defines.push_back("SRC0_TYPE=f32");
-                defines.push_back("FLOAT");
-                variant += "_f32";
-                break;
-            case GGML_TYPE_F16:
-                defines.push_back("SRC0_TYPE=f16");
-                defines.push_back("FLOAT");
-                variant += "_f16";
-                break;
-            default:
-                {
-                    std::string type_upper = src0_name;
-                    std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
-
-                    switch (context.src0->type) {
-                        case GGML_TYPE_Q4_0:
-                        case GGML_TYPE_Q5_0:
-                        case GGML_TYPE_Q8_0:
-                        case GGML_TYPE_Q3_K:
-                        case GGML_TYPE_Q6_K:
-                        case GGML_TYPE_IQ2_XXS:
-                        case GGML_TYPE_IQ2_XS:
-                        case GGML_TYPE_IQ2_S:
-                        case GGML_TYPE_IQ3_XXS:
-                        case GGML_TYPE_IQ3_S:
-                        case GGML_TYPE_IQ1_S:
-                        case GGML_TYPE_IQ4_NL:
-                        case GGML_TYPE_MXFP4:
-                            {
-                                // Quantized types using u32 buffers for portability.
-                                defines.push_back("SRC0_TYPE=u32");
-                                defines.push_back("U32_DEQUANT_HELPERS");
-                                break;
-                            }
-                        default:
-                            {
-                                defines.push_back(std::string("SRC0_TYPE=") + src0_name);
-                            }
-                    }
-
-                    defines.push_back("BYTE_HELPERS");
-                    defines.push_back(type_upper + "_T");
-                    defines.push_back(type_upper);
-                    defines.push_back(type_upper + "_SCALE_MIN");
-                    defines.push_back(type_upper + "_TABLES");
-                    defines.push_back(type_upper + "_GRID");
-
-                    variant += std::string("_") + src0_name;
-                    break;
-                }
-        }
-
-        auto processed = preprocessor.preprocess(wgsl_mul_mat, defines);
-
-        auto decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
-        decisions->wg_size = WEBGPU_MUL_MAT_WG_SIZE;
-
-        webgpu_pipeline pipeline      = ggml_webgpu_create_pipeline(device, processed, variant);
-        pipeline.context              = decisions;
-        mul_mat_legacy_pipelines[key] = pipeline;
-        return mul_mat_legacy_pipelines[key];
-    }
-
     webgpu_pipeline get_mul_mat_id_gather_pipeline(const ggml_webgpu_shader_lib_context & context) {
         auto it = mul_mat_id_gather_pipelines.find(1);
         if (it != mul_mat_id_gather_pipelines.end()) {
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 921c12b41ac..1846886db4e 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -94,14 +94,6 @@ static inline uint32_t ggml_webgpu_u32_from_f32(float value) {
 #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES     4
 #define WEBGPU_STORAGE_BUF_BINDING_MULT          4    // a storage buffer binding size must be a multiple of 4
 
-// For operations which process a row in parallel, this seems like a reasonable
-// default
-#define WEBGPU_ROW_SPLIT_WG_SIZE 64
-
-// Track https://github.com/gpuweb/gpuweb/issues/5315 for fixes to
-// implementations so this can be removed, necessary only for get_rows right now
-#define WEBGPU_MAX_WG_SIZE 288
-
 /* End Constants */
 
 // This is a "fake" base pointer, since WebGPU buffers do not have pointers to
@@ -181,6 +173,7 @@ struct webgpu_capabilities {
     wgpu::Limits limits;
     bool         supports_subgroups       = false;
     bool         supports_subgroup_matrix = false;
+    bool         supports_dot_product     = false;
 
     uint32_t sg_mat_m = 0;
     uint32_t sg_mat_n = 0;
@@ -210,6 +203,8 @@ struct webgpu_global_context_struct {
     wgpu::Buffer    memset_params_buf;
     webgpu_pipeline memset_pipeline;
 
+    std::string vendor;
+
     // TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050
 #ifdef GGML_WEBGPU_CPU_PROFILE
     // Profiling: labeled CPU time in ms (total)
@@ -259,6 +254,7 @@ struct webgpu_context_struct {
     wgpu::Buffer             set_rows_host_error_buf;
     wgpu::CommandEncoder     active_command_encoder;
     wgpu::ComputePassEncoder active_compute_pass;
+    bool                     batch_compute_passes = true;
 
     size_t memset_bytes_per_thread;
 
@@ -590,9 +586,18 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context &
     }
 #else
     for (size_t i = 0; i < dispatches.size(); i++) {
-        ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
-        ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
-        ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
+        if (ctx->batch_compute_passes) {
+            ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
+            ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
+            ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second,
+                                                        1);
+        } else {
+            wgpu::ComputePassEncoder pass = ctx->active_command_encoder.BeginComputePass();
+            pass.SetPipeline(dispatches[i].pipeline.pipeline);
+            pass.SetBindGroup(0, bind_groups[i]);
+            pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
+            pass.End();
+        }
     }
 #endif
 
@@ -618,7 +623,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx,
                                               size_t                  size) {
     std::vector<uint32_t>             params       = { (uint32_t) offset, (uint32_t) size, value };
     std::vector<wgpu::BindGroupEntry> entries      = { ggml_webgpu_make_bind_group_entry(0, buf, 0, buf.GetSize()) };
-    size_t                            bytes_per_wg = WEBGPU_MAX_WG_SIZE * ctx->capabilities.memset_bytes_per_thread;
+    size_t                            bytes_per_wg = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.memset_bytes_per_thread;
     uint32_t                          wg_x         = CEIL_DIV(size + 3, bytes_per_wg);
 
     ctx->queue.WriteBuffer(ctx->memset_params_buf, 0, params.data(), params.size() * sizeof(uint32_t));
@@ -736,8 +741,11 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src
         ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
     };
 
-    uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
+    uint32_t wg_x;
+    uint32_t wg_y;
+    uint32_t total_wg = CEIL_DIV(ne, decisions->wg_size);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
 
 static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx,
@@ -961,9 +969,10 @@ static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
 
     auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
 
+    uint32_t wg_x;
+    uint32_t wg_y;
     uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
 
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
@@ -1051,9 +1060,10 @@ static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,
 
     auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
 
+    uint32_t wg_x;
+    uint32_t wg_y;
     uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
 
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
@@ -1348,7 +1358,7 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
     shader_lib_ctx.src0                           = src;
     shader_lib_ctx.src1                           = nullptr;
     shader_lib_ctx.dst                            = dst;
-    shader_lib_ctx.max_wg_size                    = WEBGPU_MAX_WG_SIZE;
+    shader_lib_ctx.max_wg_size                    = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
 
     webgpu_pipeline pipeline  = ctx->shader_lib->get_get_rows_pipeline(shader_lib_ctx);
     auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
@@ -1384,6 +1394,58 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
+static void ggml_webgpu_quantize_q8_dispatch(webgpu_context &                    ctx,
+                                             ggml_tensor *                       src0,
+                                             ggml_tensor *                       src1,
+                                             ggml_tensor *                       dst,
+                                             std::vector<webgpu_dispatch_desc> & dispatches) {
+    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
+
+    shader_lib_ctx.src0               = src0;
+    shader_lib_ctx.src1               = src1;
+    shader_lib_ctx.dst                = dst;
+    shader_lib_ctx.max_wg_size        = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
+    shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
+
+    webgpu_pipeline qq8_pipeline = ctx->shader_lib->get_quantize_q8_pipeline(shader_lib_ctx);
+
+    // quantize_q8 pipeline
+    const size_t dst_offset           = ggml_webgpu_tensor_offset(dst);
+    const size_t q8_src1_align_offset = ROUNDUP_POW2(
+        dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+    const size_t q8_src1_binding_size =
+        ROUNDUP_POW2(src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)),
+                     WEBGPU_STORAGE_BUF_BINDING_MULT);
+
+    std::vector<uint32_t> q8_params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
+        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
+        (uint32_t) src1->ne[0],
+        (uint32_t) src1->ne[2],
+        (uint32_t) src1->ne[3],
+    };
+
+    std::vector<wgpu::BindGroupEntry> q8_entries = {
+        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
+        ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), q8_src1_align_offset, q8_src1_binding_size)
+    };
+
+    auto q8_decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(qq8_pipeline.context.get());
+
+    uint32_t       q8_wg_size     = q8_decisions->wg_size;
+    uint32_t       q8_wg_x        = 1;
+    uint32_t       q8_wg_y        = 1;
+    const uint32_t wg_per_vec     = (src0->ne[0] / 4 + (q8_wg_size - 1)) / q8_wg_size;
+    const uint32_t q8_total_wg    = src1->ne[2] * src1->ne[3] * wg_per_vec;
+    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+    compute_2d_workgroups(q8_total_wg, max_wg_per_dim, q8_wg_x, q8_wg_y);
+
+    dispatches.push_back({
+        qq8_pipeline, std::move(q8_params), std::move(q8_entries), { q8_wg_x, q8_wg_y }
+    });
+}
+
 static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
                                              ggml_tensor *    src0,
                                              ggml_tensor *    src1,
@@ -1391,47 +1453,9 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
     // Determine if this is a mat-vec operation
     bool is_vec = (dst->ne[1] == 1);
 
-    // Determine if we should use fast path
-    bool use_fast = false;
-    switch (src1->type) {
-        case GGML_TYPE_F16:
-            use_fast = (src0->type == GGML_TYPE_F16);
-            break;
-        case GGML_TYPE_F32:
-            // TODO: implement better mat-mat for k-quants, mat-vec for all k-quants except q6_K
-            switch (src0->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q5_0:
-                case GGML_TYPE_Q5_1:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q6_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q1_0:
-                case GGML_TYPE_IQ1_S:
-                case GGML_TYPE_IQ1_M:
-                case GGML_TYPE_IQ2_XXS:
-                case GGML_TYPE_IQ2_XS:
-                case GGML_TYPE_IQ2_S:
-                case GGML_TYPE_IQ3_XXS:
-                case GGML_TYPE_IQ3_S:
-                case GGML_TYPE_IQ4_NL:
-                case GGML_TYPE_IQ4_XS:
-                case GGML_TYPE_MXFP4:
-                    use_fast = true;
-                    break;
-                default:
-                    break;
-            }
-            break;
-        default:
-            break;
-    }
+    // use MMVQ path for mat-vec
+    bool use_mmvq = ggml_webgpu_can_use_mmvq(src0, src1, ctx->global_ctx->capabilities.supports_dot_product,
+                                             ctx->global_ctx->vendor);
 
     ggml_webgpu_shader_lib_context shader_lib_ctx = {};
 
@@ -1446,16 +1470,20 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
     shader_lib_ctx.sg_mat_k                 = ctx->global_ctx->capabilities.sg_mat_k;
     shader_lib_ctx.min_subgroup_size        = ctx->global_ctx->capabilities.min_subgroup_size;
     shader_lib_ctx.max_subgroup_size        = ctx->global_ctx->capabilities.max_subgroup_size;
+    shader_lib_ctx.supports_dot_product     = ctx->global_ctx->capabilities.supports_dot_product;
+    shader_lib_ctx.vendor                   = ctx->global_ctx->vendor;
 
     // Get or create pipeline
-    webgpu_pipeline pipeline;
+    webgpu_pipeline                   pipeline;
+    std::vector<webgpu_dispatch_desc> dispatches;
 
-    if (use_fast && is_vec) {
+    if (is_vec) {
+        if (use_mmvq) {
+            ggml_webgpu_quantize_q8_dispatch(ctx, src0, src1, dst, dispatches);
+        }
         pipeline = ctx->shader_lib->get_mul_mat_vec_pipeline(shader_lib_ctx);
-    } else if (use_fast) {
-        pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
     } else {
-        pipeline = ctx->shader_lib->get_mul_mat_legacy_pipeline(shader_lib_ctx);
+        pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
     }
 
     // Build params
@@ -1479,25 +1507,31 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
     };
 
     // Build bind group entries
-    std::vector<wgpu::BindGroupEntry> entries = {
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
-    };
+    std::vector<wgpu::BindGroupEntry> entries = {};
+
+    entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0));
+    if (use_mmvq) {
+        auto & mmvq_qq8_entry = dispatches[0].bind_group_entries[1];
+        entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), mmvq_qq8_entry.offset,
+                                                            mmvq_qq8_entry.size));
+    } else {
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1));
+    }
+    entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
 
     // Calculate workgroup dimensions
     uint32_t       wg_x           = 1;
     uint32_t       wg_y           = 1;
     const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
 
-    if (use_fast && is_vec) {
+    if (is_vec) {
         auto * decisions = static_cast<ggml_webgpu_mul_mat_vec_shader_decisions *>(pipeline.context.get());
 
         uint32_t batches       = dst->ne[2] * dst->ne[3];
         uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg);
         uint32_t total_wg      = output_groups * batches;
         compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
-    } else if (use_fast) {
+    } else {
         auto * decisions = static_cast<ggml_webgpu_mul_mat_shader_decisions *>(pipeline.context.get());
 
         // Fast-path tiled/subgroup calculations
@@ -1518,15 +1552,13 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
         }
         uint32_t total_wg = wg_m * wg_n * dst->ne[2] * dst->ne[3];
         compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
-
-    } else {  // legacy
-        auto *   decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
-        uint32_t wg_size   = decisions->wg_size;
-        uint32_t total_wg  = CEIL_DIV(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3], wg_size);
-        compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
     }
 
-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
+    dispatches.push_back({
+        pipeline, std::move(params), std::move(entries), { wg_x, wg_y }
+    });
+
+    return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }
 
 static webgpu_encoded_op ggml_webgpu_mul_mat_id_vec(webgpu_context & ctx,
@@ -1654,14 +1686,11 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
                                           gathered_count_ids_binding_size),
     };
 
-    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
-
-    const uint32_t gather_total_wg = param_n_expert;
-    const uint32_t gather_wg_x     = std::min(gather_total_wg, max_wg_per_dim);
-    const uint32_t gather_wg_y     = CEIL_DIV(gather_total_wg, gather_wg_x);
+    // n_expert is much less than maxComputeWorkgroupsPerDimension (e.g., n_exeprt=256 at Qwen3.5-35B-A3B)
+    const uint32_t gather_wg_x = param_n_expert;
 
     dispatches.push_back({
-        gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, gather_wg_y }
+        gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, 1 }
     });
 
     // params for mul_mat_id.wgsl
@@ -1713,7 +1742,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
     uint32_t max_wg_n           = CEIL_DIV(total_gathered, tile_n_s) + max_active_experts;
     uint32_t total_wg           = wg_m * max_wg_n;
 
-    compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
 
     dispatches.push_back({
         main_pipeline, std::move(main_params), std::move(main_entries), { wg_x, wg_y }
@@ -1956,10 +1985,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
     std::vector<wgpu::BindGroupEntry> reduce_entries;
     if (use_vec_reduce) {
         const uint32_t reduce_sg_size = ctx->global_ctx->capabilities.max_subgroup_size;
-        const uint32_t reduce_wg_size =
-            std::max(reduce_sg_size, (uint32_t) std::min<uint64_t>(
-                                         (uint64_t) nwg * reduce_sg_size,
-                                         ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
+        const uint32_t reduce_wg_size = std::max(
+            reduce_sg_size,
+            (uint32_t) std::min<uint64_t>((uint64_t) nwg * reduce_sg_size,
+                                          ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
         ggml_webgpu_shader_lib_context reduce_shader_ctx = shader_lib_ctx;
         reduce_shader_ctx.max_wg_size                    = reduce_wg_size;
         reduce_pipeline = ctx->shader_lib->get_flash_attn_vec_reduce_pipeline(reduce_shader_ctx);
@@ -2736,10 +2765,12 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
         block_size,  npr,         nrows
     };
 
-    const uint32_t                    total_wg_init = npr * nrows;
-    const uint32_t                    max_wg    = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
-    const uint32_t                    wg_x_init = std::min(total_wg_init, max_wg);
-    const uint32_t                    wg_y_init = CEIL_DIV(total_wg_init, wg_x_init);
+    uint32_t       wg_x_init;
+    uint32_t       wg_y_init;
+    const uint32_t total_wg_init  = npr * nrows;
+    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+    compute_2d_workgroups(total_wg_init, max_wg_per_dim, wg_x_init, wg_y_init);
+
     std::vector<wgpu::BindGroupEntry> init_entries = {
         ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src),
         ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), init_align_offset, init_binding_size)
@@ -2796,9 +2827,11 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
             ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(dst), align_out, size_out)
         };
 
+        uint32_t       wg_x_merge;
+        uint32_t       wg_y_merge;
         const uint32_t total_wg_merge = nm * nrows;
-        const uint32_t wg_x_merge     = std::min(total_wg_merge, max_wg);
-        const uint32_t wg_y_merge     = CEIL_DIV(total_wg_merge, wg_x_merge);
+        compute_2d_workgroups(total_wg_merge, max_wg_per_dim, wg_x_merge, wg_y_merge);
+
         dispatches.push_back({
             argsort_merge_pipeline, std::move(merge_params), std::move(merge_entries), { wg_x_merge, wg_y_merge }
         });
@@ -2918,9 +2951,12 @@ static webgpu_encoded_op ggml_webgpu_upscale(webgpu_context ctx, ggml_tensor * s
 
     webgpu_pipeline pipeline  = ctx->shader_lib->get_upscale_pipeline(shader_lib_ctx);
     auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
-    uint32_t        total_wg  = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t        wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t        wg_y = CEIL_DIV(total_wg, wg_x);
+
+    uint32_t wg_x;
+    uint32_t wg_y;
+    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
+
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
 
@@ -3110,18 +3146,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     uint32_t num_batched_kernels  = 0;
     uint32_t num_inflight_batches = 0;
     bool     contains_set_rows    = false;
-    bool     batch_compute_passes = true;
     int      num_encoded_ops      = 1;
     int      node_idx             = 0;
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     ctx->profile_timestamp_query_count = 0;
-    batch_compute_passes               = false;
     std::vector<std::string> profile_pipeline_names;
 #endif
 
     ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
-    if (batch_compute_passes) {
+    if (ctx->batch_compute_passes) {
         ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
     }
 
@@ -3148,7 +3182,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
 
             // reset state for next batch
             ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
-            if (batch_compute_passes) {
+            if (ctx->batch_compute_passes) {
                 ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
             }
             ctx->param_arena.reset();
@@ -3548,8 +3582,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                         const uint32_t kv_tile = decisions.kv_tile;
 
                         const uint32_t vec_nwg_cap = ctx->webgpu_global_ctx->capabilities.min_subgroup_size;
-                        uint32_t       nwg     = 1u;
-                        const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile);
+                        uint32_t       nwg         = 1u;
+                        const uint64_t kv_span     = (uint64_t) std::max(1u, kv_tile);
                         while ((2u * nwg * kv_span) < (uint64_t) K->ne[1] && nwg < vec_nwg_cap) {
                             nwg <<= 1;
                         }
@@ -3582,6 +3616,22 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                 }
             }
             break;
+        case GGML_OP_MUL_MAT:
+            {
+                const ggml_tensor * src0 = tensor->src[0];
+                const ggml_tensor * src1 = tensor->src[1];
+                bool                use_mmvq =
+                    ggml_webgpu_can_use_mmvq(src0, src1, ctx->webgpu_global_ctx->capabilities.supports_dot_product,
+                                             ctx->webgpu_global_ctx->vendor);
+                if (use_mmvq) {
+                    const size_t q8_src1_size =
+                        src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32));
+                    res = ROUNDUP_POW2(res + q8_src1_size +
+                                           ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment,
+                                       WEBGPU_STORAGE_BUF_BINDING_MULT);
+                }
+            }
+            break;
         case GGML_OP_MUL_MAT_ID:
             {
                 const ggml_tensor * src0 = tensor->src[0];
@@ -3658,13 +3708,13 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {
 
 static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
     // we use the maximum workgroup size for the memset pipeline
-    size_t max_threads = WEBGPU_MAX_WG_SIZE * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+    size_t max_threads = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
     // Size the bytes_per_thread so that the largest buffer size can be handled
     ctx->capabilities.memset_bytes_per_thread =
         CEIL_DIV(ctx->capabilities.limits.maxStorageBufferBindingSize, max_threads);
     std::vector<wgpu::ConstantEntry> constants(2);
     constants[0].key     = "wg_size";
-    constants[0].value   = WEBGPU_MAX_WG_SIZE;
+    constants[0].value   = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
     constants[1].key     = "bytes_per_thread";
     constants[1].value   = ctx->capabilities.memset_bytes_per_thread;
     ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
@@ -3707,12 +3757,16 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     ctx->webgpu_global_ctx->adapter.GetInfo(&info);
     ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
     ctx->webgpu_global_ctx->max_inflight_batches      = ggml_backend_webgpu_get_max_inflight_batches();
+    ctx->webgpu_global_ctx->vendor                    = info.vendor;
     wgpu::SupportedFeatures features;
     ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
     // we require f16 support
     GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
     ctx->webgpu_global_ctx->capabilities.supports_subgroups =
         ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
+    // for dot4I8packed
+    ctx->webgpu_global_ctx->capabilities.supports_dot_product = ctx->webgpu_global_ctx->instance.HasWGSLLanguageFeature(
+        wgpu::WGSLLanguageFeatureName::Packed4x8IntegerDotProduct);
 
     bool valid_subgroup_matrix_config = false;
 #ifndef __EMSCRIPTEN__
@@ -3839,6 +3893,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
                               wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
+    webgpu_ctx->batch_compute_passes = false;
     ggml_webgpu_create_buffer(
         webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
         wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf");
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl
index 372ea79bf9d..758efa17d77 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl
@@ -95,11 +95,10 @@ struct q5_1 {
 };
 #endif
 
-
 #ifdef Q8_1_T
 struct q8_1 {
     d: f16,
-    m: f16,
+    s: f16, // d * sum(qs[i])
     qs: array<u32, 8>
 };
 #endif
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
index fa3bdf4e393..e268adfb16b 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@@ -49,12 +49,14 @@ struct Params{
 var<uniform> params: Params;
 
 @compute @workgroup_size(WG_SIZE)
-fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    if (gid.x >= params.ne) {
+fn main(
+    @builtin(global_invocation_index) gindex: u32,
+) {
+    if (gindex >= params.ne) {
         return;
     }
 
-    var i = gid.x;
+    var i = gindex;
     let i3 = i / (params.src_ne2 * params.src_ne1 * params.src_ne0);
     i = i % (params.src_ne2 * params.src_ne1 * params.src_ne0);
     let i2 = i / (params.src_ne1 * params.src_ne0);
@@ -62,7 +64,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let i1 = i / params.src_ne0;
     let i0 = i % params.src_ne0;
 
-    var j = gid.x;
+    var j = gindex;
     let j3 = j / (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
     j = j % (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
     let j2 = j / (params.dst_ne1 * params.dst_ne0);
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
index ae8036b9ac5..4133f0ab564 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
@@ -122,9 +122,9 @@ const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
 const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
 const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
 
-var<workgroup> q_shmem: array<f32, Q_TILE * HEAD_DIM_QK>;
-var<workgroup> kv_shmem: array<f32, KV_TILE * KV_STAGE_STRIDE>;
-var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
+var<workgroup> q_shmem: array<Q_TYPE, Q_TILE * HEAD_DIM_QK>;
+var<workgroup> kv_shmem: array<KV_TYPE, KV_TILE * KV_STAGE_STRIDE>;
+var<workgroup> p_shmem: array<KV_TYPE, Q_TILE * KV_TILE>;
 
 @compute @workgroup_size(WG_SIZE)
 fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
@@ -169,10 +169,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 
     let head = f32(head_idx);
     let slope = select(1.0,
-                       select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
-                              pow(params.m0, head + 1.0),
-                              head < params.n_head_log2),
-                       params.max_bias > 0.0);
+        select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
+                pow(params.m0, head + 1.0),
+                head < params.n_head_log2),
+        params.max_bias > 0.0);
 
     for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
         let q_tile_row = elem_idx / HEAD_DIM_QK;
@@ -181,7 +181,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
         let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
         q_shmem[elem_idx] = select(
             0.0,
-            f32(Q[global_q_row_offset + q_col]) * params.scale,
+            Q_TYPE(Q[global_q_row_offset + q_col]) * params.scale,
             head_q_row < params.seq_len_q);
     }
 
@@ -213,10 +213,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
             let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
             let k4 = K[k_vec_index];
             let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-            kv_shmem[kv_off + 0u] = f32(k4.x);
-            kv_shmem[kv_off + 1u] = f32(k4.y);
-            kv_shmem[kv_off + 2u] = f32(k4.z);
-            kv_shmem[kv_off + 3u] = f32(k4.w);
+            kv_shmem[kv_off + 0u] = KV_TYPE(k4.x);
+            kv_shmem[kv_off + 1u] = KV_TYPE(k4.y);
+            kv_shmem[kv_off + 2u] = KV_TYPE(k4.z);
+            kv_shmem[kv_off + 3u] = KV_TYPE(k4.w);
         }
 
         workgroupBarrier();
@@ -233,18 +233,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                 var dot_val = 0.0;
                 for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
                     let q_off = q_base + chunk * 4u;
-                    let qv = vec4<f32>(
+                    let qv = vec4<Q_TYPE>(
                         q_shmem[q_off + 0u],
                         q_shmem[q_off + 1u],
                         q_shmem[q_off + 2u],
                         q_shmem[q_off + 3u]);
                     let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-                    let kv = vec4<f32>(
+                    let kv = vec4<KV_TYPE>(
                         kv_shmem[kv_off + 0u],
                         kv_shmem[kv_off + 1u],
                         kv_shmem[kv_off + 2u],
                         kv_shmem[kv_off + 3u]);
-                    dot_val += dot(qv, kv);
+                    dot_val += dot(vec4<f32>(qv), vec4<f32>(kv));
                 }
 #ifdef LOGIT_SOFTCAP
                 dot_val = params.logit_softcap * tanh(dot_val);
@@ -271,7 +271,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
             let kv_local = sg_inv_id + slot * subgroup_size;
             if (row_active && kv_local < kv_count) {
                 let p = exp(local_scores[slot] - new_max);
-                p_shmem[subgroup_p_offset + kv_local] = p;
+                p_shmem[subgroup_p_offset + kv_local] = KV_TYPE(p);
                 local_sum += p;
             }
         }
@@ -285,10 +285,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
             let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
             let v4 = V[v_vec_index];
             let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-            kv_shmem[kv_off + 0u] = f32(v4.x);
-            kv_shmem[kv_off + 1u] = f32(v4.y);
-            kv_shmem[kv_off + 2u] = f32(v4.z);
-            kv_shmem[kv_off + 3u] = f32(v4.w);
+            kv_shmem[kv_off + 0u] = KV_TYPE(v4.x);
+            kv_shmem[kv_off + 1u] = KV_TYPE(v4.y);
+            kv_shmem[kv_off + 2u] = KV_TYPE(v4.z);
+            kv_shmem[kv_off + 3u] = KV_TYPE(v4.w);
         }
 
         workgroupBarrier();
@@ -308,12 +308,12 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                 for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
                     let p = p_shmem[subgroup_p_offset + kv_local];
                     let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-                    let v4 = vec4<f32>(
+                    let v4 = vec4<KV_TYPE>(
                         kv_shmem[kv_off + 0u],
                         kv_shmem[kv_off + 1u],
                         kv_shmem[kv_off + 2u],
                         kv_shmem[kv_off + 3u]);
-                    acc += p * v4;
+                    acc += f32(p) * vec4<f32>(v4);
                 }
                 out_regs[reg_idx] = acc;
             }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
deleted file mode 100644
index fcbefdeb802..00000000000
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
+++ /dev/null
@@ -1,747 +0,0 @@
-enable f16;
-
-#define DECLARE_BYTE_LOADERS_SRC0
-#include "common_decls.tmpl"
-
-
-#ifdef FLOAT
-const BLOCK_SIZE = 1u;
-
-#elif defined(Q4_0) || defined(Q4_1) || defined(Q5_0) || defined(Q5_1) || defined(Q8_0) || defined(Q8_1) || defined(IQ4_NL)
-const BLOCK_SIZE = 32u;
-
-#elif defined(Q2_K) || defined(Q3_K) || defined(Q4_K) || defined(Q5_K) || defined(Q6_K) || defined(IQ2_XXS) || defined(IQ2_XS) || defined(IQ2_S) || defined(IQ3_XXS) || defined(IQ3_S) || defined(IQ1_S) || defined(IQ1_M) || defined(IQ4_XS)
-const BLOCK_SIZE = 256u;
-#endif
-
-#ifdef FLOAT
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    return f32(src0[src0_idx_base + offset]) * f32(src1[src1_idx_base + offset]);
-}
-#endif
-
-#ifdef Q4_0
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 18; // Block stride: 18 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var sum: f32 = 0.0;
-    for (var j: u32 = 0; j < 4; j++) {
-        let q_byte_offset = block_byte_base + 2 + j * 4;
-        let q_packed = load_u32_at_src0(q_byte_offset);
-        for (var k: u32 = 0; k < 4; k++) {
-            let q_byte = get_byte(q_packed, k);
-            let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d;
-            let q_lo = (f32(q_byte & 0xF) - 8.0f) * d;
-            let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
-            sum += q_lo * f32(src1[src1_offset]);
-            sum += q_hi * f32(src1[src1_offset + 16]);
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q4_1
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_q4_1 = src0[src0_idx_base + offset];
-    let d = f32(block_q4_1.d);
-    let m = f32(block_q4_1.m);
-    var sum: f32 = 0.0;
-    for (var j: u32 = 0; j < 4; j++) {
-        let q_packed = block_q4_1.qs[j];
-        for (var k: u32 = 0; k < 4; k++) {
-            let q_byte = get_byte(q_packed, k);
-            let q_hi = f32((q_byte >> 4) & 0xF) * d + m;
-            let q_lo = f32(q_byte & 0xF) * d + m;
-            let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
-            sum += q_lo * f32(src1[src1_offset]);
-            sum += q_hi * f32(src1[src1_offset + 16]);
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q5_0
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 22; // Block stride: 22 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var sum: f32 = 0.0;
-    let qh_packed = load_u32_at_src0(block_byte_base + 2);
-    for (var j: u32 = 0; j < 4; j++) {
-        let q_byte_offset = block_byte_base + 6 + j * 4;
-        let q_packed = load_u32_at_src0(q_byte_offset);
-        for (var k: u32 = 0; k < 4; k++) {
-            let q_byte = get_byte(q_packed, k);
-            let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10;
-            let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d;
-            let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10;
-            let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d;
-            let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
-            sum += q_lo * f32(src1[src1_offset]);
-            sum += q_hi * f32(src1[src1_offset + 16]);
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q5_1
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_q5_1 = src0[src0_idx_base + offset];
-    let d = f32(block_q5_1.d);
-    let m = f32(block_q5_1.m);
-    var sum: f32 = 0.0;
-    for (var j: u32 = 0; j < 4; j++) {
-        let q_packed = block_q5_1.qs[j];
-        for (var k: u32 = 0; k < 4; k++) {
-            let q_byte = get_byte(q_packed, k);
-            let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10;
-            let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m;
-            let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10;
-            let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m;
-            let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
-            sum += q_lo * f32(src1[src1_offset]);
-            sum += q_hi * f32(src1[src1_offset + 16]);
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q8_0
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 34; // Block stride: 34 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var sum: f32 = 0.0;
-    for (var j: u32 = 0; j < 8; j++) {
-        let q_byte_offset = block_byte_base + 2 + j * 4;
-        let q_packed = load_u32_at_src0(q_byte_offset);
-        for (var k: u32 = 0u; k < 4u; k++) {
-            let q_byte = get_byte_i32(q_packed, k);
-            let q_val = f32(q_byte) * d;
-            let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
-            sum += q_val * f32(src1[src1_offset]);
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q8_1
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_q8_1 = src0[src0_idx_base + offset];
-    let d = f32(block_q8_1.d);
-    let m = f32(block_q8_1.m);
-    var sum: f32 = 0.0;
-    for (var j: u32 = 0; j < 8; j++) {
-        let q_packed = block_q8_1.qs[j];
-        for (var k: u32 = 0; k < 4; k++) {
-            let q_byte = get_byte_i32(q_packed, k);
-            let q_val = f32(q_byte) * d + m;
-            let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
-            sum += q_val * f32(src1[src1_offset]);
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q2_K
-// 16 blocks of 16 elements each
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block = src0[src0_idx_base + offset];
-    let d = f32(block.d);
-    let m = f32(block.dmin);
-    var sum = 0.0;
-    var src1_i = src1_idx_base + offset * 256;
-    var is: u32 = 0;
-    // 2 halves of the block (128 elements each)
-    for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) {
-        // 4 groups (each group has 2 blocks of 16 elements)
-        for (var shift: u32 = 0; shift < 8; shift += 2) {
-            // 2 blocks
-            for (var k: u32 = 0; k < 32; k += 16) {
-                let sc = get_byte(block.scales[is / 4], is % 4);
-                is++;
-                let dl = d * f32(sc & 0xF);
-                let ml = m * f32(sc >> 4);
-                for (var l: u32 = 0u; l < 16; l++) {
-                    let q_idx = q_b_idx + k + l;
-                    let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
-                    let qs_val = (q_byte >> shift) & 3;
-                    sum += (f32(qs_val) * dl - ml) * src1[src1_i];
-                    src1_i++;
-                }
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q3_K
-// 16 blocks of 16 elements each
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 110; // Block stride: 110 bytes
-
-    // Bytes 108-109: f16 scale 'd'
-    let d = load_f16_as_f32_at_src0(block_byte_base + 108);
-
-    // extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale,
-    // and 2-bits from the last 4 bytes
-    // Bytes 96-107: 12 bytes of scales (3 u32s)
-    let kmask1: u32 = 0x03030303;
-    let kmask2: u32 = 0x0f0f0f0f;
-    var scale_vals: array<u32, 4>;
-    scale_vals[0] = load_u32_at_src0(block_byte_base + 96);
-    scale_vals[1] = load_u32_at_src0(block_byte_base + 100);
-    scale_vals[2] = load_u32_at_src0(block_byte_base + 104);
-
-    var tmp: u32 = scale_vals[2];
-    scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-    scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-    scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4);
-    scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-
-    // Bytes 0-31: 32 bytes of hmask (8 u32s)
-    var hmask_vals: array<u32, 8>;
-    for (var i: u32 = 0; i < 8; i++) {
-        hmask_vals[i] = load_u32_at_src0(block_byte_base + i * 4);
-    }
-
-    // Bytes 32-95: 64 bytes of qs (16 u32s)
-    var qs_vals: array<u32, 16>;
-    for (var i: u32 = 0u; i < 16; i++) {
-        qs_vals[i] = load_u32_at_src0(block_byte_base + 32 + i * 4);
-    }
-
-    var sum = 0.0;
-    var src1_i = src1_idx_base + offset * 256;
-    var is: u32 = 0;
-    var m: u32 = 1;
-    // 2 halves of the block (128 elements each)
-    for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) {
-        // 4 groups (each group has 2 blocks of 16 elements)
-        for (var shift: u32 = 0; shift < 8; shift += 2) {
-            // 2 blocks
-            for (var k: u32 = 0; k < 32; k += 16) {
-                let sc = get_byte(scale_vals[is / 4], is % 4);
-                is++;
-                let dl = d * (f32(sc) - 32.0);
-                for (var l: u32 = 0u; l < 16u; l++) {
-                    let q_idx = q_b_idx + k + l;
-                    let hm_idx = k + l;
-                    let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4);
-                    let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4);
-                    let hm = select(4.0, 0.0, (hmask_byte & m) != 0);
-                    let qs_val = (q_byte >> shift) & 3;
-                    sum += ((f32(qs_val) - hm) * dl) * src1[src1_i];
-                    src1_i++;
-                }
-            }
-            m <<= 1;
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q4_K
-// 8 blocks of 32 elements each
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block = src0[src0_idx_base + offset];
-    let d = f32(block.d);
-    let m = f32(block.dmin);
-    var sum = 0.0;
-    var src1_i = src1_idx_base + offset * 256;
-    var is: u32 = 0;
-    // 2 blocks each iteration
-    for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) {
-        for (var shift: u32 = 0; shift < 8; shift += 4) {
-            let scale_min = get_scale_min(is, block.scales);
-            is++;
-            let dl = d * scale_min.x;
-            let ml = m * scale_min.y;
-            for (var l: u32 = 0; l < 32; l++) {
-                let q_idx = q_b_idx + l;
-                let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
-                let qs_val = (q_byte >> shift) & 0xF;
-                sum += (f32(qs_val) * dl - ml) * src1[src1_i];
-                src1_i++;
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q5_K
-// 8 blocks of 32 elements each
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block = src0[src0_idx_base + offset];
-    let d = f32(block.d);
-    let m = f32(block.dmin);
-    var sum = 0.0;
-    var src1_i = src1_idx_base + offset * 256;
-    var is: u32 = 0;
-    var u: u32 = 1;
-    // 2 blocks each iteration
-    for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) {
-        for (var shift: u32 = 0; shift < 8; shift += 4) {
-            let scale_min = get_scale_min(is, block.scales);
-            is++;
-            let dl = d * scale_min.x;
-            let ml = m * scale_min.y;
-            for (var l: u32 = 0; l < 32; l++) {
-                let q_idx = q_b_idx + l;
-                let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
-                let qh_byte = get_byte(block.qh[l / 4], l % 4);
-                let qs_val = (q_byte >> shift) & 0xF;
-                let qh_val = select(0.0, 16.0, (qh_byte & u) != 0);
-                sum += ((f32(qs_val) + qh_val) * dl - ml) * src1[src1_i];
-               src1_i++;
-            }
-            u <<= 1;
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef Q6_K
-// 16 blocks of 16 elements each
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 210; // Block stride: 210 bytes
-
-    // Bytes 208-209: f16 scale 'd'
-    let d = load_f16_as_f32_at_src0(block_byte_base + 208);
-
-    // Bytes 0-127: 128 bytes of ql (32 u32s)
-    var ql_vals: array<u32, 32>;
-    for (var i: u32 = 0; i < 32; i++) {
-        ql_vals[i] = load_u32_at_src0(block_byte_base + i * 4);
-    }
-
-    // Bytes 128-191: 64 bytes of qh (16 u32s)
-    var qh_vals: array<u32, 16>;
-    for (var i: u32 = 0; i < 16; i++) {
-        qh_vals[i] = load_u32_at_src0(block_byte_base + 128 + i * 4);
-    }
-
-    // Bytes 192-207: 16 bytes of scales (4 u32s)
-    var scale_vals: array<u32, 4>;
-    for (var i: u32 = 0; i < 4; i++) {
-        scale_vals[i] = load_u32_at_src0(block_byte_base + 192 + i * 4);
-    }
-
-    var sum = 0.0;
-    var src1_i = src1_idx_base + offset * 256;
-    var qh_b_idx: u32 = 0;
-    var sc_b_idx: u32 = 0;
-    for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) {
-        for (var l: u32 = 0; l < 32; l++) {
-            let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4);
-            let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4);
-            let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4);
-
-            let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0;
-            let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0;
-            let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0;
-            let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0;
-
-            let is = l/16;
-            let is1 = sc_b_idx + is;
-            let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4);
-            let is2 = sc_b_idx + is + 2;
-            let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4);
-            let is3 = sc_b_idx + is + 4;
-            let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4);
-            let is4 = sc_b_idx + is + 6;
-            let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4);
-
-            sum += d * f32(sc1) * q1 * src1[src1_i + l];
-            sum += d * f32(sc2) * q2 * src1[src1_i + l + 32];
-            sum += d * f32(sc3) * q3 * src1[src1_i + l + 64];
-            sum += d * f32(sc4) * q4 * src1[src1_i + l + 96];
-        }
-        src1_i += 128;
-        qh_b_idx += 32;
-        sc_b_idx += 8;
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ2_XXS
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 66; // Block stride: 66 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var src1_i = src1_idx_base + offset * 256;
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 32; ib += 4) {
-        let aux0_offset = block_byte_base + 2 + ib * 2;
-        let aux1_offset = block_byte_base + 2 + (ib + 2) * 2;
-        let aux0 = load_u32_at_src0(aux0_offset);
-        let aux1 = load_u32_at_src0(aux1_offset);
-        let db = d * (0.5 + f32(aux1 >> 28)) * 0.25;
-        for (var l: u32 = 0; l < 4; l++) {
-            let ig = get_byte(aux0, l) * 8;
-            let is = (aux1 >> (7 * l)) & 127;
-            let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
-            for (var j: u32 = 0; j < 8; j++) {
-                let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4);
-                let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
-                sum += db * f32(g) * m * src1[src1_i];
-                src1_i++;
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ2_XS
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 74; // Block stride: 74 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var src1_i = src1_idx_base + offset * 256;
-
-    var scale_vals = array<u32, 2>(
-        load_u32_at_src0(block_byte_base + 66),
-        load_u32_at_src0(block_byte_base + 70)
-    );
-
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 32; ib += 4) {
-        let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4);
-        let db = array<f32, 2>(
-            d * (0.5 + f32(s & 0xF)) * 0.25,
-            d * (0.5 + f32(s >> 4)) * 0.25
-        );
-        for (var l: u32 = 0; l < 4; l++) {
-            let qs_offset = block_byte_base + 2 + (ib + l) * 2;
-            let qs_val = load_u32_at_src0(qs_offset) & 0xFFFF;
-            let ig = (qs_val & 511) * 8;
-            let is = qs_val >> 9;
-            let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
-            let dl = db[l/2];
-            for (var j: u32 = 0; j < 8; j++) {
-                let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4);
-                let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
-                sum += dl * f32(g) * m * src1[src1_i];
-                src1_i++;
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ2_S
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 82; // Block stride: 82 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var src1_i = src1_idx_base + offset * 256;
-
-    var qs_vals : array<u32, 16>;
-    for (var i: u32 = 0; i < 16; i++) {
-        qs_vals[i] = load_u32_at_src0(block_byte_base + 2 + i * 4);
-    }
-
-    var qh_vals: array<u32, 2>;
-    qh_vals[0] = load_u32_at_src0(block_byte_base + 66);
-    qh_vals[1] = load_u32_at_src0(block_byte_base + 70);
-
-    var scale_vals: array<u32, 2>;
-    scale_vals[0] = load_u32_at_src0(block_byte_base + 74);
-    scale_vals[1] = load_u32_at_src0(block_byte_base + 78);
-
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 8; ib ++) {
-        let s = get_byte(scale_vals[ib / 4], ib % 4);
-        let db = array<f32, 2>(
-            d * (0.5 + f32(s & 0xF)) * 0.25,
-            d * (0.5 + f32(s >> 4)) * 0.25
-        );
-        let qs_w = qs_vals[ib];
-        for (var l: u32 = 0; l < 4; l++) {
-            let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300;
-            let ig = (get_byte(qs_w, l) | qh_b) * 8;
-            let signs = get_byte(qs_vals[ib + 8], l);
-            let dl = db[l/2];
-            for (var j: u32 = 0; j < 8; j++) {
-                let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4);
-                let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
-                sum += dl * f32(g) * m * src1[src1_i];
-                src1_i++;
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ3_XXS
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 98; // Block stride: 98 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var src1_i = src1_idx_base + offset * 256;
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 16; ib += 2) {
-        let sc_sign_offset = block_byte_base + 2 + (ib + 32) * 2;
-        let sc_sign = load_u32_at_src0(sc_sign_offset);
-        let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5;
-        for (var l: u32 = 0; l < 4; l++) {
-            let is = (sc_sign >> (7 * l)) & 127;
-            let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
-            let ig_val = load_u32_at_src0(block_byte_base + 2 + (ib * 2 + l) * 2) & 0xFFFF;
-            let ig1 = get_byte(ig_val, 0);
-            let ig2 = get_byte(ig_val, 1);
-            for (var j: u32 = 0; j < 4; j++) {
-                let g1 = get_byte(iq3xxs_grid[ig1], j);
-                let g2 = get_byte(iq3xxs_grid[ig2], j);
-                let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0);
-                let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0);
-                sum += db * f32(g1) * m1 * src1[src1_i];
-                sum += db * f32(g2) * m2 * src1[src1_i + 4];
-                src1_i++;
-            }
-            src1_i += 4;
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ3_S
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 110; // Block stride: 110 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var src1_i = src1_idx_base + offset * 256;
-
-    var qh_vals = array<u32, 2>(
-        load_u32_at_src0(block_byte_base + 66),
-        load_u32_at_src0(block_byte_base + 70)
-    );
-
-    var sign_vals: array<u32, 8>;
-    for (var i: u32 = 0; i < 8; i++) {
-        sign_vals[i] = load_u32_at_src0(block_byte_base + 74 + i * 4);
-    }
-
-    var scale_vals = load_u32_at_src0(block_byte_base + 106);
-
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 4; ib++) {
-        let s = get_byte(scale_vals, ib);
-        let db = array<f32, 2>(
-            d * (1.0 + 2.0 * f32(s & 0xF)),
-            d * (1.0 + 2.0 * f32(s >> 4))
-        );
-        for (var k: u32 = 0; k < 2; k++) {
-            let dl = db[k];
-            let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k);
-            let sign_w = sign_vals[ib * 2 + k];
-            for (var l: u32 = 0; l < 4; l++) {
-                let signs = get_byte(sign_w, l);
-                let ig_val = load_u32_at_src0(block_byte_base + 2 + (ib * 8 + k * 4 + l) * 2) & 0xFFFF;
-                let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256);
-                let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256);
-                for (var j: u32 = 0; j < 4; j++) {
-                    let g1 = get_byte(iq3s_grid[ig1], j);
-                    let g2 = get_byte(iq3s_grid[ig2], j);
-                    let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0);
-                    let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0);
-                    sum += dl * f32(g1) * m1 * src1[src1_i];
-                    sum += dl * f32(g2) * m2 * src1[src1_i + 4];
-                    src1_i++;
-                }
-                src1_i += 4;
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ1_S
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 50; // Block stride: 50 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var src1_i = src1_idx_base + offset * 256;
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 8; ib++) {
-        let qh = load_u32_at_src0(block_byte_base + 34 + ib * 2) & 0xFFFF;
-        let dl = d * (2.0 * f32((qh >> 12) & 7) + 1.0);
-        let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0);
-        let qs_w = load_u32_at_src0(block_byte_base + 2 + ib * 4);
-        for (var l: u32 = 0; l < 4; l++) {
-            let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8;
-            for (var j: u32 = 0; j < 8; j++) {
-                let gw = iq1_grid[(ig + j) / 16];
-                let g = (gw >> (((ig + j) % 16) * 2)) & 3;
-                let gs = bitcast<i32>(g << 30) >> 30;
-                sum += dl * (f32(gs) + delta) * src1[src1_i];
-                src1_i++;
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-
-#ifdef IQ1_M
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block = src0[src0_idx_base + offset];
-
-    let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000);
-    let d = f32(bitcast<vec2<f16>>(scale).x);
-    var src1_i = src1_idx_base + offset * 256;
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 8; ib++) {
-        let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF;
-        let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7;
-        let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7;
-        var dl = array<f32, 2>(
-            d * f32(2 * s1 + 1),
-            d * f32(2 * s2 + 1)
-        );
-
-        let qh = block.qh[ib / 2] >> (16 * (ib % 2));
-        var idx = array<u32, 4>(
-            get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700),
-            get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700),
-            get_byte(block.qs[ib], 2) | ((qh) & 0x700),
-            get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700)
-        );
-        var delta = array<f32, 4>(
-            select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0),
-            select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0),
-            select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0),
-            select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0)
-        );
-        for (var l: u32 = 0; l < 4; l++) {
-            let ig = idx[l] * 8;
-            for (var j: u32 = 0; j < 8; j++) {
-                let gw = iq1_grid[(ig + j) / 16];
-                let g = (gw >> (((ig + j) % 16) * 2)) & 3;
-                let gs = bitcast<i32>(g << 30) >> 30;
-                sum += dl[l/2] * (f32(gs) + delta[l]) * src1[src1_i];
-                src1_i++;
-            }
-        }
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ4_NL
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block_byte_base = (src0_idx_base + offset) * 18; // Block stride: 18 bytes
-    let d = load_f16_as_f32_at_src0(block_byte_base);
-    var src1_i = src1_idx_base + offset * 32;
-    var sum = 0.0;
-    var qs: array<u32, 4>;
-    for (var i: u32 = 0; i < 4; i++) {
-        qs[i] = load_u32_at_src0(block_byte_base + 2 + i * 4);
-    }
-    for (var j: u32 = 0; j < 16; j++) {
-        let qsb = get_byte(qs[j / 4], j % 4);
-        sum += d * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i];
-        sum += d * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16];
-        src1_i++;
-    }
-    return sum;
-}
-#endif
-
-#ifdef IQ4_XS
-fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
-    let block = src0[src0_idx_base + offset];
-    let d = unpack2x16float(block.d_scales_h)[0];
-    let scales_h = block.d_scales_h >> 16;
-    var src1_i = src1_idx_base + offset * 256;
-    var sum = 0.0;
-    for (var ib: u32 = 0; ib < 8; ib++) {
-        let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4);
-        let dl = d * (f32(ls) - 32.0);
-        for (var j: u32 = 0; j < 16; j++) {
-            let iqs = ib * 16 + j;
-            let qsb = get_byte(block.qs[iqs / 4], iqs % 4);
-            sum += dl * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i];
-            sum += dl * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16];
-            src1_i++;
-        }
-        src1_i += 16;
-    }
-    return sum;
-}
-#endif
-
-struct MulMatParams {
-    offset_src0: u32, // in elements/blocks
-    offset_src1: u32, // in elements/blocks
-    offset_dst: u32, // in elements/blocks
-    m: u32,
-    n: u32,
-    k: u32,
-    // all strides are in elements/blocks
-    stride_01: u32,
-    stride_11: u32,
-    stride_02: u32,
-    stride_12: u32,
-    stride_03: u32,
-    stride_13: u32,
-
-    bs02: u32,
-    bs03: u32,
-    broadcast2: u32,
-    broadcast3: u32
-};
-
-@group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>; // M rows, K columns
-@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>; // K rows, N columns (transposed)
-@group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
-
-@group(0) @binding(3) var<uniform> params: MulMatParams;
-
-@compute @workgroup_size(256)
-fn main(@builtin(local_invocation_id) local_id: vec3<u32>,
-        @builtin(workgroup_id) wg_id: vec3<u32>,
-        @builtin(num_workgroups) num_wg: vec3<u32>) {
-    let wg_linear = wg_id.y * num_wg.x + wg_id.x;
-    let global_idx = wg_linear * 256u + local_id.x;
-
-    let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
-    if (global_idx >= total) {
-        return;
-    }
-
-    let dst2_stride = params.m * params.n;
-    let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
-
-    let dst3_idx = global_idx / dst3_stride;
-    let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension
-    let src13_idx = dst3_idx; // src1 is not broadcast
-    let dst3_rem = global_idx % dst3_stride;
-
-    let dst2_idx = dst3_rem / dst2_stride;
-    let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
-    let src12_idx = dst2_idx; // src1 is not broadcast
-
-    let dst2_rem = dst3_rem % dst2_stride;
-
-    let row = dst2_rem / params.m; // output row
-    let col = dst2_rem % params.m; // output column
-
-    let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01;
-    let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11;
-
-    var sum = 0.0;
-    for (var i: u32 = 0u; i < params.k/BLOCK_SIZE; i = i + 1u) {
-        sum += multiply_add(src0_idx_base, src1_idx_base, i);
-    }
-    dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.m + col] = sum;
-}
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl
index d79d5f3f282..581e922709d 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl
@@ -21,35 +21,32 @@ var<workgroup> count:atomic<u32>;
 
 @compute @workgroup_size(WG_SIZE)
 fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
-        @builtin(local_invocation_id) local_id: vec3<u32>,
-        @builtin(num_workgroups) num_wg: vec3<u32>) {
+        @builtin(local_invocation_id) local_id: vec3<u32>) {
 
     let thread_id = local_id.x;
-    let own_expert = wg_id.y * num_wg.x + wg_id.x; // the expert assigned to this workgroup
+    let own_expert = wg_id.x; // the expert assigned to this workgroup
 
-    if (own_expert < params.n_expert) {
-        if (thread_id == 0u) {
-            atomicStore(&count, 0);
-        }
+    if (thread_id == 0u) {
+        atomicStore(&count, 0);
+    }
 
-        workgroupBarrier();
-
-        for (var i = thread_id;i < params.n_expert_used * params.n_tokens;i += WG_SIZE) {
-            let row = i / params.n_expert_used;
-            let col = i % params.n_expert_used;
-            let expert = u32(ids[params.offset_ids + row * params.stride_ids_1 + col]);
-            if (own_expert == expert) {
-                let pos = atomicAdd(&count, 1u);
-                let gathered_id = own_expert * params.n_tokens + pos;
-                global_gathered_expert_used[gathered_id] = col;
-                global_gathered_tokens[gathered_id] = row;
-            }
+    workgroupBarrier();
+
+    for (var i = thread_id;i < params.n_expert_used * params.n_tokens;i += WG_SIZE) {
+        let row = i / params.n_expert_used;
+        let col = i % params.n_expert_used;
+        let expert = u32(ids[params.offset_ids + row * params.stride_ids_1 + col]);
+        if (own_expert == expert) {
+            let pos = atomicAdd(&count, 1u);
+            let gathered_id = own_expert * params.n_tokens + pos;
+            global_gathered_expert_used[gathered_id] = col;
+            global_gathered_tokens[gathered_id] = row;
         }
+    }
 
-        workgroupBarrier();
+    workgroupBarrier();
 
-        if (thread_id == 0u) {
-            gathered_count_ids[own_expert] = atomicLoad(&count);
-        }
+    if (thread_id == 0u) {
+        gathered_count_ids[own_expert] = atomicLoad(&count);
     }
 }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl
index a194cf40468..f0a7fbd059a 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl
@@ -3,10 +3,18 @@ enable subgroups;
 #endif
 enable f16;
 
+#ifdef MMVQ
+requires packed_4x8_integer_dot_product;
+#endif
+
 #define DECLARE_BYTE_LOADERS_SRC0
 #include "common_decls.tmpl"
 
+#ifdef MMVQ
+#include "mul_mat_vec_q_acc.tmpl"
+#else
 #include "mul_mat_vec_acc.tmpl"
+#endif
 
 struct MulMatParams {
     offset_src0: u32,
@@ -28,9 +36,14 @@ struct MulMatParams {
 };
 
 @group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>;
+
+#ifdef MMVQ
+@group(0) @binding(1) var<storage, read_write> src1q: array<q8_1>;
+#else
 @group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>;
-@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
+#endif
 
+@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
 // "mul_mat_vec_acc.tmpl" requires params.k, params.m, params.stride_01
 @group(0) @binding(3) var<uniform> params: MulMatParams;
 
@@ -75,10 +88,15 @@ fn main(
     let src12_idx = dst2_idx;
 
     let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02;
-    let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
     let dst_idx_base = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row_base;
 
+#ifdef MMVQ
+    let src1q_idx_base = (src13_idx * params.bs02 * params.broadcast2 + src12_idx) * (params.k / 32u);
+    let acc = accumulate_vec_q_dot(thread_id, row_base, src0_batch_offset, src1q_idx_base);
+#else
+    let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
     let acc = accumulate_vec_dot(thread_id, row_base, src0_batch_offset, src1_idx_base);
+#endif
 
 #ifdef USE_SUBGROUP_REDUCTION
     for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl
index 711c7e829d8..08753b9d643 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl
@@ -436,7 +436,6 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src
 }
 #endif
 
-
 #ifdef MUL_ACC_Q3_K
 #define BLOCK_SIZE 256
 #define BLOCK_SIZE_BYTES 110
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl
new file mode 100644
index 00000000000..3ef2f77ebe0
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl
@@ -0,0 +1,303 @@
+#ifdef U32_DEQUANT_HELPERS
+#define SRC0_TYPE u32
+
+fn byte_of(v: u32, b: u32) -> u32 {
+    return (v >> (b * 8u)) & 0xFFu;
+}
+
+fn sbyte_of(v: u32, b: u32) -> i32 {
+    let raw = i32((v >> (b * 8u)) & 0xFFu);
+    return select(raw, raw - 256, raw >= 128);
+}
+#endif
+
+#define SRC0_TYPE SRC0_INNER_TYPE
+#define SRC1_TYPE SRC1_INNER_TYPE
+
+#ifdef LEGACY_QUANTS
+#define BLOCK_SIZE 32
+#define THREADS_PER_BLOCK 4
+#elif K_QUANTS
+#define BLOCK_SIZE 256
+#define THREADS_PER_BLOCK 16
+#endif
+
+#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK)
+#define Q8_BLOCK_SIZE 32
+
+#ifdef MUL_ACC_Q4_0
+#define BLOCK_SIZE_BYTES 18
+#define B_DS_TYPE vec2<f32>
+fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2<u32> {
+    let qs_packed = load_u32_at_src0(block_byte_base + 2u + 4u * inner_id);
+
+    return vec2<u32>(
+        qs_packed & 0x0F0F0F0Fu,
+        (qs_packed >> 4u) & 0x0F0F0F0Fu
+    );
+}
+fn repack_b_qs(block:u32, inner_id: u32) -> vec2<u32> {
+    return vec2<u32>(
+            src1q[block].qs[inner_id],
+            src1q[block].qs[inner_id + 4u],
+        );
+}
+fn repack_b_dm(block: u32) -> B_DS_TYPE {
+    return B_DS_TYPE(
+        f32(src1q[block].d),
+        f32(src1q[block].s)
+    );
+}
+fn get_dm(block_byte_base: u32) -> f32 {
+    return f32(load_f16_at_src0(block_byte_base));
+}
+fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 {
+    return f32(row_sum) * (da * b_ds.x) - 8.0 * da * b_ds.y / THREADS_PER_BLOCK;
+}
+#endif
+
+#ifdef MUL_ACC_Q4_1
+#define BLOCK_SIZE_BYTES 20
+#define B_DS_TYPE vec2<f32>
+fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2<u32> {
+    let qs_packed = load_u32_at_src0(block_byte_base + 4u + 4u * inner_id);
+
+    return vec2<u32>(
+        qs_packed & 0x0F0F0F0Fu,
+        (qs_packed >> 4u) & 0x0F0F0F0Fu
+    );
+}
+fn repack_b_qs(block:u32, inner_id: u32) -> vec2<u32> {
+    return vec2<u32>(
+            src1q[block].qs[inner_id],
+            src1q[block].qs[inner_id + 4u],
+        );
+}
+fn repack_b_dm(block: u32) -> B_DS_TYPE {
+    return B_DS_TYPE(
+        f32(src1q[block].d),
+        f32(src1q[block].s)
+    );
+}
+fn get_dm(block_byte_base: u32) -> vec2<f32> {
+    return vec2<f32>(
+        f32(load_f16_at_src0(block_byte_base)),
+        f32(load_f16_at_src0(block_byte_base + 2u))
+    );
+}
+fn mul_q8_1(row_sum: i32, dma: vec2<f32>, b_ds: B_DS_TYPE) -> f32 {
+    return f32(row_sum) * (dma.x * b_ds.x) + dma.y * b_ds.y / THREADS_PER_BLOCK;
+}
+#endif
+
+#ifdef MUL_ACC_Q8_0
+#define BLOCK_SIZE_BYTES 34
+#define B_DS_TYPE f32
+fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2<u32> {
+    return vec2<u32>(
+        load_u32_at_src0(block_byte_base + 2u + 4u * (inner_id * 2u)),
+        load_u32_at_src0(block_byte_base + 2u + 4u * (inner_id * 2u + 1))
+    );
+}
+fn repack_b_qs(block:u32, inner_id: u32) -> vec2<u32> {
+    return vec2<u32>(
+            src1q[block].qs[inner_id * 2u],
+            src1q[block].qs[inner_id * 2u + 1],
+        );
+}
+fn repack_b_dm(block: u32) -> B_DS_TYPE {
+    return B_DS_TYPE(src1q[block].d);
+}
+fn get_dm(block_byte_base: u32) -> f32 {
+    return f32(load_f16_at_src0(block_byte_base));
+}
+fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 {
+    return f32(row_sum) * (da * b_ds);
+}
+#endif
+
+#ifdef LEGACY_QUANTS
+fn mmvq_dot_product(a_byte_base: u32, b_inner_id: u32, b_repacked: vec2<u32>, b_ds: B_DS_TYPE) -> f32 {
+    var row_sum = 0;
+    let a_repacked = repack_a(a_byte_base, b_inner_id);
+
+    row_sum += dot4I8Packed(a_repacked[0], b_repacked[0]);
+    row_sum += dot4I8Packed(a_repacked[1], b_repacked[1]);
+
+    return mul_q8_1(row_sum, get_dm(a_byte_base), b_ds);
+}
+
+fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<f32, OUTPUTS_PER_WG> {
+    var acc: array<f32, OUTPUTS_PER_WG>;
+
+    let num_blocks = params.k / BLOCK_SIZE;
+
+    for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) {
+        let b_inner_id = thread_id % THREADS_PER_BLOCK;
+        let b_block_idx = src1q_idx_base + block;
+
+        let b_repacked = repack_b_qs(b_block_idx, b_inner_id);
+        let b_ds = repack_b_dm(b_block_idx);
+
+        for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
+            let output_row = row_base + row;
+            if (output_row < params.m) {
+                let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
+                acc[row] += mmvq_dot_product(block_byte_base, b_inner_id, b_repacked, b_ds);
+            }
+        }
+    }
+
+    return acc;
+}
+#endif
+
+#ifdef MUL_ACC_Q2_K
+#define BLOCK_SIZE_BYTES 84
+#define B_DS_TYPE f32
+fn repack_a(block_byte_base: u32, tid: u32) -> vec4<u32> {
+    let ih2 = tid / 8u;
+    let phase = tid % 2u;
+    let iq4_idx = 2u * ih2 + phase;
+    let qs_byte_base = block_byte_base + 16u + 16u * iq4_idx;
+    let qs_shift = tid & 6u;
+    return vec4<u32>(
+        (load_u32_at_src0_aligned(qs_byte_base) >> qs_shift) & 0x03030303u,
+        (load_u32_at_src0_aligned(qs_byte_base + 4u) >> qs_shift) & 0x03030303u,
+        (load_u32_at_src0_aligned(qs_byte_base + 8u) >> qs_shift) & 0x03030303u,
+        (load_u32_at_src0_aligned(qs_byte_base + 12u) >> qs_shift) & 0x03030303u,
+    );
+}
+fn repack_b_qs(q8_block_idx: u32, tid: u32) -> vec4<u32> {
+    let phase = tid % 2u;
+    return vec4<u32>(
+        src1q[q8_block_idx].qs[4u * phase],
+        src1q[q8_block_idx].qs[4u * phase + 1u],
+        src1q[q8_block_idx].qs[4u * phase + 2u],
+        src1q[q8_block_idx].qs[4u * phase + 3u],
+    );
+}
+fn repack_b_dm(q8_block_idx: u32) -> B_DS_TYPE {
+    return B_DS_TYPE(src1q[q8_block_idx].d);
+}
+fn get_dm(block_byte_base: u32) -> vec2<f32> {
+    return vec2<f32>(
+        f32(load_f16_at_src0(block_byte_base + 80u)),
+        f32(load_f16_at_src0(block_byte_base + 82u)),
+    );
+}
+fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2<f32> {
+    let scale_byte = block_byte_base + tid;
+    let scale = byte_of(load_u32_at_src0_aligned(scale_byte), scale_byte & 3u);
+    return vec2<f32>(f32(scale & 0xFu), f32(scale >> 4u));
+}
+fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4<u32>, b_ds: B_DS_TYPE) -> f32 {
+    let a_repacked = repack_a(a_byte_base, tid);
+    let dm = get_dm(a_byte_base);
+    let scale_min = get_scale_min(a_byte_base, tid);
+
+    let scale_q = i32(scale_min.x);
+    let scale_m_i8x4 = u32(scale_min.y) * 0x01010101u;
+
+    let row_sum_d = (dot4I8Packed(b_repacked[0], a_repacked[0]) + dot4I8Packed(b_repacked[1], a_repacked[1])
+                   + dot4I8Packed(b_repacked[2], a_repacked[2]) + dot4I8Packed(b_repacked[3], a_repacked[3])) * scale_q;
+    let row_sum_m = dot4I8Packed(b_repacked[0], scale_m_i8x4) + dot4I8Packed(b_repacked[1], scale_m_i8x4)
+                  + dot4I8Packed(b_repacked[2], scale_m_i8x4) + dot4I8Packed(b_repacked[3], scale_m_i8x4);
+
+    return b_ds * (dm.x * f32(row_sum_d) - dm.y * f32(row_sum_m));
+}
+#endif
+
+#ifdef MUL_ACC_Q4_K
+#define BLOCK_SIZE_BYTES 144
+#define B_DS_TYPE vec2<f32>
+fn repack_a(block_byte_base: u32, tid: u32) -> vec4<u32> {
+    let iq4 = tid / 4u;
+    let phase = tid % 2u;
+    let nibble = (tid >> 1u) % 2u;
+    let q_qs_byte_base = block_byte_base + 16u + 32u * iq4 + 16u * phase;
+    let qs_shift = 4u * nibble;
+    return vec4<u32>(
+        (load_u32_at_src0_aligned(q_qs_byte_base) >> qs_shift) & 0x0F0F0F0Fu,
+        (load_u32_at_src0_aligned(q_qs_byte_base + 4u) >> qs_shift) & 0x0F0F0F0Fu,
+        (load_u32_at_src0_aligned(q_qs_byte_base + 8u) >> qs_shift) & 0x0F0F0F0Fu,
+        (load_u32_at_src0_aligned(q_qs_byte_base + 12u) >> qs_shift) & 0x0F0F0F0Fu,
+    );
+}
+fn repack_b_qs(q8_block_idx: u32, tid: u32) -> vec4<u32> {
+    let phase = tid % 2u;
+    return vec4<u32>(
+        src1q[q8_block_idx].qs[4u * phase],
+        src1q[q8_block_idx].qs[4u * phase + 1u],
+        src1q[q8_block_idx].qs[4u * phase + 2u],
+        src1q[q8_block_idx].qs[4u * phase + 3u],
+    );
+}
+fn repack_b_dm(q8_block_idx: u32) -> B_DS_TYPE {
+    return B_DS_TYPE(
+        f32(src1q[q8_block_idx].d),
+        f32(src1q[q8_block_idx].s),
+    );
+}
+fn get_dm(block_byte_base: u32) -> vec2<f32> {
+    return vec2<f32>(
+        f32(load_f16_at_src0(block_byte_base + 0u)),
+        f32(load_f16_at_src0(block_byte_base + 2u)),
+    );
+}
+fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2<f32> {
+    let sc_m_idx = tid / 2u;
+    let scales_byte_base = block_byte_base + 4u;
+    let scales0_3  = load_u32_at_src0_aligned(scales_byte_base);
+    let scales4_7  = load_u32_at_src0_aligned(scales_byte_base + 4u);
+    let scales8_11 = load_u32_at_src0_aligned(scales_byte_base + 8u);
+
+    let byte_idx = sc_m_idx & 3u;
+    let is_high = sc_m_idx >= 4u;
+
+    let sc_low  = byte_of(scales0_3, byte_idx) & 0x3Fu;
+    let sc_high = (byte_of(scales8_11, byte_idx) & 0x0Fu) | ((byte_of(scales0_3, byte_idx) & 0xC0u) >> 2u);
+    let scale = f32(select(sc_low, sc_high, is_high));
+
+    let mn_low  = byte_of(scales4_7, byte_idx) & 0x3Fu;
+    let mn_high = (byte_of(scales8_11, byte_idx) >> 4u) | ((byte_of(scales4_7, byte_idx) & 0xC0u) >> 2u);
+    let min_val = f32(select(mn_low, mn_high, is_high));
+
+    return vec2<f32>(scale, min_val);
+}
+fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4<u32>, b_ds: B_DS_TYPE) -> f32 {
+    let a_repacked = repack_a(a_byte_base, tid);
+    let dm = get_dm(a_byte_base);
+    let scale_min = get_scale_min(a_byte_base, tid);
+
+    let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1])
+                + dot4I8Packed(a_repacked[2], b_repacked[2]) + dot4I8Packed(a_repacked[3], b_repacked[3]);
+
+    // Each thread covers half of the Q8_1 block, so add only b_ds.y/2.
+    return b_ds.x * dm.x * scale_min.x * f32(row_sum) - dm.y * scale_min.y * (b_ds.y / (Q8_BLOCK_SIZE / ELEMS_PER_THREAD));
+}
+#endif
+
+#ifdef K_QUANTS
+fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<f32, OUTPUTS_PER_WG> {
+    var acc: array<f32, OUTPUTS_PER_WG>;
+
+    let tid = thread_id % THREADS_PER_BLOCK;
+
+    for (var block = thread_id / THREADS_PER_BLOCK; block < params.k / BLOCK_SIZE; block += WG_SIZE / THREADS_PER_BLOCK) {
+        let src1q_idx = src1q_idx_base + (block * BLOCK_SIZE + ELEMS_PER_THREAD * tid) / Q8_BLOCK_SIZE;
+        let b_repacked = repack_b_qs(src1q_idx, tid);
+        let b_ds = repack_b_dm(src1q_idx);
+
+        for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
+            let output_row = row_base + row;
+            if (output_row < params.m) {
+                let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
+                acc[row] += mmvq_dot_product(block_byte_base, tid, b_repacked, b_ds);
+            }
+        }
+    }
+
+    return acc;
+}
+#endif
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl
new file mode 100644
index 00000000000..b3f1fa04b80
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl
@@ -0,0 +1,173 @@
+#ifdef USE_SUBGROUP_REDUCTION
+enable subgroups;
+#endif
+enable f16;
+
+requires packed_4x8_integer_dot_product;
+
+#include "common_decls.tmpl"
+
+struct Params {
+    offset_src1: u32,
+    stride_12: u32,
+    stride_13: u32,
+    ne0: u32,
+    ne2: u32,
+    ne3: u32,
+};
+
+#define SRC1_TYPE vec4<SRC1_INNER_TYPE>
+
+@group(0) @binding(0) var<storage, read_write> src1: array<SRC1_TYPE>;
+@group(0) @binding(1) var<storage, read_write> src1q: array<q8_1>;
+
+@group(0) @binding(2) var<uniform> params: Params;
+
+#ifdef USE_SUBGROUP_REDUCTION
+fn cluster_max_8(v: f32) -> f32 {
+    var r = v;
+    r = max(r, subgroupShuffleXor(r, 1u));
+    r = max(r, subgroupShuffleXor(r, 2u));
+    r = max(r, subgroupShuffleXor(r, 4u));
+    return r;
+}
+
+#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K)
+fn cluster_add_i4x8(v: i32) -> i32 {
+    var r= v;
+    r += subgroupShuffleXor(r, 1u);
+    r += subgroupShuffleXor(r, 2u);
+    r += subgroupShuffleXor(r, 4u);
+    return r;
+}
+#endif
+#endif
+
+#ifdef USE_WORKGROUP_REDUCTION
+#define CLUSTER_SIZE 8
+
+var<workgroup> partial_amaxs: array<array<f32, CLUSTER_SIZE>, WG_SIZE / CLUSTER_SIZE>;
+var<workgroup> partial_sums:  array<array<i32, CLUSTER_SIZE>, WG_SIZE / CLUSTER_SIZE>;
+#endif
+
+@compute @workgroup_size(WG_SIZE)
+fn main(
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+    @builtin(num_workgroups) num_wg: vec3<u32>
+) {
+    let thread_id = local_id.x;
+    let num_vec4 = params.ne0 / 4u;
+
+    let wg_per_vec = (num_vec4 + (WG_SIZE - 1u)) / WG_SIZE;
+    let total_batches = wg_per_vec * params.ne2 * params.ne3;
+
+    let wg_linear = wg_id.y * num_wg.x + wg_id.x;
+    if (wg_linear >= total_batches) {
+        return;
+    }
+
+    let src13_idx = wg_linear / (params.ne2 * wg_per_vec);
+    let src12_idx = (wg_linear - src13_idx * (params.ne2 * wg_per_vec)) / wg_per_vec;
+    let src11_wg_idx = wg_linear % wg_per_vec;
+    let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
+    let src1_idx_vec4_base = src1_idx_base / 4u;
+
+    let blocks_per_row = params.ne0 / 32u;
+    let blocks_per_wg = (WG_SIZE * 4u) / 32u;
+    let src1q_idx_base = (src13_idx * params.ne2 + src12_idx) * blocks_per_row;
+    let src1q_idx = src1q_idx_base + src11_wg_idx * blocks_per_wg + thread_id / 8u;
+    let qs_idx = thread_id % 8u;
+
+    // reduction
+    var q4 = vec4<f32>(0.0);
+    var q4_quants = 0u;
+    var thread_amax = 0.0;
+
+    let src11_vec4_idx = src11_wg_idx * WG_SIZE + thread_id;
+    let is_valid = src11_vec4_idx < num_vec4;
+
+#ifdef USE_SUBGROUP_REDUCTION
+
+    var d = 0.0;
+
+    if (is_valid) {
+        q4 = src1[src1_idx_vec4_base + src11_vec4_idx];
+        let abs_q4 = abs(q4);
+        thread_amax = max(max(abs_q4[0u], abs_q4[1u]), max(abs_q4[2], abs_q4[3]));
+    }
+
+    d = cluster_max_8(thread_amax) / 127.0;
+
+    if (is_valid) {
+        let id = select(0.0, 1.0 / d, d > 0.0);
+        q4_quants = pack4xI8(vec4<i32>(round(q4 * id)));
+        if (qs_idx == 0u) {
+            src1q[src1q_idx].d = f16(d);
+        }
+        src1q[src1q_idx].qs[qs_idx] = q4_quants;
+    }
+
+#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K)
+    let q4_quants_sum = dot4I8Packed(q4_quants, 0x01010101u);
+    let s = f16(d * f32(cluster_add_i4x8(q4_quants_sum)));
+
+    if (is_valid) {
+        if (qs_idx == 0u) {
+            src1q[src1q_idx].s = s;
+        }
+    }
+#endif
+#endif
+
+#ifdef USE_WORKGROUP_REDUCTION
+
+    var d = 0.0;
+    let cluster_id = thread_id / 8u;
+
+    if (is_valid) {
+        q4 = src1[src1_idx_vec4_base + src11_vec4_idx];
+        let abs_q4 = abs(q4);
+        thread_amax = max(max(abs_q4[0], abs_q4[1]), max(abs_q4[2], abs_q4[3]));
+        partial_amaxs[cluster_id][qs_idx] = thread_amax;
+    }
+
+    workgroupBarrier();
+
+    if (is_valid) {
+        let amax = max(
+                    max(
+                        max(partial_amaxs[cluster_id][0], partial_amaxs[cluster_id][1]), max(partial_amaxs[cluster_id][2], partial_amaxs[cluster_id][3])),
+                    max(
+                        max(partial_amaxs[cluster_id][4], partial_amaxs[cluster_id][5]), max(partial_amaxs[cluster_id][6], partial_amaxs[cluster_id][7]))
+                );
+
+        d = amax / 127.0;
+        let id = select(0.0f, 1.0f / d, d > 0.0f);
+
+        q4_quants = pack4xI8(vec4<i32>(round(q4 * id)));
+        src1q[src1q_idx].qs[qs_idx] = q4_quants;
+
+        if (qs_idx == 0u) {
+            src1q[src1q_idx].d = f16(d);
+        }
+    }
+
+#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K)
+
+    partial_sums[cluster_id][qs_idx] = dot4I8Packed(q4_quants, 0x01010101u);
+
+    workgroupBarrier();
+
+    if (is_valid) {
+        if (qs_idx == 0u) {
+            let s = d * f32(partial_sums[cluster_id][0] + partial_sums[cluster_id][1] + partial_sums[cluster_id][2] + partial_sums[cluster_id][3]
+                                    + partial_sums[cluster_id][4] + partial_sums[cluster_id][5] + partial_sums[cluster_id][6] + partial_sums[cluster_id][7]);
+            src1q[src1q_idx].s = f16(s);
+        }
+    }
+
+#endif
+#endif
+
+}
diff --git a/ggml/src/ggml-zendnn/CMakeLists.txt b/ggml/src/ggml-zendnn/CMakeLists.txt
index f1e4f991fae..e4ba9cfbd0f 100644
--- a/ggml/src/ggml-zendnn/CMakeLists.txt
+++ b/ggml/src/ggml-zendnn/CMakeLists.txt
@@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
     ExternalProject_Add(
         zendnn
         GIT_REPOSITORY https://github.com/amd/ZenDNN.git
-        GIT_TAG ac9e580d9434b7b98985f2627a7ebfb5eba4bb0d    # ZenDNN-2026-WW17
+        GIT_TAG 253b94ce0d7e9284c265fefb485714944caff9d3    # ZenDNN-2026-WW19
         PREFIX      ${ZENDNN_PREFIX}
         SOURCE_DIR  ${ZENDNN_SOURCE_DIR}
         BINARY_DIR  ${ZENDNN_BUILD_DIR}
diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
index 6a83bb6b1ec..3c33dcb11a0 100644
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@@ -2,6 +2,10 @@
 
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
+
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+
 #include "zendnnl.hpp"
 
 #include <cstring>
@@ -19,6 +23,8 @@ zendnnl::common::data_type_t ggml_to_zendnn_type() {
         return zendnnl::common::data_type_t::f32;
     } else if constexpr (std::is_same_v<T, ggml_bf16_t>) {
         return zendnnl::common::data_type_t::bf16;
+    } else if constexpr (std::is_same_v<T, block_q8_0>) {
+        return zendnnl::common::data_type_t::s8;
     } else {
         return zendnnl::common::data_type_t::none;
     }
@@ -48,6 +54,17 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int
     params.num_threads = ctx->n_threads;
 
     zendnnl::lowoha::matmul::matmul_batch_params_t batch_params;
+
+    if constexpr (std::is_same_v<TA, block_q8_0>) {
+        params.dtypes.compute = zendnnl::common::data_type_t::s8;
+        const int64_t num_groups = k / QK8_0;
+        params.dynamic_quant = true;
+        params.quant_params.src_scale.buff = nullptr;
+        params.quant_params.src_scale.dt   = zendnnl::common::data_type_t::bf16;
+        params.quant_params.src_scale.dims = {n, num_groups};
+        params.packing.pack_format_b = 1;
+    }
+
     zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct(
         'r', false, true,   // row-major, don't transpose B, transpose A (because it's column-major)
         n,                  // M: rows of B and C
@@ -71,7 +88,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int
     return true;
 }
 
-static bool ggml_zendnn_sgemm(ggml_backend_zendnn_context * ctx, int64_t m, int64_t n, int64_t k,
+static bool ggml_zendnn_gemm(ggml_backend_zendnn_context * ctx, int64_t m, int64_t n, int64_t k,
                               const void * A, int64_t lda, const void * B, int64_t ldb, void * C,
                               int64_t ldc, int Atype, int Btype, int Ctype) {
 
@@ -108,6 +125,14 @@ static bool ggml_zendnn_sgemm(ggml_backend_zendnn_context * ctx, int64_t m, int6
                     (const ggml_bf16_t *)B, ldb,
                     (float *)C, ldc);
             return false;
+        case GGML_TYPE_Q8_0:
+            if (Btype != GGML_TYPE_F32 || Ctype != GGML_TYPE_F32)
+                return false;
+            return ggml_zendnn_matmul<block_q8_0, float, float>(
+                ctx, m, n, k,
+                (const block_q8_0 *)A, lda,
+                (const float *)B, ldb,
+                (float *)C, ldc);
         default:
             return false; // unsupported type
     }
@@ -145,7 +170,9 @@ static void ggml_zendnn_compute_forward_mul_mat(
     const int64_t r3 = ne13/ne03;
 
     void * work_data = ctx->work_data.get();
-    if (src1->type != vec_dot_type) {
+
+    // ZenDNN requires FP32 for dynamic quantization, so conversion is skipped
+    if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) {
         const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
         const size_t nbw2 = nbw1 * ne11;
         const size_t nbw3 = nbw2 * ne12;
@@ -171,9 +198,9 @@ static void ggml_zendnn_compute_forward_mul_mat(
 
     for (int64_t i13 = 0; i13 < ne13; i13++) {
         for (int64_t i12 = 0; i12 < ne12; i12++) {
-            const void* wdata = src1->type == vec_dot_type ? src1->data : work_data;
+            const void* wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data;
             const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-            if (!ggml_zendnn_sgemm(ctx,
+            if (!ggml_zendnn_gemm(ctx,
                                   ne01,     // m
                                   ne11,     // n
                                   ne10,     // k
@@ -184,9 +211,9 @@ static void ggml_zendnn_compute_forward_mul_mat(
                                   static_cast<char *>(dst->data) + i12*nb2 + i13*nb3,
                                   ne01,     // ldc
                                   src0->type,
-                                  vec_dot_type,
+                                  src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
                                   dst->type))
-                GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
+                GGML_ABORT("%s: ZenDNN gemm failed\n", __func__);
         }
     }
 }
@@ -261,10 +288,15 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
     const size_t nbw1 = row_size;
     const size_t nbw2 = nbw1 * ne11;
     const size_t nbw3 = nbw2 * ne12;
-    const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
+    const size_t src1_conv_size = (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) ? ne13 * nbw3 : 0;
+
+    // For Q8_0, src1 is always F32; the gather buffer must hold F32 rows (ne10*4 bytes),
+    // not Q8_0-encoded rows (row_size ≈ ne10/32*34 bytes) — they differ by ~4x.
+    const size_t f32_row_size = (size_t)ne10 * sizeof(float);
+    const size_t gather_row_size = (src0->type == GGML_TYPE_Q8_0) ? f32_row_size : row_size;
 
     // size for MoE gather/scatter buffers
-    const size_t wdata_cur_size = max_rows * row_size;
+    const size_t wdata_cur_size = max_rows * gather_row_size;
     const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
 
     // allocate single buffer for all needs
@@ -279,7 +311,8 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
     char * wdata_cur = work_data + src1_conv_size;
     char * dst_cur = wdata_cur + wdata_cur_size;
 
-    if (src1->type != vec_dot_type) {
+    // ZenDNN requires FP32 for dynamic quantization, so conversion is skipped
+    if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) {
         GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
         #pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
@@ -294,7 +327,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
         }
     }
 
-    const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
+    const void * wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data;
 
     // process each expert with gather -> gemm -> scatter pattern
     for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -315,14 +348,14 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
             const int64_t i12 = row_mapping.i2;
 
             std::memcpy(
-                wdata_cur + ir1 * row_size,
-                (const char *) wdata + (i11 + i12*ne11) * row_size,
-                row_size
+                wdata_cur + ir1 * gather_row_size,
+                (const char *) wdata + (i11 + i12*ne11) * gather_row_size,
+                gather_row_size
             );
         }
 
         // batched gemm for all tokens in this expert
-        if (!ggml_zendnn_sgemm(ctx,
+        if (!ggml_zendnn_gemm(ctx,
                               ne01,       // m
                               cne1,       // n
                               ne10,       // k
@@ -333,9 +366,9 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
                               dst_cur,
                               ne01,       // ldc
                               src0->type,
-                              vec_dot_type,
+                              src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
                               dst->type)) {
-            GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
+            GGML_ABORT("%s: ZenDNN gemm failed\n", __func__);
         }
 
         // scatter output rows to destination
@@ -577,6 +610,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
             switch (weights->type) {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_BF16:
+                case GGML_TYPE_Q8_0:
                     return true;
                 default:
                     return false;
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index ab3cc974867..5e198618251 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -228,9 +228,18 @@ struct gguf_context {
 };
 
 struct gguf_reader {
-    gguf_reader(FILE * file) : file(file) {
-        // read the remaining bytes once and update on each read
-        nbytes_remain = file_remain(file);
+    gguf_reader(
+            gguf_reader_callback_t callback,
+            void * userdata,
+            size_t max_chunk_read,
+            uint64_t data_offset = 0,
+            uint64_t nbytes_remain = 0)
+        : callback(callback),
+          userdata(userdata),
+          max_chunk_read(max_chunk_read),
+          data_offset(data_offset),
+          nbytes_remain(nbytes_remain) {
+        GGML_ASSERT(max_chunk_read > 0);
     }
 
     // helper for remaining bytes in a file
@@ -257,12 +266,10 @@ struct gguf_reader {
     template <typename T>
     bool read(T & dst) const {
         const size_t size = sizeof(dst);
-        if (nbytes_remain < size) {
+        if (size > nbytes_remain) {
             return false;
         }
-        const size_t nread = fread(&dst, 1, size, file);
-        nbytes_remain -= nread;
-        return nread == size;
+        return read_raw(&dst, size) == size;
     }
 
     template <typename T>
@@ -344,24 +351,71 @@ struct gguf_reader {
             return false;
         }
         dst.resize(static_cast<size_t>(size));
-        const size_t nread = fread(dst.data(), 1, size, file);
-        nbytes_remain -= nread;
-        return nread == size;
+        return read_raw(dst.data(), static_cast<size_t>(size)) == size;
     }
 
     bool read(void * dst, const size_t size) const {
         if (size > nbytes_remain) {
             return false;
         }
-        const size_t nread = fread(dst, 1, size, file);
-        nbytes_remain -= nread;
-        return nread == size;
+        return read_raw(dst, size) == size;
+    }
+
+    uint64_t tell() const {
+        return data_offset;
+    }
+
+    bool seek(uint64_t absolute_offset) const {
+        const uint64_t end_offset = uint64_t(data_offset) + nbytes_remain;
+        if (absolute_offset > end_offset) {
+            return false;
+        }
+
+        data_offset = absolute_offset;
+        nbytes_remain = end_offset - absolute_offset;
+
+        return true;
     }
 
 private:
-    FILE * file;
+    size_t read_raw(void * dst, size_t size) const {
+        if (callback == nullptr || size == 0) {
+            return 0;
+        }
+
+        uint8_t * data = static_cast<uint8_t *>(dst);
+        size_t total_nread = 0;
+        bool reached_eof = false;
 
-    mutable uint64_t nbytes_remain;
+        while (total_nread < size) {
+            const size_t chunk_size = std::min(max_chunk_read, size - total_nread);
+            if (data_offset + total_nread < data_offset) {
+                break;
+            }
+            const size_t nread = callback(userdata, static_cast<void *>(data + total_nread), data_offset + total_nread, chunk_size);
+            total_nread += nread;
+            if (nread != chunk_size) {
+                reached_eof = true;
+                break;
+            }
+        }
+
+        data_offset += total_nread;
+        GGML_ASSERT(total_nread <= nbytes_remain);
+        nbytes_remain -= total_nread;
+
+        if (reached_eof) {
+            nbytes_remain = 0;
+        }
+
+        return total_nread;
+    }
+
+    gguf_reader_callback_t callback = nullptr;
+    void * userdata = nullptr;
+    size_t max_chunk_read = 0;
+    mutable uint64_t data_offset = 0;
+    mutable uint64_t nbytes_remain = 0;
 };
 
 struct gguf_context * gguf_init_empty(void) {
@@ -394,12 +448,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
     return true;
 }
 
-struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
-    if (!file) {
-        return nullptr;
-    }
-
-    const struct gguf_reader gr(file);
+static struct gguf_context * gguf_init_from_reader(const struct gguf_reader & gr, struct gguf_init_params params) {
     struct gguf_context * ctx = new gguf_context;
 
     bool ok = true;
@@ -700,14 +749,14 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
     GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
 
     // we require the data section to be aligned, so take into account any padding
-    if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) {
+    if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
         GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
         gguf_free(ctx);
         return nullptr;
     }
 
     // store the current file offset - this is where the data section starts
-    ctx->offset = gguf_ftell(file);
+    ctx->offset = gr.tell();
 
     // compute the total size of the data section, taking into account the alignment
     {
@@ -844,6 +893,89 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
     return ctx;
 }
 
+struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params) {
+    if (callback == nullptr) {
+        return nullptr;
+    }
+
+    const struct gguf_reader gr(callback, userdata, max_chunk_read == 0 ? SIZE_MAX : max_chunk_read, 0, max_expected_size);
+    return gguf_init_from_reader(gr, params);
+}
+
+struct gguf_file_reader {
+    FILE * file;
+    uint64_t offset;
+};
+
+static size_t gguf_file_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) {
+    GGML_ASSERT(len > 0);
+
+    gguf_file_reader & reader = *static_cast<gguf_file_reader *>(userdata);
+
+    if (reader.offset != offset) {
+        if (offset > INT64_MAX || gguf_fseek(reader.file, static_cast<int64_t>(offset), SEEK_SET) != 0) {
+            return 0;
+        }
+
+        reader.offset = offset;
+    }
+
+    const size_t nread = fread(static_cast<uint8_t *>(output), 1, len, reader.file);
+    reader.offset += nread;
+    return nread;
+}
+
+struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
+    if (!file) {
+        return nullptr;
+    }
+
+    const int64_t cur = gguf_ftell(file);
+    if (cur < 0) {
+        return nullptr;
+    }
+
+    gguf_file_reader reader = {
+        /*.file   = */ file,
+        /*.offset = */ static_cast<uint64_t>(cur),
+    };
+    const struct gguf_reader gr(gguf_file_reader_callback, &reader, SIZE_MAX, reader.offset, gguf_reader::file_remain(file));
+    return gguf_init_from_reader(gr, params);
+}
+
+struct gguf_buffer_reader {
+    const uint8_t * data;
+    size_t          size;
+};
+
+static size_t gguf_buffer_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) {
+    GGML_ASSERT(len > 0);
+
+    const gguf_buffer_reader & reader = *static_cast<gguf_buffer_reader *>(userdata);
+
+    if (offset > reader.size || len > reader.size - offset) {
+        return 0;
+    }
+
+    const size_t data_offset = static_cast<size_t>(offset);
+    const size_t nread = std::min(len, reader.size - data_offset);
+    memcpy(static_cast<uint8_t *>(output), reader.data + data_offset, nread);
+    return nread;
+}
+
+struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) {
+    if (data == nullptr || size == 0) {
+        return nullptr;
+    }
+
+    gguf_buffer_reader reader = {
+        /*.data = */ static_cast<const uint8_t *>(data),
+        /*.size = */ size,
+    };
+    const struct gguf_reader gr(gguf_buffer_reader_callback, &reader, SIZE_MAX, 0, size);
+    return gguf_init_from_reader(gr, params);
+}
+
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
     FILE * file = ggml_fopen(fname, "rb");
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c25f217f990..b0624030c04 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -430,6 +430,7 @@ class MODEL_ARCH(IntEnum):
     GEMMA3           = auto()
     GEMMA3N          = auto()
     GEMMA4           = auto()
+    GEMMA4_ASSISTANT = auto()
     GEMMA_EMBEDDING  = auto()
     STARCODER2       = auto()
     RWKV6            = auto()
@@ -505,6 +506,7 @@ class MODEL_ARCH(IntEnum):
     LLAMA_EMBED      = auto()
     MAINCODER        = auto()
     KIMI_LINEAR      = auto()
+    TALKIE           = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -747,7 +749,7 @@ class MODEL_TENSOR(IntEnum):
     V_LAYER_OUT_SCALE    = auto()
     V_PRE_NORM           = auto()
     V_POST_NORM          = auto()
-    V_MM_PRE_NORM        = auto() # hunyuanocr
+    V_MM_PRE_NORM        = auto() # hunyuanvl
     V_MM_POST_NORM       = auto()
     V_MM_INP_NORM        = auto()
     V_MM_INP_PROJ        = auto() # gemma3
@@ -791,8 +793,8 @@ class MODEL_TENSOR(IntEnum):
     V_MM_GATE            = auto() # cogvlm
     V_TOK_BOI            = auto() # cogvlm
     V_TOK_EOI            = auto() # cogvlm
-    V_TOK_IMG_BEGIN      = auto() # hunyuanocr
-    V_TOK_IMG_END        = auto() # hunyuanocr
+    V_TOK_IMG_BEGIN      = auto() # hunyuanvl
+    V_TOK_IMG_END        = auto() # hunyuanvl
     V_STD_BIAS           = auto() # gemma4
     V_STD_SCALE          = auto() # gemma4
     V_SAM_POS_EMBD       = auto() # Deepseek-OCR
@@ -856,6 +858,8 @@ class MODEL_TENSOR(IntEnum):
     A_PER_DIM_K_SCALE     = auto() # gemma4
     A_PER_DIM_SCALE       = auto() # gemma4
     # nextn/mtp
+    NEXTN_PRE_PROJ       = auto()
+    NEXTN_POST_PROJ      = auto()
     NEXTN_EH_PROJ        = auto()
     NEXTN_EMBED_TOKENS   = auto()
     NEXTN_ENORM          = auto()
@@ -945,6 +949,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GEMMA3:           "gemma3",
     MODEL_ARCH.GEMMA3N:          "gemma3n",
     MODEL_ARCH.GEMMA4:           "gemma4",
+    MODEL_ARCH.GEMMA4_ASSISTANT: "gemma4-assistant",
     MODEL_ARCH.GEMMA_EMBEDDING:  "gemma-embedding",
     MODEL_ARCH.STARCODER2:       "starcoder2",
     MODEL_ARCH.RWKV6:            "rwkv6",
@@ -1021,6 +1026,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.LLAMA_EMBED:      "llama-embed",
     MODEL_ARCH.MAINCODER:        "maincoder",
     MODEL_ARCH.KIMI_LINEAR:      "kimi-linear",
+    MODEL_ARCH.TALKIE:           "talkie",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -1401,6 +1407,8 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.A_QF_FFN_DOWN:             "a.proj_blk.{bid}.ffn_down",
     MODEL_TENSOR.A_QF_FFN_NORM:             "a.proj_blk.{bid}.ffn_norm",
     # NextN/MTP
+    MODEL_TENSOR.NEXTN_PRE_PROJ:            "nextn.pre_projection",
+    MODEL_TENSOR.NEXTN_POST_PROJ:           "nextn.post_projection",
     MODEL_TENSOR.NEXTN_EH_PROJ:             "blk.{bid}.nextn.eh_proj",
     MODEL_TENSOR.NEXTN_EMBED_TOKENS:        "blk.{bid}.nextn.embed_tokens",
     MODEL_TENSOR.NEXTN_ENORM:               "blk.{bid}.nextn.enorm",
@@ -2481,6 +2489,24 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.PER_LAYER_PROJ_NORM,
         MODEL_TENSOR.PER_LAYER_POST_NORM,
     ],
+    MODEL_ARCH.GEMMA4_ASSISTANT: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.NEXTN_PRE_PROJ,
+        MODEL_TENSOR.NEXTN_POST_PROJ,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.FFN_PRE_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+        MODEL_TENSOR.LAYER_OUT_SCALE,
+    ],
     MODEL_ARCH.GEMMA_EMBEDDING: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT,
@@ -4013,6 +4039,19 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
     ],
+    MODEL_ARCH.TALKIE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.LAYER_OUT_SCALE,
+    ],
     # TODO
 }
 
@@ -4273,7 +4312,6 @@ class VisionProjectorType:
     GLM4V = "glm4v"
     YOUTUVL = "youtuvl"
     NEMOTRON_V2_VL = "nemotron_v2_vl"
-    HUNYUANOCR     = "hunyuanocr"
     HUNYUANVL      = "hunyuanvl"
     MINICPMV4_6    = "minicpmv4_6"
     GRANITE_SPEECH = "granite_speech"  # audio
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 1d9d9ab7d70..80966b6ef15 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -28,6 +28,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati
 # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
 def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
     rows = arr.reshape((-1, arr.shape[-1]))
+    assert len(rows.shape)
     osize = 1
     for dim in oshape:
         osize *= dim
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index f40cb828201..66746f08d13 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -34,6 +34,7 @@ class TensorNameMap:
             "encoder",                                   # neobert
             "model.transformer.wte",                     # llada
             "embed_tokens",                              # qwen3-embedding
+            "model.embed",                               # talkie
         ),
 
         # Token type embeddings
@@ -259,6 +260,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.q_proj",                     # llada
             "layers.{bid}.self_attn.q_proj",                             # qwen3-embedding
             "backbone.layers.{bid}.mixer.q_proj",                        # nemotron-h
+            "model.blocks.{bid}.attn.attn_query",                        # talkie
         ),
 
         # Attention key
@@ -279,6 +281,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.k_proj",                   # llada
             "layers.{bid}.self_attn.k_proj",                           # qwen3-embedding
             "backbone.layers.{bid}.mixer.k_proj",                      # nemotron-h
+            "model.blocks.{bid}.attn.attn_key",                        # talkie
         ),
 
         # Attention value
@@ -298,6 +301,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.v_proj",                     # llada
             "layers.{bid}.self_attn.v_proj",                             # qwen3-embedding
             "backbone.layers.{bid}.mixer.v_proj",                        # nemotron-h
+            "model.blocks.{bid}.attn.attn_value",                        # talkie
         ),
 
         # Attention output
@@ -336,6 +340,7 @@ class TensorNameMap:
             "layers.{bid}.self_attn.o_proj",                                # qwen3-embedding
             "backbone.layers.{bid}.mixer.o_proj",                           # nemotron-h
             "model.layers.{bid}.self_attn.language_expert_dense",           # cogvlm
+            "model.blocks.{bid}.attn.attn_resid",                           # talkie
         ),
 
         # Attention output norm
@@ -508,6 +513,7 @@ class TensorNameMap:
             "layers.{bid}.mlp.up_proj",                               # qwen3-embedding
             "backbone.layers.{bid}.mixer.up_proj",                    # nemotron-h
             "model.layers.{bid}.mlp.language_mlp.up_proj",            # cogvlm
+            "model.blocks.{bid}.mlp.mlp_linear",                      # talkie
         ),
 
         MODEL_TENSOR.FFN_UP_EXP: (
@@ -561,6 +567,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.ff_proj",         # llada
             "layers.{bid}.mlp.gate_proj",                     # qwen3-embedding
             "model.layers.{bid}.mlp.language_mlp.gate_proj",  # cogvlm
+            "model.blocks.{bid}.mlp.mlp_gate",                # talkie
         ),
 
         MODEL_TENSOR.FFN_GATE_EXP: (
@@ -636,6 +643,7 @@ class TensorNameMap:
             "layers.{bid}.mlp.down_proj",                             # qwen3-embedding
             "backbone.layers.{bid}.mixer.down_proj",                  # nemotron-h
             "model.layers.{bid}.mlp.language_mlp.down_proj",          # cogvlm
+            "model.blocks.{bid}.mlp.mlp_resid",                       # talkie
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -682,6 +690,7 @@ class TensorNameMap:
             "model.layers.layers.{bid}.mixer.q_norm",                         # plamo3
             "layers.{bid}.self_attn.q_norm",                                  # qwen3-embedding
             "model.layers.{bid}.attention.query_layernorm",                   # apertus
+            "model.blocks.{bid}.attn.head_gain.head_g",                       # talkie
         ),
 
         MODEL_TENSOR.ATTN_K_NORM: (
@@ -716,6 +725,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.LAYER_OUT_SCALE: (
             "model.layers.{bid}.layer_scalar", # gemma4
+            "model.blocks.{bid}.embed_skip.a_g", # talkie
         ),
 
         MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
@@ -1366,7 +1376,7 @@ class TensorNameMap:
             "mlp_AR.linear_{bid}", # PaddleOCR-VL
             "merger.mlp.{bid}",
             "vision_tower.merger.mlp.{bid}", # dots.ocr
-            "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
+            "vit.perceive.proj.{bid}", # HunyuanVL (proj.0 = conv1, proj.2 = conv2)
         ),
 
         MODEL_TENSOR.V_MMPROJ_FC: (
@@ -1374,7 +1384,7 @@ class TensorNameMap:
             "model.vision.linear_proj.linear_proj", # cogvlm
             "model.projector.layers", # Deepseek-OCR
             "visual.merger.proj", # glm4v
-            "vit.perceive.mlp", # HunyuanOCR
+            "vit.perceive.mlp", # HunyuanVL
         ),
 
         MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1403,7 +1413,7 @@ class TensorNameMap:
             "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
             "vpm.embeddings.patch_embedding",
             "model.vision_model.embeddings.patch_embedding", # SmolVLM
-            "vit.embeddings.patch_embedding", # HunyuanOCR
+            "vit.embeddings.patch_embedding", # HunyuanVL
             "vision_tower.patch_conv", # pixtral-hf
             "vision_encoder.patch_conv", # pixtral
             "vision_model.patch_embedding.linear", # llama 4
@@ -1429,7 +1439,7 @@ class TensorNameMap:
             "model.vision_tower.embeddings.position_embeddings", # Intern-S1
             "vpm.embeddings.position_embedding",
             "model.vision_model.embeddings.position_embedding", # SmolVLM
-            "vit.embeddings.position_embedding", # HunyuanOCR
+            "vit.embeddings.position_embedding", # HunyuanVL
             "vision_model.positional_embedding_vlm", # llama 4
             "vision_tower.patch_embed.pos_emb", # kimi-vl
             "visual.pos_embed", # qwen3vl
@@ -1442,12 +1452,12 @@ class TensorNameMap:
 
         MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
             "model.image_newline",  # Deepseek-OCR
-            "vit.perceive.image_newline", # HunyuanOCR
+            "vit.perceive.image_newline", # HunyuanVL
         ),
 
         MODEL_TENSOR.V_ENC_EMBD_VSEP: (
             "model.view_seperator",  # Deepseek-OCR
-            "vit.perceive.image_sep", # HunyuanOCR
+            "vit.perceive.image_sep", # HunyuanVL
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_QKV: (
@@ -1466,7 +1476,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
             "vpm.encoder.layers.{bid}.self_attn.q_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
-            "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
+            "vit.layers.{bid}.self_attn.q_proj", # HunyuanVL
             "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
             "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
             "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
@@ -1490,7 +1500,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
             "vpm.encoder.layers.{bid}.self_attn.k_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
-            "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
+            "vit.layers.{bid}.self_attn.k_proj", # HunyuanVL
             "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
             "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
             "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
@@ -1514,7 +1524,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
             "vpm.encoder.layers.{bid}.self_attn.v_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
-            "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
+            "vit.layers.{bid}.self_attn.v_proj", # HunyuanVL
             "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
             "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
             "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
@@ -1532,7 +1542,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
             "vpm.encoder.layers.{bid}.layer_norm1",
             "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
-            "vit.layers.{bid}.input_layernorm", # HunyuanOCR
+            "vit.layers.{bid}.input_layernorm", # HunyuanVL
             "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
             "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
             "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
@@ -1553,7 +1563,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
             "vpm.encoder.layers.{bid}.self_attn.out_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
-            "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
+            "vit.layers.{bid}.self_attn.o_proj", # HunyuanVL
             "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
             "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
             "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
@@ -1580,7 +1590,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
             "vpm.encoder.layers.{bid}.layer_norm2",
             "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
-            "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
+            "vit.layers.{bid}.post_attention_layernorm", # HunyuanVL
             "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
             "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
             "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
@@ -1601,7 +1611,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
             "vpm.encoder.layers.{bid}.mlp.fc1",
             "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
-            "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
+            "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanVL
             "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
             "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
             "vision_model.model.layers.{bid}.mlp.fc1", # llama4
@@ -1630,7 +1640,7 @@ class TensorNameMap:
             "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
             "vpm.encoder.layers.{bid}.mlp.fc2",
             "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
-            "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
+            "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanVL
             "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
             "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
             "vision_model.model.layers.{bid}.mlp.fc2", # llama4
@@ -1694,7 +1704,7 @@ class TensorNameMap:
         MODEL_TENSOR.V_MM_POST_NORM: (
             "visual.merger.post_projection_norm", # glm4v
             "vision_tower.post_trunk_norm", # dots.ocr
-            "vit.perceive.after_rms", # HunyuanOCR
+            "vit.perceive.after_rms", # HunyuanVL
         ),
 
         MODEL_TENSOR.V_MM_INP_PROJ: (
@@ -1899,15 +1909,15 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_MM_PRE_NORM: (
-            "vit.perceive.before_rms", # HunyuanOCR
+            "vit.perceive.before_rms", # HunyuanVL
         ),
 
         MODEL_TENSOR.V_TOK_IMG_BEGIN: (
-            "vit.perceive.image_begin", # HunyuanOCR
+            "vit.perceive.image_begin", # HunyuanVL
         ),
 
         MODEL_TENSOR.V_TOK_IMG_END: (
-            "vit.perceive.image_end", # HunyuanOCR
+            "vit.perceive.image_end", # HunyuanVL
         ),
 
         MODEL_TENSOR.V_STD_BIAS: (
@@ -2245,6 +2255,14 @@ class TensorNameMap:
         ),
 
         # NextN/MTP tensors
+        MODEL_TENSOR.NEXTN_PRE_PROJ: (
+            "pre_projection",
+        ),
+
+        MODEL_TENSOR.NEXTN_POST_PROJ: (
+            "post_projection",
+        ),
+
         MODEL_TENSOR.NEXTN_EH_PROJ: (
             "model.layers.{bid}.eh_proj",
         ),
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index b2a36460ad8..d11c34a2186 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -37,7 +37,7 @@ packages = [
 [tool.poetry.dependencies]
 python = ">=3.10"
 
-[tool.poetry.dev-dependencies]
+[tool.poetry.group.dev.dependencies]
 pytest = "^5.2"
 
 [build-system]
diff --git a/include/llama.h b/include/llama.h
index 75095b22d08..e8374c53b70 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -874,7 +874,8 @@ extern "C" {
 // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
 #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
 
-// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load)
+// Keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load).
+// Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag.
 #define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
 
     typedef uint32_t llama_state_seq_flags;
diff --git a/pyproject.toml b/pyproject.toml
index a39c4f010bb..e4f8c86b951 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,13 +6,13 @@ version = "0.0.0"
 dynamic = ["classifiers"]
 readme = "README.md"
 authors = [{name = "GGML", email = "ggml@ggml.ai"}]
-requires-python = '>=3.10'
+requires-python = '>=3.10,<3.15'
 dependencies = [
-    'numpy (>=1.25.0,<2.0.0)',
+    'numpy (>=1.26.4,<3.0.0)',
     'sentencepiece (>=0.1.98,<0.3.0)',
     'transformers (==5.5.1)',
-    'protobuf (>=4.21.0)',
-    'torch (>=2.2.0,<3.0.0)',
+    'protobuf (>=4.21.0,<5.0.0)',
+    'torch (>=2.6.0,<3.0.0)',
     'gguf @ ./gguf-py',
 ]
 classifiers = [
@@ -32,17 +32,20 @@ llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main"
 llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main"
 
 [tool.poetry]
-packages = [{ include = "*.py", from = "." }]
+packages = [
+    { include = "*.py", from = "." },
+    { include = "conversion", from = "." },
+]
 
 [tool.poetry.dependencies]
 torch = [
-    { version = "~=2.6.0", source = "pypi", markers = "sys_platform == 'darwin'" },
-    { version = "~=2.6.0+cpu", source = "pytorch", markers = "sys_platform == 'linux'" },
-    { version = "~=2.6.0", source = "pypi", markers = "sys_platform == 'win32'" }
+    { version = "==2.11.0", source = "pypi", markers = "sys_platform == 'darwin'" },
+    { version = "==2.11.0+cpu", source = "pytorch", markers = "sys_platform == 'linux'" },
+    { version = "==2.11.0", source = "pypi", markers = "sys_platform == 'win32'" },
 ]
 
 [tool.poetry.group.dev.dependencies]
-pytest = "^5.2"
+pytest = "~=8.3.3"
 
 # Force wheel + cpu
 # For discussion and context see https://github.com/python-poetry/poetry#6409
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
index 122b4788d91..f80fdc1f640 100644
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,8 +1,8 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 
-## Embedding Gemma requires PyTorch 2.6.0 or later
-torch~=2.6.0; platform_machine != "s390x"
+## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility
+torch==2.11.0; platform_machine != "s390x"
 
 # torch s390x packages can only be found from nightly builds
 --extra-index-url https://download.pytorch.org/whl/nightly
diff --git a/scripts/snapdragon/adb/run-bench.sh b/scripts/snapdragon/adb/run-bench.sh
index 27459df241b..bbe7146b444 100755
--- a/scripts/snapdragon/adb/run-bench.sh
+++ b/scripts/snapdragon/adb/run-bench.sh
@@ -45,5 +45,5 @@ adb $adbserial $adbhost shell " \
   ADSP_LIBRARY_PATH=$basedir/$branch/lib \
     $ndev $nhvx $opmask $verbose $profile $hb ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
-        --ubatch-size 256 -fa 1 -ngl 99 $cli_opts $@    \
+        --ubatch-size 1024 -fa 1 -ngl 99 $cli_opts $@   \
 "
diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh
index 3f89a7777c6..48127dfa252 100755
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@@ -73,6 +73,6 @@ adb $adbserial $adbhost shell " \
     $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
       ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
          --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1           \
-         --ctx-size 8192 --ubatch-size 256 -fa on                  \
+         --ctx-size 8192 --ubatch-size 1024 -fa on                 \
          -ngl 99 --device $device $cli_opts $@                     \
 "
diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh
index 1b904c77b3f..fe14bb14225 100755
--- a/scripts/snapdragon/adb/run-completion.sh
+++ b/scripts/snapdragon/adb/run-completion.sh
@@ -51,6 +51,9 @@ opbatch=
 opqueue=
 [ "$OQ" != "" ] && opqueue="GGML_HEXAGON_OPQUEUE=$OQ"
 
+oppoll=
+[ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP"
+
 opflt=
 [ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF"
 
@@ -66,9 +69,9 @@ adb $adbserial $adbhost shell " \
   cd $basedir; ulimit -c unlimited;        \
     LD_LIBRARY_PATH=$basedir/$branch/lib   \
     ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
+    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $vmem $mbuf \
       ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
          --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                  \
-         --ctx-size 8192 --ubatch-size 256 -fa on                         \
-         -ngl 99 --device $device $cli_opts $@                    \
+         --ctx-size 8192 --ubatch-size 1024 -fa on                        \
+         -ngl 99 --device $device $cli_opts $@                            \
 "
diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh
index 38467beba3d..992045cb9b3 100755
--- a/scripts/snapdragon/adb/run-mtmd.sh
+++ b/scripts/snapdragon/adb/run-mtmd.sh
@@ -66,6 +66,6 @@ adb $adbserial $adbhost shell " \
          --mmproj $basedir/../gguf/$mmproj                                   \
          --image $basedir/../gguf/$image                                     \
          --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                     \
-         --ctx-size 8192 --ubatch-size 256 -fa on                            \
+         --ctx-size 8192 --ubatch-size 1024 -fa on                           \
          -ngl 99 --device $device -v $cli_opts $@                            \
 "
diff --git a/scripts/snapdragon/adb/run-tool.sh b/scripts/snapdragon/adb/run-tool.sh
index 27cbb2b6d05..6d7e32b3218 100755
--- a/scripts/snapdragon/adb/run-tool.sh
+++ b/scripts/snapdragon/adb/run-tool.sh
@@ -42,6 +42,15 @@ ndev=
 hb=
 [ "$HB" != "" ] && hb="GGML_HEXAGON_HOSTBUF=$HB"
 
+opbatch=
+[ "$OB" != "" ] && opbatch="GGML_HEXAGON_OPBATCH=$OB"
+
+opqueue=
+[ "$OQ" != "" ] && opqueue="GGML_HEXAGON_OPQUEUE=$OQ"
+
+oppoll=
+[ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP"
+
 set -x
 
 tool=$1; shift
@@ -50,5 +59,5 @@ adb $adbserial $adbhost shell " \
   cd $basedir; ulimit -c unlimited;        \
     LD_LIBRARY_PATH=$basedir/$branch/lib   \
     ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb ./$branch/bin/$tool $@ \
+    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll ./$branch/bin/$tool $@ \
 "
diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1
index 8bf6939d2c0..5ee81df6889 100644
--- a/scripts/snapdragon/windows/run-bench.ps1
+++ b/scripts/snapdragon/windows/run-bench.ps1
@@ -45,4 +45,4 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
 & "$basedir\bin\llama-bench.exe" `
     --mmap 0 -m $basedir\..\..\gguf\$model `
     --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
-    --batch-size 128 -ngl 99 --device $device $cli_opts
+    --ubatch-size 1024 -ngl 99 --device $device $cli_opts
diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1
index 104452f9ba7..b51149bec25 100644
--- a/scripts/snapdragon/windows/run-cli.ps1
+++ b/scripts/snapdragon/windows/run-cli.ps1
@@ -49,5 +49,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
 & "$basedir\bin\llama-cli.exe" `
     --no-mmap -m $basedir\..\..\gguf\$model `
     --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
-    --ctx-size 8192 --ubatch-size 256 -fa on `
+    --ctx-size 8192 --ubatch-size 1024 -fa on `
     -ngl 99 --device $device $cli_opts
diff --git a/scripts/snapdragon/windows/run-completion.ps1 b/scripts/snapdragon/windows/run-completion.ps1
index 5841a82fa99..ffce8184dc0 100644
--- a/scripts/snapdragon/windows/run-completion.ps1
+++ b/scripts/snapdragon/windows/run-completion.ps1
@@ -49,5 +49,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
 & "$basedir\bin\llama-completion.exe" `
     --no-mmap -m $basedir\..\..\gguf\$model `
     --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
-    --ctx-size 8192 --batch-size 256 -fa on `
+    --ctx-size 8192 --ubatch-size 1024 -fa on `
     -ngl 99 -no-cnv --device $device $cli_opts
diff --git a/scripts/snapdragon/windows/run-mtmd.ps1 b/scripts/snapdragon/windows/run-mtmd.ps1
index be817875142..b38fae35fe4 100644
--- a/scripts/snapdragon/windows/run-mtmd.ps1
+++ b/scripts/snapdragon/windows/run-mtmd.ps1
@@ -64,5 +64,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
     --mmproj $basedir\..\..\gguf\$mmproj `
     --image $basedir\..\..\gguf\$image `
     --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
-    --ctx-size 8192 --ubatch-size 256 -fa on `
+    --ctx-size 8192 --ubatch-size 1024 -fa on `
     -ngl 99 --device $device -v $cli_opts
diff --git a/scripts/snapdragon/windows/setup-build.ps1 b/scripts/snapdragon/windows/setup-build.ps1
index 0f3244cc9d2..d8ef24d4413 100644
--- a/scripts/snapdragon/windows/setup-build.ps1
+++ b/scripts/snapdragon/windows/setup-build.ps1
@@ -7,10 +7,10 @@ $ErrorActionPreference = "Stop"
 $BaseDir = "C:\Qualcomm"
 
 # SDK 1: Hexagon
-$HexagonUrl     = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz"
+$HexagonUrl     = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz"
 $HexagonParent  = Join-Path $BaseDir "Hexagon_SDK"
-$HexagonSdkVersion   = "6.4.0.2"
-$HexagonToolsVersion = "19.0.04"
+$HexagonSdkVersion   = "6.6.0.0"
+$HexagonToolsVersion = "19.0.07"
 $HexagonSdkTarget    = Join-Path $HexagonParent $HexagonSdkVersion
 $HexagonToolsTarget  = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion"
 
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 0fa47782fd9..a4f87b2b9ae 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-0ce7ad348a3151e1da9f65d962044546bcaad421
+e705c5fed490514458bdd2eaddc43bd098fcce9b
diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py
index 658f7326b96..fec05d97db2 100755
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@
 import sys
 import subprocess
 
-HTTPLIB_VERSION = "refs/tags/v0.45.0"
+HTTPLIB_VERSION = "refs/tags/v0.46.0"
 
 vendor = {
     "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
diff --git a/scripts/ui-assets.cmake b/scripts/ui-assets.cmake
new file mode 100644
index 00000000000..ae7a1cc26d3
--- /dev/null
+++ b/scripts/ui-assets.cmake
@@ -0,0 +1,342 @@
+# Provision UI assets and generate ui.cpp/ui.h.
+#
+# Asset provisioning priority:
+#   1. Pre-built assets in SRC_DIST_DIR (manually built by user)
+#   2. If BUILD_UI=ON: npm build
+#   3. If above did not produce assets and HF_ENABLED=ON: HF Bucket download
+
+cmake_minimum_required(VERSION 3.16)
+
+set(UI_SOURCE_DIR     "" CACHE STRING "UI source directory (to run npm build)")
+set(UI_BINARY_DIR     "" CACHE STRING "UI binary directory (to store generated files)")
+set(LLAMA_SOURCE_DIR  "" CACHE STRING "Project source root (to resolve version from git)")
+set(HF_BUCKET         "" CACHE STRING "Hugging Face bucket name")
+set(HF_VERSION        "" CACHE STRING "Version to download (empty = resolve from git)")
+set(HF_ENABLED        "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)")
+set(BUILD_UI          "" CACHE STRING "Build UI via npm (ON/OFF)")
+set(LLAMA_UI_EMBED    "" CACHE STRING "Path to llama-ui-embed helper")
+
+set(ASSETS
+    bundle.css
+    bundle.js
+    index.html
+    loading.html
+)
+
+set(DIST_DIR     "${UI_BINARY_DIR}/dist")
+set(SRC_DIST_DIR "${UI_SOURCE_DIR}/dist")
+set(STAMP_FILE   "${UI_BINARY_DIR}/.ui-stamp")
+set(UI_CPP       "${UI_BINARY_DIR}/ui.cpp")
+set(UI_H         "${UI_BINARY_DIR}/ui.h")
+
+function(assets_present out_var)
+    set(present TRUE)
+    foreach(asset ${ASSETS})
+        if(NOT EXISTS "${DIST_DIR}/${asset}")
+            set(present FALSE)
+            break()
+        endif()
+    endforeach()
+    set(${out_var} ${present} PARENT_SCOPE)
+endfunction()
+
+function(copy_src_dist out_var)
+    set(${out_var} FALSE PARENT_SCOPE)
+
+    foreach(asset ${ASSETS})
+        if(NOT EXISTS "${SRC_DIST_DIR}/${asset}")
+            return()
+        endif()
+    endforeach()
+
+    file(MAKE_DIRECTORY "${DIST_DIR}")
+    message(STATUS "UI: using pre-built assets from ${SRC_DIST_DIR}")
+    foreach(asset ${ASSETS})
+        execute_process(
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${SRC_DIST_DIR}/${asset}" "${DIST_DIR}/${asset}"
+        )
+    endforeach()
+    set(${out_var} TRUE PARENT_SCOPE)
+endfunction()
+
+function(npm_build_should_skip out_var)
+    set(${out_var} FALSE PARENT_SCOPE)
+
+    assets_present(present)
+    if(NOT present)
+        return()
+    endif()
+
+    if(EXISTS "${STAMP_FILE}")
+        return()
+    endif()
+
+    if(NOT EXISTS "${UI_SOURCE_DIR}/sources.cmake")
+        return()
+    endif()
+    include("${UI_SOURCE_DIR}/sources.cmake")
+
+    set(globs "")
+    foreach(g ${UI_SOURCE_GLOBS})
+        list(APPEND globs "${UI_SOURCE_DIR}/${g}")
+    endforeach()
+    file(GLOB_RECURSE sources ${globs})
+    foreach(f ${UI_SOURCE_FILES})
+        list(APPEND sources "${UI_SOURCE_DIR}/${f}")
+    endforeach()
+
+    file(TIMESTAMP "${DIST_DIR}/index.html" out_ts)
+
+    foreach(s ${sources})
+        if(NOT EXISTS "${s}")
+            continue()
+        endif()
+        file(TIMESTAMP "${s}" s_ts)
+        if(s_ts STRGREATER out_ts)
+            return()
+        endif()
+    endforeach()
+
+    set(${out_var} TRUE PARENT_SCOPE)
+endfunction()
+
+function(npm_build out_var)
+    set(${out_var} FALSE PARENT_SCOPE)
+
+    if(NOT EXISTS "${UI_SOURCE_DIR}/package.json")
+        message(STATUS "UI: ${UI_SOURCE_DIR}/package.json not found, skipping npm")
+        return()
+    endif()
+
+    npm_build_should_skip(skip)
+    if(skip)
+        message(STATUS "UI: npm output up-to-date, skipping build")
+        set(${out_var} TRUE PARENT_SCOPE)
+        return()
+    endif()
+
+    if(CMAKE_HOST_WIN32)
+        find_program(NPM_EXECUTABLE NAMES npm.cmd npm.bat npm)
+    else()
+        find_program(NPM_EXECUTABLE npm)
+    endif()
+    if(NOT NPM_EXECUTABLE)
+        message(STATUS "UI: npm not found, skipping npm build")
+        return()
+    endif()
+
+    if(NOT EXISTS "${UI_SOURCE_DIR}/node_modules")
+        message(STATUS "UI: running npm install (first time)")
+        execute_process(
+            COMMAND ${NPM_EXECUTABLE} install
+            WORKING_DIRECTORY "${UI_SOURCE_DIR}"
+            RESULT_VARIABLE rc
+            ERROR_VARIABLE  err
+        )
+        if(NOT rc EQUAL 0)
+            message(STATUS "UI: npm install failed (${rc})")
+            message(STATUS "  stderr: ${err}")
+            return()
+        endif()
+    endif()
+
+    file(MAKE_DIRECTORY "${DIST_DIR}")
+
+    message(STATUS "UI: running npm run build, output -> ${DIST_DIR}")
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}"
+                ${NPM_EXECUTABLE} run build
+        WORKING_DIRECTORY "${UI_SOURCE_DIR}"
+        RESULT_VARIABLE rc
+        ERROR_VARIABLE  err
+    )
+    if(NOT rc EQUAL 0)
+        message(STATUS "UI: npm run build failed (${rc})")
+        message(STATUS "  stderr: ${err}")
+        return()
+    endif()
+
+    assets_present(present)
+    if(NOT present)
+        message(STATUS "UI: npm build finished but assets missing in ${DIST_DIR}")
+        return()
+    endif()
+
+    message(STATUS "UI: npm build succeeded")
+    file(REMOVE "${STAMP_FILE}")
+    set(${out_var} TRUE PARENT_SCOPE)
+endfunction()
+
+function(resolve_version out_var)
+    if(NOT "${HF_VERSION}" STREQUAL "")
+        set(${out_var} "${HF_VERSION}" PARENT_SCOPE)
+        return()
+    endif()
+
+    if(EXISTS "${LLAMA_SOURCE_DIR}/cmake/build-info.cmake")
+        include("${LLAMA_SOURCE_DIR}/cmake/build-info.cmake")
+        if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0)
+            set(${out_var} "b${BUILD_NUMBER}" PARENT_SCOPE)
+            return()
+        endif()
+    endif()
+
+    set(${out_var} "" PARENT_SCOPE)
+endfunction()
+
+function(hf_download version out_var out_resolved)
+    set(${out_var}      FALSE PARENT_SCOPE)
+    set(${out_resolved} ""    PARENT_SCOPE)
+
+    file(MAKE_DIRECTORY "${DIST_DIR}")
+
+    set(candidates "")
+    if(NOT "${version}" STREQUAL "")
+        list(APPEND candidates "${version}")
+    endif()
+    list(APPEND candidates "latest")
+
+    foreach(resolved ${candidates})
+        set(base "https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/${resolved}")
+
+        message(STATUS "UI: downloading from ${resolved}: ${base}")
+
+        set(ok TRUE)
+        foreach(asset ${ASSETS})
+            file(DOWNLOAD "${base}/${asset}?download=true" "${DIST_DIR}/${asset}"
+                STATUS status TIMEOUT 60
+            )
+            list(GET status 0 rc)
+            if(NOT rc EQUAL 0)
+                list(GET status 1 errmsg)
+                message(STATUS "UI: download ${asset} from ${resolved} failed: ${errmsg}")
+                set(ok FALSE)
+                break()
+            endif()
+            message(STATUS "UI: downloaded ${asset}")
+        endforeach()
+
+        if(NOT ok)
+            continue()
+        endif()
+
+        # Best-effort checksum verification
+        file(DOWNLOAD "${base}/checksums.txt?download=true" "${DIST_DIR}/checksums.txt"
+            STATUS cs_status TIMEOUT 30
+        )
+        list(GET cs_status 0 cs_rc)
+        if(cs_rc EQUAL 0)
+            message(STATUS "UI: verifying checksums")
+            file(STRINGS "${DIST_DIR}/checksums.txt" cs_lines)
+            foreach(asset ${ASSETS})
+                file(SHA256 "${DIST_DIR}/${asset}" h)
+                string(TOLOWER "${h}" h)
+                string(REGEX MATCH "${h}[ \t]+${asset}" m "${cs_lines}")
+                if(NOT m)
+                    message(WARNING "UI: checksum verification failed for ${asset}")
+                    set(ok FALSE)
+                    break()
+                endif()
+            endforeach()
+            if(ok)
+                message(STATUS "UI: all checksums verified")
+            endif()
+        endif()
+
+        if(ok)
+            set(${out_var}      TRUE         PARENT_SCOPE)
+            set(${out_resolved} "${resolved}" PARENT_SCOPE)
+            return()
+        endif()
+    endforeach()
+endfunction()
+
+function(emit_files)
+    assets_present(present)
+
+    set(args "${UI_CPP}" "${UI_H}")
+    if(present)
+        foreach(asset ${ASSETS})
+            list(APPEND args "${asset}" "${DIST_DIR}/${asset}")
+        endforeach()
+    endif()
+
+    execute_process(
+        COMMAND "${LLAMA_UI_EMBED}" ${args}
+        RESULT_VARIABLE rc
+    )
+    if(NOT rc EQUAL 0)
+        message(FATAL_ERROR "UI: llama-ui-embed failed (${rc})")
+    endif()
+endfunction()
+
+# ---------------------------------------------------------------------------
+# 1. Priority 1: pre-built assets supplied in tools/ui/dist
+# ---------------------------------------------------------------------------
+copy_src_dist(SRC_OK)
+if(SRC_OK)
+    emit_files()
+    return()
+endif()
+
+# ---------------------------------------------------------------------------
+# 2. Priority 2: npm build (if BUILD_UI=ON)
+# ---------------------------------------------------------------------------
+set(provisioned FALSE)
+
+if(BUILD_UI)
+    npm_build(NPM_OK)
+    if(NPM_OK)
+        set(provisioned TRUE)
+    endif()
+endif()
+
+# ---------------------------------------------------------------------------
+# 3. Priority 3: HF Bucket download (if npm did not produce assets and HF_ENABLED=ON)
+# ---------------------------------------------------------------------------
+if(NOT provisioned AND HF_ENABLED)
+    resolve_version(VERSION)
+
+    set(stamp_ok FALSE)
+    if(EXISTS "${STAMP_FILE}" AND NOT "${VERSION}" STREQUAL "")
+        file(READ "${STAMP_FILE}" stamped)
+        string(STRIP "${stamped}" stamped)
+        if("${stamped}" STREQUAL "${VERSION}")
+            set(stamp_ok TRUE)
+        endif()
+    endif()
+
+    assets_present(have_assets)
+    if(stamp_ok AND have_assets)
+        message(STATUS "UI: HF stamp '${stamped}' matches version, skipping HF fetch")
+        set(provisioned TRUE)
+    else()
+        hf_download("${VERSION}" HF_OK HF_RESOLVED)
+        if(HF_OK)
+            file(WRITE "${STAMP_FILE}" "${HF_RESOLVED}")
+            message(STATUS "UI: HF download succeeded, stamp updated (${HF_RESOLVED})")
+            set(provisioned TRUE)
+        else()
+            message(STATUS "UI: HF download failed")
+        endif()
+    endif()
+endif()
+
+# ---------------------------------------------------------------------------
+# 4. Fallback: warn about stale or missing assets, then emit whatever we have
+# ---------------------------------------------------------------------------
+if(NOT provisioned)
+    assets_present(have_assets)
+    if(have_assets)
+        message(WARNING "UI: provisioning failed; embedding stale assets from ${DIST_DIR}")
+    else()
+        message(WARNING "UI: no assets available - building without an embedded UI. "
+                        "In a disconnected environment, download the pre-built UI "
+                        "from a llama.cpp release at "
+                        "https://github.com/ggml-org/llama.cpp/releases and "
+                        "extract to tools/ui/dist.")
+    endif()
+endif()
+
+emit_files()
diff --git a/scripts/ui-download.cmake b/scripts/ui-download.cmake
deleted file mode 100644
index 65143642a2c..00000000000
--- a/scripts/ui-download.cmake
+++ /dev/null
@@ -1,223 +0,0 @@
-# Download UI assets from Hugging Face Bucket at build time
-# Usage: cmake -DPUBLIC_DIR=... -DHF_BUCKET=... -DHF_VERSION=... -DASSETS="a;b;c" -P scripts/ui-download.cmake
-#
-# Asset provisioning priority:
-#   1. Pre-built assets already in PUBLIC_DIR (cached from a previous run)
-#   2. Local npm build (if NPM_DIR is provided and has package.json)
-#   3. Hugging Face Bucket download (version-specific, then 'latest' fallback)
-
-cmake_minimum_required(VERSION 3.16)
-
-set(PUBLIC_DIR   "" CACHE STRING "Directory to store/download assets")
-set(HF_BUCKET    "" CACHE STRING "Hugging Face bucket name")
-set(HF_VERSION   "" CACHE STRING "Version to download (empty = resolve from git)")
-set(ASSETS       "" CACHE STRING "Plus-separated list of asset filenames (+)")
-set(STAMP_FILE   "" CACHE STRING "Stamp file to create on success (optional)")
-set(SOURCE_DIR   "" CACHE STRING "Project source root (to resolve version from git)")
-set(NPM_DIR      "" CACHE STRING "UI source directory (to run npm build)")
-set(HF_ENABLED   "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)")
-
-# ---------------------------------------------------------------------------
-# 1. Resolve version from git if not provided at configure time
-# ---------------------------------------------------------------------------
-set(RESOLVED_VERSION "${HF_VERSION}")
-if("${RESOLVED_VERSION}" STREQUAL "" AND NOT "${SOURCE_DIR}" STREQUAL "")
-    if(EXISTS "${SOURCE_DIR}/cmake/build-info.cmake")
-        include("${SOURCE_DIR}/cmake/build-info.cmake")
-        if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0)
-            set(RESOLVED_VERSION "b${BUILD_NUMBER}")
-            message(STATUS "UI: resolved version from git: ${RESOLVED_VERSION}")
-        endif()
-    endif()
-endif()
-
-# Convert + back to CMake list (+ is used as separator instead of ; to
-# avoid platform-specific escaping issues when passing via -D arguments)
-string(REGEX REPLACE "\\+" ";" ASSETS "${ASSETS}")
-
-# ---------------------------------------------------------------------------
-# 2. Check stamp freshness — re-download if resolved version changed
-# ---------------------------------------------------------------------------
-set(FORCE_REBUILD FALSE)
-if(NOT "${STAMP_FILE}" STREQUAL "" AND EXISTS "${STAMP_FILE}")
-    file(READ "${STAMP_FILE}" STAMPED_VERSION)
-    string(STRIP "${STAMPED_VERSION}" STAMPED_VERSION)
-    if(NOT "${STAMPED_VERSION}" STREQUAL "${RESOLVED_VERSION}")
-        message(STATUS "UI: version changed (${STAMPED_VERSION} -> ${RESOLVED_VERSION}), re-building")
-        set(FORCE_REBUILD TRUE)
-    endif()
-endif()
-
-# ---------------------------------------------------------------------------
-# 3. Check if assets already exist (cached from a previous run)
-# ---------------------------------------------------------------------------
-set(ALL_EXISTS TRUE)
-foreach(asset ${ASSETS})
-    if(NOT EXISTS "${PUBLIC_DIR}/${asset}")
-        set(ALL_EXISTS FALSE)
-        break()
-    endif()
-endforeach()
-
-if(ALL_EXISTS AND NOT FORCE_REBUILD)
-    message(STATUS "UI: all assets already exist in ${PUBLIC_DIR}, skipping")
-    return()
-endif()
-
-file(MAKE_DIRECTORY "${PUBLIC_DIR}")
-
-# ---------------------------------------------------------------------------
-# 4. Priority 2: build from source via npm (fast path for developers)
-# ---------------------------------------------------------------------------
-set(PROVISION_SUCCESS FALSE)
-
-if(NOT PROVISION_SUCCESS AND NOT "${NPM_DIR}" STREQUAL "")
-    if(EXISTS "${NPM_DIR}/package.json")
-        # Check if npm is available before attempting npm build
-        find_program(NPM_EXECUTABLE npm)
-        if(NPM_EXECUTABLE)
-            message(STATUS "UI: building from source in ${NPM_DIR}")
-
-            # Run npm install if node_modules is missing
-            if(NOT EXISTS "${NPM_DIR}/node_modules")
-                message(STATUS "UI: running npm install (first time)")
-                execute_process(
-                    COMMAND ${NPM_EXECUTABLE} install
-                    WORKING_DIRECTORY "${NPM_DIR}"
-                    RESULT_VARIABLE NPM_INSTALL_RESULT
-                    OUTPUT_VARIABLE NPM_OUT
-                    ERROR_VARIABLE  NPM_ERR
-                )
-                if(NOT NPM_INSTALL_RESULT EQUAL 0)
-                    message(STATUS "UI: npm install failed (${NPM_INSTALL_RESULT}), falling back to download")
-                    message(STATUS "  stderr: ${NPM_ERR}")
-                endif()
-            endif()
-
-            # Run the build
-            execute_process(
-                COMMAND ${NPM_EXECUTABLE} run build
-                WORKING_DIRECTORY "${NPM_DIR}"
-                RESULT_VARIABLE NPM_BUILD_RESULT
-                OUTPUT_VARIABLE NPM_OUT
-                ERROR_VARIABLE  NPM_ERR
-            )
-
-            if(NPM_BUILD_RESULT EQUAL 0)
-                # Verify that the expected assets were produced
-                set(ALL_BUILT TRUE)
-                foreach(asset ${ASSETS})
-                    if(NOT EXISTS "${PUBLIC_DIR}/${asset}")
-                        set(ALL_BUILT FALSE)
-                        break()
-                    endif()
-                endforeach()
-
-                if(ALL_BUILT)
-                    message(STATUS "UI: local npm build succeeded")
-                    set(PROVISION_SUCCESS TRUE)
-                else()
-                    message(STATUS "UI: npm build completed but assets missing from ${PUBLIC_DIR}, falling back to download")
-                endif()
-            else()
-                message(STATUS "UI: npm build failed (${NPM_BUILD_RESULT}), falling back to download")
-                message(STATUS "  stderr: ${NPM_ERR}")
-            endif()
-        else()
-            message(STATUS "UI: npm not found, skipping npm build and trying HF Bucket download")
-        endif()
-    else()
-        message(STATUS "UI: NPM_DIR (${NPM_DIR}) has no package.json, skipping npm build")
-    endif()
-endif()
-
-# ---------------------------------------------------------------------------
-# 5. Priority 3: download from Hugging Face Bucket (if enabled)
-# ---------------------------------------------------------------------------
-if(NOT PROVISION_SUCCESS AND HF_ENABLED)
-    # Build list of URLs to try — version-specific first, then 'latest'
-    set(URL_ENTRIES "")
-    if(NOT "${RESOLVED_VERSION}" STREQUAL "")
-        list(APPEND URL_ENTRIES
-            "version:https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/${RESOLVED_VERSION}")
-    endif()
-    list(APPEND URL_ENTRIES
-        "latest:https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/latest")
-
-    foreach(entry ${URL_ENTRIES})
-        string(REGEX REPLACE "^([^:]+):.*$" "\\1" url_label "${entry}")
-        string(REGEX REPLACE "^[^:]+:(.*)$" "\\1" base_url "${entry}")
-
-        message(STATUS "UI: downloading assets from ${url_label}: ${base_url}")
-
-        # Download each asset
-        set(ALL_OK TRUE)
-        foreach(asset ${ASSETS})
-            set(download_url "${base_url}/${asset}?download=true")
-            set(download_path "${PUBLIC_DIR}/${asset}")
-            file(DOWNLOAD "${download_url}" "${download_path}"
-                STATUS download_status TIMEOUT 60
-            )
-            list(GET download_status 0 download_result)
-            if(NOT download_result EQUAL 0)
-                list(GET download_status 1 error_message)
-                message(STATUS "UI: failed to download ${asset} from ${url_label}: ${error_message}")
-                set(ALL_OK FALSE)
-                break()
-            endif()
-            message(STATUS "UI: downloaded ${asset}")
-        endforeach()
-
-        if(NOT ALL_OK)
-            continue()
-        endif()
-
-        # Verify checksums if the server provides them
-        file(DOWNLOAD "${base_url}/checksums.txt?download=true"
-            "${PUBLIC_DIR}/checksums.txt"
-            STATUS checksum_status TIMEOUT 30
-        )
-        list(GET checksum_status 0 checksum_result)
-        if(checksum_result EQUAL 0)
-            message(STATUS "UI: verifying checksums...")
-            file(STRINGS "${PUBLIC_DIR}/checksums.txt" CHECKSUMS_CONTENT)
-            foreach(asset ${ASSETS})
-                set(download_path "${PUBLIC_DIR}/${asset}")
-                file(SHA256 "${download_path}" asset_hash)
-                string(TOLOWER "${asset_hash}" EXPECTED_HASH_LOWER)
-                string(REGEX MATCH "${EXPECTED_HASH_LOWER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}")
-                if(NOT CHECKSUM_LINE)
-                    message(WARNING "UI: checksum verification failed for ${asset}")
-                    set(ALL_OK FALSE)
-                    break()
-                endif()
-            endforeach()
-            if(ALL_OK)
-                message(STATUS "UI: all checksums verified")
-            endif()
-        endif()
-
-        if(ALL_OK)
-            set(PROVISION_SUCCESS TRUE)
-            break()
-        endif()
-    endforeach()
-
-    if(PROVISION_SUCCESS)
-        message(STATUS "UI: provisioning complete")
-    else()
-        message(WARNING "UI: failed to download assets from HF Bucket (${HF_BUCKET})")
-    endif()
-endif()
-
-# ---------------------------------------------------------------------------
-# 6. Write stamp file on success (stores resolved version for freshness check)
-# ---------------------------------------------------------------------------
-if(PROVISION_SUCCESS)
-    if(NOT "${STAMP_FILE}" STREQUAL "")
-        file(WRITE "${STAMP_FILE}" "${RESOLVED_VERSION}")
-    endif()
-else()
-    message(WARNING "UI: no source available. Neither local build (${NPM_DIR}) nor HF Bucket download succeeded.")
-    message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.")
-endif()
diff --git a/scripts/xxd.cmake b/scripts/xxd.cmake
deleted file mode 100644
index 73f6cfff7f2..00000000000
--- a/scripts/xxd.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
-# Usage: cmake -DINPUT=build/tools/ui/dist/index.html -DOUTPUT=build/tools/ui/dist/index.html.hpp -P scripts/xxd.cmake
-
-SET(INPUT "" CACHE STRING "Input File")
-SET(OUTPUT "" CACHE STRING "Output File")
-
-get_filename_component(filename "${INPUT}" NAME)
-string(REGEX REPLACE "\\.|-" "_" name "${filename}")
-
-file(READ "${INPUT}" hex_data HEX)
-string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}")
-
-string(LENGTH ${hex_data} hex_len)
-math(EXPR len "${hex_len} / 2")
-
-file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n")
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index c9eead18aa3..96620dc30a2 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -57,6 +57,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GEMMA3,           "gemma3"           },
     { LLM_ARCH_GEMMA3N,          "gemma3n"          },
     { LLM_ARCH_GEMMA4,           "gemma4"           },
+    { LLM_ARCH_GEMMA4_ASSISTANT, "gemma4-assistant" },
     { LLM_ARCH_GEMMA_EMBEDDING,  "gemma-embedding"  },
     { LLM_ARCH_STARCODER2,       "starcoder2"       },
     { LLM_ARCH_MAMBA,            "mamba"            },
@@ -133,6 +134,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
     { LLM_ARCH_MAINCODER,        "maincoder"        },
     { LLM_ARCH_KIMI_LINEAR,      "kimi-linear"      },
+    { LLM_ARCH_TALKIE,           "talkie"           },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -445,6 +447,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_FFN_NORM_EXPS,                          "blk.%d.ffn_norm_exps" },
     { LLM_TENSOR_ATTN_K_B,                               "blk.%d.attn_k_b" },
     { LLM_TENSOR_ATTN_V_B,                               "blk.%d.attn_v_b" },
+    { LLM_TENSOR_NEXTN_PRE_PROJ,                         "nextn.pre_projection" },
+    { LLM_TENSOR_NEXTN_POST_PROJ,                        "nextn.post_projection" },
     { LLM_TENSOR_NEXTN_EH_PROJ,                          "blk.%d.nextn.eh_proj" },
     { LLM_TENSOR_NEXTN_EMBED_TOKENS,                     "blk.%d.nextn.embed_tokens" },
     { LLM_TENSOR_NEXTN_ENORM,                            "blk.%d.nextn.enorm" },
@@ -757,6 +761,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_INDEXER_PROJ,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_Q_B,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_NEXTN_PRE_PROJ,             {LLM_TENSOR_LAYER_INPUT,     GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_NEXTN_POST_PROJ,            {LLM_TENSOR_LAYER_INPUT,     GGML_OP_MUL_MAT}},
     // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
     // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
     // the model loader doesn't fault on the block index.
@@ -767,8 +773,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // Nemotron 3 Super
-    {LLM_TENSOR_FFN_LATENT_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    // latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
+    {LLM_TENSOR_FFN_LATENT_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
 };
 
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 89cf16cc37c..35a235bd5ed 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -61,6 +61,7 @@ enum llm_arch {
     LLM_ARCH_GEMMA3,
     LLM_ARCH_GEMMA3N,
     LLM_ARCH_GEMMA4,
+    LLM_ARCH_GEMMA4_ASSISTANT,
     LLM_ARCH_GEMMA_EMBEDDING,
     LLM_ARCH_STARCODER2,
     LLM_ARCH_MAMBA,
@@ -137,6 +138,7 @@ enum llm_arch {
     LLM_ARCH_LLAMA_EMBED,
     LLM_ARCH_MAINCODER,
     LLM_ARCH_KIMI_LINEAR,
+    LLM_ARCH_TALKIE,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -549,6 +551,8 @@ enum llm_tensor {
     LLM_TENSOR_INDEXER_PROJ,
     LLM_TENSOR_INDEXER_ATTN_K,
     LLM_TENSOR_INDEXER_ATTN_Q_B,
+    LLM_TENSOR_NEXTN_PRE_PROJ,
+    LLM_TENSOR_NEXTN_POST_PROJ,
     LLM_TENSOR_NEXTN_EH_PROJ,
     LLM_TENSOR_NEXTN_EMBED_TOKENS,
     LLM_TENSOR_NEXTN_ENORM,
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 6554a89b28a..f10397747b0 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -73,7 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
     { "gpt-oss",           LLM_CHAT_TEMPLATE_OPENAI_MOE        },
     { "hunyuan-dense",     LLM_CHAT_TEMPLATE_HUNYUAN_DENSE     },
-    { "hunyuan-ocr",       LLM_CHAT_TEMPLATE_HUNYUAN_OCR       },
+    { "hunyuan-vl",        LLM_CHAT_TEMPLATE_HUNYUAN_VL        },
     { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
     { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
     { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
@@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
     } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
         return LLM_CHAT_TEMPLATE_OPENAI_MOE;
     } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_begin▁of▁sentence｜>")) {
-        return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
+        return LLM_CHAT_TEMPLATE_HUNYUAN_VL;
     } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_place▁holder▁no▁3｜>")) {
         return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
     } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@@ -825,8 +825,8 @@ int32_t llm_chat_apply_template(
                 ss << "<｜hy_User｜>" << chat[i]->content << "<｜hy_Assistant｜>";
             }
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
-        // tencent/HunyuanOCR
+    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) {
+        // tencent/HunyuanOCR & tencent/HunyuanVL
         ss << "<｜hy_begin▁of▁sentence｜>";
         for (size_t i = 0; i < chat.size(); i++) {
             std::string role(chat[i]->role);
diff --git a/src/llama-chat.h b/src/llama-chat.h
index 13f936a946c..ea6540c0be7 100644
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -53,7 +53,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
     LLM_CHAT_TEMPLATE_OPENAI_MOE,
     LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
-    LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
+    LLM_CHAT_TEMPLATE_HUNYUAN_VL,
     LLM_CHAT_TEMPLATE_KIMI_K2,
     LLM_CHAT_TEMPLATE_SEED_OSS,
     LLM_CHAT_TEMPLATE_GROK_2,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 3cc8ffa6668..a1135698053 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -30,6 +30,70 @@ static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) {
     throw std::runtime_error("Unsupported ctx type");
 }
 
+static uint32_t ctx_type_to_embd_inp(const llama_hparams & hparams, llama_context_type ctx_type) {
+    switch (ctx_type) {
+        case LLAMA_CONTEXT_TYPE_DEFAULT: return hparams.n_embd_inp();
+        case LLAMA_CONTEXT_TYPE_MTP    : return hparams.n_embd_out();
+    }
+    throw std::runtime_error("Unsupported ctx type");
+}
+
+namespace {
+struct src_mctx_reset_on_exit {
+    llama_memory_context_ptr * slot;
+    ~src_mctx_reset_on_exit() { if (slot) slot->reset(); }
+};
+
+static void llama_assert_gemma4_mtp_source_placement(
+        const llama_context * ctx,
+        const llama_context * src) {
+    if (!ctx || !src) {
+        return;
+    }
+
+    const auto & model_dft = ctx->get_model();
+    const auto & model_tgt = src->get_model();
+
+    if (model_dft.arch != LLM_ARCH_GEMMA4_ASSISTANT || model_tgt.arch != LLM_ARCH_GEMMA4) {
+        return;
+    }
+
+    if (model_tgt.split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
+        return;
+    }
+
+    const auto & hparams_dft = model_dft.hparams;
+    const auto & hparams_tgt = model_tgt.hparams;
+
+    const int32_t il_tgt_full = (int32_t) hparams_tgt.n_layer - 1;
+    const int32_t il_tgt_swa  = (int32_t) hparams_tgt.n_layer - 2;
+
+    ggml_backend_dev_t dev_cpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!dev_cpu) {
+        throw std::runtime_error("Gemma 4 assistant MTP placement check failed: no CPU backend found");
+    }
+
+    const bool kv_offload = src->get_cparams().offload_kqv;
+
+    for (uint32_t il_dft = 0; il_dft < hparams_dft.n_layer; ++il_dft) {
+        const int32_t il_tgt = hparams_dft.is_swa(il_dft) ? il_tgt_swa : il_tgt_full;
+
+        ggml_backend_dev_t dev_dft = model_dft.dev_layer(il_dft);
+        ggml_backend_dev_t dev_kv  = kv_offload ? model_tgt.dev_layer(il_tgt) : dev_cpu;
+
+        if (dev_dft != dev_kv) {
+            throw std::runtime_error(format(
+                    "Gemma 4 assistant MTP placement mismatch: draft layer %d is on %s, "
+                    "but shared target KV layer %d is on %s",
+                    (int) il_dft,
+                    ggml_backend_dev_name(dev_dft),
+                    (int) il_tgt,
+                    ggml_backend_dev_name(dev_kv)));
+        }
+    }
+}
+}
+
 llama_context::llama_context(
         const llama_model & model,
               llama_context_params params) :
@@ -368,7 +432,11 @@ llama_context::llama_context(
             LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
         }
 
-        sched_reserve();
+        // MTP draft contexts can't reserve until the source context is wired
+        // via llama_set_mtp_source — defer to the first decode.
+        if (cparams.ctx_type != LLAMA_CONTEXT_TYPE_MTP) {
+            sched_reserve();
+        }
 
         if (!cparams.flash_attn) {
             if (ggml_is_quantized(params.type_v)) {
@@ -442,6 +510,23 @@ void llama_context::sched_reserve() {
         }
     }
 
+    // When called from decode(), src_mctx_for_decode is already populated and
+    // we must not drop it on exit (process_ubatch still needs it). Snapshot
+    // only when sched_reserve runs standalone (e.g. lazy first-decode reserve
+    // when set_mtp_source flipped sched_need_reserve).
+    const bool owns_src_snapshot = src_ctx && !src_mctx_for_decode;
+    if (owns_src_snapshot) {
+        auto * src_memory = src_ctx->get_memory();
+        if (!src_memory) {
+            throw std::runtime_error("MTP source context has no memory module");
+        }
+        src_mctx_for_decode = src_memory->init_full();
+        if (!src_mctx_for_decode) {
+            throw std::runtime_error("failed to initialize MTP source memory snapshot");
+        }
+    }
+    src_mctx_reset_on_exit reserve_src_drop{owns_src_snapshot ? &src_mctx_for_decode : nullptr};
+
     // avoid reserving graphs with zero outputs - assume one output per sequence
     const int n_outputs = n_seqs;
 
@@ -896,10 +981,9 @@ float * llama_context::get_embeddings_pre_norm_ith(int32_t i) {
             throw std::runtime_error("no pre-norm embeddings");
         }
 
-        const uint32_t n_embd = model.hparams.n_embd;
+        const uint32_t n_embd = model.hparams.n_embd_out();
 
         if (!cparams.embeddings_pre_norm_masked) {
-            // unmasked: pre-norm rows are stored densely, indexed by raw token position.
             if (i < 0 || (size_t)(i + 1) * n_embd > embd_pre_norm.size) {
                 throw std::runtime_error(format("out of range [0, %zu)", embd_pre_norm.size / n_embd));
             }
@@ -1105,6 +1189,18 @@ void llama_context::set_embeddings_pre_norm(bool value, bool masked) {
     cparams.embeddings_pre_norm_masked = masked;
 }
 
+void llama_context::set_mtp_source(llama_context * src) {
+    if (src_ctx == src) {
+        return;
+    }
+    llama_assert_gemma4_mtp_source_placement(this, src);
+    src_ctx = src;
+    src_mctx_for_decode.reset();
+    // worst-case compute buffers were reserved without knowing about the source
+    // memory; force a re-reserve so the next decode sees src views
+    sched_need_reserve = true;
+}
+
 void llama_context::set_causal_attn(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
@@ -1137,6 +1233,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
 
     LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
 
+    if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
+        static bool warned = false;
+        if (!warned) {
+            LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__);
+            warned = true;
+        }
+        if (sampling.samplers.count(seq_id) > 0) {
+            sched_need_reserve = true;
+        }
+        sampling.samplers.erase(seq_id);
+        return false;
+    }
+
     const bool can_offload =
         sampler &&
         sampler->iface->backend_init &&
@@ -1317,7 +1426,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     const auto & hparams = model.hparams;
 
-    const int64_t n_embd  = hparams.n_embd_inp();
+    const int64_t n_embd  = ctx_type_to_embd_inp(hparams, cparams.ctx_type);
     const int64_t n_vocab = model.vocab.n_tokens();
 
     // note: during encode, we always pass the full sequence starting from pos = 0
@@ -1452,7 +1561,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
         ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
         GGML_ASSERT(backend_h != nullptr);
 
-        const uint32_t n_embd = hparams.n_embd;
+        const uint32_t n_embd = hparams.n_embd_out();
         GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_pre_norm.size);
         ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm.data, 0, n_tokens*n_embd*sizeof(float));
     }
@@ -1627,7 +1736,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     const auto & hparams = model.hparams;
 
     const int64_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd_inp();
+    const int64_t n_embd  = ctx_type_to_embd_inp(hparams, cparams.ctx_type);
 
     // when computing embeddings, all tokens are output
     const bool output_all   = cparams.embeddings;
@@ -1689,6 +1798,20 @@ int llama_context::decode(const llama_batch & batch_inp) {
     embd_seq.clear();
     output_swaps.clear();
 
+    src_mctx_reset_on_exit decode_src_drop{&src_mctx_for_decode};
+    if (src_ctx) {
+        auto * src_memory = src_ctx->get_memory();
+        if (!src_memory) {
+            LLAMA_LOG_ERROR("%s: MTP source context has no memory module\n", __func__);
+            return -2;
+        }
+        src_mctx_for_decode = src_memory->init_full();
+        if (!src_mctx_for_decode) {
+            LLAMA_LOG_ERROR("%s: failed to snapshot MTP source memory\n", __func__);
+            return -2;
+        }
+    }
+
     sched_reserve();
 
     bool did_optimize = false;
@@ -1903,7 +2026,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                 ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
                 GGML_ASSERT(backend_h != nullptr);
 
-                const uint32_t n_embd = hparams.n_embd;
+                const uint32_t n_embd = hparams.n_embd_out();
                 float * embd_pre_norm_out = embd_pre_norm.data + offset*n_embd;
 
                 GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_pre_norm.size);
@@ -1996,7 +2119,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     const auto n_batch    = cparams.n_batch;
     const auto n_vocab    = vocab.n_tokens();
-    const auto n_embd     = hparams.n_embd;
     const auto n_embd_out = hparams.n_embd_out();
 
     bool has_logits        = true;
@@ -2015,12 +2137,10 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     logits.size        = has_logits        ? n_vocab*n_outputs_max     : 0;
     embd.size          = has_embd          ? n_embd_out*n_outputs_max  : 0;
-    embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max      : 0;
+    embd_pre_norm.size = has_embd_pre_norm ? n_embd_out*n_outputs_max  : 0;
 
     if (has_embd_pre_norm && !cparams.embeddings_pre_norm_masked) {
-        // unmasked: pre-norm row exists for every token in the batch, not just
-        // those flagged via batch.logits[i] -> size by token count instead.
-        embd_pre_norm.size = (size_t) n_embd * n_batch;
+        embd_pre_norm.size = (size_t) n_embd_out * n_batch;
     }
 
     // Allocate backend sampling output buffers if there are backend samplers configured.
@@ -2283,6 +2403,8 @@ llm_graph_params llama_context::graph_params(
         /*.cvec        =*/ cvec.get(),
         /*.loras       =*/ loras.get(),
         /*.mctx        =*/ mctx,
+        /*.src_mctx    =*/ src_mctx_for_decode.get(),
+        /*.src_model   =*/ src_ctx ? &src_ctx->get_model() : nullptr,
         /*.cross       =*/ &cross,
         /*.samplers    =*/ sampling.samplers,
         /*.n_outputs   =*/ n_outputs,
@@ -3575,6 +3697,10 @@ void llama_set_embeddings_pre_norm(llama_context * ctx, bool value, bool masked)
     ctx->set_embeddings_pre_norm(value, masked);
 }
 
+void llama_set_mtp_source(llama_context * ctx, llama_context * src) {
+    ctx->set_mtp_source(src);
+}
+
 float * llama_get_embeddings_pre_norm(llama_context * ctx) {
     ctx->synchronize();
 
diff --git a/src/llama-context.h b/src/llama-context.h
index d03f681d4a1..cffd8a83a1c 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -6,6 +6,7 @@
 #include "llama-graph.h"
 #include "llama-adapter.h"
 #include "llama-impl.h"
+#include "llama-memory.h"
 
 #include "ggml-cpp.h"
 #include "ggml-opt.h"
@@ -111,6 +112,7 @@ struct llama_context {
 
     void set_embeddings (bool value);
     void set_embeddings_pre_norm(bool value, bool masked);
+    void set_mtp_source(llama_context * src);
     void set_causal_attn(bool value);
     void set_warmup(bool value);
 
@@ -275,6 +277,12 @@ struct llama_context {
 
     std::unique_ptr<llama_memory_i> memory;
 
+    // external KV source used by MTP draft contexts. src_ctx is the target
+    // context whose memory we read; src_mctx_for_decode is a per-decode
+    // snapshot held for the duration of one decode/sched_reserve call.
+    llama_context *           src_ctx              = nullptr;
+    llama_memory_context_ptr  src_mctx_for_decode;
+
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     buffer_view<float> logits = {nullptr, 0};
 
diff --git a/src/llama-ext.h b/src/llama-ext.h
index edfa71c207c..9e1cf727996 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -85,6 +85,11 @@ using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory
 LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model);
 LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
 
+// number of layers that own KV (i.e. layers whose graph writes K/V).
+// 0 means the model owns no KV — e.g. a Gemma4-style MTP draft that reads
+// trunk KV via llama_set_mtp_source.
+LLAMA_API int32_t llama_model_n_layer_kv(const struct llama_model * model);
+
 LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
 
 LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
@@ -97,6 +102,7 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c
 // If masked == true,  output the embeddings only for the tokens with batch.logits != 0
 // If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
 LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value, bool masked);
+LLAMA_API void llama_set_mtp_source(struct llama_context * ctx, struct llama_context * src);
 
 // mirrors:
 // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 31cf41a1c2d..c894cb04544 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
 }
 
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
-    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
-    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+    // base tensors may not be allocated if there are no non-SWA attention layers
+    if (self_k_idxs && self_k_idxs->buffer) {
+        mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
+        mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
 
-    mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
 
-    mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
-    mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+    // swa tensors may not be allocated if there are no SWA attention layers
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
+        mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
 
-    mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+    }
 
     if (self_k_rot) {
         mctx->get_base()->set_input_k_rot(self_k_rot);
@@ -534,15 +540,51 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
 
     bool res = true;
 
-    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+    // base tensors may not be allocated if there are no non-SWA attention layers
+    if (self_k_idxs && self_k_idxs->buffer) {
+        res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+        res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
+    }
+
+    // swa tensors may not be allocated if there are no SWA attention layers
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+        res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+    }
+
+    return res;
+}
+
+void llm_graph_input_attn_src_kv_iswa::set_input(const llama_ubatch * ubatch) {
+    src_mctx->get_base()->set_input_kq_mask(self_kq_mask,     ubatch, cparams.causal_attn);
+    src_mctx->get_swa() ->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+
+    if (self_k_rot) {
+        src_mctx->get_base()->set_input_k_rot(self_k_rot);
+    }
+    if (self_v_rot) {
+        src_mctx->get_base()->set_input_v_rot(self_v_rot);
+    }
+    if (self_k_rot_swa) {
+        src_mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
+    }
+    if (self_v_rot_swa) {
+        src_mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
+    }
+}
 
-    res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+bool llm_graph_input_attn_src_kv_iswa::can_reuse(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_kv_cache_iswa_context *>(params.src_mctx);
 
+    this->src_mctx = mctx;
+
+    bool res = true;
     res &= can_reuse_kq_mask(self_kq_mask,     mctx->get_base(), params.ubatch, params.cparams);
     res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(),  params.ubatch, params.cparams);
-
     return res;
 }
 
@@ -953,6 +995,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     cvec             (params.cvec),
     loras            (params.loras),
     mctx             (params.mctx),
+    src_mctx         (params.src_mctx),
+    src_model        (params.src_model),
     cross            (params.cross),
     samplers         (params.samplers),
     cb_func          (params.cb),
@@ -2441,6 +2485,98 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
 
+llm_graph_input_attn_src_kv_iswa * llm_graph_context::build_attn_inp_src_kv_iswa() const {
+    GGML_ASSERT(src_mctx && "MTP draft graph requires src_mctx (set via llama_set_mtp_source)");
+
+    const auto * src_iswa = static_cast<const llama_kv_cache_iswa_context *>(src_mctx);
+
+    auto inp = std::make_unique<llm_graph_input_attn_src_kv_iswa>(hparams, cparams, src_iswa);
+
+    inp->self_kq_mask     = build_attn_inp_kq_mask(ctx0, src_iswa->get_base(), ubatch, cparams);
+    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+
+    inp->self_kq_mask_swa     = build_attn_inp_kq_mask(ctx0, src_iswa->get_swa(), ubatch, cparams);
+    inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+
+    inp->self_k_rot     = src_iswa->get_base()->build_input_k_rot(ctx0);
+    inp->self_v_rot     = src_iswa->get_base()->build_input_v_rot(ctx0);
+    inp->self_k_rot_swa = src_iswa->get_swa()->build_input_k_rot(ctx0);
+    inp->self_v_rot_swa = src_iswa->get_swa()->build_input_v_rot(ctx0);
+
+    return (llm_graph_input_attn_src_kv_iswa *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_attn_src_kv_iswa * inp,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * wo_s,
+        ggml_tensor * q_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * sinks,
+        ggml_tensor * v_mla,
+            float     kq_scale,
+            int       il_assist,
+            int       il_src) const {
+    const bool is_swa = hparams.is_swa(il_assist);
+
+    const auto * src_iswa = inp->src_mctx;
+    const auto * src_cur  = is_swa ? src_iswa->get_swa() : src_iswa->get_base();
+
+    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+
+    auto * k_rot = is_swa ? inp->self_k_rot_swa : inp->self_k_rot;
+    auto * v_rot = is_swa ? inp->self_v_rot_swa : inp->self_v_rot;
+
+    if (k_rot) {
+        q_cur = ggml_mul_mat_aux(ctx0, q_cur, k_rot);
+    }
+
+    ggml_build_forward_expand(gf, q_cur);
+
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = src_cur->get_k(ctx0, il_src);
+    ggml_tensor * v = src_cur->get_v(ctx0, il_src);
+
+    // build_attn_mha splits q across k->ne[3] (the trunk's stream count). When the
+    // trunk runs kv_unified=false the assistant's ubatch only references a subset
+    // of streams (one per active draft seq); q->ne[2] is not divisible by the full
+    // n_stream and the view collapses tokens. Slice k/v down to exactly the streams
+    // referenced by this ubatch. Requires those streams to form a contiguous range.
+    if (k->ne[3] > 1 && (uint32_t) k->ne[3] != ubatch.n_seqs_unq) {
+        GGML_ASSERT(ubatch.n_seqs_unq > 0 && ubatch.seq_id_unq);
+        llama_seq_id min_s = ubatch.seq_id_unq[0];
+        llama_seq_id max_s = ubatch.seq_id_unq[0];
+        for (uint32_t s = 1; s < ubatch.n_seqs_unq; ++s) {
+            min_s = std::min(min_s, ubatch.seq_id_unq[s]);
+            max_s = std::max(max_s, ubatch.seq_id_unq[s]);
+        }
+        GGML_ASSERT((uint32_t)(max_s - min_s + 1) == ubatch.n_seqs_unq &&
+                "MTP src-kv attn requires the active draft seq_ids to be contiguous");
+        GGML_ASSERT((int64_t) max_s < k->ne[3] && "MTP assistant seq_id beyond trunk stream count");
+
+        k = ggml_view_4d(ctx0, k, k->ne[0], k->ne[1], k->ne[2], (int64_t) ubatch.n_seqs_unq,
+                k->nb[1], k->nb[2], k->nb[3], (size_t) min_s * k->nb[3]);
+        v = ggml_view_4d(ctx0, v, v->ne[0], v->ne[1], v->ne[2], (int64_t) ubatch.n_seqs_unq,
+                v->nb[1], v->nb[2], v->nb[3], (size_t) min_s * v->nb[3]);
+    }
+
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il_assist);
+    cb(cur, "kqv_out", il_assist);
+
+    if (v_rot) {
+        cur = ggml_mul_mat_aux(ctx0, cur, v_rot);
+    }
+
+    if (wo) {
+        cur = build_lora_mm(wo, cur, wo_s);
+    }
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_attn(
         llm_graph_input_attn_cross * inp,
         ggml_tensor * wo,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index bf6778237e6..c6e44749af3 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -402,6 +402,42 @@ class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
     const llama_kv_cache_iswa_context * mctx;
 };
 
+// mask-only input for attention against an external (read-only) ISWA KV cache.
+// used by MTP draft graphs that attend to the target's KV without owning any.
+class llm_graph_input_attn_src_kv_iswa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_src_kv_iswa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_iswa_context * src_mctx) :
+        hparams(hparams),
+        cparams(cparams),
+        src_mctx(src_mctx) {
+    }
+    ~llm_graph_input_attn_src_kv_iswa() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * get_kq_mask()       const { return self_kq_mask_cnv; }
+    ggml_tensor * get_kq_mask_swa()   const { return self_kq_mask_swa_cnv; }
+
+    ggml_tensor * self_kq_mask         = nullptr;
+    ggml_tensor * self_kq_mask_cnv     = nullptr;
+    ggml_tensor * self_kq_mask_swa     = nullptr;
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr;
+
+    ggml_tensor * self_k_rot     = nullptr;
+    ggml_tensor * self_v_rot     = nullptr;
+    ggml_tensor * self_k_rot_swa = nullptr;
+    ggml_tensor * self_v_rot_swa = nullptr;
+
+    const llama_hparams hparams;
+    const llama_cparams cparams;
+
+    const llama_kv_cache_iswa_context * src_mctx;
+};
+
 class llm_graph_input_attn_cross : public llm_graph_input_i {
 public:
     llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
@@ -544,6 +580,11 @@ struct llm_graph_params {
     const llama_adapter_cvec     * cvec;
     const llama_adapter_loras    * loras;
     const llama_memory_context_i * mctx;
+    // per-decode snapshot of an external memory module the graph reads from
+    // (never writes) — e.g. ctx_dft reading target KV during MTP draft.
+    // nullptr for a main decode. Rebound inside reuse-aware input classes.
+    const llama_memory_context_i * src_mctx;
+    const llama_model            * src_model;
     const llama_cross            * cross;
 
     std::map<llama_seq_id, llama_sampler *> samplers;
@@ -761,6 +802,8 @@ struct llm_graph_context {
     const llama_adapter_cvec     * cvec;
     const llama_adapter_loras    * loras;
     const llama_memory_context_i * mctx;
+    const llama_memory_context_i * src_mctx;
+    const llama_model            * src_model;
     const llama_cross            * cross;
 
     std::map<llama_seq_id, llama_sampler *> samplers;
@@ -973,6 +1016,24 @@ struct llm_graph_context {
                   float   kq_scale,
                     int   il) const;
 
+    llm_graph_input_attn_src_kv_iswa * build_attn_inp_src_kv_iswa() const;
+
+    // Q-only attention against an external ISWA KV cache (no K/V projections,
+    // no writes). il_assist labels the attention block in the local graph for
+    // logging; il_src indexes the source K/V layer to attend to.
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_src_kv_iswa * inp,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * wo_s,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * sinks, // [n_head_q]
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il_assist,
+                    int   il_src) const;
+
     llm_graph_input_attn_cross * build_attn_inp_cross() const;
 
     ggml_tensor * build_attn(
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a49a055a630..c460d7822cd 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -2433,6 +2433,10 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
     return n_kv;
 }
 
+llama_pos llama_kv_cache_context::seq_pos_max(llama_seq_id seq_id) const {
+    return kv->seq_pos_max(seq_id);
+}
+
 ggml_type llama_kv_cache_context::type_k() const {
     return kv->type_k();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0b62dc7b232..b658d5e664b 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -350,6 +350,11 @@ class llama_kv_cache_context : public llama_memory_context_i {
 
     uint32_t get_n_kv() const;
 
+    // last position recorded in the cache for this sequence; -1 if absent.
+    // exposed for cross-context KV consumers (e.g. MTP draft) that need to
+    // anchor the source position without owning a memory module of their own.
+    llama_pos seq_pos_max(llama_seq_id seq_id) const;
+
     ggml_type type_k() const;
     ggml_type type_v() const;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8bf20a716eb..e533ce2ce11 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -44,6 +44,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_llama_embed(params);
         case LLM_ARCH_MAINCODER:
             return new llama_model_maincoder(params);
+        case LLM_ARCH_TALKIE:
+            return new llama_model_talkie(params);
         case LLM_ARCH_DECI:
             return new llama_model_deci(params);
         case LLM_ARCH_BAICHUAN:
@@ -134,6 +136,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_gemma3n(params);
         case LLM_ARCH_GEMMA4:
             return new llama_model_gemma4(params);
+        case LLM_ARCH_GEMMA4_ASSISTANT:
+            return new llama_model_gemma4_assistant(params);
         case LLM_ARCH_GEMMA_EMBEDDING:
             return new llama_model_gemma_embedding(params);
         case LLM_ARCH_STARCODER2:
@@ -1334,6 +1338,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
             if (!layer.ssm_beta_s && layer.ssm_beta) {
                 layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
             }
+            if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) {
+                layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) {
+                layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
 
             // input scales
             if (!layer.wq_in_s && layer.wq) {
@@ -1393,6 +1403,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
             if (!layer.ssm_beta_in_s && layer.ssm_beta) {
                 layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
             }
+            if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) {
+                layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) {
+                layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
         }
         // output scales
         if (output && output->type == GGML_TYPE_NVFP4) {
@@ -2311,6 +2327,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GEMMA3:
         case LLM_ARCH_GEMMA3N:
         case LLM_ARCH_GEMMA4:
+        case LLM_ARCH_GEMMA4_ASSISTANT:
         case LLM_ARCH_GEMMA_EMBEDDING:
         case LLM_ARCH_STARCODER2:
         case LLM_ARCH_OPENELM:
@@ -2341,6 +2358,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_QWEN3NEXT:
         case LLM_ARCH_MIMO2:
         case LLM_ARCH_STEP35:
+        case LLM_ARCH_TALKIE:
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
@@ -2503,6 +2521,10 @@ int32_t llama_model_n_devices(const struct llama_model * model) {
     return (int32_t)model->devices.size();
 }
 
+int32_t llama_model_n_layer_kv(const struct llama_model * model) {
+    return (int32_t) model->hparams.n_layer_kv();
+}
+
 ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i) {
     if (i < 0 || i >= (int)model->devices.size()) {
         return nullptr;
diff --git a/src/llama-model.h b/src/llama-model.h
index 01c87a75271..73324183616 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -202,12 +202,16 @@ struct llama_layer_shortconv {
 };
 
 struct llama_layer_nextn {
-    struct ggml_tensor * eh_proj          = nullptr;
-    struct ggml_tensor * embed_tokens     = nullptr;
-    struct ggml_tensor * enorm            = nullptr;
-    struct ggml_tensor * hnorm            = nullptr;
-    struct ggml_tensor * shared_head_head = nullptr;
-    struct ggml_tensor * shared_head_norm = nullptr;
+    struct ggml_tensor * eh_proj               = nullptr;
+    struct ggml_tensor * eh_proj_s             = nullptr;
+    struct ggml_tensor * eh_proj_in_s          = nullptr;
+    struct ggml_tensor * embed_tokens          = nullptr;
+    struct ggml_tensor * enorm                 = nullptr;
+    struct ggml_tensor * hnorm                 = nullptr;
+    struct ggml_tensor * shared_head_head      = nullptr;
+    struct ggml_tensor * shared_head_head_s    = nullptr;
+    struct ggml_tensor * shared_head_head_in_s = nullptr;
+    struct ggml_tensor * shared_head_norm      = nullptr;
 };
 
 struct llama_layer {
@@ -484,7 +488,7 @@ struct llama_layer {
     struct ggml_tensor * indexer_attn_k   = nullptr;
     struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
 
-    // gemma4 layer output scale
+    // gemma4 layer output scale, reused for talkie embedding skip scale
     struct ggml_tensor * out_scale = nullptr;
 
     struct llama_layer_posnet posnet;
@@ -538,6 +542,10 @@ struct llama_model {
     struct ggml_tensor * output_s    = nullptr;
     struct ggml_tensor * output_in_s = nullptr;
 
+    // NextN/MTP model-level projections
+    struct ggml_tensor * nextn_pre_proj  = nullptr;
+    struct ggml_tensor * nextn_post_proj = nullptr;
+
     // classifier
     struct ggml_tensor * cls       = nullptr;
     struct ggml_tensor * cls_b     = nullptr;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index f43cf546ca0..473becade82 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -511,6 +511,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                 };
                 byte_encode = false;
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_MINICPM5:
+                regex_exprs = {
+                    // original regex from tokenizer.json (openbmb/MiniCPM5-1B)
+                    "\\p{N}{1,3}",
+                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -530,6 +538,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 struct llm_tokenizer_bpe_session {
     llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
 
+    virtual ~llm_tokenizer_bpe_session() = default;
+
     static void append(const llama_token token_id, std::vector<llama_token> & output)  {
         output.push_back(token_id);
     }
@@ -567,7 +577,7 @@ struct llm_tokenizer_bpe_session {
         }
     }
 
-    void tokenize(const std::string & text, std::vector<llama_token> & output) {
+    virtual void tokenize(const std::string & text, std::vector<llama_token> & output) {
         int final_prev_index = -1;
         const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
 
@@ -1579,6 +1589,88 @@ struct llm_tokenizer_plamo2_session {
     const llm_tokenizer_plamo2 & tokenizer;
 };
 
+// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
+// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
+// text at load
+static const std::string dna_kmer_marker = "\xee\x80\x80";
+
+struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
+    llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
+
+    void tokenize(const std::string & text, std::vector<llama_token> & output) override {
+        static const std::string open_tag  = "<dna>";
+        static const std::string close_tag = "</dna>";
+
+        const auto dna_begin_id = vocab.text_to_token(open_tag);
+        const auto dna_end_id   = vocab.text_to_token(close_tag);
+        const auto dna_oov_id   = vocab.text_to_token("<oov>");
+
+        // Fall back to plain BPE if the DNA pieces aren't in the vocab.
+        if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) {
+            llm_tokenizer_bpe_session::tokenize(text, output);
+            return;
+        }
+
+        const size_t k = 6;
+        size_t pos = 0;
+
+        while (pos < text.size()) {
+            const size_t start = text.find(open_tag, pos);
+            if (start == std::string::npos) {
+                if (pos < text.size()) {
+                    llm_tokenizer_bpe_session::tokenize(text.substr(pos), output);
+                }
+                break;
+            }
+            if (start > pos) {
+                llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output);
+            }
+            output.push_back(dna_begin_id);
+
+            const size_t content_start = start + open_tag.size();
+            const size_t end           = text.find(close_tag, content_start);
+            const size_t content_end   = (end == std::string::npos) ? text.size() : end;
+
+            emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output);
+
+            if (end == std::string::npos) {
+                break;
+            }
+            output.push_back(dna_end_id);
+            pos = end + close_tag.size();
+        }
+    }
+
+private:
+    void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector<llama_token> & output) {
+        std::string seq = raw;
+        for (char & c : seq) {
+            if (c >= 'a' && c <= 'z') {
+                c = char(c - 32);
+            }
+        }
+
+        // k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
+        // isn't in the vocab and falls back to <oov>
+        auto kmer_token = [&](const std::string & kmer) {
+            const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
+            return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
+        };
+
+        size_t i = 0;
+        for (; i + k <= seq.size(); i += k) {
+            output.push_back(kmer_token(seq.substr(i, k)));
+        }
+        if (i < seq.size()) {
+            std::string kmer = seq.substr(i);
+            kmer.append(k - kmer.size(), 'A');
+            output.push_back(kmer_token(kmer));
+        }
+    }
+
+    const llama_vocab & vocab;
+};
+
 //
 // impl
 //
@@ -1808,7 +1900,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_mask_id = 103;
 
             add_sep = true;
-        } else if (tokenizer_model == "gpt2") {
+        } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
             // read bpe merges and populate bpe ranks
@@ -1955,6 +2047,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (tokenizer_pre == "default") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (tokenizer_pre == "minicpm5") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5;
+                ignore_merges = true;
             } else if (
                     tokenizer_pre == "llama3"   ||
                     tokenizer_pre == "llama-v3" ||
@@ -2112,7 +2207,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             } else if (
                 tokenizer_pre == "gpt-4o" ||
                 tokenizer_pre == "llama4" ||
-                tokenizer_pre == "kanana2") {
+                tokenizer_pre == "kanana2" ||
+                tokenizer_pre == "talkie") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
                 clean_spaces = false;
             } else if (
@@ -2266,6 +2362,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     }
     GGML_ASSERT(id_to_token.size() == token_to_id.size());
 
+    // hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
+    // it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
+    // k-mers are the block right after <oov>, so only scan from there.
+    if (tokenizer_model == "hybriddna") {
+        const auto idx = token_to_id.find("<oov>");
+        if (idx != token_to_id.end()) {
+            auto it = id_to_token.begin() + idx->second + 1;
+            for (; it != id_to_token.end(); ++it) {
+                std::string & text = it->text;
+                if (text.size() > dna_kmer_marker.size()
+                        && text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
+                    text.erase(text.size() - dna_kmer_marker.size());
+                }
+            }
+        }
+    }
+
     init_tokenizer(type);
 
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@@ -3144,11 +3257,19 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
                 // it calls some other methods that are not exist in llm_tokenizer,
                 // here just cast it to bpe tokenizer object
+                const llm_tokenizer_bpe * tok_bpe = static_cast<const llm_tokenizer_bpe *>(tokenizer.get());
+
+                std::unique_ptr<llm_tokenizer_bpe_session> session;
+                if (vocab.get_tokenizer_model() == "hybriddna") {
+                    session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
+                } else {
+                    session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
+                }
+
                 if (add_special) {
-                    session.append_bos(output);
+                    session->append_bos(output);
                 }
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -3161,15 +3282,15 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
 #endif
-                        session.tokenize(text, output);
+                        session->tokenize(text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        session.append(fragment.token, output);
+                        session->append(fragment.token, output);
                     }
                 }
 
                 if (add_special) {
-                    session.append_eos(output);
-                    session.check_double_bos_eos(output);
+                    session->append_eos(output);
+                    session->check_double_bos_eos(output);
                 }
             } break;
         case LLAMA_VOCAB_TYPE_WPM:
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 8b040b912e2..8ab77594284 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
     LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
     LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
     LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE      = 51,
+    LLAMA_VOCAB_PRE_TYPE_MINICPM5        = 52,
 };
 
 struct LLM_KV;
diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
index a67238383ed..4f4c7cac7a8 100644
--- a/src/models/delta-net-base.cpp
+++ b/src/models/delta-net-base.cpp
@@ -562,13 +562,13 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
     }
 
     const int64_t D = S_v * S_v * H_v;
-    const int64_t K = (int64_t) cparams.n_rs_seq + 1;
+    const int64_t K = cparams.n_rs_seq + 1;
 
     // TODO: remove pad + simplify
-    ggml_tensor * state_in_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
-    ggml_tensor * state_3d    = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0);
+    ggml_tensor * s_3d     = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
+    ggml_tensor * s_3d_pad = ggml_pad       (ctx0, s_3d, 0, K - 1, 0, 0);
 
-    ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d);
+    ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad);
     if (n_seq_tokens > 1) {
         cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
     } else {
diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp
new file mode 100644
index 00000000000..78b22714250
--- /dev/null
+++ b/src/models/gemma4-assistant.cpp
@@ -0,0 +1,208 @@
+#include "models.h"
+
+void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) {
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+
+    uint32_t n_kv_shared_layers = 0;
+    ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
+
+    hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers;
+    hparams.f_attention_scale     = 1.0f;
+
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,         hparams.nextn_predict_layers, false);
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,           hparams.rope_freq_base_train_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,     hparams.n_swa);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA,     hparams.n_embd_head_k_swa);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,   hparams.n_embd_head_v_swa);
+
+    if (hparams.n_layer == 4) {
+        type = LLM_TYPE_31B;
+    }
+}
+
+void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_embd_head_k != n_embd_head_v) {
+        throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k == n_embd_head_v");
+    }
+    if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) {
+        throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k_swa == n_embd_head_v_swa");
+    }
+    if (hparams.n_embd_out() == n_embd) {
+        throw std::runtime_error("Gemma 4 assistant requires embedding_length_out to carry the target hidden size");
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+    output   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+
+    const int64_t n_embd_backbone = hparams.n_embd_out();
+    nextn_pre_proj  = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ,  "weight"), { 2*n_embd_backbone, n_embd }, 0);
+    nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0);
+
+    int rope_freqs_flag = 0;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const int64_t n_head      = hparams.n_head(i);
+        const int64_t n_embd_head = hparams.n_embd_head_k(i);
+        const int64_t n_ff        = hparams.n_ff(i);
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+        layer.wq        = create_tensor(tn(LLM_TENSOR_ATTN_Q,    "weight", i), { n_embd, n_embd_head*n_head }, 0);
+        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,  "weight", i), { n_embd_head*n_head, n_embd }, 0);
+
+        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), { n_embd_head }, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+        layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), { 1u }, 0);
+
+        if (!hparams.is_swa(i)) {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_embd_head/2 }, rope_freqs_flag);
+            rope_freqs_flag = TENSOR_DUPLICATED;
+        }
+
+        layer.ffn_norm      = create_tensor(tn(LLM_TENSOR_FFN_NORM,      "weight", i), { n_embd }, 0);
+        layer.ffn_gate      = create_tensor(tn(LLM_TENSOR_FFN_GATE,      "weight", i), { n_embd, n_ff }, 0);
+        layer.ffn_up        = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), { n_embd, n_ff }, 0);
+        layer.ffn_down      = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), { n_ff, n_embd }, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), { n_embd }, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gemma4_assistant::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) :
+        llm_graph_context(params) {
+    GGML_ASSERT(src_mctx  && "Gemma 4 assistant graph requires an MTP source (llama_set_mtp_source)");
+    GGML_ASSERT(src_model && "Gemma 4 assistant graph requires a source model");
+    GGML_ASSERT(src_model->tok_embd && "source model missing tok_embd");
+
+    const auto & src_hparams = src_model->hparams;
+
+    // By convention the MTP draft reads from the trunk's final SWA and full layers.
+    const int32_t src_layer_full = (int32_t) src_hparams.n_layer - 1;
+    const int32_t src_layer_swa  = (int32_t) src_hparams.n_layer - 2;
+    GGML_ASSERT(!src_hparams.is_swa(src_layer_full) && "trunk's last layer must be full attention");
+    GGML_ASSERT( src_hparams.is_swa(src_layer_swa)  && "trunk's penultimate layer must be SWA");
+
+    const int64_t n_embd_backbone = hparams.n_embd_out();
+
+    ggml_tensor * inp_tokens;
+    ggml_tensor * inp_h;
+    {
+        auto inp = std::make_unique<llm_graph_input_embd>(n_embd_backbone);
+
+        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        cb(inp->tokens, "inp_tokens", -1);
+        ggml_set_input(inp->tokens);
+        inp_tokens = inp->tokens;
+        res->t_inp_tokens = inp->tokens;
+
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_backbone, ubatch.n_tokens);
+        cb(inp->embd, "inp_h", -1);
+        ggml_set_input(inp->embd);
+        inp_h = inp->embd;
+        res->t_inp_embd = inp->embd;
+
+        res->add_input(std::move(inp));
+    }
+
+    ggml_tensor * x = ggml_get_rows(ctx0, src_model->tok_embd, inp_tokens);
+    x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone));
+    cb(x, "inp_embd_target", -1);
+
+    ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0);
+    cb(xh, "inp_xh", -1);
+
+    ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_pre_proj, xh);
+    cb(cur, "pre_proj", -1);
+
+    auto *        inp_attn    = build_attn_inp_src_kv_iswa();
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    ggml_tensor * inpL = cur;
+
+    for (int il = 0; il < n_layer; ++il) {
+        const bool    is_swa = hparams.is_swa(il);
+        const int32_t il_src = is_swa ? src_layer_swa : src_layer_full;
+
+        const int64_t n_embd_head = hparams.n_embd_head_k(il);
+        const int64_t n_head      = hparams.n_head(il);
+
+        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
+        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+        const int   n_rot_l      = hparams.n_rot(il);
+
+        ggml_tensor * cur_norm = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur_norm, "attn_norm", il);
+
+        ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur_norm);
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+        Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+        cb(Qcur, "Qcur_normed", il);
+
+        ggml_tensor * freq_factors = is_swa ? nullptr : model.layers[il].rope_freqs;
+        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig,
+                             freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
+        cb(Qcur, "Qcur_pos", il);
+
+        cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr,
+                Qcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il, il_src);
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur  = ggml_get_rows(ctx0, cur,  inp_out_ids);
+            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+        }
+
+        cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_post_norm", il);
+
+        ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL);
+        cb(attn_out, "attn_out", il);
+
+        cur = build_norm(attn_out, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   nullptr, nullptr,
+                model.layers[il].ffn_gate, nullptr, nullptr,
+                model.layers[il].ffn_down, nullptr, nullptr,
+                nullptr,
+                LLM_FFN_GELU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1);
+        cb(cur, "ffn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, attn_out);
+
+        cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
+        cb(cur, "out_scaled", il);
+
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+
+    ggml_tensor * logits = build_lora_mm(model.output, cur);
+    cb(logits, "result_output", -1);
+    res->t_logits = logits;
+
+    ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_post_proj, cur);
+    cb(h_next, "result_h_pre_norm", -1);
+    res->t_h_pre_norm = h_next;
+
+    ggml_build_forward_expand(gf, logits);
+    ggml_build_forward_expand(gf, h_next);
+}
diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp
index 4f9d8b18bc7..e0c7911d246 100644
--- a/src/models/gemma4.cpp
+++ b/src/models/gemma4.cpp
@@ -245,7 +245,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
         }
 
         // TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing
-        if (il == n_layer - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) {
             cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
             inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         }
@@ -345,7 +345,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
             ggml_tensor * inp_this_layer = ggml_view_2d_slice(ctx0, inp_per_layer, il); // [n_embd_per_layer, n_tokens]
 
             // TODO @ngxson : improve this
-            if (il == n_layer - 1 && inp_out_ids) {
+            if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) {
                 inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids);
             }
 
@@ -372,6 +372,12 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
     }
     cur = inpL;
 
+    res->t_h_pre_norm = cur;
+
+    if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    }
+
     cur = build_norm(cur,
             model.output_norm, nullptr,
             LLM_NORM_RMS, -1);
diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp
index 4e6ebef82cb..1ac5a95ccdc 100644
--- a/src/models/mistral3.cpp
+++ b/src/models/mistral3.cpp
@@ -177,9 +177,9 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
             cb(cur, "ffn_norm", il);
 
             cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   model.layers[il].ffn_up_s,
+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
                     NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, il);
             cb(cur, "ffn_out", il);
@@ -200,7 +200,11 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
                     LLM_FFN_SILU, true,
                     hparams.expert_weights_scale,
                     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    il);
+                    il,
+                    nullptr, nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    model.layers[il].ffn_gate_exps_s,
+                    model.layers[il].ffn_down_exps_s);
             cb(cur, "ffn_moe_out", il);
         }
         cur = ggml_add(ctx0, cur, ffn_inp);
diff --git a/src/models/models.h b/src/models/models.h
index 7e551eb965b..fd5cb52752b 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -186,6 +186,19 @@ struct llama_model_maincoder : public llama_model_base {
 };
 
 
+struct llama_model_talkie : public llama_model_base {
+    llama_model_talkie(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_deci : public llama_model_base {
     llama_model_deci(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
@@ -797,6 +810,19 @@ struct llama_model_gemma4 : public llama_model_base {
 };
 
 
+struct llama_model_gemma4_assistant : public llama_model_base {
+    llama_model_gemma4_assistant(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_gemma_embedding : public llama_model_base {
     llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 361d7538a03..04ecc18fcdc 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -496,7 +496,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     const int64_t n_embd_head = hparams.n_embd_head_v();
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
 
-    // The MTP block lives at the source file's original layer index.
+    // hparams.n_layer includes both main model layers and MTP layers. The MTP
+    // layer is stored immediately after the main layers in model.layers[].
     const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
     const auto & layer = model.layers[il];
 
@@ -524,8 +525,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
 
     res->add_input(std::move(inp));
 
-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn           = build_attn_inp_kv();
 
     ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
@@ -536,7 +538,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
     cb(concat, "mtp_concat", il);
 
-    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
     cb(cur, "mtp_eh_proj", il);
 
     ggml_tensor * inpSA = cur;
@@ -614,6 +616,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     cb(cur, "h_pre_norm", -1);
     res->t_h_pre_norm = cur;
 
+    cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+
     ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
             ? layer.nextn.shared_head_norm
             : model.output_norm;
@@ -622,8 +626,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     cb(cur, "mtp_shared_head_norm", -1);
 
     ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
     GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
+    cur = build_lora_mm(head_w, cur, head_s);
     cb(cur, "result_output", -1);
 
     res->t_logits = cur;
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index 4f63c410d66..dc24f6ed537 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -588,8 +588,10 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
 
     res->add_input(std::move(inp));
 
-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn           = build_attn_inp_kv();
+
 
     ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
@@ -600,7 +602,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
     cb(concat, "mtp_concat", il);
 
-    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
     cb(cur, "mtp_eh_proj", il);
 
     ggml_tensor * inpSA = cur;
@@ -710,6 +712,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     cb(cur, "h_pre_norm", -1);
     res->t_h_pre_norm = cur;
 
+    cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+
     ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
             ? layer.nextn.shared_head_norm
             : model.output_norm;
@@ -718,8 +722,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     cb(cur, "mtp_shared_head_norm", -1);
 
     ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
     GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
+    cur = build_lora_mm(head_w, cur, head_s);
     cb(cur, "result_output", -1);
 
     res->t_logits = cur;
diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp
new file mode 100644
index 00000000000..1258eeb19b6
--- /dev/null
+++ b/src/models/talkie.cpp
@@ -0,0 +1,149 @@
+#include "models.h"
+
+void llama_model_talkie::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+
+    switch (hparams.n_layer) {
+        case 40: type = LLM_TYPE_13B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_talkie::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    output   = create_tensor(tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // no k gain
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {1, n_head}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+
+        layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_talkie::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_talkie::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_k();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+    inpL = build_norm(inpL, nullptr, nullptr, LLM_NORM_RMS, -1);
+    cb(inpL, "inp_norm", -1);
+
+    ggml_tensor * embd_skip = inpL;
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+        ggml_tensor * inp_skip = embd_skip;
+
+        cur = build_norm(inpL, nullptr, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self-attention
+        {
+            auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
+                    n_embd_head, n_head, n_head_kv, il);
+
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+            // reference applies qknorm after rope
+            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_norm", il);
+
+            Kcur = build_norm(Kcur, nullptr, nullptr, LLM_NORM_RMS, il);
+            cb(Kcur, "Kcur_norm", il);
+
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, nullptr, model.layers[il].wo_s,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur      = ggml_get_rows(ctx0, cur,      inp_out_ids);
+            inpSA    = ggml_get_rows(ctx0, inpSA,    inp_out_ids);
+            inp_skip = ggml_get_rows(ctx0, inp_skip, inp_out_ids);
+        }
+
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp, nullptr, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   nullptr, nullptr,
+                model.layers[il].ffn_gate, nullptr, nullptr,
+                model.layers[il].ffn_down, nullptr, model.layers[il].ffn_down_s,
+                nullptr,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        ggml_tensor * skip = ggml_mul(ctx0, inp_skip, model.layers[il].out_scale);
+        cb(skip, "embd_skip", il);
+
+        cur = ggml_add(ctx0, cur, skip);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    cur = build_norm(cur, nullptr, nullptr, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+
+    res->t_embd = cur;
+
+    cur = build_lora_mm(model.output, cur);
+    cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
+    cb(cur, "result_output", -1);
+
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0fdbd39c94a..33ae3b303cf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -255,6 +255,10 @@ set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED
 llama_build_and_test(test-recurrent-state-rollback.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
 set_tests_properties(test-recurrent-state-rollback PROPERTIES FIXTURES_REQUIRED test-download-model)
 
+# Test state save/load functionality
+llama_build_and_test(test-save-load-state.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
+set_tests_properties(test-save-load-state PROPERTIES FIXTURES_REQUIRED test-download-model)
+
 if (NOT GGML_BACKEND_DL)
     # these tests use the backends directly and cannot be built with dynamic loading
     llama_build_and_test(test-barrier.cpp)
diff --git a/tests/gguf-model-data.cpp b/tests/gguf-model-data.cpp
index d277173c466..fe8b4ca76e7 100644
--- a/tests/gguf-model-data.cpp
+++ b/tests/gguf-model-data.cpp
@@ -630,10 +630,11 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
         }
 
         for (int i = 2; i <= model.n_split; i++) {
-            char num_buf[6], total_buf[6];
-            snprintf(num_buf,   sizeof(num_buf),   "%05d", i);
-            snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
-            std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
+            char buf_num[32];
+            char buf_tot[32];
+            snprintf(buf_num, sizeof(buf_num), "%05d", i);
+            snprintf(buf_tot, sizeof(buf_tot), "%05d", (int)model.n_split);
+            std::string shard_name = split_prefix + "-" + buf_num + "-of-" + buf_tot + ".gguf";
 
             auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
             if (!shard.has_value()) {
@@ -704,10 +705,11 @@ gguf_context_ptr gguf_fetch_gguf_ctx(
         }
 
         for (int i = 2; i <= model.n_split; i++) {
-            char num_buf[6], total_buf[6];
-            snprintf(num_buf,   sizeof(num_buf),   "%05d", i);
-            snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
-            std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
+            char buf_num[32];
+            char buf_tot[32];
+            snprintf(buf_num, sizeof(buf_num), "%05d", i);
+            snprintf(buf_tot, sizeof(buf_tot), "%05d", (int)model.n_split);
+            std::string shard_name = split_prefix + "-" + buf_num + "-of-" + buf_tot + ".gguf";
 
             auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
             if (!shard.has_value()) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index f54ab41c195..3853f03297b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -21,6 +21,7 @@
 #include <ggml-cpp.h>
 
 #include <algorithm>
+#include <atomic>
 #include <array>
 #include <cfloat>
 #include <cinttypes>
@@ -33,6 +34,7 @@
 #include <future>
 #include <fstream>
 #include <memory>
+#include <mutex>
 #include <random>
 #include <regex>
 #include <set>
@@ -55,33 +57,24 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
     {
         // parallel initialization
         static const size_t n_threads = N_THREADS;
-        // static RNG initialization (revisit if n_threads stops being constant)
-        static std::vector<std::default_random_engine> generators = []() {
-            std::random_device rd;
-            std::vector<std::default_random_engine> vec;
-            vec.reserve(n_threads);
-            //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
-            for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
-            return vec;
-        }();
-
-        auto init_thread = [&](size_t ith, size_t start, size_t end) {
+
+        auto init_thread = [&](size_t start, size_t end) {
+            thread_local std::default_random_engine gen(std::random_device{}());
             std::uniform_real_distribution<float> distribution(min, max);
-            auto & gen = generators[ith];
             for (size_t i = start; i < end; i++) {
                 data[i] = distribution(gen);
             }
         };
 
         if (n_threads == 1) {
-            init_thread(0, 0, nels);
+            init_thread(0, nels);
         } else {
             std::vector<std::future<void>> tasks;
             tasks.reserve(n_threads);
             for (size_t i = 0; i < n_threads; i++) {
                 size_t start =     i*nels/n_threads;
                 size_t end   = (i+1)*nels/n_threads;
-                tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
+                tasks.push_back(std::async(std::launch::async, init_thread, start, end));
             }
             for (auto & t : tasks) {
                 t.get();
@@ -516,6 +509,25 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
     return true;
 }
 
+static std::string test_time_now() {
+    time_t t = time(NULL);
+    struct tm tm_buf;
+#ifdef _WIN32
+    if (gmtime_s(&tm_buf, &t) != 0) {
+        return "";
+    }
+#else
+    if (gmtime_r(&t, &tm_buf) == nullptr) {
+        return "";
+    }
+#endif
+    char buf[32];
+    if (std::strftime(buf, sizeof(buf), "%FT%TZ", &tm_buf) == 0) {
+        return "";
+    }
+    return buf;
+}
+
 // Test result structure for SQL output
 struct test_result {
     std::string test_time;
@@ -545,11 +557,7 @@ struct test_result {
         supported      = false;
         passed         = false;
 
-        // Set test time
-        time_t t = time(NULL);
-        char   buf[32];
-        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
-        test_time = buf;
+        test_time = test_time_now();
 
         // Set build info
         build_commit = ggml_commit();
@@ -573,11 +581,7 @@ struct test_result {
         n_runs(n_runs),
         device_description(device_description),
         backend_reg_name(backend_reg_name) {
-        // Set test time
-        time_t t = time(NULL);
-        char   buf[32];
-        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
-        test_time = buf;
+        test_time = test_time_now();
 
         // Set build info
         build_commit = ggml_commit();
@@ -1110,6 +1114,17 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
     GGML_ABORT("invalid output format");
 }
 
+static std::mutex g_test_output_mutex;
+
+static void print_test_result_locked(printer * output_printer, const test_result & result) {
+    if (output_printer == nullptr) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> guard(g_test_output_mutex);
+    output_printer->print_test_result(result);
+}
+
 struct test_case {
     virtual ~test_case() {}
 
@@ -1338,9 +1353,7 @@ struct test_case {
             test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test",
                              false, false, "not supported");
 
-            if (output_printer) {
-                output_printer->print_test_result(result);
-            }
+            print_test_result_locked(output_printer, result);
 
             ggml_free(ctx);
             return test_status_t::NOT_SUPPORTED;
@@ -1462,9 +1475,7 @@ struct test_case {
         test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test", supported, test_passed,
                            error_msg);
 
-        if (output_printer) {
-            output_printer->print_test_result(result);
-        }
+        print_test_result_locked(output_printer, result);
 
         return test_passed ? test_status_t::OK : test_status_t::FAIL;
     }
@@ -2866,15 +2877,24 @@ struct test_set : public test_case {
 struct test_cpy : public test_case {
     const ggml_type type_src;
     const ggml_type type_dst;
-    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_src;
+    const std::array<int64_t, 4> ne_dst;
     const std::array<int64_t, 4> permute_src;
     const std::array<int64_t, 4> permute_dst;
     bool _src_use_permute;
     bool _dst_use_permute;
     bool _src_transpose;
+    bool _use_dst_shape;
 
     std::string vars() override {
-        return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose);
+        if (_use_dst_shape) {
+            return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose);
+        }
+        return VARS_TO_STR6(type_src, type_dst, ne_src, permute_src, permute_dst, _src_transpose);
+    }
+
+    int64_t total_elements() const {
+        return ne_src[0] * ne_src[1] * ne_src[2] * ne_src[3];
     }
 
     double max_nmse_err() override {
@@ -2899,7 +2919,7 @@ struct test_cpy : public test_case {
                 err_estimate /= 8.0f;
             }
             err_estimate *= err_estimate;
-            err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]);
+            err_estimate /= (150.0f*150.0f*0.25f)*float(total_elements());
             return err_estimate;
         }
         return 1e-6;
@@ -2910,17 +2930,19 @@ struct test_cpy : public test_case {
     }
 
     test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_src = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_dst = {-1, -1, -1, -1},
             std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
             std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
             bool transpose_src = false)
-        : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
+        : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst),
           _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
           _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
-          _src_transpose(transpose_src){}
+          _src_transpose(transpose_src),
+          _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data());
         ggml_set_param(src);
         ggml_set_name(src, "src");
 
@@ -2934,7 +2956,8 @@ struct test_cpy : public test_case {
             ggml_set_name(src, "src_transposed");
         }
 
-        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
+        std::array<int64_t, 4> dst_ne = _use_dst_shape ? ne_dst : std::array<int64_t, 4>{src->ne[0], src->ne[1], src->ne[2], src->ne[3]};
+        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
         ggml_set_name(dst, "dst");
 
         if (_dst_use_permute) {
@@ -8040,42 +8063,72 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
         for (int k = 1; k < 4; ++k) {
             test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
-            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
-            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 3, 1, 2}, {0, 2, 1, 3}));
         }
     }
 
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
         for (ggml_type type_dst : all_types) {
             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows
         }
     }
     for (ggml_type type_src : all_types) {
         for (ggml_type type_dst : {GGML_TYPE_F32}) {
             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows
         }
     }
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
         for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); // cpy not-contiguous
         }
     }
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+
+    // CPY - different src/dst shapes (reshaping via CPY)
+    // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
+    // Each src permutation is tested against canonical sorted and reverse dst (skip self).
+    {
+        std::array<int64_t, 4> dims = {3, 5, 7, 32};
+        std::sort(dims.begin(), dims.end());
+        std::array<int64_t, 4> canonical = dims;
+        std::array<int64_t, 4> reversed  = {32, 7, 5, 3};
+        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            std::array<int64_t, 4> cur = dims;
+            do {
+                if (cur != canonical) {
+                    test_cases.emplace_back(new test_cpy(type, type, cur, canonical));
+                }
+                if (cur != reversed) {
+                    test_cases.emplace_back(new test_cpy(type, type, cur, reversed));
+                }
+                if (cur[0] == 32 && type == GGML_TYPE_F32) {
+                    if (canonical[0] == 32) {
+                        test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, canonical));
+                    }
+                    if (reversed[0] == 32) {
+                        test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, reversed));
+                    }
+                }
+                std::next_permutation(cur.begin(), cur.end());
+            } while (cur != canonical);
+        }
+    }
 
     for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
         for (bool use_view_slice : { true, false }) {
@@ -8266,6 +8319,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 64, 1, 64));
     test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 256, 1, 256));
     test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 128, 32, 128));
+    test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 128, 4, 128, {2, 3}));
 
 #if 0
     // > 4GB A matrix. Too slow to be enabled by default.
@@ -8830,9 +8884,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1));
     test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2));
     test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3));
+
     test_cases.emplace_back(new test_pad());
     test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular
     test_cases.emplace_back(new test_pad_ext());
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false));
+
     test_cases.emplace_back(new test_pad_reflect_1d());
     test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
     test_cases.emplace_back(new test_roll());
@@ -9132,22 +9201,21 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
 
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F16,  {512, 3072, 1, 1}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {8192, 512, 2, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {3072, 512, 2, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {8192, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {3072, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_Q4_0, {8192, 512, 2, 1}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32,  {8192, 512, 2, 1}));
 
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
 
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
 
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
@@ -9337,6 +9405,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 
     // Examples from granite-4.0-h-1b/ggml-model-Q8_0.gguf
     test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1})); // prefill
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {937, 8192, 1, 1}, {4, 8192, 1, 1})); // prefill
     test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4,   3328, 1, 1}, {4, 3328, 1, 1})); // generate
     test_cases.emplace_back(new test_ssm_conv_bias_silu(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1}, true));  // prefill
     test_cases.emplace_back(new test_ssm_conv_bias_silu(GGML_TYPE_F32, {4,   3328, 1, 1}, {4, 3328, 1, 1}, true));  // generate
@@ -9435,8 +9504,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const c
     return test_cases;
 }
 
-static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
-                         printer * output_printer, const char * test_file_path) {
+static bool test_backend(ggml_backend_t backend, ggml_backend_dev_t dev, test_mode mode, const char * op_names_filter, const char * params_filter,
+                         printer * output_printer, const char * test_file_path, int parallel_workers) {
     auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
         if (params_filter == nullptr) {
             return;
@@ -9489,21 +9558,90 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
             set_use_ref(backend_cpu, true);
         }
 
-        size_t n_ok = 0;
-        size_t                   tests_run = 0;
+        std::atomic<size_t> n_ok = 0;
+        std::atomic<size_t> tests_run = 0;
         std::vector<std::string> failed_tests;
-        for (auto & test : test_cases) {
-            test_status_t status = test->eval(backend, backend_cpu, op_names_filter, output_printer);
-            if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) {
-                continue;
+        std::mutex failed_tests_mutex;
+
+        // Each worker grabs a chunk of cases at a time. The chunk shrinks as we
+        // run out of work so that a few slow tests at the tail get spread across
+        // workers instead of landing on one unlucky thread.
+        constexpr size_t MAX_TESTS_PER_ITER = 100;
+        std::atomic<size_t> test_idx = 0;
+
+        const auto & next_chunk = [&](size_t & my_begin, size_t & my_end) {
+            const size_t cur = test_idx.load(std::memory_order_relaxed);
+            const size_t remaining = cur < test_cases.size() ? test_cases.size() - cur : 0;
+            const size_t chunk = std::max<size_t>(1, std::min<size_t>(MAX_TESTS_PER_ITER, remaining / parallel_workers));
+            my_begin = test_idx.fetch_add(chunk);
+            my_end = std::min(my_begin + chunk, test_cases.size());
+        };
+
+        const auto & run_tests = [&](ggml_backend_t b, ggml_backend_t b_cpu) {
+            size_t my_begin, my_end;
+            next_chunk(my_begin, my_end);
+            while (my_begin < test_cases.size()) {
+                for (size_t i = my_begin; i < my_end; ++i) {
+                    auto & test = test_cases[i];
+                    test_status_t status = test->eval(b, b_cpu, op_names_filter, output_printer);
+                    if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) {
+                        continue;
+                    }
+                    tests_run++;
+                    if (status == test_status_t::OK) {
+                        n_ok++;
+                    } else if (status == test_status_t::FAIL) {
+                        std::lock_guard<std::mutex> guard(failed_tests_mutex);
+                        failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")");
+                    }
+                }
+                next_chunk(my_begin, my_end);
             }
-            tests_run++;
-            if (status == test_status_t::OK) {
-                n_ok++;
-            } else if (status == test_status_t::FAIL) {
-                failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")");
+        };
+
+        if (parallel_workers <= 1) {
+            // Reuse the outer backend / backend_cpu so we don't pay an
+            // extra CPU backend init.
+            run_tests(backend, backend_cpu);
+        } else {
+            std::atomic<size_t> workers_started = 0;
+
+            const auto & eval_worker = [&]() {
+                ggml_backend_t b = ggml_backend_dev_init(dev, NULL);
+                if (b == NULL) {
+                    return;
+                }
+
+                ggml_backend_t b_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+                if (b_cpu == NULL) {
+                    ggml_backend_free(b);
+                    return;
+                }
+
+                if (set_use_ref) {
+                    set_use_ref(b_cpu, true);
+                }
+                workers_started++;
+                run_tests(b, b_cpu);
+                ggml_backend_free(b_cpu);
+                ggml_backend_free(b);
+            };
+
+            std::vector<std::thread> threads;
+            threads.reserve(parallel_workers);
+            for (int i = 0; i < parallel_workers; ++i) {
+                threads.emplace_back(eval_worker);
+            }
+            for (auto & t : threads) {
+                t.join();
+            }
+
+            if (workers_started == 0 && !test_cases.empty()) {
+                ggml_backend_free(backend_cpu);
+                return false;
             }
         }
+
         output_printer->print_summary(test_summary_info(n_ok, tests_run, false));
         output_printer->print_failed_tests(failed_tests);
 
@@ -9651,7 +9789,7 @@ static void show_test_coverage() {
 
 static void usage(char ** argv) {
     printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
-    printf(" [--show-coverage] [--test-file <path>]\n");
+    printf(" [--show-coverage] [--test-file <path>] [-j <n>]\n");
     printf("    valid modes:\n");
     printf("      - test (default, compare with CPU backend for correctness)\n");
     printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
@@ -9663,6 +9801,7 @@ static void usage(char ** argv) {
     printf("    --list-ops lists all available GGML operations\n");
     printf("    --show-coverage shows test coverage\n");
     printf("    --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
+    printf("    -j <n> runs tests using <n> parallel worker threads (default: 1, test mode only)\n");
 }
 
 int main(int argc, char ** argv) {
@@ -9672,6 +9811,7 @@ int main(int argc, char ** argv) {
     const char * backend_filter = nullptr;
     const char * params_filter = nullptr;
     const char * test_file_path = nullptr;
+    int parallel_workers = 1;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "test") == 0) {
@@ -9726,6 +9866,17 @@ int main(int argc, char ** argv) {
                 usage(argv);
                 return 1;
             }
+        } else if (strcmp(argv[i], "-j") == 0) {
+            if (i + 1 < argc) {
+                parallel_workers = atoi(argv[++i]);
+                if (parallel_workers < 1) {
+                    usage(argv);
+                    return 1;
+                }
+            } else {
+                usage(argv);
+                return 1;
+            }
         } else {
             usage(argv);
             return 1;
@@ -9778,7 +9929,7 @@ int main(int argc, char ** argv) {
                                                              false, "", ggml_backend_dev_description(dev),
                                                              total / 1024 / 1024, free / 1024 / 1024, true));
 
-        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
+        bool ok = test_backend(backend, dev, mode, op_names_filter, params_filter, output_printer.get(), test_file_path, parallel_workers);
 
         if (ok) {
             n_ok++;
diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp
index 1d96de718e2..6f8e957489c 100644
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@@ -81,6 +81,8 @@ static void test_normalize_quotes_with_embedded_quotes(testing & t);
 // TAG_WITH_TAGGED argument parsing tests
 static void test_tagged_args_with_embedded_quotes(testing & t);
 
+static void test_role_markers_all_templates(testing & t);
+
 int main(int argc, char * argv[]) {
     testing t(std::cout);
     t.verbose = true;
@@ -103,6 +105,7 @@ int main(int argc, char * argv[]) {
     t.test("standard_json_tools", test_standard_json_tools_formats);
     t.test("normalize_quotes_to_json", test_normalize_quotes_to_json);
     t.test("tagged_args_embedded_quotes", test_tagged_args_with_embedded_quotes);
+    t.test("role_markers_all_templates", test_role_markers_all_templates);
 
     return t.summary();
 }
@@ -714,7 +717,7 @@ static void test_compare_variants_both_modifiers(testing & t) {
 static void test_compare_variants_template_failure(testing & t) {
     // Test with template that causes failure during application (not construction)
     // We use a valid template syntax but one that will fail during application
-    common_chat_template tmpl("{{ messages[0]['nonexistent_field'] }}", "", "");
+    common_chat_template tmpl("{{ messages.cahoot()[0]['nonexistent_field'] }}", "", "");
 
     template_params params;
     params.messages = json::array({
@@ -1848,6 +1851,128 @@ static json build_edit_tool() {
     });
 }
 
+// ============================================================================
+// Role marker detection tests for all autoparser-handled templates
+//
+// Verifies that detect_user_start_marker / detect_assistant_start_marker
+// return the correct boundary text between turns for every template that
+// falls through to the differential autoparser (i.e. is not handled by a
+// dedicated specialized template in common_chat_try_specialized_template).
+//
+// Markers were deduced manually from the jinja sources in models/templates/.
+// ============================================================================
+struct role_marker_case {
+    std::string template_file;
+    std::string expected_user_start;
+    std::string expected_assistant_start;
+};
+
+static void test_role_markers_all_templates(testing & t) {
+    // Each entry is { template filename, user_start, assistant_start } as
+    // produced when rendering the standard chatml-like sequences. The values
+    // come from reading each jinja template and tracing what text precedes
+    // a user/assistant message body once the autoparser strips any reasoning
+    // markers it detected first.
+    const std::vector<role_marker_case> cases = {
+        // ChatML family: <|im_start|>{role} ... <|im_end|>
+        { "Bielik-11B-v3.0-Instruct.jinja",                  "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "HuggingFaceTB-SmolLM3-3B.jinja",                  "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "MiMo-VL.jinja",                                   "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja", "<|im_start|>user",   "<|im_start|>assistant"      },
+        { "NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja",   "<|im_start|>user",   "<|im_start|>assistant"      },
+        { "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja",       "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen3.5-4B.jinja",                                "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen3-Coder.jinja",                               "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen-Qwen2.5-7B-Instruct.jinja",                  "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen-Qwen3-0.6B.jinja",                           "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen-QwQ-32B.jinja",                              "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "StepFun3.5-Flash.jinja",                          "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "stepfun-ai-Step-3.5-Flash.jinja",                 "<|im_start|>user",       "<|im_start|>assistant"      },
+
+        // DeepSeek family
+        { "deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja",  "<｜User｜>",                "<｜Assistant｜>"             },
+        { "deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja",  "<｜User｜>",                "<｜Assistant｜>"             },
+        { "deepseek-ai-DeepSeek-V3.1.jinja",                 "<｜User｜>",                "<｜Assistant｜>"             },
+        { "llama-cpp-deepseek-r1.jinja",                     "<｜User｜>",                "<｜Assistant｜>"             },
+
+        // Llama 3 header family
+        { "meetkai-functionary-medium-v3.1.jinja",           "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        { "meta-llama-Llama-3.1-8B-Instruct.jinja",          "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        { "meta-llama-Llama-3.2-3B-Instruct.jinja",          "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        { "meta-llama-Llama-3.3-70B-Instruct.jinja",         "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        // fireworks-ai forces a trailing assistant header even without add_generation_prompt,
+        // so the marker is absorbed into the common suffix and assistant_start is detected as empty.
+        { "fireworks-ai-llama-3-firefunction-v2.jinja",      "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+
+        // Phi/GLM/Apriel-style: <|user|> / <|assistant|>
+        { "microsoft-Phi-3.5-mini-instruct.jinja",           "<|user|>",               "<|assistant|>"              },
+        { "GLM-4.6.jinja",                                   "<|user|>",               "<|assistant|>"              },
+        { "unsloth-Apriel-1.5.jinja",                        "<|user|>",               "<|assistant|>"              },
+        { "GLM-4.7-Flash.jinja",                             "<|user|>",                 "<|assistant|>"                },
+
+        // Gemma 2: <start_of_turn>{user|model}
+        { "google-gemma-2-2b-it.jinja",                      "<start_of_turn>user",    "<start_of_turn>model"       },
+
+        // IBM Granite
+        { "ibm-granite-granite-3.3-2B-Instruct.jinja",       "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" },
+        { "ibm-granite-granite-4.0.jinja",                   "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" },
+
+        // Cohere R-series
+        { "CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja",
+            "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_RESPONSE|>" },
+        { "CohereForAI-c4ai-command-r-plus-tool_use.jinja",
+            "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" },
+
+        // Mistral: assistant content follows [/INST] immediately, no header
+        { "mistralai-Mistral-Nemo-Instruct-2407.jinja",      "[INST]",                   "" },
+        { "Mistral-Small-3.2-24B-Instruct-2506.jinja",       "[INST]",                   "" },
+
+        // Apertus uses <|user_start|> / <|assistant_start|> but the user diff
+        // carries the preceding <|assistant_end|> from the previous turn.
+        { "Apertus-8B-Instruct.jinja",                       "<|user_start|>", "<|assistant_start|>" },
+
+        // Apriel 1.6 wraps the assistant body with <|begin_assistant|>, but
+        // <|begin_assistant|> is also the detected reasoning start, so the
+        // assistant_start is trimmed back to the preceding newline.
+        { "Apriel-1.6-15b-Thinker-fixed.jinja",              "<|begin_user|>", "<|begin_assistant|>" },
+
+        // ByteDance Seed-OSS: <seed:bos>{role}
+        { "ByteDance-Seed-OSS.jinja",                        "<seed:bos>user",         "<seed:bos>assistant"        },
+
+        // GigaChat 3.1: {role}<|role_sep|>
+        { "GigaChat3.1-10B-A1.8B.jinja",                     "user<|role_sep|>",       "assistant<|role_sep|>"      },
+
+        // MiniMax M2: ]~b]{user|ai}
+        { "MiniMax-M2.jinja",                                "]~b]user",               "]~b]ai"                     },
+
+        // Nemotron Nano v2: <SPECIAL_11>{User|Assistant}; assistant marker
+        // is followed by a prefilled <think> block that gets included.
+        { "NVIDIA-Nemotron-Nano-v2.jinja",                   "<SPECIAL_11>User",       "<SPECIAL_11>Assistant" },
+
+        // Reka Edge: "human: " / "assistant: " — but the rendered preamble
+        // depends on enable_thinking, which currently confuses the user-start
+        // diff and trims the marker down. Lock in the observed value.
+        { "Reka-Edge.jinja",                                 "human:",                     "assistant:"       },
+
+        // RWKV-world chat preset: "User: " / "Assistant: "
+        { "llama-cpp-rwkv-world.jinja",                      "User:",               "Assistant:"              },
+
+        // Upstage Solar 100B: <|begin|>{role}... but reasoning marker absorbs
+        // the "<|begin|>assistant" prefix from assistant_start.
+        { "upstage-Solar-Open-100B.jinja",                   "<|begin|>user<|content|>", "<|begin|>assistant"           },
+    };
+
+    for (const auto & c : cases) {
+        t.test(c.template_file, [&](testing & t) {
+            common_chat_template tmpl = load_template(t, "models/templates/" + c.template_file);
+            struct autoparser ap;
+            ap.analyze_template(tmpl);
+            t.assert_equal("user_start",      c.expected_user_start,      ap.user_start);
+            t.assert_equal("assistant_start", c.expected_assistant_start, ap.assistant_start);
+        });
+    }
+}
+
 // Test that reproduces the Seed-OSS template issue with embedded quotes
 static void test_tagged_args_with_embedded_quotes(testing & t) {
     json tools = build_edit_tool();
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index a428ef35c18..1a5161cc101 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1548,6 +1548,40 @@ static void test_msgs_oaicompat_json_conversion() {
     }
 }
 
+static void test_split_by_role() {
+    LOG_DBG("%s\n", __func__);
+
+    // Empty inputs
+    assert_equals<size_t>(0, common_chat_split_by_role("", {}).size());
+    assert_equals<size_t>(0, common_chat_split_by_role("hello", {}).size());
+    assert_equals<size_t>(0, common_chat_split_by_role("", { { "user", "<|user|>" } }).size());
+
+    // Multi-role conversation, no leading/trailing content
+    {
+        const std::string prompt = "<|user|>Hi<|assistant|>Hello<|user|>Bye";
+        const auto splits = common_chat_split_by_role(prompt, {
+            { "user",      "<|user|>"      },
+            { "assistant", "<|assistant|>" },
+        });
+        assert_equals<size_t>(3, splits.size());
+
+        assert_equals<std::string>("user", splits[0].role);
+        assert_equals<size_t>(0, splits[0].pos);
+        assert_equals<size_t>(10, splits[0].len);
+        assert_equals<std::string>("<|user|>Hi", prompt.substr(splits[0].pos, splits[0].len));
+
+        assert_equals<std::string>("assistant", splits[1].role);
+        assert_equals<size_t>(10, splits[1].pos);
+        assert_equals<size_t>(18, splits[1].len);
+        assert_equals<std::string>("<|assistant|>Hello", prompt.substr(splits[1].pos, splits[1].len));
+
+        assert_equals<std::string>("user", splits[2].role);
+        assert_equals<size_t>(28, splits[2].pos);
+        assert_equals<size_t>(11, splits[2].len);
+        assert_equals<std::string>("<|user|>Bye", prompt.substr(splits[2].pos, splits[2].len));
+    }
+}
+
 static void test_tools_oaicompat_json_conversion() {
     LOG_DBG("%s\n", __func__);
     std::vector<common_chat_tool> tools{
@@ -4338,16 +4372,19 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
     // Format: <TOOLCALL>[{"name": "func", "arguments": {...}}]</TOOLCALL>
     {
         auto tst = peg_tester("models/templates/NVIDIA-Nemotron-Nano-v2.jinja", detailed_debug);
-        tst.test("<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL><SPECIAL_12>")
+        tst.test("<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>")
             .tools({ special_function_tool })
             .expect(message_assist_call)
             .run();
 
         // Continuation tests
         tst.test("world!\nWhat's up?")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
             .messages({ message_user, message_assist_prefill_content })
             .add_generation_prompt(false)
             .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT)
+            .expect_reasoning("I'm thinking")
             .expect_content("Hello, world!\nWhat's up?")
             .run();
     }
@@ -5593,6 +5630,7 @@ int main(int argc, char ** argv) {
     {
         test_msg_diffs_compute();
         test_msgs_oaicompat_json_conversion();
+        test_split_by_role();
         test_tools_oaicompat_json_conversion();
         test_convert_responses_to_chatcmpl();
         test_developer_role_to_system_workaround();
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
index ed3070dc4de..1ae468fbd65 100644
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
@@ -162,6 +162,42 @@ static void helper_write(FILE * file, const void * data, const size_t nbytes) {
     GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
 }
 
+static std::vector<uint8_t> read_file_to_buffer(FILE * file) {
+    GGML_ASSERT(file != nullptr);
+    GGML_ASSERT(fseek(file, 0, SEEK_END) == 0);
+
+    const long size = ftell(file);
+    GGML_ASSERT(size >= 0);
+
+    rewind(file);
+
+    std::vector<uint8_t> data(static_cast<size_t>(size));
+    GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
+
+    rewind(file);
+    return data;
+}
+
+struct callback_reader_data {
+    const uint8_t * data;
+    size_t size;
+};
+
+static size_t read_buffer_callback(void * userdata, void * output, uint64_t offset, size_t len) {
+    GGML_ASSERT(len > 0);
+
+    const callback_reader_data & reader = *static_cast<callback_reader_data *>(userdata);
+
+    if (offset > reader.size || len > reader.size - offset) {
+        return 0;
+    }
+
+    const size_t data_offset = static_cast<size_t>(offset);
+    const size_t nread = std::min(len, reader.size - data_offset);
+    memcpy(static_cast<uint8_t *>(output), reader.data + data_offset, nread);
+    return nread;
+}
+
 static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
     FILE * file = tmpfile();
 
@@ -1095,10 +1131,29 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
     return ok;
 }
 
-static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
+enum roundtrip_read_mode {
+    ROUNDTRIP_READ_MODE_FILE,
+    ROUNDTRIP_READ_MODE_BUFFER,
+    ROUNDTRIP_READ_MODE_CALLBACK,
+};
+
+static const char * roundtrip_read_mode_name(const roundtrip_read_mode mode) {
+    switch (mode) {
+        case ROUNDTRIP_READ_MODE_FILE:     return "file";
+        case ROUNDTRIP_READ_MODE_BUFFER:   return "buffer";
+        case ROUNDTRIP_READ_MODE_CALLBACK: return "callback";
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+static std::pair<int, int> test_roundtrip(
+        ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta,
+        const roundtrip_read_mode read_mode) {
     ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-    printf("%s: device=%s, backend=%s, only_meta=%s\n",
-        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
+    printf("%s: device=%s, backend=%s, only_meta=%s, read_mode=%s\n",
+        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend),
+        only_meta ? "yes" : "no", roundtrip_read_mode_name(read_mode));
 
     int npass = 0;
     int ntest = 0;
@@ -1133,7 +1188,22 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
         /*no_alloc =*/ false,
         /*ctx      =*/ only_meta ? nullptr : &ctx_1,
     };
-    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
+    struct gguf_context * gguf_ctx_1 = nullptr;
+    const std::vector<uint8_t> data = read_mode == ROUNDTRIP_READ_MODE_FILE
+        ? std::vector<uint8_t>()
+        : read_file_to_buffer(file);
+
+    if (read_mode == ROUNDTRIP_READ_MODE_BUFFER) {
+        gguf_ctx_1 = gguf_init_from_buffer(data.data(), data.size(), gguf_params);
+    } else if (read_mode == ROUNDTRIP_READ_MODE_CALLBACK) {
+        callback_reader_data reader = {
+            /*.data = */ data.data(),
+            /*.size = */ data.size(),
+        };
+        gguf_ctx_1 = gguf_init_from_callback(read_buffer_callback, &reader, 4096, 4ull << 30 /* 4GB */, gguf_params);
+    } else {
+        gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
+    }
 
     printf("%s: same_version: ", __func__);
     if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
@@ -1343,7 +1413,17 @@ int main(int argc, char ** argv) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 
         for (bool only_meta : {true, false}) {
-            std::pair<int, int> result = test_roundtrip(dev, seed, only_meta);
+            std::pair<int, int> result = test_roundtrip(dev, seed, only_meta, ROUNDTRIP_READ_MODE_FILE);
+            npass += result.first;
+            ntest += result.second;
+        }
+        {
+            std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_BUFFER);
+            npass += result.first;
+            ntest += result.second;
+        }
+        {
+            std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_CALLBACK);
             npass += result.first;
             ntest += result.second;
         }
diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index 16af11a2862..d1138da0f6f 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -387,7 +387,7 @@ static bool arch_supported(const llm_arch arch) {
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         return false; // FIXME CUDA backend crashes.
     }
-    if (arch == LLM_ARCH_GEMMA4) {
+    if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
         return false; // FIXME @ngxson
     }
     if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
@@ -442,7 +442,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
         if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
             continue;
         }
-        if (arch == LLM_ARCH_GEMMA4) {
+        if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
             continue; // FIXME: ISWA KV cache initialization needs more fixture params
         }
         for (bool moe : {false, true}) {
@@ -526,7 +526,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
         if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
             continue;
         }
-        if (arch == LLM_ARCH_GEMMA4) {
+        if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
             continue; // FIXME: ISWA KV cache initialization needs more fixture params
         }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/tests/test-save-load-state.cpp
similarity index 100%
rename from examples/save-load-state/save-load-state.cpp
rename to tests/test-save-load-state.cpp
diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt
index f9ffd2d4ce7..f6ed257f556 100644
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -1,6 +1,23 @@
+# llama-batched-bench-impl: batched-bench logic, reusable by app
+
+set(TARGET llama-batched-bench-impl)
+
+add_library(${TARGET} batched-bench.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-batched-bench executable
+
 set(TARGET llama-batched-bench)
-add_executable(${TARGET} batched-bench.cpp)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE llama-batched-bench-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp
index 3964ef25955..e2dcd0b2e71 100644
--- a/tools/batched-bench/batched-bench.cpp
+++ b/tools/batched-bench/batched-bench.cpp
@@ -15,7 +15,10 @@ static void print_usage(int, char ** argv) {
     LOG("\n");
 }
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_batched_bench(int argc, char ** argv);
+
+int llama_batched_bench(int argc, char ** argv) {
     std::setlocale(LC_NUMERIC, "C");
 
     common_params params;
diff --git a/tools/batched-bench/main.cpp b/tools/batched-bench/main.cpp
new file mode 100644
index 00000000000..958cfc5b31c
--- /dev/null
+++ b/tools/batched-bench/main.cpp
@@ -0,0 +1,5 @@
+int llama_batched_bench(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_batched_bench(argc, argv);
+}
diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
index 7e01abb81b9..a3e635719b6 100644
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -1,9 +1,24 @@
+# llama-cli-impl: CLI logic, reusable by app
+
+set(TARGET llama-cli-impl)
+
+add_library(${TARGET} cli.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
+target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-cli executable
+
 set(TARGET llama-cli)
-add_executable(${TARGET} cli.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-include_directories(../server)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE llama-cli-impl)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 38bc78a3fdf..04aef018870 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -88,11 +88,11 @@
 | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
 | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
 | `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
-| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
+| `--log-file FNAME` | Log to file<br/>(env: LLAMA_ARG_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_ARG_LOG_COLORS) |
 | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_ARG_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: trace (more info)<br/> - 5: debug<br/>(default: 3)<br/><br/>(env: LLAMA_ARG_LOG_VERBOSITY) |
 | `--log-prefix, --no-log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_ARG_LOG_PREFIX) |
 | `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_ARG_LOG_TIMESTAMPS) |
 | `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
@@ -147,7 +147,6 @@
 | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
 | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
 | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
-| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
@@ -166,14 +165,14 @@
 | `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
-| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
@@ -195,6 +194,7 @@
 | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
 | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
 | `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
+| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)<br/>(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) |
 | `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
 | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
 | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 369c24216b7..af40adbb4ce 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -342,7 +342,10 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
 
 static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_cli(int argc, char ** argv);
+
+int llama_cli(int argc, char ** argv) {
     common_params params;
 
     params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
diff --git a/tools/cli/main.cpp b/tools/cli/main.cpp
new file mode 100644
index 00000000000..cb7d795b666
--- /dev/null
+++ b/tools/cli/main.cpp
@@ -0,0 +1,5 @@
+int llama_cli(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_cli(argc, argv);
+}
diff --git a/tools/completion/CMakeLists.txt b/tools/completion/CMakeLists.txt
index 2c7df80652c..a310251eff6 100644
--- a/tools/completion/CMakeLists.txt
+++ b/tools/completion/CMakeLists.txt
@@ -1,6 +1,23 @@
+# llama-completion-impl: completion logic, reusable by app
+
+set(TARGET llama-completion-impl)
+
+add_library(${TARGET} completion.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-completion executable
+
 set(TARGET llama-completion)
-add_executable(${TARGET} completion.cpp)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE llama-completion-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
diff --git a/tools/completion/README.md b/tools/completion/README.md
index e5dd7f6f4e7..e8a1287f3a1 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -171,11 +171,11 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
 | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
 | `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
-| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
+| `--log-file FNAME` | Log to file<br/>(env: LLAMA_ARG_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_ARG_LOG_COLORS) |
 | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_ARG_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: trace (more info)<br/> - 5: debug<br/>(default: 3)<br/><br/>(env: LLAMA_ARG_LOG_VERBOSITY) |
 | `--log-prefix, --no-log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_ARG_LOG_PREFIX) |
 | `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_ARG_LOG_TIMESTAMPS) |
 | `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
@@ -254,8 +254,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 
diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp
index 1dc5df1afa2..dffcadd4131 100644
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -84,7 +84,10 @@ static void sigint_handler(int signo) {
 }
 #endif
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_completion(int argc, char ** argv);
+
+int llama_completion(int argc, char ** argv) {
     std::setlocale(LC_NUMERIC, "C");
 
     common_params params;
diff --git a/tools/completion/main.cpp b/tools/completion/main.cpp
new file mode 100644
index 00000000000..bea9a0ec9aa
--- /dev/null
+++ b/tools/completion/main.cpp
@@ -0,0 +1,5 @@
+int llama_completion(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_completion(argc, argv);
+}
diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt
index 25c40966333..8acdaef3712 100644
--- a/tools/fit-params/CMakeLists.txt
+++ b/tools/fit-params/CMakeLists.txt
@@ -1,6 +1,23 @@
+# llama-fit-params-impl: fit-params logic, reusable by app
+
+set(TARGET llama-fit-params-impl)
+
+add_library(${TARGET} fit-params.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-fit-params executable
+
 set(TARGET llama-fit-params)
-add_executable(${TARGET} fit-params.cpp)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE llama-fit-params-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp
index bcdf4404016..5d897bc4669 100644
--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@@ -12,7 +12,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_fit_params(int argc, char ** argv);
+
+int llama_fit_params(int argc, char ** argv) {
     common_params params;
 
     common_init();
@@ -30,7 +33,7 @@ int main(int argc, char ** argv) {
     if (!params.fit_params_print) {
         const common_params_fit_status status = common_fit_params(params.model.path.c_str(), &mparams, &cparams,
                 params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
-                params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+                params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
         if (status != COMMON_PARAMS_FIT_STATUS_SUCCESS) {
             LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
             exit(1);
diff --git a/tools/fit-params/main.cpp b/tools/fit-params/main.cpp
new file mode 100644
index 00000000000..b7271d4756a
--- /dev/null
+++ b/tools/fit-params/main.cpp
@@ -0,0 +1,5 @@
+int llama_fit_params(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_fit_params(argc, argv);
+}
diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt
index 93d6a3aa2e7..b1c35ee88a5 100644
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -1,6 +1,23 @@
+# llama-bench-impl: benchmark logic, reusable by app
+
+set(TARGET llama-bench-impl)
+
+add_library(${TARGET} llama-bench.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-bench executable
+
 set(TARGET llama-bench)
-add_executable(${TARGET} llama-bench.cpp)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE llama-bench-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 07198fb164c..d9732096866 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -2136,7 +2136,10 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
     GGML_ABORT("fatal error");
 }
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_bench(int argc, char ** argv);
+
+int llama_bench(int argc, char ** argv) {
     std::setlocale(LC_NUMERIC, "C");
     // try to set locale for unicode characters in markdown
     std::setlocale(LC_CTYPE, ".UTF-8");
diff --git a/tools/llama-bench/main.cpp b/tools/llama-bench/main.cpp
new file mode 100644
index 00000000000..0c18bb0c9d4
--- /dev/null
+++ b/tools/llama-bench/main.cpp
@@ -0,0 +1,5 @@
+int llama_bench(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_bench(argc, argv);
+}
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index a76adc9b80b..ffd30c7e6a1 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -22,7 +22,7 @@ add_library(mtmd
             models/gemma4v.cpp
             models/glm4v.cpp
             models/granite-speech.cpp
-            models/hunyuanocr.cpp
+            models/hunyuanvl.cpp
             models/internvl.cpp
             models/kimivl.cpp
             models/kimik25.cpp
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index c359851999f..ef4c342ba86 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -170,7 +170,7 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"
 
-// hunyuanocr / hunyuanvl (shared GGUF tensor names)
+// hunyuanvl (shared GGUF tensor names)
 #define TN_MM_PRE_NORM     "mm.pre_norm.%s"
 #define TN_TOK_IMG_BEGIN   "mm.image_begin"
 #define TN_TOK_IMG_END     "mm.image_end"
@@ -343,7 +343,6 @@ enum projector_type {
     PROJECTOR_TYPE_YASA2,
     PROJECTOR_TYPE_KIMIK25,
     PROJECTOR_TYPE_NEMOTRON_V2_VL,
-    PROJECTOR_TYPE_HUNYUANOCR,
     PROJECTOR_TYPE_HUNYUANVL,
     PROJECTOR_TYPE_MINICPMV4_6,
     PROJECTOR_TYPE_GRANITE_SPEECH,
@@ -393,7 +392,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_YASA2,     "yasa2"},
     { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
     { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
-    { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
     { PROJECTOR_TYPE_HUNYUANVL,  "hunyuanvl"},
     { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
     { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index ce15dbcd11e..e0de41e0b5b 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -35,6 +35,16 @@ enum resize_algo {
     // RESIZE_ALGO_LANCZOS, // TODO
 };
 
+// Padding style for img_tool::resize
+//   PAD_NONE    - no padding; direct resize to target dimensions
+//   PAD_CEIL    - aspect-preserving pad (default)
+//   PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity)
+enum pad_style {
+    PAD_NONE,
+    PAD_CEIL,
+    PAD_NEAREST,
+};
+
 struct clip_hparams {
     int32_t image_size = 0;
     int32_t patch_size = 0;
@@ -52,7 +62,7 @@ struct clip_hparams {
     int32_t image_min_pixels = -1;
     int32_t image_max_pixels = -1;
     resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
-    bool image_resize_pad = true; // if false, center-crop will be applied when resizing
+    pad_style image_resize_pad = PAD_CEIL; // padding style when resizing
     std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
 
     // (preprocessor) for llava-uhd style models
@@ -61,8 +71,8 @@ struct clip_hparams {
     int32_t preproc_max_tiles = 0;
     resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
     resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
-    bool image_pad_rf = true;  // if true, refined image will be padded (e.g. llava-1.6)
-    bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
+    pad_style image_pad_rf = PAD_CEIL;  // padding style for the refined image (e.g. llava-1.6)
+    pad_style image_pad_ov = PAD_NONE;  // padding style for the overview image (e.g. llava-1.6)
     std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
     std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
 
@@ -510,7 +520,7 @@ struct clip_model {
     ggml_tensor * mm_boi = nullptr;
     ggml_tensor * mm_eoi = nullptr;
 
-    // hunyuanocr perceiver
+    // hunyuanvl perceiver
     ggml_tensor * mm_pre_norm_w  = nullptr;
     ggml_tensor * mm_img_begin   = nullptr;
     ggml_tensor * mm_img_end     = nullptr;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9727a738ed8..5fd583d40bc 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -162,8 +162,14 @@ struct clip_ctx {
 
     bool debug_output_embeddings = false;
 
+    // for measuring memory usage
+    bool no_alloc = false;
+    std::map<ggml_backend_dev_t, size_t> mem_usage;
+    std::map<ggml_backend_dev_t, size_t> mem_compute;
+
     clip_ctx(clip_context_params & ctx_params) {
         flash_attn_type = ctx_params.flash_attn_type;
+        no_alloc = ctx_params.no_alloc;
         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
         if (!backend_cpu) {
             throw std::runtime_error("failed to initialize CPU backend");
@@ -930,10 +936,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
             } break;
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
             {
-                builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
+                builder = std::make_unique<clip_graph_hunyuanvl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_MLP:
         case PROJECTOR_TYPE_MLP_NORM:
@@ -1227,12 +1232,12 @@ struct clip_model_loader {
                         hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
                         hparams.image_pad_color     = {122, 116, 104};
                         if (!hparams.image_res_candidates.empty()) {
-                            hparams.image_resize_pad  = true;
+                            hparams.image_resize_pad  = PAD_CEIL;
                             hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                         } else {
                             // llava-1.6 default params
-                            hparams.image_pad_ov         = false;
-                            hparams.image_pad_rf         = true;
+                            hparams.image_pad_ov         = PAD_NONE;
+                            hparams.image_pad_rf         = PAD_CEIL;
                             hparams.image_pad_color_rf   = {122, 116, 104};
                             hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
                             hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
@@ -1240,7 +1245,7 @@ struct clip_model_loader {
                     } break;
                 case PROJECTOR_TYPE_GLM_EDGE:
                     {
-                        hparams.image_resize_pad  = true;
+                        hparams.image_resize_pad  = PAD_CEIL;
                         hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                     } break;
                 case PROJECTOR_TYPE_MINICPMV:
@@ -1435,7 +1440,7 @@ struct clip_model_loader {
                     {
                         hparams.n_merge = 2;
                         hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
-                        hparams.image_resize_pad  = false;
+                        hparams.image_resize_pad  = PAD_NONE;
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                         std::vector<int> wa_layer_indexes_vec;
@@ -1455,7 +1460,7 @@ struct clip_model_loader {
 
                         // reka model performs better when using resize_bicubic, which stretches
                         // the image to fit fixed square size
-                        hparams.image_resize_pad = false;
+                        hparams.image_resize_pad = PAD_NONE;
                     } break;
                 case PROJECTOR_TYPE_GLM4V:
                     {
@@ -1510,31 +1515,23 @@ struct clip_model_loader {
                         hparams.image_size = 1024;
                         hparams.warmup_image_size = 1024;
                         hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
-                        hparams.image_pad_color[0] = hparams.image_mean[0];
-                        hparams.image_pad_color[1] = hparams.image_mean[1];
-                        hparams.image_pad_color[2] = hparams.image_mean[2];
+                        hparams.image_pad_color = {127, 127, 127};
 
                         get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
                         get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                         get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                      } break;
-                case PROJECTOR_TYPE_HUNYUANOCR:
-                    {
-                        hparams.n_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
-                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
-                        hparams.set_warmup_n_tokens(28*28);
-                    } break;
                 case PROJECTOR_TYPE_HUNYUANVL:
                     {
                         hparams.n_merge = 2;
                         hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
-                        hparams.image_resize_pad = false;
+                        hparams.image_resize_pad = PAD_NONE;
                         hparams.ffn_op = FFN_GELU;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         hparams.set_limit_image_tokens(256, 16384);
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels, false);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels, false);
                         hparams.set_warmup_n_tokens(32*32);
                     } break;
                 case PROJECTOR_TYPE_LFM2A:
@@ -1688,6 +1685,8 @@ struct clip_model_loader {
                 ggml_set_name(data_tensor, cur->name);
                 loaded_tensor_names.insert(name);
                 cur = data_tensor;
+                // add to weight memory counter
+                ctx_clip.mem_usage[ggml_backend_get_device(ctx_clip.backend)] += ggml_nbytes(cur);
             }
             return cur;
         };
@@ -2337,7 +2336,6 @@ struct clip_model_loader {
                     model.mm_boi            = get_tensor(TN_TOK_BOI);
                     model.mm_eoi            = get_tensor(TN_TOK_EOI);
                 } break;
-            case PROJECTOR_TYPE_HUNYUANOCR:
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
@@ -2602,7 +2600,7 @@ struct clip_model_loader {
         }
 
         // load data
-        {
+        if (!ctx_clip.no_alloc) {
             std::vector<uint8_t> read_buf;
 
             // alloc memory and offload data
@@ -2676,7 +2674,7 @@ struct clip_model_loader {
         if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
             // try to enable flash attention to see if it's supported
             ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
-            info = alloc_compute_meta(ctx_clip, batch);
+            info = reserve_compute_meta(ctx_clip, batch);
             if (!info.fattn && info.fattn_op) {
                 auto op = info.fattn_op;
                 LOG_WRN("%s: *****************************************************************\n", __func__);
@@ -2695,10 +2693,10 @@ struct clip_model_loader {
                 LOG_WRN("%s: please report this on github as an issue\n", __func__);
                 LOG_WRN("%s: *****************************************************************\n", __func__);
                 ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
-                alloc_compute_meta(ctx_clip, batch);
+                reserve_compute_meta(ctx_clip, batch);
             }
         } else {
-            info = alloc_compute_meta(ctx_clip, batch);
+            info = reserve_compute_meta(ctx_clip, batch);
             if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
                 LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
             }
@@ -2737,12 +2735,14 @@ struct clip_model_loader {
         }
     }
 
-    static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
+    // only initialize backend buffers, but do not allocate them yet
+    static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
         ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
 
         ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
         ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
 
+        ctx_clip.mem_compute.clear();
         for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
             ggml_backend_t backend = ctx_clip.backend_ptrs[i];
             ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
@@ -2752,6 +2752,7 @@ struct clip_model_loader {
                         ggml_backend_buft_name(buft),
                         size / 1024.0 / 1024.0);
             }
+            ctx_clip.mem_compute[ggml_backend_get_device(backend)] += size;
         }
 
         const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
@@ -3062,7 +3063,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_MIMOVL:
         case PROJECTOR_TYPE_GLM4V:
         case PROJECTOR_TYPE_PADDLEOCR:
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
         case PROJECTOR_TYPE_YOUTUVL:
             return (img->nx / params.patch_size) / 2;
@@ -3279,7 +3279,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
             n_patches = h * (h + 1) + 1;
         } break;
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
             {
                 int merge = ctx->model.hparams.n_merge;
@@ -3915,7 +3914,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_JANUS_PRO:
         case PROJECTOR_TYPE_PHI4:
         case PROJECTOR_TYPE_COGVLM:
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_YASA2:
             {
                 // do nothing
@@ -3925,7 +3923,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 // Compute the HunyuanVL 2D position embedding on CPU (with the
                 // custom sf=(target+0.1)/n_grid bilinear sampling that the
                 // reference implementation uses) and upload it to the graph
-                // input declared in clip_graph_hunyuanocr::build().
+                // input declared in clip_graph_hunyuanvl::build().
                 GGML_ASSERT(model.position_embeddings != nullptr);
                 ggml_tensor * src_t   = model.position_embeddings;
                 const int64_t n_embd  = src_t->ne[0];
@@ -4246,7 +4244,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_KIMIK25:
         case PROJECTOR_TYPE_YASA2:
             return ctx->model.mm_2_w->ne[1];
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
             return ctx->model.mm_model_proj->ne[1];
         case PROJECTOR_TYPE_COGVLM:
@@ -4266,22 +4263,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
     }
 }
 
-int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    // TODO: remove this function
-    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
-        return ctx->model.hparams.minicpmv_version;
-    }
-    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV4_6) {
-        return 46;
-    }
-    return 0;
-}
-
-bool clip_is_glm(const struct clip_ctx * ctx) {
-    // TODO: remove this function
-    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
-}
-
 bool clip_is_llava(const struct clip_ctx * ctx) {
     return ctx->model.hparams.has_llava_projector;
 }
@@ -4330,6 +4311,14 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
     return &ctx->model.hparams;
 }
 
+std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx) {
+    std::map<ggml_backend_dev_t, size_t> result = ctx->mem_usage;
+    for (auto & [dev, size] : ctx->mem_compute) {
+        result[dev] += size;
+    }
+    return result;
+}
+
 //
 // API for debugging
 //
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index f643ed6e979..9b807ffa77b 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -6,6 +6,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <map>
+
 // !!! Internal header, to be used by mtmd only !!!
 
 #define MTMD_INTERNAL_HEADER
@@ -40,6 +42,7 @@ struct clip_context_params {
     bool warmup;
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
+    bool no_alloc;
 };
 
 struct clip_init_result {
@@ -102,8 +105,6 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
-int clip_is_minicpmv(const struct clip_ctx * ctx);
-bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 // note for contributor: this clip_is_(model) pattern is deprecated
 //                       do NOT add new functions like this
@@ -116,6 +117,8 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 
+std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
+
 struct clip_cap {
     bool has_vision;
     bool has_audio;
diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp
index b1f6ead5b5e..8419d496a5b 100644
--- a/tools/mtmd/models/deepseekocr.cpp
+++ b/tools/mtmd/models/deepseekocr.cpp
@@ -88,165 +88,169 @@ static ggml_tensor * get_rel_pos(ggml_context * ctx0,
     return cur;  // [C, k_size, q_size]
 }
 
-ggml_cgraph * clip_graph_deepseekocr::build() {
-    // patch embedding
-    ggml_tensor * inp_raw = build_inp_raw();
 
-    ggml_tensor * sam_out;
+ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
     // Building SAM
-    {
-        const int n_embd  = hparams.sam_n_embd;
-        const int n_layer = hparams.sam_n_layer;
-        const int n_heads = hparams.sam_n_head;
-        const int d_heads = n_embd / n_heads;
-        const int window  = hparams.attn_window_size;
-
-        ggml_tensor * inpL;
-
-        inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
-        inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
-        inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
-
-        ggml_tensor * rel_pos_indices_local;
-        ggml_tensor * rel_pos_indices_global;
-
-        rel_pos_indices_local  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
-        rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
-        ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
-        ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
-        ggml_set_input(rel_pos_indices_local);
-        ggml_set_input(rel_pos_indices_global);
-
-        ggml_tensor * cur;
-        const auto    tgt_size = inpL->ne[1];
-        const auto    str_size = model.pos_embed->ne[1];
-
-        if (str_size != tgt_size) {
-            ggml_tensor * old_pos_embed = nullptr;
-            old_pos_embed               = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
-            ggml_tensor * new_pos_embed =
-                ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
-            new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
-            cur           = ggml_add(ctx0, inpL, new_pos_embed);
-        } else {
-            cur = ggml_add(ctx0, inpL, model.pos_embed);
-        }
+    const int n_embd  = hparams.sam_n_embd;
+    const int n_layer = hparams.sam_n_layer;
+    const int n_heads = hparams.sam_n_head;
+    const int d_heads = n_embd / n_heads;
+    const int window  = hparams.attn_window_size;
 
-        // loop over layers
-        for (int il = 0; il < n_layer; il++) {
-            auto &        layer    = model.sam_layers[il];
-            ggml_tensor * shortcut = cur;
+    ggml_tensor * inpL;
 
-            // layernorm1
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+    inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
+    inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
+    inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
 
-            const int64_t w0 = cur->ne[1];
-            const int64_t h0 = cur->ne[2];
+    ggml_tensor * rel_pos_indices_local;
+    ggml_tensor * rel_pos_indices_global;
 
-            ggml_tensor * indices;
+    rel_pos_indices_local  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
+    rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
+    ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
+    ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
+    ggml_set_input(rel_pos_indices_local);
+    ggml_set_input(rel_pos_indices_global);
 
-            if (hparams.is_global_attn(il)) {
-                indices = rel_pos_indices_global;
-            } else {
-                // local attention layer - apply window partition
-                cur     = window_partition(ctx0, cur, window);
-                indices = rel_pos_indices_local;
-            }
+    ggml_tensor * cur;
+    const auto    tgt_size = inpL->ne[1];
+    const auto    str_size = model.pos_embed->ne[1];
+
+    if (str_size != tgt_size) {
+        ggml_tensor * old_pos_embed = nullptr;
+        old_pos_embed               = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
+        ggml_tensor * new_pos_embed =
+            ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
+        new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
+        cur           = ggml_add(ctx0, inpL, new_pos_embed);
+    } else {
+        cur = ggml_add(ctx0, inpL, model.pos_embed);
+    }
 
-            const int64_t W = cur->ne[1];
-            const int64_t H = cur->ne[2];
-            // self-attention
-            {
-                const int B = cur->ne[3];
-
-                cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-                cur = ggml_add(ctx0, cur, layer.qkv_b);
-                cur = ggml_cont(ctx0, cur);  // Ensure tensor is contiguous before reshape
-                cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
-
-                ggml_tensor * Q;
-                ggml_tensor * K;
-                ggml_tensor * V;
-
-                Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
-                Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
-
-                K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
-                K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
-
-                V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
-                V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
-
-                ggml_tensor * mask;
-                ggml_tensor * rw;
-                ggml_tensor * rh;
-                ggml_tensor * qr;
-
-                rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W);  // [W, W, C]
-                rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H);  // [H, H, C]
-                qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
-                qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
-
-                rw   = ggml_mul_mat(ctx0, rw,
-                                    ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3)));  // [B*n_heads, W, H, W]
-                rw   = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3));                // [B*n_heads, H, W, W]
-                rw   = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
-                rw   = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
-                rh   = ggml_mul_mat(ctx0, rh, qr);  // [B*n_heads, H, W, H]
-                rh   = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
-                mask = ggml_add(ctx0, rw, rh);      // [B*n_heads, H*W, H, W]
-                mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
-                mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        auto &        layer    = model.sam_layers[il];
+        ggml_tensor * shortcut = cur;
 
-                const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
 
-                cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
-                                 il);  // [B, H*W, n_embd]
-                cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
-            }
+        const int64_t w0 = cur->ne[1];
+        const int64_t h0 = cur->ne[2];
 
-            if (hparams.is_global_attn(il) == false) {
-                // local attention layer - reverse window partition
-                cur = window_unpartition(ctx0, cur, w0, h0, window);
-            }
+        ggml_tensor * indices;
 
-            // re-add the layer input, e.g., residual
-            cur = ggml_add(ctx0, cur, shortcut);
+        if (hparams.is_global_attn(il)) {
+            indices = rel_pos_indices_global;
+        } else {
+            // local attention layer - apply window partition
+            cur     = window_partition(ctx0, cur, window);
+            indices = rel_pos_indices_local;
+        }
 
-            ggml_tensor * inpFF = cur;
+        const int64_t W = cur->ne[1];
+        const int64_t H = cur->ne[2];
+        // self-attention
+        {
+            const int B = cur->ne[3];
+
+            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+            cur = ggml_add(ctx0, cur, layer.qkv_b);
+            cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
+            cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
+
+            ggml_tensor * Q;
+            ggml_tensor * K;
+            ggml_tensor * V;
+
+            Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
+            Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
+
+            K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
+            K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
+
+            V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
+            V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
+
+            ggml_tensor * mask;
+            ggml_tensor * rw;
+            ggml_tensor * rh;
+            ggml_tensor * qr;
+
+            rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
+            rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
+            qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
+            qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
+
+            rw = ggml_mul_mat(ctx0, rw,
+                              ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
+            rw   = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
+            rw   = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
+            rw   = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
+            rh   = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
+            rh   = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
+            mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
+            mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
+            // casting mask to F16 only required when flash-attn is enabled
+            if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+                mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
+            }
 
-            // layernorm2
-            cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+            const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
 
-            // ffn
-            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
-                            hparams.ffn_op, il);
+            cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
+                             il); // [B, H*W, n_embd]
+            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
+        }
 
-            // residual 2
-            cur = ggml_add(ctx0, cur, inpFF);
-            cb(cur, "sam_layer_out", il);
+        if (hparams.is_global_attn(il) == false) {
+            // local attention layer - reverse window partition
+            cur = window_unpartition(ctx0, cur, w0, h0, window);
         }
 
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, shortcut);
 
-        cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-        cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+        ggml_tensor * inpFF = cur;
 
-        cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-        cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+        // layernorm2
+        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
 
-        cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
-        cb(cur, "sam_output", -1);
+        // ffn
+        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
+                        hparams.ffn_op, il);
 
-        ggml_build_forward_expand(gf, cur);
-        sam_out = cur;
+        // residual 2
+        cur = ggml_add(ctx0, cur, inpFF);
+        cb(cur, "sam_layer_out", il);
     }
 
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+    cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+    cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+    cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
+    cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
+    cb(cur, "sam_output", -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return cur;
+}
+
+ggml_cgraph * clip_graph_deepseekocr::build() {
+    // patch embedding
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * sam_out = build_sam(inp_raw);
+
     ggml_tensor * clip_out;
     // Building DS-OCR CLIP
     {
diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanvl.cpp
similarity index 70%
rename from tools/mtmd/models/hunyuanocr.cpp
rename to tools/mtmd/models/hunyuanvl.cpp
index 45ed684f70d..2c670979d76 100644
--- a/tools/mtmd/models/hunyuanocr.cpp
+++ b/tools/mtmd/models/hunyuanvl.cpp
@@ -1,25 +1,15 @@
 #include "models.h"
 
-ggml_cgraph * clip_graph_hunyuanocr::build() {
+ggml_cgraph * clip_graph_hunyuanvl::build() {
     const int merge = hparams.n_merge;
     const int pw    = n_patches_x;
     const int ph    = n_patches_y;
 
-    // Position embedding interpolation.
-    // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard
-    // ggml_interpolate cannot express. To avoid adding a new ggml op, the
-    // resize is computed on CPU in clip_image_batch_encode and uploaded here
-    // as a graph input (named "hunyuanvl_pos_embd").
-    // HunyuanOCR uses the same square layout and the standard ratio-based
-    // interpolation provided by resize_position_embeddings().
-    ggml_tensor * pos_embd = nullptr;
-    if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
-        pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw);
-        ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
-        ggml_set_input(pos_embd);
-    } else {
-        pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
-    }
+    // position embedding: declared as a graph input, filled on CPU
+    // by clip_image_batch_encode (see PROJECTOR_TYPE_HUNYUANVL branch there).
+    ggml_tensor * pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw);
+    ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
+    ggml_set_input(pos_embd);
 
     ggml_tensor * inp = build_inp();
     ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 955daa6d6d3..119c2d541b5 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -118,6 +118,7 @@ struct clip_graph_whisper_enc : clip_graph {
 struct clip_graph_deepseekocr : clip_graph {
     clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
+    ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
 };
 
 struct clip_graph_conformer : clip_graph {
@@ -141,8 +142,8 @@ struct clip_graph_glm4v : clip_graph {
     ggml_cgraph * build() override;
 };
 
-struct clip_graph_hunyuanocr : clip_graph {
-    clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+struct clip_graph_hunyuanvl : clip_graph {
+    clip_graph_hunyuanvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
 };
 
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index 1b058e02601..37c271d18a8 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -38,7 +38,7 @@ struct img_tool {
             clip_image_u8 & dst,
             const clip_image_size & target_resolution,
             resize_algo algo,
-            bool add_padding = true, // TODO: define the behavior for add_padding = false
+            pad_style padding = PAD_CEIL,
             std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
         dst.nx = target_resolution.width;
         dst.ny = target_resolution.height;
@@ -50,7 +50,7 @@ struct img_tool {
             return;
         }
 
-        if (!add_padding) {
+        if (padding == PAD_NONE) {
             // direct resize
             switch (algo) {
                 case RESIZE_ALGO_BILINEAR:
@@ -71,8 +71,15 @@ struct img_tool {
             float scale_w = static_cast<float>(target_resolution.width) / src.nx;
             float scale_h = static_cast<float>(target_resolution.height) / src.ny;
             float scale = std::min(scale_w, scale_h);
-            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
-            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+
+            int new_width, new_height;
+            if (padding == PAD_NEAREST) {
+                new_width  = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
+            } else {
+                new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+            }
 
             switch (algo) {
                 case RESIZE_ALGO_BILINEAR:
@@ -91,9 +98,14 @@ struct img_tool {
             // fill dst with pad_color
             fill(dst, pad_color);
 
-            int offset_x = (target_resolution.width  - new_width)  / 2;
-            int offset_y = (target_resolution.height - new_height) / 2;
-
+            int offset_x, offset_y;
+            if (padding == PAD_NEAREST) {
+                offset_x = static_cast<int>(std::round((target_resolution.width  - new_width)  / 2.0f));
+                offset_y = static_cast<int>(std::round((target_resolution.height - new_height) / 2.0f));
+            } else {
+                offset_x = (target_resolution.width  - new_width)  / 2;
+                offset_y = (target_resolution.height - new_height) / 2;
+            }
             composite(dst, resized_image, offset_x, offset_y);
         }
     }
@@ -356,10 +368,10 @@ struct img_tool {
             GGML_ASSERT(inSize > 0 && outSize > 0);
             double support, scale, filterscale;
             double center, ww, ss;
-            int xx, x, ksize, xmin, xmax, xcnt;
+            int xx, x, ksize, xmin, xmax;
 
             // Calculate scaling factor: ratio of input range to output size
-            filterscale = scale = (double)inSize / outSize;
+            filterscale = scale = static_cast<double>(inSize) / outSize;
             // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
             // For downsampling (scale > 1), widen filter to prevent aliasing
             if (filterscale < 1.0) {
@@ -373,6 +385,7 @@ struct img_tool {
             std::vector<double> pre_weights(outSize * ksize);  // Temporary weights
             bounds.resize(outSize * 2);
 
+
             // For each output pixel, compute its filter coefficients
             for (xx = 0; xx < outSize; xx++) {
                 // Calculate the center position in input space (pixel-center convention: +0.5)
@@ -391,10 +404,10 @@ struct img_tool {
                     xmax = inSize;
                 }
 
-                xcnt = xmax - xmin;
+                xmax -= xmin;
 
                 // Compute filter weights for each contributing input pixel
-                for (x = 0; x < xcnt; x++) {
+                for (x = 0; x < xmax; x++) {
                     // Distance from input pixel center to output pixel center in input space
                     double w = bicubic_filter((x + xmin - center + 0.5) * ss);
                     pre_weights[xx * ksize + x] = w;
@@ -402,7 +415,7 @@ struct img_tool {
                 }
 
                 // Normalize weights to sum to 1.0 (preserves brightness)
-                for (x = 0; x < xcnt; x++) {
+                for (x = 0; x < xmax; x++) {
                     if (ww != 0.0) {
                         pre_weights[xx * ksize + x] /= ww;
                     }
@@ -415,18 +428,27 @@ struct img_tool {
 
                 // Store input pixel range for this output pixel
                 bounds[xx * 2 + 0] = xmin;
-                bounds[xx * 2 + 1] = xcnt;
+                bounds[xx * 2 + 1] = xmax;
             }
 
             // Convert floating-point coefficients to fixed-point integers
             // Formula: int32 = round(float * 2^PRECISION_BITS)
             weights.resize(outSize * ksize);
+
+            const double fxp_scale = std::ldexp(1.0, PRECISION_BITS); // 1.0 * 2^PRECISION_BITS
+
             for (int i = 0; i < outSize * ksize; i++) {
+                double tmp_val = pre_weights[i] * fxp_scale;
                 if (pre_weights[i] < 0) {
-                    weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                    tmp_val -= 0.5;
                 } else {
-                    weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                    tmp_val += 0.5;
                 }
+                tmp_val = std::round(tmp_val);
+                tmp_val = std::clamp(tmp_val,
+                                     static_cast<double>(std::numeric_limits<int32_t>::min()),
+                                     static_cast<double>(std::numeric_limits<int32_t>::max()));
+                weights[i] = static_cast<int32_t>(tmp_val);
             }
 
             return ksize;
@@ -1083,35 +1105,31 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
 //
 
 bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    const std::vector native_resolutions = {
-        /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
-    };
-    // original image size
-    const clip_image_size original_size{img.nx, img.ny};
-    const int orig_w = original_size.width;
-    const int orig_h = original_size.height;
-    const int orig_area = orig_h * orig_w;
-
-    size_t mode_i = 0;
-    int min_diff = orig_area;
-
-    for (size_t i = 0; i < native_resolutions.size(); i++) {
-        int r = native_resolutions[i];
-        if (std::abs(orig_area - r * r) < min_diff) {
-            mode_i = i;
-            min_diff = std::abs(orig_area - r * r);
+    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
+    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
+
+    const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
+
+    size_t  mode_i   = 0;
+    int64_t min_diff = std::numeric_limits<int64_t>::max();
+    for (size_t i = 0; i < std::size(native_resolutions); i++) {
+        const int64_t r    = native_resolutions[i];
+        const int64_t diff = std::abs(orig_area - r * r);
+        if (diff < min_diff) {
+            mode_i   = i;
+            min_diff = diff;
         }
     }
-
-    /* Native Resolution (Base/Large) */
     const int image_size = native_resolutions[mode_i];
 
-    // scaled and padded image
-    clip_image_u8_ptr scaled_img(clip_image_u8_init());
-    img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo);
+    // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
+    // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
+    clip_image_u8 padded;
+    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
+                     PAD_NEAREST, hparams.image_pad_color);
 
     clip_image_f32_ptr res(clip_image_f32_init());
-    img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std);
+    img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
     output.entries.push_back(std::move(res));
 
     output.grid_x = 1;
@@ -1246,7 +1264,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
             std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
         };
         clip_image_u8 scaled;
-        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
+        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
         resized = std::move(scaled);
     }
 
@@ -1347,7 +1365,7 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
     clip_image_u8 img_for_crop = prepared;
     if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
         clip_image_u8 refined;
-        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
+        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
         img_for_crop = std::move(refined);
     }
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 8f12d0b43ea..63b7e4d052a 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -21,6 +21,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <climits>
 #include <vector>
 
 // represents raw image data, layout is RGBRGBRGB...
@@ -139,13 +140,13 @@ mtmd_context_params mtmd_context_params_default() {
 struct mtmd_context {
     struct clip_ctx * ctx_v; // vision
     struct clip_ctx * ctx_a; // audio
-    const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
 
     bool print_timings;
     int n_threads;
     std::string media_marker;
-    const int n_embd_text;
+    const int n_embd_text = -1; // -1 means llm context not provided, skip checking this
+    const llama_vocab * vocab = nullptr; // can be nullptr if text_model is not provided
     mtmd_pos_type pos_type;
 
     // these are not token, but strings used to mark the beginning and end of image/audio embeddings
@@ -178,12 +179,13 @@ struct mtmd_context {
 
     mtmd_context(const char * mmproj_fname,
                    const llama_model * text_model,
-                   const mtmd_context_params & ctx_params) :
-        text_model   (text_model),
+                   const mtmd_context_params & ctx_params,
+                   bool no_alloc = false) :
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
         media_marker (ctx_params.media_marker),
-        n_embd_text  (llama_model_n_embd_inp(text_model))
+        n_embd_text  (text_model ? llama_model_n_embd_inp(text_model) : -1),
+        vocab        (text_model ? llama_model_get_vocab(text_model) : nullptr)
     {
         if (ctx_params.image_marker != nullptr) {
             throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
@@ -193,21 +195,23 @@ struct mtmd_context {
             throw std::runtime_error("media_marker must not be empty");
         }
 
-        auto decoder_rope_type = llama_model_rope_type(text_model);
-        switch (decoder_rope_type) {
-            case LLAMA_ROPE_TYPE_NONE:
-            case LLAMA_ROPE_TYPE_NORM:
-            case LLAMA_ROPE_TYPE_NEOX:
-                {
-                    pos_type = MTMD_POS_TYPE_NORMAL;
-                } break;
-            case LLAMA_ROPE_TYPE_MROPE:
-            case LLAMA_ROPE_TYPE_IMROPE:
-                {
-                    pos_type = MTMD_POS_TYPE_MROPE;
-                } break;
-            default:
-                throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type));
+        if (text_model) {
+            auto decoder_rope_type = llama_model_rope_type(text_model);
+            switch (decoder_rope_type) {
+                case LLAMA_ROPE_TYPE_NONE:
+                case LLAMA_ROPE_TYPE_NORM:
+                case LLAMA_ROPE_TYPE_NEOX:
+                    {
+                        pos_type = MTMD_POS_TYPE_NORMAL;
+                    } break;
+                case LLAMA_ROPE_TYPE_MROPE:
+                case LLAMA_ROPE_TYPE_IMROPE:
+                    {
+                        pos_type = MTMD_POS_TYPE_MROPE;
+                    } break;
+                default:
+                    throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type));
+            }
         }
 
         clip_context_params ctx_clip_params {
@@ -218,6 +222,7 @@ struct mtmd_context {
             /* warmup            */ ctx_params.warmup,
             /* cb_eval           */ ctx_params.cb_eval,
             /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
+            /* no_alloc          */ no_alloc,
         };
 
         auto res = clip_init(mmproj_fname, ctx_clip_params);
@@ -241,7 +246,7 @@ struct mtmd_context {
         // since we already validate n_embd of vision and audio mmproj,
         // we can safely assume that they are the same
         int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
-        if (n_embd_text != n_embd_clip) {
+        if (n_embd_text > 0 && n_embd_text != n_embd_clip) {
             throw std::runtime_error(string_format(
                 "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
                 "hint: you may be using wrong mmproj\n",
@@ -279,7 +284,7 @@ struct mtmd_context {
                 } break;
             case PROJECTOR_TYPE_MINICPMV:
                 {
-                    int minicpmv_version = clip_is_minicpmv(ctx_v);
+                    int minicpmv_version = clip_get_hparams(ctx_v)->minicpmv_version;
                     if (minicpmv_version == 2) {
                         // minicpmv 2.5 format:
                         // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -488,7 +493,6 @@ struct mtmd_context {
                     img_end = "\n"; // prevent empty batch on llama-server
                     image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                 } break;
-            case PROJECTOR_TYPE_HUNYUANOCR:
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
@@ -594,7 +598,11 @@ struct mtmd_context {
 
 private:
     llama_token lookup_token(const std::string & token_text) {
-        const llama_vocab * vocab = llama_model_get_vocab(text_model);
+        if (vocab == nullptr) {
+            // TODO @ngxson : this case is currently hit by mtmd_get_memory_usage
+            // but we should reconsider this if this case is needed in other places in the future
+            return LLAMA_TOKEN_NULL;
+        }
         const int n_vocab = llama_vocab_n_tokens(vocab);
         for (int i = 0; i < n_vocab; i++) {
             if (token_to_piece(vocab, i, true) == token_text) {
@@ -605,6 +613,9 @@ struct mtmd_context {
     }
 
     std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
+        if (vocab == nullptr) {
+            throw std::runtime_error("llama_vocab is not provided");
+        }
         std::string piece;
         piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
         const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
@@ -653,7 +664,7 @@ struct mtmd_tokenizer {
         add_special   = text->add_special;
         parse_special = text->parse_special;
         input_text    = text->text;
-        vocab         = llama_model_get_vocab(ctx->text_model);
+        vocab         = ctx->vocab;
     }
 
     int32_t tokenize(mtmd_input_chunks * output) {
@@ -679,27 +690,29 @@ struct mtmd_tokenizer {
             }
         }
 
-        if (add_special && llama_vocab_get_add_bos(vocab)) {
-            // if first chunk is text, we add BOS token to first text chunk
-            // otherwise, create a new text chunk with BOS token
-            if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-                // add BOS token to the beginning of first text chunk
-                cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
-            } else {
-                // create a new text chunk with BOS token at the beginning
-                mtmd_input_chunk bos_chunk{
-                    MTMD_INPUT_CHUNK_TYPE_TEXT,
-                    {llama_vocab_bos(vocab)},
-                    nullptr, // image tokens
-                    nullptr, // audio tokens
-                };
-                cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+        if (vocab != nullptr) {
+            if (add_special && llama_vocab_get_add_bos(vocab)) {
+                // if first chunk is text, we add BOS token to first text chunk
+                // otherwise, create a new text chunk with BOS token
+                if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                    // add BOS token to the beginning of first text chunk
+                    cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
+                } else {
+                    // create a new text chunk with BOS token at the beginning
+                    mtmd_input_chunk bos_chunk{
+                        MTMD_INPUT_CHUNK_TYPE_TEXT,
+                        {llama_vocab_bos(vocab)},
+                        nullptr, // image tokens
+                        nullptr, // audio tokens
+                    };
+                    cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+                }
             }
-        }
 
-        if (add_special && llama_vocab_get_add_eos(vocab)) {
-            // if last chunk is text, we add EOS token to it
-            add_text({llama_vocab_eos(vocab)});
+            if (add_special && llama_vocab_get_add_eos(vocab)) {
+                // if last chunk is text, we add EOS token to it
+                add_text({llama_vocab_eos(vocab)});
+            }
         }
 
         if (i_bm != bitmaps.size()) {
@@ -714,6 +727,9 @@ struct mtmd_tokenizer {
     }
 
     void add_text(const std::string & txt, bool parse_special) {
+        if (vocab == nullptr) {
+            throw std::runtime_error("llama_vocab is not provided");
+        }
         LOG_DBG("%s: %s\n", __func__, txt.c_str());
         auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
         add_text(tokens);
@@ -1002,10 +1018,16 @@ struct mtmd_tokenizer {
                const std::string & text,
                             bool   add_special,
                             bool   parse_special) {
+        if (vocab == nullptr) {
+            throw std::runtime_error("llama_vocab is not provided");
+        }
         // upper limit for the number of tokens
         int n_tokens = text.length() + 2 * add_special;
         std::vector<llama_token> result(n_tokens);
         n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        if (n_tokens == std::numeric_limits<int32_t>::min()) {
+            throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+        }
         if (n_tokens < 0) {
             result.resize(-n_tokens);
             int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
@@ -1067,8 +1089,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     bool ok = false;
 
     if (clip_is_llava(ctx_clip)
-        || clip_is_minicpmv(ctx_clip)
-        || clip_is_glm(ctx_clip)
+        || proj_type == PROJECTOR_TYPE_MINICPMV
+        || proj_type == PROJECTOR_TYPE_GLM_EDGE
         || proj_type == PROJECTOR_TYPE_INTERNVL) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
@@ -1542,3 +1564,36 @@ void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector<float> &
         }
     }
 }
+
+static void stub_log_callback(enum ggml_log_level, const char *, void *) {
+    // do nothing
+}
+
+std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(const char * mmproj_fname,
+                                                            struct mtmd_context_params ctx_params) {
+    mtmd::context_ptr ctx;
+    auto saved_log_callback = g_logger_state.log_callback;
+    auto saved_log_user_data = g_logger_state.log_callback_user_data;
+    try {
+        mtmd_log_set(stub_log_callback, nullptr); // suppress logging
+        ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params));
+        mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback
+        std::map<ggml_backend_dev_t, size_t> total_mem;
+        auto merge = [&](const struct clip_ctx * c) {
+            for (auto & [dev, size] : clip_get_mem_usage(c)) {
+                total_mem[dev] += size;
+            }
+        };
+        if (ctx->ctx_v) {
+            merge(ctx->ctx_v);
+        }
+        if (ctx->ctx_a) {
+            merge(ctx->ctx_a);
+        }
+        return total_mem;
+    } catch (const std::exception & e) {
+        mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return {};
+    }
+}
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 54b9515a3ea..5d518df799e 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -9,6 +9,7 @@
 #include <stdbool.h>
 
 #ifdef __cplusplus
+#include <map>
 #include <string>
 #include <vector>
 #include <cinttypes>
@@ -261,6 +262,14 @@ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 } // extern "C"
 #endif
 
+// Get memory usage of the current model in bytes, per backend device
+// Note: this is an unstable API, used internally by fit_params; it WILL be removed or changed without deprecation
+#ifdef __cplusplus
+MTMD_API std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(
+    const char * mmproj_fname,
+    struct mtmd_context_params ctx_params);
+#endif
+
 //
 // C++ wrappers
 //
diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt
index 0a1f4e86477..f26d8e912a3 100644
--- a/tools/mtmd/requirements.txt
+++ b/tools/mtmd/requirements.txt
@@ -1,5 +1,12 @@
 -r ../../requirements/requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=11.3.0
-torch~=2.6.0
-torchvision~=0.21.0
+
+## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility
+torch==2.11.0; platform_machine != "s390x" # check_requirements: ignore "=="
+torchvision==0.26.0; platform_machine != "s390x" # check_requirements: ignore "=="
+
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "=="
+torchvision>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "=="
diff --git a/tools/mtmd/tests/test-1-extracted.md b/tools/mtmd/tests/test-1-extracted.md
deleted file mode 100644
index a92dcd95916..00000000000
--- a/tools/mtmd/tests/test-1-extracted.md
+++ /dev/null
@@ -1,85 +0,0 @@
-<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|>
-# MEN WALK ON MOON
-ASTRONAUTS LAND ON PLAIN;
-COLLECT ROCKS, PLANT FLAG
-
-<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|>
-Voice From Moon:
-Eagle Has Landed'
-
-<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|>
-EAGLE (the lunar surface, Houston, Truesquily)
-Base here, The Eagle has landed.
-
-<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|>
-BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot.
-
-<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|>
-TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here.
-
-<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|>
-TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.)
-
-<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|>
-TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue.
-
-<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|>
-TRAVELLING MADE: Eagle, and service mobility.
-
-<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|>
-How do you read me?
-
-<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|>
-TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over.
-
-<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|>
-COLUMBIA: Yes, I heard the whole thing.
-
-<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|>
-BOOTHROOM: Well, it's a good show.
-
-<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|>
-COLUMBIA: Fantastic.
-
-<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|>
-TRAVELLING MADE: I'll read that.
-
-<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|>
-APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec-
-
-<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|>
-tion of lunar descent.
-
-<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|>
-
-
-<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|>
-A Powdery Surface
-Is Closely Explored
-
-<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|>
-BY JOHN NOBLE WILFORD
-
-<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|>
-HOUSTON, Monday, July 21—New hires landed and walked on the moon.
-
-<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|>
-Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time.
-
-<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|>
-Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here.
-
-<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|>
-"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager.
-
-<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|>
-About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater.
-
-<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|>
-"That's one small step for man, one giant leap for mankind."
-
-<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|>
-His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth.
-
-<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|>
-Testable Slope Test Soil
diff --git a/tools/mtmd/tests/test-1-extracted.txt b/tools/mtmd/tests/test-1-extracted.txt
deleted file mode 100644
index 4fe273e31b6..00000000000
--- a/tools/mtmd/tests/test-1-extracted.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-MEN WALK ON MOON
-ASTRONAUTS LAND ON PLAIN;
-COLLECT ROCKS, PLANT FLAG
-
-Voice From Moon:
-'Eagle Has Landed'
-
-A Powder Surface
-Is Closely Explored
-
-By JOHN NOBLE WILFORD
-NOVEMBER, Monday, July 21—New York Herald and
-wished on the moon.
-
-Two American astronauts of Apollo 11, steered their
-frigate Eagle toward the moon's surface and smoothly to
-the lunar landing yesterday at 4:17:40 P.M., Eastern day-
-light time.
-
-Neil A. Armstrong, the 38-year-old civilian commander,
-landed on the soft sand of the moon's surface here.
-
-"Beautiful, Triumph!" he said. "The Eagle has landed."
-
-The first man to reach the moon—Neil Armstrong and
-his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon,
-brought their ship to rest on a level, rock-strewn plain near
-the moon's surface. The two men and two of the three
-astronauts on board, Armstrong, Conrad and Edwin E.
-Aldrin, 38, of Houston, stepped slowly down the ladder
-and descended as he pointed his first full-flaming footpad
-at the lunar crater.
-
-"That's one small step for man, one giant leap for
-mankind."
-
-His first step on the moon came at 10:56:20 P.M., as
-a television camera rolled the earth's thousandth line every
-second to an aerial and studied audiences of hundreds of
-millions of people on earth.
-
-Textile Slope Test Soil
diff --git a/tools/mtmd/tests/test-1-ground-truth.txt b/tools/mtmd/tests/test-1-ground-truth.txt
new file mode 100644
index 00000000000..fd85b6485f7
--- /dev/null
+++ b/tools/mtmd/tests/test-1-ground-truth.txt
@@ -0,0 +1,24 @@
+
+ A Powdery Surface
+  Is Closely Explored
+
+By JOHN NOBLE WILFORD
+Special to The New York Times
+
+HOUSTON, Monday, July 21—Men have landed and walked on the moon.
+
+Two Americans, astronauts of Apollo 11, steered their fragile four-legged lunar module safely and smoothly to the historic landing yesterday at 4:17:40 P.M., Eastern daylight time.
+
+Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the mission control room here:
+
+"Houston, Tranquility Base here. The Eagle has landed."
+
+The first men to reach the moon—Mr. Armstrong and his co-pilot, Col. Edwin E. Aldrin Jr. of the Air Force—brought their ship to rest on a level, rock-strewn plain near the southwestern shore of the arid Sea of Tranquility.
+
+About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and declared as he planted the first human footprint on the lunar crust:
+
+"That's one small step for man, one giant leap for mankind."
+
+His first step on the moon came at 10:56:20 P.M., as a television camera outside the craft transmitted his every move to an awed and excited audience of hundreds of millions of people on earth.
+
+Tentative Steps Test Soil
diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py
index 674a3500151..5c1980271b8 100644
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@@ -1,186 +1,220 @@
 #!/usr/bin/env python3
 """
-Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation
-for DeepSeek-OCR model using embedding similarity.
+Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
+image to the actual text in part of that image.
+
+Runs the test image through mtmd-cli, calculates CER and chrF for
+its output, and holds them against the HF model's scores.
 """
 
 import argparse
+import logging
 import subprocess
 import sys
+import unicodedata
 from pathlib import Path
 
-from sentence_transformers import SentenceTransformer
-from sentence_transformers import util
+logger = logging.getLogger("deepseek-ocr-test")
+
+DEFAULT_IMAGE = "test-1.jpeg"
+DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
+RUN_TIMEOUT = 300
+
+# DeepSeek-OCR reference scores on the test image.
+# This is the baseline the implementation should keep up with.
+HF_REFERENCE_CER = 0.3030
+HF_REFERENCE_CHRF = 67.52
+
+CER_TOLERANCE = 0.02
+CHRF_TOLERANCE = 2.0
+
+CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
+CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
+
+
+def verdict(ok: bool) -> str:
+    return "PASS" if ok else "FAIL"
+
 
+def normalize_text(text: str) -> str:
+    """NFC-normalize and collapse whitespace, so line-wrap and spacing
+    don't count as CER errors."""
+    return " ".join(unicodedata.normalize("NFC", text).split())
 
-def run_mtmd_deepseek_ocr(
-        model_path: str,
-        mmproj_path: str,
-        image_path: str,
-        bin_path: str,
-        prompt: str = "Free OCR."
-) -> str:
+
+def locally_align(expected: str, ocr_out: str) -> str:
+    """Return the span of `ocr_out` that best matches `expected`.
+
+    The ground truth covers part of the article body.
+    But the test image includes half of the newspaper's front page.
+    Fuzzy partial-ratio matching picks out
+    the body so the unrelated text doesn't disturb CER / chrF.
     """
-    Run inference using llama.cpp mtmd-cli.
+    from rapidfuzz import fuzz
+    alignment = fuzz.partial_ratio_alignment(expected, ocr_out)
+    if alignment is None or alignment.dest_end <= alignment.dest_start:
+        return ocr_out
+    return ocr_out[alignment.dest_start:alignment.dest_end]
+
+
+def compute_cer(expected: str, ocr_out: str) -> float:
+    """Character Error Rate. Lower is better.
+    CER: fraction of characters you'd insert/delete/substitute to fix the output; 0 = perfect."""
+    import jiwer
+    return jiwer.cer(expected, ocr_out)
+
+
+def compute_chrf(expected: str, ocr_out: str) -> float:
+    """chrF score on 0-100. Higher is better.
+    chrF: F-score over shared character n-grams; more forgiving of small word/spacing drift than CER.
     """
+    from sacrebleu.metrics import CHRF
+    return CHRF().sentence_score(ocr_out, [expected]).score
+
+
+def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
+    """Run mtmd-cli on the image and return its output."""
     cmd = [
-        bin_path,
-        "-m", model_path,
-        "--mmproj", mmproj_path,
-        "--image", image_path,
-        # "-p", "<|grounding|>Convert the document to markdown.",
-        "-p", prompt,
+        str(bin_path),
+        "-m", str(model_path),
+        "--mmproj", str(mmproj_path),
+        "--image", str(image_path),
+        "-p", "Free OCR. ",
         "--chat-template", "deepseek-ocr",
         "--temp", "0",
-        "-n", "1024",
-        # "--verbose"
+        "--flash-attn", "off",  # match the HF "eager" attention reference
+        "--no-warmup",
     ]
+    logger.debug(f"  command: {' '.join(cmd)}")
 
-    print(f"Running llama.cpp command: {' '.join(cmd)}")
-
-    result = subprocess.run(
-        cmd,
-        capture_output=True,
-        text=False,
-        timeout=300
-    )
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=False, timeout=RUN_TIMEOUT)
+    except subprocess.TimeoutExpired as e:
+        if e.stderr:
+            logger.error("llama.cpp stderr:\n%s", e.stderr.decode("utf-8", errors="replace"))
+        raise RuntimeError(f"llama-mtmd-cli timed out after {RUN_TIMEOUT}s")
 
     if result.returncode != 0:
-        stderr = result.stderr.decode('utf-8', errors='replace')
-        print(f"llama.cpp stderr: {stderr}")
+        logger.error("llama.cpp stderr:\n%s", result.stderr.decode("utf-8", errors="replace"))
         raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
 
-    output = result.stdout.decode('utf-8', errors='replace').strip()
-    print(f"llama.cpp output length: {len(output)} chars")
+    output = result.stdout.decode("utf-8", errors="replace").strip()
+    if not output:
+        raise RuntimeError("llama-mtmd-cli produced no output on stdout")
+    logger.info(f"  output: {len(output)} chars")
     return output
 
 
-def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float:
-    """
-    Compute cosine similarity between two texts using embedding model.
-    """
-    print(f"Loading embedding model: {model_name}")
+def read_expected_text(file_path: Path) -> str:
+    with open(file_path, "r", encoding="utf-8") as f:
+        return f.read().strip()
 
-    # Use sentence-transformers for easier embedding extraction
-    embed_model = SentenceTransformer(model_name)
 
-    print("Computing embeddings...")
-    embeddings = embed_model.encode([text1, text2], convert_to_numpy=True)
+def evaluate(expected: str, ocr_out: str) -> bool:
+    expected = normalize_text(expected)
+    ocr_out = normalize_text(ocr_out)
+    aligned = locally_align(expected, ocr_out)
+
+    logger.debug(f"\n--- expected (normalized) ---\n{expected}")
+    logger.debug(f"\n--- OCR output (normalized) ---\n{ocr_out}")
+    logger.debug(f"\n--- aligned span ---\n{aligned}")
+
+    cer = compute_cer(expected, aligned)
+    chrf = compute_chrf(expected, aligned)
+
+    cer_pass = cer <= CER_MAX
+    chrf_pass = chrf >= CHRF_MIN
+    passed = cer_pass and chrf_pass
+
+    logger.info("")
+    logger.info("=" * 60)
+    logger.info("Free OCR evaluation:")
+    logger.info("=" * 60)
+    logger.info(f"  CER               {cer:>7.4f}    (<= {CER_MAX:>7.4f}  -> {verdict(cer_pass)})")
+    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (>= {CHRF_MIN:>7.2f}  -> {verdict(chrf_pass)})")
+    logger.info(f"  Expected chars    {len(expected):>7}")
+    logger.info(f"  Aligned chars     {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
+    logger.info("")
+    logger.info(f"  Result: {verdict(passed)}")
+    logger.info("=" * 60)
+    return passed
+
+
+def argument_parser() -> argparse.ArgumentParser:
+    ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
+    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
+                    help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
+    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
+                    help="Path to mmproj GGUF file (relative to repo root or absolute)")
+    ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
+                    help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
+    ap.add_argument("--verbose", action="store_true",
+                    help="Also log the expected, OCR, and aligned text")
+    return ap
 
-    similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0]
-    return float(similarity)
 
+def configure_logging(verbose: bool) -> None:
+    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO,
+                        format="%(message)s")
 
-def read_expected_output(file_path: str) -> str:
-    """
-    Read expected OCR output from file.
-    """
-    cur_path = Path(__file__).parent
-    expected_path = str(cur_path / file_path)
-    with open(expected_path, "r", encoding="utf-8") as f:
-        return f.read().strip()
 
+def resolve_path(path: str, base: Path) -> Path:
+    p = Path(path)
+    return p if p.is_absolute() else base / p
 
-def main():
-    ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
-    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
-                    help="Path to llama.cpp GGUF model")
-    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
-                    help="Path to mmproj GGUF file")
-    ap.add_argument("--image", default="test-1.jpeg",
-                    help="Path to test image")
-    ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
-                    help="Path to llama-mtmd-cli binary")
-    ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B",
-                    help="Embedding model for similarity computation")
-    ap.add_argument("--threshold", type=float, default=0.7,
-                    help="Minimum similarity threshold for pass")
-    args = ap.parse_args()
-
-    # Validate paths
-    # script directory + image
-    mtmd_dir = Path(__file__).parent.parent
-    args.image = str(mtmd_dir / args.image)
-    # project directory + llama model
-    args.llama_model = str(mtmd_dir.parent.parent / args.llama_model)
-    # project directory + mmproj
-    args.mmproj = str(mtmd_dir.parent.parent / args.mmproj)
-    args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin)
-    if not Path(args.image).exists():
-        print(f"Error: Image not found: {args.image}")
-        sys.exit(1)
-    if not Path(args.llama_model).exists():
-        print(f"Error: Model not found: {args.llama_model}")
-        sys.exit(1)
-    if not Path(args.mmproj).exists():
-        print(f"Error: mmproj not found: {args.mmproj}")
-        sys.exit(1)
-
-    print("=" * 60)
-    print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison")
-    print("=" * 60)
-
-    # Default paths based on your command
-
-    # Run llama.cpp inference
-    print("\n[2/3] Running llama.cpp implementation...")
-    llama_free_ocr = run_mtmd_deepseek_ocr(
-        args.llama_model,
-        args.mmproj,
-        args.image,
-        args.llama_bin
-    )
-
-    llama_md_ocr = run_mtmd_deepseek_ocr(
-        args.llama_model,
-        args.mmproj,
-        args.image,
-        args.llama_bin,
-        prompt="<|grounding|>Convert the document to markdown."
-    )
-
-    expected_free_ocr = read_expected_output("test-1-extracted.txt")
-    expected_md_ocr = read_expected_output("test-1-extracted.md")
-
-    # Compute similarity
-    print("\n[3/3] Computing embedding similarity...")
-    free_ocr_similarity = compute_embedding_similarity(
-        expected_free_ocr,
-        llama_free_ocr,
-        args.embedding_model
-    )
-
-    md_ocr_similarity = compute_embedding_similarity(
-        expected_md_ocr,
-        llama_md_ocr,
-        args.embedding_model
-    )
-
-    # Results
-    print("\n" + "=" * 60)
-    print("RESULTS")
-    print("=" * 60)
-    print(f"\nReference Model output:\n{'-' * 40}")
-    print(expected_free_ocr)
-    print(f"\nDeepSeek-OCR output:\n{'-' * 40}")
-    print(llama_free_ocr)
-    print(f"\n{'=' * 60}")
-    print(f"Cosine Similarity: {free_ocr_similarity:.4f}")
-    print(f"Threshold: {args.threshold}")
-    print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}")
-    print("=" * 60)
-
-    # Markdown OCR results
-    print(f"\nReference Model Markdown output:\n{'-' * 40}")
-    print(expected_md_ocr)
-    print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}")
-    print(llama_md_ocr)
-    print(f"\n{'=' * 60}")
-    print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}")
-    print(f"Threshold: {args.threshold}")
-    print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}")
-    print("=" * 60)
+
+def main() -> int:
+    args = argument_parser().parse_args()
+    configure_logging(args.verbose)
+
+    tests_dir = Path(__file__).parent  # tools/mtmd/tests
+    mtmd_dir = tests_dir.parent  # tools/mtmd
+    repo_root = mtmd_dir.parent.parent  # repo root
+
+    inputs = [
+        ("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
+        ("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
+        ("model", resolve_path(args.llama_model, repo_root)),
+        ("mmproj", resolve_path(args.mmproj, repo_root)),
+        ("binary", resolve_path(args.llama_bin, repo_root)),
+    ]
+    for label, path in inputs:
+        if not path.exists():
+            logger.error(f"Error: {label} not found: {path}")
+            return 1
+    paths = dict(inputs)
+
+    logger.info("=" * 60)
+    logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
+    logger.info("=" * 60)
+    logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
+    logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
+
+    logger.debug("")
+    logger.debug("Resolved test inputs:")
+    for label, path in inputs:
+        logger.debug(f"  {label:<14} {path}")
+
+    logger.info("")
+    logger.info("[1/3] Running llama.cpp 'Free OCR'")
+    try:
+        ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
+                               paths["image"], paths["binary"])
+    except RuntimeError as e:
+        logger.error(f"Error: {e}")
+        return 1
+
+    logger.info("")
+    logger.info("[2/3] Reading expected output")
+    expected = read_expected_text(paths["expected-text"])
+    logger.info(f"  expected: {len(expected)} chars")
+
+    logger.info("")
+    logger.info("[3/3] Computing OCR metrics")
+    ok = evaluate(expected, ocr_out)
+
+    return 0 if ok else 1
 
 
 if __name__ == "__main__":
-    main()
+    sys.exit(main())
diff --git a/tools/mtmd/tests/tests-requirements.txt b/tools/mtmd/tests/tests-requirements.txt
index 3134d098d62..f6645a70422 100644
--- a/tools/mtmd/tests/tests-requirements.txt
+++ b/tools/mtmd/tests/tests-requirements.txt
@@ -1,5 +1,3 @@
-sentence-transformers
-transformers
-tokenizers
-torch
-torchvision
+jiwer
+sacrebleu
+rapidfuzz
diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt
index 0c194ee7f08..0eee9acd406 100644
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -1,6 +1,23 @@
+# llama-perplexity-impl: perplexity logic, reusable by app
+
+set(TARGET llama-perplexity-impl)
+
+add_library(${TARGET} perplexity.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-perplexity executable
+
 set(TARGET llama-perplexity)
-add_executable(${TARGET} perplexity.cpp)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE llama-perplexity-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
diff --git a/tools/perplexity/main.cpp b/tools/perplexity/main.cpp
new file mode 100644
index 00000000000..13a9940e9ee
--- /dev/null
+++ b/tools/perplexity/main.cpp
@@ -0,0 +1,5 @@
+int llama_perplexity(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_perplexity(argc, argv);
+}
diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
index 75defd7c87b..027cd53f29e 100644
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -157,7 +157,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
                 break;
             }
             lock.unlock();
-            const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + size_t(i)*nv, tokens[i+1]);
             local_nll += v;
             local_nll2 += v*v;
         }
@@ -169,7 +169,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
     for (auto & w : workers) {
         w.join();
     }
-    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
+    out.write((const char *)log_probs.data(), size_t(n_token)*nv*sizeof(uint16_t));
 }
 
 struct kl_divergence_result {
@@ -279,7 +279,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
                 break;
             }
             lock.unlock();
-            std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + size_t(i)*nv, tokens[i+1], local_kld);
             kld_values[i]    = (float)v.first;
             p_diff_values[i] = v.second;
         }
@@ -524,7 +524,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
         logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
         const int nv = 2*((n_vocab + 1)/2) + 4;
-        log_probs.resize(n_ctx * nv);
+        log_probs.resize(size_t(n_ctx) * nv);
     }
 
     // We get the logits for all the tokens in the context window (params.n_ctx)
@@ -2005,7 +2005,10 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
     LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
 }
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_perplexity(int argc, char ** argv);
+
+int llama_perplexity(int argc, char ** argv) {
     std::setlocale(LC_NUMERIC, "C");
 
     common_params params;
diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 965adc0059b..eead4c85951 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -1,7 +1,23 @@
+# llama-quantize-impl: quantize logic, reusable by app
+
+set(TARGET llama-quantize-impl)
+
+add_library(${TARGET} quantize.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-quantize executable
+
 set(TARGET llama-quantize)
-add_executable(${TARGET} quantize.cpp)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
+
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE llama-quantize-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
diff --git a/tools/quantize/main.cpp b/tools/quantize/main.cpp
new file mode 100644
index 00000000000..fc247190c83
--- /dev/null
+++ b/tools/quantize/main.cpp
@@ -0,0 +1,5 @@
+int llama_quantize(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_quantize(argc, argv);
+}
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 3d33d47d98b..7292bda6f4e 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -490,7 +490,10 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
     return true;
 }
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_quantize(int argc, char ** argv);
+
+int llama_quantize(int argc, char ** argv) {
     std::setlocale(LC_NUMERIC, "C");
     if (argc < 3) {
         usage(argv[0]);
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index 57d3e871d99..7d427431db9 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -27,24 +27,33 @@ target_include_directories(${TARGET} PRIVATE ../mtmd)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common mtmd ${CMAKE_THREAD_LIBS_INIT})
 
+# llama-server-impl: server logic, reusable by app
 
-# llama-server executable
-
-set(TARGET llama-server)
+set(TARGET llama-server-impl)
 
-set(TARGET_SRCS
+add_library(${TARGET}
     server.cpp
     server-http.cpp
     server-http.h
     server-models.cpp
     server-models.h
 )
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
-add_executable(${TARGET} ${TARGET_SRCS})
-install(TARGETS ${TARGET} RUNTIME)
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC server-context llama-ui cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
 
-target_include_directories(${TARGET} PRIVATE ../mtmd)
-target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PRIVATE server-context llama-ui PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
+# llama-server executable
+
+set(TARGET llama-server)
+
+add_executable(${TARGET} main.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 
+target_link_libraries(${TARGET} PRIVATE llama-server-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/server/README.md b/tools/server/README.md
index 9b413423975..0d20ced879f 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -105,11 +105,11 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
 | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
 | `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
-| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
+| `--log-file FNAME` | Log to file<br/>(env: LLAMA_ARG_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_ARG_LOG_COLORS) |
 | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_ARG_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: trace (more info)<br/> - 5: debug<br/>(default: 3)<br/><br/>(env: LLAMA_ARG_LOG_VERBOSITY) |
 | `--log-prefix, --no-log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_ARG_LOG_PREFIX) |
 | `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_ARG_LOG_TIMESTAMPS) |
 | `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
@@ -163,7 +163,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
 | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
-| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
+| `-cms, --checkpoint-min-step N` | minimum spacing between context checkpoints in tokens (default: 256, 0 = no minimum)<br/>(env: LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
 | `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
@@ -204,7 +204,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
-| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
@@ -223,8 +223,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
@@ -249,6 +249,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
 | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
 | `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
+| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)<br/>(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) |
 | `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
 | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
 | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
diff --git a/tools/server/main.cpp b/tools/server/main.cpp
new file mode 100644
index 00000000000..7f17c56a8c2
--- /dev/null
+++ b/tools/server/main.cpp
@@ -0,0 +1,5 @@
+int llama_server(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+    return llama_server(argc, argv);
+}
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index dc00edfa82a..fb71792fe6d 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1110,6 +1110,16 @@ json oaicompat_chat_params_parse(
         llama_params["chat_parser"] = chat_params.parser;
     }
 
+    llama_params["message_spans"] = json::array();
+
+    for (const auto & span : chat_params.message_spans) {
+        llama_params["message_spans"].push_back({
+            { "role", span.role },
+            { "pos",  span.pos  },
+            { "len",  span.len  },
+        });
+    }
+
     // Reasoning budget: pass parameters through to sampling layer
     {
         int reasoning_budget = opt.reasoning_budget;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index dc3189e1705..db635e5521c 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -8,7 +8,10 @@
 
 #include "build-info.h"
 #include "common.h"
+#include "fit.h"
 #include "llama.h"
+#include "../../src/llama-ext.h" // staging API: llama_set_mtp_source
+#include "ggml-cpp.h"
 #include "log.h"
 #include "sampling.h"
 #include "speculative.h"
@@ -506,6 +509,9 @@ struct server_slot {
 
         if (ptask) {
             res["id_task"] = ptask->id;
+            res["n_prompt_tokens"]           = (int32_t) prompt.tokens.size();
+            res["n_prompt_tokens_processed"] = n_prompt_tokens_processed;
+            res["n_prompt_tokens_cache"]     = n_prompt_tokens_cache;
             res["params"] = ptask->params.to_json(only_metrics);
             res["next_token"] = {
                 {
@@ -701,6 +707,10 @@ struct server_context_impl {
     bool sleeping = false;
 
     void destroy() {
+        spec.reset();
+        ctx_dft.reset();
+        model_dft.reset();
+
         llama_init.reset();
 
         ctx_tgt = nullptr;
@@ -746,6 +756,140 @@ struct server_context_impl {
 
         params_base = params;
 
+        std::string & mmproj_path = params_base.mmproj.path;
+        bool has_mmproj = !mmproj_path.empty();
+        mtmd_context_params mparams = mtmd_context_params_default();
+        if (has_mmproj) {
+            mparams.use_gpu          = params_base.mmproj_use_gpu;
+            mparams.print_timings    = false;
+            mparams.n_threads        = params_base.cpuparams.n_threads;
+            mparams.flash_attn_type  = params_base.flash_attn_type;
+            mparams.warmup           = params_base.warmup;
+            mparams.image_min_tokens = params_base.image_min_tokens;
+            mparams.image_max_tokens = params_base.image_max_tokens;
+            mparams.media_marker     = get_media_marker();
+        }
+
+        // optionally get the memory usage of mmproj
+        if (has_mmproj && params_base.fit_params) {
+            auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams);
+            if (!mmproj_mem.empty()) {
+                size_t total = 0;
+                for (auto & [dev, size] : mmproj_mem) {
+                    total += size;
+                }
+                SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0));
+                GGML_ASSERT(!params_base.fit_params_target.empty());
+                for (auto & [dev, size] : mmproj_mem) {
+                    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+                        if (ggml_backend_dev_get(i) == dev) {
+                            if (i < params_base.fit_params_target.size()) {
+                                SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev));
+                                params_base.fit_params_target[i] += size;
+                            }
+                            break;
+                        }
+                    }
+                }
+            } else {
+                SRV_ERR("%s", "[mtmd] failed to get memory usage of mmproj\n");
+            }
+        }
+
+        // optionally reserve VRAM for the draft / MTP context before fitting the target model
+        if (params_base.fit_params) {
+            const bool spec_mtp = std::find(params_base.speculative.types.begin(),
+                                            params_base.speculative.types.end(),
+                                            COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end();
+            const bool has_draft = params_base.speculative.has_dft();
+
+            if (has_draft || spec_mtp) {
+                common_params params_dft = params_base;
+                bool measure_model_bytes = true;
+
+                if (has_draft) {
+                    const auto & params_spec = params_base.speculative.draft;
+                    params_dft.devices               = params_spec.devices;
+                    params_dft.model                 = params_spec.mparams;
+                    params_dft.n_gpu_layers          = params_spec.n_gpu_layers;
+                    params_dft.cache_type_k          = params_spec.cache_type_k;
+                    params_dft.cache_type_v          = params_spec.cache_type_v;
+                    params_dft.tensor_buft_overrides = params_spec.tensor_buft_overrides;
+                } else {
+                    // MTP draft context lives on the target model, only context+compute are new
+                    measure_model_bytes = false;
+                }
+
+                auto mparams_dft = common_model_params_to_llama(params_dft);
+                auto cparams_dft = common_context_params_to_llama(params_dft);
+                if (spec_mtp) {
+                    cparams_dft.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
+                    cparams_dft.type_k   = params_base.speculative.draft.cache_type_k;
+                    cparams_dft.type_v   = params_base.speculative.draft.cache_type_v;
+                }
+                cparams_dft.n_rs_seq = 0;
+
+                bool skip_measure = false;
+                //TODO: remove this
+                if (spec_mtp && has_draft) {
+                    struct gguf_init_params meta_params = {
+                        /* .no_alloc = */ true,
+                        /* .ctx      = */ nullptr,
+                    };
+                    gguf_context_ptr meta(gguf_init_from_file(params_dft.model.path.c_str(), meta_params));
+
+                    if (std::string(gguf_get_val_str(meta.get(), gguf_find_key(meta.get(), "general.architecture"))) == "gemma4-assistant") {
+                        skip_measure = true;
+                        SRV_WRN("[spec] skipping --fit memory measurement for Gemma 4 assistant draft model '%s'\n",
+                                params_dft.model.path.c_str());
+                    }
+                }
+
+                std::vector<ggml_backend_dev_t> devs;
+                uint32_t hp_ngl = 0;
+                uint32_t hp_nct = 0;
+                uint32_t hp_nex = 0;
+                if (!skip_measure) try {
+                    auto dmd = common_get_device_memory_data(
+                        params_dft.model.path.c_str(), &mparams_dft, &cparams_dft,
+                        devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
+
+                    GGML_ASSERT(!params_base.fit_params_target.empty());
+                    size_t total = 0;
+
+                    std::vector<ggml_backend_dev_t> tgt_devices = params.devices;
+
+                    if (tgt_devices.empty()) {
+                        for(size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                           tgt_devices.push_back(ggml_backend_dev_get(i));
+                        }
+                    }
+
+                    for (size_t j = 0; j < devs.size(); ++j) {
+                        const size_t bytes =
+                            (measure_model_bytes ? dmd[j].mb.model : 0) +
+                            dmd[j].mb.context +
+                            dmd[j].mb.compute;
+                        total += bytes;
+                        for (size_t i = 0; i < tgt_devices.size(); i++) {
+                            if (tgt_devices[i] == devs[j]) {
+                                SRV_DBG("[spec] adding %.2f MiB to fit_params_target for device %s\n",
+                                        bytes / (1024.0 * 1024.0), ggml_backend_dev_name(devs[j]));
+                                params_base.fit_params_target[i] += bytes;
+                                break;
+                            }
+                        }
+                    }
+                    SRV_INF("[spec] estimated memory usage of %s is %.2f MiB\n",
+                            has_draft ? "draft model" : "MTP context",
+                            total / (1024.0 * 1024.0));
+                } catch (const std::exception & e) {
+                    SRV_ERR("[spec] failed to measure %s memory: %s\n",
+                            has_draft ? "draft model" : "MTP context", e.what());
+                }
+            }
+        }
+
         llama_init = common_init_from_params(params_base);
 
         model_tgt = llama_init->model();
@@ -768,13 +912,23 @@ struct server_context_impl {
 
             SRV_INF("loading draft model '%s'\n", params_spec.mparams.path.c_str());
 
+            const bool spec_mtp = std::find(params_base.speculative.types.begin(),
+                                            params_base.speculative.types.end(),
+                                            COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end();
+
             auto params_dft = params_base;
 
             params_dft.devices      = params_spec.devices;
             params_dft.model        = params_spec.mparams;
             params_dft.n_gpu_layers = params_spec.n_gpu_layers;
-            params_dft.cache_type_k = params_spec.cache_type_k;
-            params_dft.cache_type_v = params_spec.cache_type_v;
+            // TODO: find a better way to expose that the cache is shared
+            if (spec_mtp) {
+                params_dft.cache_type_k = params_base.cache_type_k;
+                params_dft.cache_type_v = params_base.cache_type_v;
+            } else {
+                params_dft.cache_type_k = params_spec.cache_type_k;
+                params_dft.cache_type_v = params_spec.cache_type_v;
+            }
 
             if (params_spec.cpuparams.n_threads > 0) {
                 params_dft.cpuparams.n_threads       = params_spec.cpuparams.n_threads;
@@ -793,9 +947,6 @@ struct server_context_impl {
 
             auto cparams = common_context_params_to_llama(params_dft);
 
-            const bool spec_mtp = std::find(params_base.speculative.types.begin(),
-                                            params_base.speculative.types.end(),
-                                            COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end();
             if (spec_mtp) {
                 cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
             }
@@ -805,6 +956,11 @@ struct server_context_impl {
             cparams.n_rs_seq = 0;
             ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
 
+            if (spec_mtp) {
+                // MTP draft must know its target before the first decode
+                llama_set_mtp_source(ctx_dft.get(), ctx_tgt);
+            }
+
             ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get());
 
             params_base.speculative.draft.ctx_tgt = ctx_tgt;
@@ -816,6 +972,8 @@ struct server_context_impl {
 
             auto cparams_mtp = common_context_params_to_llama(params_base);
             cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
+            cparams_mtp.type_k   = params_base.speculative.draft.cache_type_k;
+            cparams_mtp.type_v   = params_base.speculative.draft.cache_type_v;
             cparams_mtp.n_rs_seq = 0;
 
             ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp));
@@ -824,24 +982,20 @@ struct server_context_impl {
                 return false;
             }
 
+            // wire the source before any decode (the seq-rm probe below
+            // triggers sched_reserve which needs src for Gemma4-style MTP)
+            llama_set_mtp_source(ctx_dft.get(), ctx_tgt);
+
             ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get());
 
             params_base.speculative.draft.ctx_tgt = ctx_tgt;
             params_base.speculative.draft.ctx_dft = ctx_dft.get();
         }
 
-        std::string & mmproj_path = params_base.mmproj.path;
-        if (!mmproj_path.empty()) {
-            mtmd_context_params mparams = mtmd_context_params_default();
-
-            mparams.use_gpu          = params_base.mmproj_use_gpu;
-            mparams.print_timings    = false;
-            mparams.n_threads        = params_base.cpuparams.n_threads;
-            mparams.flash_attn_type  = params_base.flash_attn_type;
-            mparams.warmup           = params_base.warmup;
-            mparams.image_min_tokens = params_base.image_min_tokens;
-            mparams.image_max_tokens = params_base.image_max_tokens;
-            mparams.media_marker     = get_media_marker();
+        if (has_mmproj) {
+            if (!is_resume) {
+                mtmd_helper_log_set(common_log_default_callback, nullptr);
+            }
 
             mctx = mtmd_init_from_file(mmproj_path.c_str(), model_tgt, mparams);
             if (mctx == nullptr) {
@@ -987,6 +1141,13 @@ struct server_context_impl {
         }
         SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
 
+        if (params_base.n_ctx_checkpoints > 0) {
+            SRV_INF("context checkpoints enabled, max = %d, min spacing = %d\n",
+                    params_base.n_ctx_checkpoints, params_base.checkpoint_min_step);
+        } else {
+            SRV_INF("%s", "context checkpoints disabled\n");
+        }
+
         if (!params_base.model_alias.empty()) {
             // backward compat: use first alias as model name
             model_name = *params_base.model_alias.begin();
@@ -2642,8 +2803,6 @@ struct server_context_impl {
                                 }
 
                                 if (pos_min >= pos_min_thold) {
-                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
-
                                     // search for a context checkpoint
                                     const auto it = std::find_if(
                                         slot.prompt.checkpoints.rbegin(),
@@ -2660,7 +2819,6 @@ struct server_context_impl {
 
                                     if (!do_reset) {
                                         // restore the context checkpoint
-
                                         it->load_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                                         it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
 
@@ -2796,6 +2954,9 @@ struct server_context_impl {
                         has_mtmd = true;
                     }
 
+                    const int32_t n_before_user = slot.task->params.n_before_user;
+                    const bool n_before_user_known = n_before_user > 0;
+
                     // add prompt tokens for processing in the current batch
                     while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) {
                         // get next token to process
@@ -2824,6 +2985,13 @@ struct server_context_impl {
 
                         slot.n_prompt_tokens_processed++;
 
+                        // stop the prompt batch exactly before the latest user input, so a checkpoint
+                        // can be created after the previous messages
+                        if (n_before_user_known &&
+                            slot.prompt.n_tokens() == n_before_user) {
+                            break;
+                        }
+
                         // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
                         // create checkpoints that many tokens before the end of the prompt:
                         //  - 4 + n_ubatch
@@ -2849,6 +3017,8 @@ struct server_context_impl {
                     // the number of tokens added to the batch for the current slot
                     const auto n_tokens_cur = batch.n_tokens - n_tokens_prev;
 
+                    const bool near_prompt_end = slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch;
+
                     // entire prompt has been processed
                     if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
                         slot.state = SLOT_STATE_DONE_PROMPT;
@@ -2863,39 +3033,49 @@ struct server_context_impl {
 
                         slot.init_sampler();
                     } else {
-                        if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) {
-                            // near the end of the prompt
-                            do_checkpoint = do_checkpoint && true;
-                        } else {
-                            // only do non-end checkpoints if the "checkpoint every n tokens" option is set
-                            do_checkpoint = do_checkpoint && params_base.checkpoint_every_nt > 0;
-
-                            if (do_checkpoint) {
-                                llama_pos last_checkpoint = 0;
-                                if (!slot.prompt.checkpoints.empty()) {
-                                    last_checkpoint = slot.prompt.checkpoints.back().n_tokens;
-                                }
-
-                                do_checkpoint = do_checkpoint && slot.prompt.n_tokens() - batch.n_tokens - last_checkpoint >= params_base.checkpoint_every_nt;
-
-                                if (do_checkpoint) {
-                                    SLT_INF(slot, "%d tokens since last checkpoint at %d, creating new checkpoint during processing at position %d\n", params_base.checkpoint_every_nt, last_checkpoint, slot.prompt.n_tokens());
-                                }
-                            }
+                        // skip ordinary mid-prompt checkpoints
+                        if (!n_before_user_known && !near_prompt_end) {
+                            do_checkpoint = false;
                         }
                     }
 
                     const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
                     const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), slot.id);
 
-                    // no need for empty or small checkpoints
-                    do_checkpoint = do_checkpoint && (pos_min >= 0 && slot.prompt.n_tokens() >= 64);
+                    // checkpoints are created before the current batch is decoded, so
+                    // their token position is the batch start rather than the prompt end
+                    const int32_t n_tokens_start = slot.prompt.n_tokens() - n_tokens_cur;
+
+                    {
+                        const bool is_on_user =
+                            n_before_user_known &&
+                            n_tokens_start == n_before_user;
+
+                        const bool is_after_user =
+                            n_before_user_known &&
+                            n_tokens_start > n_before_user;
+
+                        const bool is_allowed =
+                            !n_before_user_known ||
+                            is_on_user ||
+                            (is_after_user && near_prompt_end);
+
+                        if (do_checkpoint && !is_allowed) {
+                            do_checkpoint = false;
+                        }
+                    }
+
+                    // nothing to checkpoint yet
+                    // TODO: is this check needed?
+                    if (do_checkpoint && pos_min < 0) {
+                        do_checkpoint = false;
+                    }
 
                     // do not checkpoint after mtmd chunks
                     do_checkpoint = do_checkpoint && !has_mtmd;
 
                     // no need to create checkpoints that are too close together
-                    do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || slot.prompt.n_tokens() - n_tokens_cur > slot.prompt.checkpoints.back().n_tokens + 64);
+                    do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step);
                     SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max);
 
                     // note: we create the checkpoint before calling llama_decode(), so the current batch is not
@@ -3412,6 +3592,53 @@ void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
     impl->queue_tasks.on_sleeping_state(std::move(callback));
 }
 
+// compute the number of tokens before the last user message in the prompt
+static int32_t prompt_get_n_before_user(
+        const json & message_spans,
+        const std::string & prompt,
+        const std::vector<raw_buffer> & files,
+        const llama_vocab * vocab,
+        mtmd_context * mctx) {
+    int32_t result = -1;
+    int32_t byte_pos = -1;
+
+    for (const auto & span : message_spans) {
+        const std::string role = json_value(span, "role", std::string());
+
+        if (role == "user") {
+            byte_pos = json_value(span, "pos", -1);
+        }
+    }
+
+    if (byte_pos >= 0) {
+        GGML_ASSERT((size_t) byte_pos <= prompt.size());
+
+        const std::string prefix = prompt.substr(0, (size_t) byte_pos);
+
+        const std::string marker = get_media_marker();
+        size_t n_prefix_media = 0;
+        for (size_t pos = 0; (pos = prefix.find(marker, pos)) != std::string::npos; pos += marker.size()) {
+            n_prefix_media++;
+        }
+
+        GGML_ASSERT(n_prefix_media <= files.size());
+
+        if (mctx != nullptr && n_prefix_media > 0) {
+            // TODO: this makes a copy - avoid it
+            std::vector<raw_buffer> prefix_files(files.begin(), files.begin() + n_prefix_media);
+
+            result = (int32_t) process_mtmd_prompt(mctx, prefix, prefix_files).size();
+        } else {
+            result = (int32_t) tokenize_input_prompts(vocab, nullptr, prefix, true, true)[0].size();
+        }
+
+        SRV_TRC("message_spans: last user message: byte_pos=%d, media=%zu, n_before_user=%d\n",
+                byte_pos, n_prefix_media, result);
+    }
+
+    return result;
+}
+
 
 //
 // server_routes
@@ -3461,6 +3688,18 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                     meta->slot_n_ctx,
                     meta->logit_bias_eog,
                     data);
+
+            const auto message_spans = json_value(data, "message_spans", json::array());
+            if (prompt.is_string() && message_spans.is_array()) {
+                task.params.n_before_user =
+                    prompt_get_n_before_user(
+                        message_spans,
+                        prompt.get<std::string>(),
+                        files,
+                        ctx_server.vocab,
+                        ctx_server.mctx);
+            }
+
             task.id_slot = json_value(data, "id_slot", -1);
 
             // OAI-compat
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 9d008fc94c2..c0a9af8f092 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -99,6 +99,7 @@ bool server_http_context::init(const common_params & params) {
         srv.reset(
             new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
         );
+        is_ssl = true;
     } else {
         SRV_INF("%s", "running without SSL\n");
         srv.reset(new httplib::Server());
@@ -231,16 +232,19 @@ bool server_http_context::init(const common_params & params) {
     };
 
     auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
-        (void)req; // suppress unused parameter warning when LLAMA_BUILD_UI is not defined
         bool ready = is_ready.load();
         if (!ready) {
-#if defined(LLAMA_BUILD_UI)
+#if defined(LLAMA_UI_HAS_ASSETS)
             auto tmp = string_split<std::string>(req.path, '.');
             if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) {
-                res.status = 503;
-                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
-                return false;
+                if (const llama_ui_asset * a = llama_ui_find_asset("loading.html")) {
+                    res.status = 503;
+                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, "text/html; charset=utf-8");
+                    return false;
+                }
             }
+#else
+            (void)req;
 #endif
             // no endpoints are allowed to be accessed when the server is not ready
             // this is to prevent any data races or inconsistent states
@@ -312,23 +316,27 @@ bool server_http_context::init(const common_params & params) {
                 return 1;
             }
         } else {
-#if defined(LLAMA_BUILD_UI)
-            // using embedded static index.html
-            srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) {
-                // COEP and COOP headers, required by pyodide (python interpreter)
-                res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
-                return false;
-            });
-            srv->Get(params.api_prefix + "/bundle.js", [](const httplib::Request & /*req*/, httplib::Response & res) {
-                res.set_content(reinterpret_cast<const char*>(bundle_js), bundle_js_len, "application/javascript; charset=utf-8");
-                return false;
-            });
-            srv->Get(params.api_prefix + "/bundle.css", [](const httplib::Request & /*req*/, httplib::Response & res) {
-                res.set_content(reinterpret_cast<const char*>(bundle_css), bundle_css_len, "text/css; charset=utf-8");
-                return false;
-            });
+#if defined(LLAMA_UI_HAS_ASSETS)
+            auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) {
+                return [name, mime, with_isolation_headers](const httplib::Request & /*req*/, httplib::Response & res) {
+                    const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
+                    if (!a) {
+                        res.status = 404;
+                        return false;
+                    }
+                    if (with_isolation_headers) {
+                        // COEP and COOP headers, required by pyodide (python interpreter)
+                        res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                        res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    }
+                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
+                    return false;
+                };
+            };
+
+            srv->Get(params.api_prefix + "/",           serve_asset("index.html", "text/html; charset=utf-8",              true));
+            srv->Get(params.api_prefix + "/bundle.js",  serve_asset("bundle.js",  "application/javascript; charset=utf-8", false));
+            srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8",               false));
 #endif
         }
     }
@@ -371,8 +379,8 @@ bool server_http_context::start() {
     thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
     srv->wait_until_ready();
 
-    listening_address = is_sock ? string_format("unix://%s",    hostname.c_str())
-                                : string_format("http://%s:%d", hostname.c_str(), port);
+    listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
+                                : string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
     return true;
 }
 
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 66ee555f501..099b5e1cc6f 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -74,6 +74,7 @@ struct server_http_context {
     std::string path_prefix;
     std::string hostname;
     int port;
+    bool is_ssl = false;
 
     server_http_context();
     ~server_http_context();
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index ccf42320f77..47b6c2a4ec0 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -14,6 +14,7 @@
 #include <mutex>
 #include <condition_variable>
 #include <cstring>
+#include <cstdlib>
 #include <atomic>
 #include <chrono>
 #include <queue>
@@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
     // TODO: maybe validate preset before rendering ?
     // render args
     args = preset.to_args(bin_path);
+
+    // unified binary dispatches by subcommand, re-inject it right after the
+    // binary path so the child starts as 'llama serve ...' not 'llama ...'
+    const char * app_cmd = std::getenv("LLAMA_APP_CMD");
+    if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) {
+        args.insert(args.begin() + 1, app_cmd);
+    }
 }
 
 void server_model_meta::update_caps() {
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index d45513dbeba..abc00c82bdb 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -149,7 +149,7 @@ task_result_state::task_result_state(const common_chat_parser_params & chat_pars
     , oai_resp_id("resp_" + random_string())
     , oai_resp_reasoning_id("rs_" + random_string())
     , oai_resp_message_id("msg_" + random_string()) {
-    if (!chat_parser_params.echo) {
+    if (chat_parser_params.is_continuation && !chat_parser_params.echo) {
         // initialize chat_msg to avoid emitting a delta containing the assistant prefill
         chat_msg = common_chat_parse("", true, chat_parser_params);
     }
@@ -432,6 +432,10 @@ task_params server_task::params_from_json_cmpl(
         if (data.contains("chat_parser")) {
             params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
         }
+        if (data.contains("continue_final_message")) {
+            auto continuation = common_chat_continuation_parse(data.at("continue_final_message"));
+            params.chat_parser_params.is_continuation = continuation != COMMON_CHAT_CONTINUATION_NONE;
+        }
         params.chat_parser_params.echo = json_value(data, "echo", false);
     }
 
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 0978bb6ff16..60e216e7927 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -61,6 +61,9 @@ struct task_params {
 
     int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled)
 
+    // number of prompt tokens before the latest user message
+    int32_t n_before_user = -1;
+
     int64_t t_max_prompt_ms  = -1; // TODO: implement
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index c82f1179431..4d56d45e83c 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -71,7 +71,10 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
     };
 }
 
-int main(int argc, char ** argv) {
+// satisfies -Wmissing-declarations
+int llama_server(int argc, char ** argv);
+
+int llama_server(int argc, char ** argv) {
     std::setlocale(LC_NUMERIC, "C");
 
     // own arguments required by this example
diff --git a/tools/ui/CMakeLists.txt b/tools/ui/CMakeLists.txt
index 383940cb636..60d9020da38 100644
--- a/tools/ui/CMakeLists.txt
+++ b/tools/ui/CMakeLists.txt
@@ -1,150 +1,104 @@
 set(TARGET llama-ui)
 
-# Deprecated: use LLAMA_UI_HF_BUCKET instead
-set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets (deprecated: use LLAMA_UI_HF_BUCKET)")
 set(LLAMA_UI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets")
 
 # Backward compat: forward old var to new one
-if(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT DEFINED LLAMA_UI_HF_BUCKET)
+if(DEFINED LLAMA_BUILD_WEBUI)
+    set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
+    message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
+endif()
+if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
+    set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
+    message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
+endif()
+if(DEFINED LLAMA_WEBUI_HF_BUCKET)
     set(LLAMA_UI_HF_BUCKET ${LLAMA_WEBUI_HF_BUCKET})
-elseif(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT "${LLAMA_WEBUI_HF_BUCKET}" STREQUAL "${LLAMA_UI_HF_BUCKET}")
     message(DEPRECATION "LLAMA_WEBUI_HF_BUCKET is deprecated, use LLAMA_UI_HF_BUCKET instead")
 endif()
 
-set(TARGET_SRCS "")
-set(UI_COMPILE_DEFS "")
-
-if(LLAMA_BUILD_UI)
-    set(PUBLIC_ASSETS
-        index.html
-        bundle.js
-        bundle.css
-        loading.html
-    )
-
-    # Determine source of UI assets (priority: local > HF Bucket)
-    set(UI_SOURCE "")
-    set(UI_SOURCE_DIR "")
+# Resolve HF asset version: explicit env var > derived from build number > unset
+if(DEFINED ENV{HF_WEBUI_VERSION})
+    set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}")
+    message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead")
+elseif(DEFINED ENV{HF_UI_VERSION})
+    set(HF_UI_VERSION "$ENV{HF_UI_VERSION}")
+elseif(DEFINED LLAMA_BUILD_NUMBER)
+    set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}")
+else()
+    set(HF_UI_VERSION "")
+endif()
 
-    # Priority 1: Check for local build output
-    set(LOCAL_UI_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist")
+if(NOT "${HF_UI_VERSION}" STREQUAL "" AND NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$")
+    message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$")
+endif()
 
-    # Verify all required assets exist before declaring local source valid
-    set(ALL_ASSETS_PRESENT TRUE)
-    foreach(asset ${PUBLIC_ASSETS})
-        if(NOT EXISTS "${LOCAL_UI_DIR}/${asset}")
-            set(ALL_ASSETS_PRESENT FALSE)
-            break()
-        endif()
-    endforeach()
+set(UI_CPP "${CMAKE_CURRENT_BINARY_DIR}/ui.cpp")
+set(UI_H   "${CMAKE_CURRENT_BINARY_DIR}/ui.h")
 
-    if(ALL_ASSETS_PRESENT)
-        set(UI_SOURCE "local")
-        set(UI_SOURCE_DIR "${LOCAL_UI_DIR}")
-        message(STATUS "UI: using local build from ${UI_SOURCE_DIR}")
+if(CMAKE_CROSSCOMPILING)
+    find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
+    if(NOT HOST_CXX_COMPILER)
+        message(FATAL_ERROR "UI: no host C++ compiler (g++/clang++) found to build llama-ui-embed; set -DHOST_CXX_COMPILER=<path>")
     endif()
+    message(STATUS "UI: building llama-ui-embed with host compiler ${HOST_CXX_COMPILER}")
 
-    # Priority 2: Build-time asset provisioning (npm build → HF Bucket fallback)
-    if(NOT UI_SOURCE_DIR)
-        # Environment variable takes precedence (e.g., from CI workflows)
-        # Deprecated: use HF_UI_VERSION instead
-        if(DEFINED ENV{HF_WEBUI_VERSION})
-            set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}")
-            message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead")
-            if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$")
-                message(FATAL_ERROR "UI: invalid HF_WEBUI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$")
-            endif()
-        elseif(DEFINED ENV{HF_UI_VERSION})
-            set(HF_UI_VERSION "$ENV{HF_UI_VERSION}")
-            if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$")
-                message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$")
-            endif()
-        elseif(DEFINED LLAMA_BUILD_NUMBER)
-            set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}")
-            message(STATUS "UI: derived HF_UI_VERSION=b${LLAMA_BUILD_NUMBER}")
-        else()
-            set(HF_UI_VERSION "")
-            message(STATUS "UI: version not specified (will use HF 'latest')")
-        endif()
-
-        if("${HF_UI_VERSION}" STREQUAL "")
-            set(UI_VERSION_TAG "provisioned")
-        else()
-            set(UI_VERSION_TAG "${HF_UI_VERSION}")
-        endif()
-        set(UI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.ui-${UI_VERSION_TAG}.stamp")
+    if(CMAKE_HOST_WIN32)
+        set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed-host.exe")
+    else()
+        set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed-host")
+    endif()
 
-        string(REPLACE ";" "+" PUBLIC_ASSETS_JOINED "${PUBLIC_ASSETS}")
+    add_custom_command(
+        OUTPUT  "${LLAMA_UI_EMBED_EXE}"
+        COMMAND "${HOST_CXX_COMPILER}" -O2 -std=c++17
+                -o "${LLAMA_UI_EMBED_EXE}" "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp"
+        DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp"
+        COMMENT "Building llama-ui-embed (host)"
+        VERBATIM
+    )
 
-        add_custom_command(
-            OUTPUT ${UI_STAMP}
-            COMMAND ${CMAKE_COMMAND}
-                "-DSOURCE_DIR=${PROJECT_SOURCE_DIR}"
-                "-DPUBLIC_DIR=${PROJECT_SOURCE_DIR}/build/tools/ui/dist"
-                "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}"
-                "-DHF_VERSION=${HF_UI_VERSION}"
-                "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
-                "-DASSETS=${PUBLIC_ASSETS_JOINED}"
-                "-DSTAMP_FILE=${UI_STAMP}"
-                "-DNPM_DIR=${PROJECT_SOURCE_DIR}/tools/ui"
-                -P ${PROJECT_SOURCE_DIR}/scripts/ui-download.cmake
-            COMMENT "Building/provisioning UI assets (npm build -> HF Bucket fallback)"
-        )
+    # phony target to tie it into the dependency graph
+    add_custom_target(llama-ui-embed DEPENDS "${LLAMA_UI_EMBED_EXE}")
+else()
+    add_executable(llama-ui-embed embed.cpp)
+    target_compile_features(llama-ui-embed PRIVATE cxx_std_17)
+    set_target_properties(llama-ui-embed PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    )
+    set(LLAMA_UI_EMBED_EXE "$<TARGET_FILE:llama-ui-embed>")
+endif()
 
-        set(UI_SOURCE "provisioned")
-        set(UI_SOURCE_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist")
-    endif()
+# Run the provisioning script every build so source changes in tools/ui/ are
+# always picked up. The script uses copy_if_different for ui.cpp/ui.h, so the
+# library only recompiles when contents actually change.
+add_custom_target(llama-ui-assets ALL
+    BYPRODUCTS ${UI_CPP} ${UI_H}
+    COMMAND ${CMAKE_COMMAND}
+        "-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}"
+        "-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}"
+        "-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
+        "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}"
+        "-DHF_VERSION=${HF_UI_VERSION}"
+        "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
+        "-DBUILD_UI=${LLAMA_BUILD_UI}"
+        "-DLLAMA_UI_EMBED=${LLAMA_UI_EMBED_EXE}"
+        -P "${PROJECT_SOURCE_DIR}/scripts/ui-assets.cmake"
+    COMMENT "Provisioning UI assets"
+    VERBATIM
+)
 
-    # Process assets from the determined source
-    if(UI_SOURCE_DIR)
-        foreach(asset ${PUBLIC_ASSETS})
-            set(input "${UI_SOURCE_DIR}/${asset}")
-            set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
-            list(APPEND TARGET_SRCS ${output})
+add_dependencies(llama-ui-assets llama-ui-embed)
 
-            if(UI_SOURCE STREQUAL "local")
-                if(NOT EXISTS "${input}")
-                    message(FATAL_ERROR "UI asset not found: ${input}")
-                endif()
-                set(dependency "${input}")
-            else()
-                set(dependency "${UI_STAMP}")
-            endif()
+set_source_files_properties(${UI_CPP} ${UI_H} PROPERTIES GENERATED TRUE)
 
-            add_custom_command(
-                DEPENDS ${dependency}
-                OUTPUT "${output}"
-                COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
-            )
-            set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
-        endforeach()
+add_library(${TARGET} STATIC ${UI_CPP} ${UI_H})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+add_dependencies(${TARGET} llama-ui-assets)
 
-        list(APPEND UI_COMPILE_DEFS
-            LLAMA_BUILD_UI
-            LLAMA_UI_DEFAULT_ENABLED=1
-        )
-        message(STATUS "UI: embedded with source: ${UI_SOURCE}")
-    else()
-        message(WARNING "UI: no source available. Neither local build (build/tools/ui/dist/) nor HF Bucket download succeeded.")
-        message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.")
-        list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0)
-    endif()
-else()
-    list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
-# Build the static library
-add_library(${TARGET} STATIC ui.cpp)
-
 target_include_directories(${TARGET} PUBLIC
-    ${CMAKE_CURRENT_SOURCE_DIR}
     ${CMAKE_CURRENT_BINARY_DIR}
 )
-
-target_compile_definitions(${TARGET} PUBLIC ${UI_COMPILE_DEFS})
-
-if(TARGET_SRCS)
-    # List generated .hpp files as sources so CMake tracks them as build dependencies
-    target_sources(${TARGET} PRIVATE ${TARGET_SRCS})
-    set_source_files_properties(${TARGET_SRCS} PROPERTIES HEADER_FILE_ONLY TRUE)
-endif()
diff --git a/tools/ui/embed.cpp b/tools/ui/embed.cpp
new file mode 100644
index 00000000000..41227868e6b
--- /dev/null
+++ b/tools/ui/embed.cpp
@@ -0,0 +1,144 @@
+// llama-ui-embed: generate ui.cpp / ui.h that embed UI assets as C arrays.
+//
+// Usage:
+//   llama-ui-embed <out_cpp> <out_h> [<asset_name> <asset_path>]...
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <fstream>
+#include <string>
+#include <vector>
+
+static bool read_file(const std::string & path, std::vector<unsigned char> & out) {
+    std::ifstream f(path, std::ios::binary | std::ios::ate);
+    if (!f) {
+        fprintf(stderr, "embed: cannot open %s\n", path.c_str());
+        return false;
+    }
+    const auto sz = f.tellg();
+    if (sz < 0) {
+        return false;
+    }
+    f.seekg(0);
+    out.resize(static_cast<size_t>(sz));
+    if (sz > 0 && !f.read(reinterpret_cast<char *>(out.data()), sz)) {
+        return false;
+    }
+    return true;
+}
+
+static void append_bytes_hex(std::string & out, const std::vector<unsigned char> & bytes) {
+    static const char hex[] = "0123456789abcdef";
+    out.reserve(out.size() + bytes.size() * 5);
+    for (unsigned char b : bytes) {
+        out += '0';
+        out += 'x';
+        out += hex[b >> 4];
+        out += hex[b & 0xf];
+        out += ',';
+    }
+}
+
+static bool write_if_different(const std::string & path, const std::string & content) {
+    std::ifstream f(path, std::ios::binary | std::ios::ate);
+    if (f) {
+        const auto sz = f.tellg();
+        if (sz >= 0 && static_cast<size_t>(sz) == content.size()) {
+            std::string existing(static_cast<size_t>(sz), '\0');
+            f.seekg(0);
+            if (sz == 0 || f.read(existing.data(), sz)) {
+                if (existing == content) {
+                    return true;
+                }
+            }
+        }
+    }
+
+    std::ofstream out(path, std::ios::binary | std::ios::trunc);
+    if (!out) {
+        fprintf(stderr, "embed: cannot write %s\n", path.c_str());
+        return false;
+    }
+    if (!content.empty()) {
+        out.write(content.data(), static_cast<std::streamsize>(content.size()));
+    }
+    return out.good();
+}
+
+static std::string fmt(const char * pattern, ...) {
+    char tmp[512];
+    va_list ap;
+    va_start(ap, pattern);
+    const int n = vsnprintf(tmp, sizeof(tmp), pattern, ap);
+    va_end(ap);
+    return (n > 0) ? std::string(tmp, static_cast<size_t>(n)) : std::string();
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3 || ((argc - 3) % 2) != 0) {
+        fprintf(stderr, "usage: %s <out_cpp> <out_h> [<name> <path>]...\n", argv[0]);
+        return 1;
+    }
+
+    const std::string out_cpp = argv[1];
+    const std::string out_h   = argv[2];
+    const int n_assets = (argc - 3) / 2;
+
+    std::string h;
+    h += "#pragma once\n\n#include <stddef.h>\n\n";
+    if (n_assets > 0) {
+        h += "#define LLAMA_UI_HAS_ASSETS 1\n\n";
+    }
+    h +=
+        "struct llama_ui_asset {\n"
+        "    const char *          name;\n"
+        "    const unsigned char * data;\n"
+        "    size_t                size;\n"
+        "};\n\n"
+        "const llama_ui_asset * llama_ui_find_asset(const char * name);\n";
+
+    std::string cpp;
+    cpp += "#include \"ui.h\"\n\n#include <string.h>\n\n";
+
+    if (n_assets > 0) {
+        for (int i = 0; i < n_assets; i++) {
+            const char * path = argv[3 + i * 2 + 1];
+            std::vector<unsigned char> bytes;
+            if (!read_file(path, bytes)) {
+                return 1;
+            }
+            cpp += fmt("static const unsigned char asset_%d_data[] = {", i);
+            append_bytes_hex(cpp, bytes);
+            cpp += fmt("};\nstatic const size_t        asset_%d_size = %lu;\n\n",
+                       i, static_cast<unsigned long>(bytes.size()));
+        }
+
+        cpp += "static const llama_ui_asset g_assets[] = {\n";
+        for (int i = 0; i < n_assets; i++) {
+            const char * name = argv[3 + i * 2];
+            cpp += fmt("    { \"%s\", asset_%d_data, asset_%d_size },\n", name, i, i);
+        }
+        cpp += "};\n\n";
+
+        cpp +=
+            "const llama_ui_asset * llama_ui_find_asset(const char * name) {\n"
+            "    for (const auto & a : g_assets) {\n"
+            "        if (strcmp(a.name, name) == 0) {\n"
+            "            return &a;\n"
+            "        }\n"
+            "    }\n"
+            "    return nullptr;\n"
+            "}\n";
+    } else {
+        cpp +=
+            "const llama_ui_asset * llama_ui_find_asset(const char *) {\n"
+            "    return nullptr;\n"
+            "}\n";
+    }
+
+    bool ok = true;
+    ok = write_if_different(out_h,   h)   && ok;
+    ok = write_if_different(out_cpp, cpp) && ok;
+    return ok ? 0 : 1;
+}
diff --git a/tools/ui/scripts/dev.sh b/tools/ui/scripts/dev.sh
index 9256f255aec..7e1d3c15e5b 100644
--- a/tools/ui/scripts/dev.sh
+++ b/tools/ui/scripts/dev.sh
@@ -11,24 +11,28 @@
 
 cd ../../
 
+# Ensure node_modules are installed
+if [ ! -d "tools/ui/node_modules" ]; then
+    echo "📦 Installing npm dependencies..."
+    cd tools/ui && npm install && cd ../../
+fi
+
 # Check and install git hooks if missing
 check_and_install_hooks() {
     local hooks_missing=false
 
     # Check for required hooks
-    if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ] || [ ! -f ".git/hooks/post-push" ]; then
+    if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ]; then
         hooks_missing=true
     fi
 
     if [ "$hooks_missing" = true ]; then
         echo "🔧 Git hooks missing, installing them..."
-        cd tools/ui
-        if bash scripts/install-git-hooks.sh; then
+        if bash "$(dirname "$0")/git-hooks/install.sh"; then
             echo "✅ Git hooks installed successfully"
         else
             echo "⚠️  Failed to install git hooks, continuing anyway..."
         fi
-        cd ../../
     else
         echo "✅ Git hooks already installed"
     fi
diff --git a/tools/ui/scripts/git-hooks/install.sh b/tools/ui/scripts/git-hooks/install.sh
new file mode 100755
index 00000000000..1a17dcad6d3
--- /dev/null
+++ b/tools/ui/scripts/git-hooks/install.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+#
+# Install git hooks for llama-ui
+# Copies pre-commit and pre-push hooks into the repo's .git/hooks directory.
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
+HOOKS_DIR="$REPO_ROOT/$(cd "$REPO_ROOT" && git rev-parse --git-path hooks)"
+
+# Verify package.json exists
+if [ ! -f "$REPO_ROOT/tools/ui/package.json" ]; then
+    echo "❌ package.json not found in tools/ui"
+    exit 1
+fi
+
+echo "Installing git hooks for llama-ui..."
+
+for hook in pre-commit pre-push; do
+    src="$SCRIPT_DIR/${hook}.sh"
+    dst="$HOOKS_DIR/$hook"
+
+    if cp "$src" "$dst" && chmod +x "$dst"; then
+        echo "  ✅ $hook"
+    else
+        echo "  ❌ Failed to install $hook"
+        exit 1
+    fi
+done
+
+echo ""
+echo "Pre-commit:  format (staged) + type-check"
+echo "Pre-push:    lint + test"
+echo ""
+echo "Hooks stash unstaged changes temporarily and restore them after."
+echo "Skip with:  git commit --no-verify / git push --no-verify"
diff --git a/tools/ui/scripts/git-hooks/pre-commit.sh b/tools/ui/scripts/git-hooks/pre-commit.sh
new file mode 100755
index 00000000000..1fa83efde59
--- /dev/null
+++ b/tools/ui/scripts/git-hooks/pre-commit.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+#
+# Pre-commit hook for llama-ui
+# Runs: format (staged files only) + type-check
+# Stashes unstaged changes temporarily and restores them after.
+
+# Only run when there are staged changes in tools/ui/
+if ! git diff --cached --name-only | grep -q "^tools/ui/"; then
+    exit 0
+fi
+
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT/tools/ui"
+
+# Check that node_modules exists
+if [ ! -d "node_modules" ]; then
+    echo "❌ node_modules not found. Run 'npm install' first."
+    exit 1
+fi
+
+# Stash unstaged changes in tools/ui/ so they don't interfere
+stash_name="pi-ui-precommit"
+git stash push --keep-index -u -m "$stash_name" -- tools/ui/ 2>/dev/null || true
+
+echo "Running pre-commit checks for llama-ui..."
+
+# Format only staged files
+staged_ui=$(git diff --cached --name-only -- tools/ui/)
+if [ -n "$staged_ui" ]; then
+    echo "$staged_ui" | xargs npx --no-install prettier --write
+    format_ok=$?
+    # Re-stage formatted files
+    git add tools/ui/
+else
+    format_ok=0
+fi
+
+# Type-check the clean tree
+npm run check
+check_ok=$?
+
+# Restore stashed changes
+if git stash list | grep -q "$stash_name"; then
+    git stash pop 2>/dev/null || true
+fi
+
+if [ $format_ok -ne 0 ]; then
+    echo "❌ Format failed"
+    exit 1
+fi
+if [ $check_ok -ne 0 ]; then
+    echo "❌ Type check failed"
+    exit 1
+fi
+
+echo "✅ Pre-commit checks passed"
+exit 0
diff --git a/tools/ui/scripts/git-hooks/pre-push.sh b/tools/ui/scripts/git-hooks/pre-push.sh
new file mode 100755
index 00000000000..953d3a22430
--- /dev/null
+++ b/tools/ui/scripts/git-hooks/pre-push.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+#
+# Pre-push hook for llama-ui
+# Runs: lint + test
+# Ignores unstaged changes (stashes them temporarily and restores after).
+
+needs_check=false
+
+# Read refs from stdin: local_ref local_sha remote_ref remote_sha
+while read local_ref local_sha remote_ref remote_sha; do
+    # New branch or force-push — always check
+    if [ "$local_sha" = "0000000000000000000000000000000000000000" ] || \
+       [ "$remote_sha" = "0000000000000000000000000000000000000000" ]; then
+        needs_check=true
+        continue
+    fi
+
+    # Check for changes in tools/ui/ between remote and local
+    if git diff --name-only "$remote_sha...$local_sha" -- tools/ui/ | grep -q .; then
+        needs_check=true
+    fi
+done
+
+if [ "$needs_check" = false ]; then
+    exit 0
+fi
+
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT/tools/ui"
+
+# Check that node_modules exists
+if [ ! -d "node_modules" ]; then
+    echo "❌ node_modules not found. Run 'npm install' first."
+    exit 1
+fi
+
+# Stash unstaged changes so they don't interfere with checks
+stash_name="pi-ui-prepush"
+git stash push -u -m "$stash_name" -- tools/ui/ 2>/dev/null || true
+
+echo "Running pre-push checks for llama-ui..."
+
+# Lint
+npm run lint
+lint_ok=$?
+
+# Test
+npm test
+test_ok=$?
+
+# Restore stashed changes
+if git stash list | grep -q "$stash_name"; then
+    git stash pop 2>/dev/null || true
+fi
+
+if [ $lint_ok -ne 0 ]; then
+    echo "❌ Lint failed"
+    exit 1
+fi
+if [ $test_ok -ne 0 ]; then
+    echo "❌ Tests failed"
+    exit 1
+fi
+
+echo "✅ Pre-push checks passed"
+exit 0
diff --git a/tools/ui/scripts/install-git-hooks.sh b/tools/ui/scripts/install-git-hooks.sh
deleted file mode 100755
index 213feb08dcf..00000000000
--- a/tools/ui/scripts/install-git-hooks.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-
-# Script to install pre-commit hook for llama-ui
-# Pre-commit: formats, checks, and builds the UI app
-
-REPO_ROOT=$(git rev-parse --show-toplevel)
-PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit"
-
-echo "Installing pre-commit hook for llama-ui..."
-
-# Create the pre-commit hook
-cat > "$PRE_COMMIT_HOOK" << 'EOF'
-#!/bin/bash
-
-# Check if there are any changes in the tools/ui directory
-if git diff --cached --name-only | grep -q "^tools/ui/"; then
-    REPO_ROOT=$(git rev-parse --show-toplevel)
-    cd "$REPO_ROOT/tools/ui"
-
-    # Check if package.json exists
-    if [ ! -f "package.json" ]; then
-        echo "Error: package.json not found in tools/ui"
-        exit 1
-    fi
-
-    echo "Formatting and checking llama-ui code..."
-
-    # Run the format command
-    npm run format
-    if [ $? -ne 0 ]; then
-        echo "Error: npm run format failed"
-        exit 1
-    fi
-
-    # Run the lint command
-    npm run lint
-    if [ $? -ne 0 ]; then
-        echo "Error: npm run lint failed"
-        exit 1
-    fi
-
-    # Run the check command
-    npm run check
-    if [ $? -ne 0 ]; then
-        echo "Error: npm run check failed"
-        exit 1
-    fi
-
-    echo "✅ llama-ui code formatted and checked successfully"
-
-    # Build the llama-ui
-    echo "Building llama-ui..."
-    npm run build
-    if [ $? -ne 0 ]; then
-        echo "❌ npm run build failed"
-        exit 1
-    fi
-
-    echo "✅ llama-ui built successfully"
-fi
-
-exit 0
-EOF
-
-# Make hook executable
-chmod +x "$PRE_COMMIT_HOOK"
-
-if [ $? -eq 0 ]; then
-    echo "✅ Git hook installed successfully!"
-    echo "   Pre-commit: $PRE_COMMIT_HOOK"
-    echo ""
-    echo "The hook will automatically:"
-    echo "  • Format, lint and check llama-ui code before commits"
-    echo "  • Build llama-ui"
-else
-    echo "❌ Failed to make hook executable"
-    exit 1
-fi
diff --git a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts
index ddf6fa1e56e..01c714a241d 100644
--- a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts
+++ b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts
@@ -19,7 +19,7 @@ const GUIDE_FOR_FRONTEND = `
 -->
 `.trim();
 
-const OUTPUT_DIR = '../../build/tools/ui/dist';
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist';
 
 export function llamaCppBuildPlugin(): Plugin {
 	return {
diff --git a/tools/ui/sources.cmake b/tools/ui/sources.cmake
new file mode 100644
index 00000000000..de9dbf78b79
--- /dev/null
+++ b/tools/ui/sources.cmake
@@ -0,0 +1,15 @@
+# Inputs used to decide whether the npm build output is up-to-date.
+
+set(UI_SOURCE_GLOBS
+    src/*
+    static/*
+)
+
+set(UI_SOURCE_FILES
+    package.json
+    package-lock.json
+    vite.config.ts
+    svelte.config.js
+    tsconfig.json
+    scripts/vite-plugin-llama-cpp-build.ts
+)
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte
index 54ddcf9b0a7..6a91bf90524 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte
@@ -1,5 +1,5 @@
 <script lang="ts">
-	import { IsMobile } from '$lib/hooks/is-mobile.svelte';
+	import { isMobile } from '$lib/stores/viewport.svelte';
 	import ChatFormActionAddDropdown from './ChatFormActionAddDropdown.svelte';
 	import ChatFormActionAddSheet from './ChatFormActionAddSheet.svelte';
 	import ChatFormActionAddButton from './ChatFormActionAddButton.svelte';
@@ -31,8 +31,6 @@
 		onMcpSettingsClick,
 		onSystemPromptClick
 	}: Props = $props();
-
-	const isMobile = new IsMobile();
 </script>
 
 {#if isMobile.current}
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
index 29702060565..07f079f5b51 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
@@ -3,7 +3,7 @@
 	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
 	import { isRouterMode, serverError } from '$lib/stores/server.svelte';
 	import { ModelsSelectorDropdown, ModelsSelectorSheet } from '$lib/components/app';
-	import { IsMobile } from '$lib/hooks/is-mobile.svelte';
+	import { isMobile } from '$lib/stores/viewport.svelte';
 	import { activeMessages } from '$lib/stores/conversations.svelte';
 
 	interface Props {
@@ -152,8 +152,6 @@
 	let selectorModelRef: ModelsSelectorDropdown | ModelsSelectorSheet | undefined =
 		$state(undefined);
 
-	let isMobile = new IsMobile();
-
 	export function open() {
 		selectorModelRef?.open();
 	}
diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
index bd93a569ce6..e733a64a97a 100644
--- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
@@ -1,8 +1,7 @@
 <script lang="ts">
-	import { Trash2, AlertTriangle, RefreshCw } from '@lucide/svelte';
+	import { Trash2 } from '@lucide/svelte';
 	import { afterNavigate } from '$app/navigation';
 	import { page } from '$app/state';
-	import { fadeInView } from '$lib/actions/fade-in-view.svelte';
 	import {
 		ChatScreenForm,
 		ChatMessages,
@@ -13,9 +12,9 @@
 		DialogFileUploadError,
 		DialogChatError,
 		ServerLoadingSplash,
-		DialogConfirmation
+		DialogConfirmation,
+		ChatScreenServerError
 	} from '$lib/components/app';
-	import * as Alert from '$lib/components/ui/alert';
 	import { setProcessingInfoContext } from '$lib/contexts';
 	import { ErrorDialogType } from '$lib/enums';
 	import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
@@ -35,11 +34,12 @@
 		activeConversation
 	} from '$lib/stores/conversations.svelte';
 	import { config } from '$lib/stores/settings.svelte';
-	import { serverLoading, serverError, serverStore, isRouterMode } from '$lib/stores/server.svelte';
+	import { serverLoading, serverError, isRouterMode } from '$lib/stores/server.svelte';
 	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
 	import { isFileTypeSupported, filterFilesByModalities } from '$lib/utils';
 	import { parseFilesToMessageExtras, processFilesToChatUploaded } from '$lib/utils/browser-only';
 	import { onMount } from 'svelte';
+	import ChatScreenGreeting from './ChatScreenGreeting.svelte';
 
 	let { showCenteredEmpty = false } = $props();
 
@@ -68,6 +68,8 @@
 
 	let showEmptyFileDialog = $state(false);
 
+	let processingInfoVisible = $state(false);
+
 	let emptyFileNames = $state<string[]>([]);
 
 	let initialMessage = $state('');
@@ -175,6 +177,10 @@
 		showDeleteDialog = false;
 	}
 
+	function handleProcessingInfoVisibility(visible: boolean) {
+		processingInfoVisible = visible;
+	}
+
 	function handleDragEnter(event: DragEvent) {
 		event.preventDefault();
 
@@ -395,61 +401,32 @@
 			{#if !isEmpty}
 				<ChatMessages
 					messages={activeMessages()}
+					onMessagesReady={handleMessagesReady}
 					onUserAction={() => {
 						autoScroll.enable();
 						if (!autoScroll.userScrolledUp) {
 							autoScroll.scrollToBottom();
 						}
 					}}
-					onMessagesReady={handleMessagesReady}
 				/>
 			{/if}
 
 			<div
-				class="pointer-events-none {isEmpty
-					? 'absolute bottom-[calc(50dvh-7rem)]'
-					: 'sticky bottom-4'} right-4 left-4 mt-auto -mb-14 pt-16 transition-all duration-200"
+				class={[
+					'pointer-events-none sticky right-4 left-4 mt-auto transition-all duration-200',
+					isEmpty ? 'bottom-[calc(50dvh-7rem)]' : 'bottom-4 pt-24 md:pt-32'
+				]}
 			>
-				{#if isEmpty}
-					<div class="mb-8 px-4 text-center" use:fadeInView={{ duration: 300 }}>
-						<h1 class="mb-2 text-2xl font-semibold tracking-tight md:text-3xl">Hello there</h1>
-
-						<p class="text-muted-foreground md:text-lg">
-							{serverStore.props?.modalities?.audio
-								? 'Record audio, type a message '
-								: 'Type a message'} or upload files to get started
-						</p>
-					</div>
-				{/if}
-
-				<ChatScreenActionScrollDown container={chatScrollContainer} />
-
-				{#if page.params.id}
-					<ChatScreenProcessingInfo />
-				{/if}
-
-				{#if hasPropsError}
-					<div
-						class="pointer-events-auto mx-auto mb-4 max-w-[48rem] px-1"
-						use:fadeInView={{ y: 10, duration: 250 }}
-					>
-						<Alert.Root variant="destructive">
-							<AlertTriangle class="h-4 w-4" />
-							<Alert.Title class="flex items-center justify-between">
-								<span>Server unavailable</span>
-								<button
-									onclick={() => serverStore.fetch()}
-									disabled={isServerLoading}
-									class="flex items-center gap-1.5 rounded-lg bg-destructive/20 px-2 py-1 text-xs font-medium hover:bg-destructive/30 disabled:opacity-50"
-								>
-									<RefreshCw class="h-3 w-3 {isServerLoading ? 'animate-spin' : ''}" />
-									{isServerLoading ? 'Retrying...' : 'Retry'}
-								</button>
-							</Alert.Title>
-							<Alert.Description>{serverError()}</Alert.Description>
-						</Alert.Root>
-					</div>
-				{/if}
+				<ChatScreenGreeting {isEmpty} />
+
+				<ChatScreenActionScrollDown
+					container={chatScrollContainer}
+					hasProcessingInfoVisible={processingInfoVisible}
+				/>
+
+				<ChatScreenProcessingInfo onVisibilityChange={handleProcessingInfoVisibility} />
+
+				<ChatScreenServerError />
 
 				<div class="conversation-chat-form pointer-events-auto rounded-t-3xl">
 					<ChatScreenForm
diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte
index 3f3ee867710..c43bee3e3c3 100644
--- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte
@@ -2,10 +2,17 @@
 	import { ArrowDown } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
 
-	let { container }: { container: HTMLDivElement | undefined } = $props();
+	interface Props {
+		container: HTMLDivElement | undefined;
+		hasProcessingInfoVisible: boolean;
+	}
+
+	let { container, hasProcessingInfoVisible }: Props = $props();
 
 	let show = $state(false);
 
+	let buttonBottom = $derived(hasProcessingInfoVisible ? '2rem' : '0');
+
 	function checkVisibility() {
 		if (!container) return;
 		const { scrollTop, scrollHeight, clientHeight } = container;
@@ -34,14 +41,20 @@
 	});
 </script>
 
-<div class="pointer-events-auto relative z-50 mx-auto mb-4 flex max-w-[48rem] justify-center">
+<div
+	class="pointer-events-{show
+		? 'auto'
+		: 'none'} relative z-50 mx-auto mb-4 flex max-w-[48rem] justify-center"
+>
 	<Button
 		onclick={scrollToBottom}
 		variant="secondary"
 		size="icon"
-		class="h-10 w-10 rounded-full bg-background/80 shadow-lg backdrop-blur-sm transition-all duration-200 hover:bg-muted/80"
+		class="pointer-events-all absolute h-10 w-10 rounded-full bg-background/80 shadow-lg backdrop-blur-sm transition-all duration-200 hover:bg-muted/80"
+		style="bottom: {buttonBottom}; transform: translateY({show ? '0' : '2rem'}); opacity: {show
+			? 1
+			: 0};"
 		aria-label="Scroll to bottom"
-		style="transform: translateY({show ? '0' : '20px'}); opacity: {show ? 1 : 0};"
 	>
 		<ArrowDown class="h-4 w-4" />
 	</Button>
diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenGreeting.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenGreeting.svelte
new file mode 100644
index 00000000000..141d4f4e43c
--- /dev/null
+++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenGreeting.svelte
@@ -0,0 +1,25 @@
+<script lang="ts">
+	import { fadeInView } from '$lib/actions/fade-in-view.svelte';
+	import { serverStore } from '$lib/stores/server.svelte';
+
+	interface Props {
+		isEmpty: boolean;
+	}
+
+	let { isEmpty = false }: Props = $props();
+</script>
+
+<div
+	class={[
+		'pointer-events-none mb-4 hidden px-4 text-center',
+		isEmpty && 'pointer-events-auto block!'
+	]}
+	use:fadeInView={{ duration: 300 }}
+>
+	<h1 class="mb-2 text-2xl font-semibold tracking-tight md:text-3xl">Hello there</h1>
+
+	<p class="text-muted-foreground md:text-lg">
+		{serverStore.props?.modalities?.audio ? 'Record audio, type a message ' : 'Type a message'} or upload
+		files to get started
+	</p>
+</div>
diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
index b5979db13c2..f38f3519c34 100644
--- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
@@ -6,6 +6,7 @@
 	import { activeMessages, activeConversation } from '$lib/stores/conversations.svelte';
 	import { config } from '$lib/stores/settings.svelte';
 	import { getProcessingInfoContext } from '$lib/contexts';
+	import { page } from '$app/state';
 
 	const processingState = useProcessingState();
 	const processingInfoCtx = getProcessingInfoContext();
@@ -16,6 +17,14 @@
 	let isStreaming = $derived(isChatStreaming());
 	let processingDetails = $derived(processingState.getTechnicalDetails());
 
+	let processingVisible = $derived(processingDetails.length > 0);
+
+	let { onVisibilityChange }: { onVisibilityChange?: (visible: boolean) => void } = $props();
+
+	$effect(() => {
+		onVisibilityChange?.(processingVisible);
+	});
+
 	$effect(() => {
 		const conversation = activeConversation();
 
@@ -60,9 +69,12 @@
 </script>
 
 <div
-	class={['chat-processing-info-container pointer-events-none', showProcessingInfo && 'visible']}
+	class={[
+		'chat-processing-info-container pointer-events-none relative',
+		page.params.id && showProcessingInfo && 'visible'
+	]}
 >
-	<div class="chat-processing-info-content">
+	<div class="chat-processing-info-content absolute bottom-4 left-1/2 -translate-x-1/2">
 		{#each processingDetails as detail (detail)}
 			<span class="chat-processing-info-detail pointer-events-auto backdrop-blur-sm">{detail}</span>
 		{/each}
diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte
new file mode 100644
index 00000000000..2a998dbebfa
--- /dev/null
+++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte
@@ -0,0 +1,34 @@
+<script lang="ts">
+	import { AlertTriangle, RefreshCw } from '@lucide/svelte';
+	import { fadeInView } from '$lib/actions/fade-in-view.svelte';
+	import * as Alert from '$lib/components/ui/alert';
+	import { serverError, serverLoading, serverStore } from '$lib/stores/server.svelte';
+
+	let hasError = $derived(!!serverError());
+</script>
+
+{#if hasError}
+	<div
+		class="pointer-events-auto mx-auto mb-4 max-w-[48rem] px-1"
+		use:fadeInView={{ y: 10, duration: 250 }}
+	>
+		<Alert.Root variant="destructive">
+			<AlertTriangle class="h-4 w-4" />
+
+			<Alert.Title class="flex items-center justify-between">
+				<span>Server unavailable</span>
+
+				<button
+					onclick={() => serverStore.fetch()}
+					disabled={serverLoading()}
+					class="flex items-center gap-1.5 rounded-lg bg-destructive/20 px-2 py-1 text-xs font-medium hover:bg-destructive/30 disabled:opacity-50"
+				>
+					<RefreshCw class="h-3 w-3 {serverLoading() ? 'animate-spin' : ''}" />
+					{serverLoading() ? 'Retrying...' : 'Retry'}
+				</button>
+			</Alert.Title>
+
+			<Alert.Description>{serverError()}</Alert.Description>
+		</Alert.Root>
+	</div>
+{/if}
diff --git a/tools/ui/src/lib/components/app/chat/index.ts b/tools/ui/src/lib/components/app/chat/index.ts
index 9c7ce864e21..be553596024 100644
--- a/tools/ui/src/lib/components/app/chat/index.ts
+++ b/tools/ui/src/lib/components/app/chat/index.ts
@@ -674,3 +674,10 @@ export { default as ChatScreenProcessingInfo } from './ChatScreen/ChatScreenProc
  * Takes the chat container element as a prop to manage scroll state internally.
  */
 export { default as ChatScreenActionScrollDown } from './ChatScreen/ChatScreenActionScrollDown.svelte';
+
+/**
+ * Server error alert displayed when the server is unreachable.
+ * Shows the error message with a retry button.
+ * Rendered inside ChatScreen when `serverError` store has a value.
+ */
+export { default as ChatScreenServerError } from './ChatScreen/ChatScreenServerError.svelte';
diff --git a/tools/ui/src/lib/components/ui/sidebar/context.svelte.ts b/tools/ui/src/lib/components/ui/sidebar/context.svelte.ts
index 9d49ee1f0b3..2fa5cc25dcb 100644
--- a/tools/ui/src/lib/components/ui/sidebar/context.svelte.ts
+++ b/tools/ui/src/lib/components/ui/sidebar/context.svelte.ts
@@ -1,4 +1,4 @@
-import { IsMobile } from '$lib/hooks/is-mobile.svelte.js';
+import { isMobile } from '$lib/stores/viewport.svelte.js';
 import { getContext, setContext } from 'svelte';
 import { SIDEBAR_KEYBOARD_SHORTCUT, SIDEBAR_MIN_WIDTH } from './constants.js';
 
@@ -27,19 +27,17 @@ class SidebarState {
 	sidebarWidth = $state(SIDEBAR_MIN_WIDTH);
 	isResizing = $state(false);
 	setOpen: SidebarStateProps['setOpen'];
-	#isMobile: IsMobile;
 	state = $derived.by(() => (this.open ? 'expanded' : 'collapsed'));
 
 	constructor(props: SidebarStateProps) {
 		this.setOpen = props.setOpen;
-		this.#isMobile = new IsMobile();
 		this.props = props;
 	}
 
 	// Convenience getter for checking if the sidebar is mobile
 	// without this, we would need to use `sidebar.isMobile.current` everywhere
 	get isMobile() {
-		return this.#isMobile.current;
+		return isMobile.current;
 	}
 
 	// Event handler to apply to the `<svelte:window>`
diff --git a/tools/ui/src/lib/constants/image-size.ts b/tools/ui/src/lib/constants/image-size.ts
new file mode 100644
index 00000000000..bcc1da08b19
--- /dev/null
+++ b/tools/ui/src/lib/constants/image-size.ts
@@ -0,0 +1 @@
+export const MEGAPIXELS_TO_PIXELS = 1_000_000;
diff --git a/tools/ui/src/lib/constants/settings-keys.ts b/tools/ui/src/lib/constants/settings-keys.ts
index 92a57f88acf..53243992fa2 100644
--- a/tools/ui/src/lib/constants/settings-keys.ts
+++ b/tools/ui/src/lib/constants/settings-keys.ts
@@ -18,6 +18,7 @@ export const SETTINGS_KEYS = {
 	TITLE_GENERATION_USE_FIRST_LINE: 'titleGenerationUseFirstLine',
 	TITLE_GENERATION_USE_LLM: 'titleGenerationUseLLM',
 	TITLE_GENERATION_PROMPT: 'titleGenerationPrompt',
+	MAX_IMAGE_RESOLUTION: 'maxImageMPixels',
 	// Display
 	SHOW_MESSAGE_STATS: 'showMessageStats',
 	SHOW_THOUGHT_IN_PROGRESS: 'showThoughtInProgress',
diff --git a/tools/ui/src/lib/constants/settings-registry.ts b/tools/ui/src/lib/constants/settings-registry.ts
index 93b3cd5edb5..efef18fdeb6 100644
--- a/tools/ui/src/lib/constants/settings-registry.ts
+++ b/tools/ui/src/lib/constants/settings-registry.ts
@@ -193,6 +193,14 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				defaultValue: TITLE_GENERATION.DEFAULT_PROMPT,
 				type: SettingsFieldType.TEXTAREA,
 				section: SETTINGS_SECTION_SLUGS.GENERAL
+			},
+			{
+				key: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION,
+				label: 'Maximum image resolution (megapixels)',
+				help: 'Images larger than this will be resized before sending to server. Set to 0 to disable.',
+				defaultValue: 0,
+				type: SettingsFieldType.INPUT,
+				section: SETTINGS_SECTION_SLUGS.GENERAL
 			}
 		]
 	},
diff --git a/tools/ui/src/lib/constants/uri-template.ts b/tools/ui/src/lib/constants/uri-template.ts
index dc834aca27a..9b44d6ed375 100644
--- a/tools/ui/src/lib/constants/uri-template.ts
+++ b/tools/ui/src/lib/constants/uri-template.ts
@@ -55,3 +55,6 @@ export const VARIABLE_PREFIX_MODIFIER_REGEX = /:[\d]+$/;
 
 /** Regex to strip one or more leading slashes */
 export const LEADING_SLASHES_REGEX = /^\/+/;
+
+/** Regex to match base64-encoded image URIs (format: "data:image/[media type];base64,[data]")*/
+export const BASE64_IMAGE_URI_REGEX = /^data:(image\/[a-z0-9.\-+]+);base64/;
diff --git a/tools/ui/src/lib/enums/agentic.enums.ts b/tools/ui/src/lib/enums/agentic.enums.ts
index b96d244cd17..9ad7b4f1a87 100644
--- a/tools/ui/src/lib/enums/agentic.enums.ts
+++ b/tools/ui/src/lib/enums/agentic.enums.ts
@@ -16,3 +16,12 @@ export enum AgenticSectionType {
 	REASONING = 'reasoning',
 	REASONING_PENDING = 'reasoning_pending'
 }
+
+/**
+ * How a Continue click on an assistant message resumes generation.
+ */
+export enum ContinueIntentKind {
+	APPEND_TEXT = 'append_text',
+	RERUN_TURN = 'rerun_turn',
+	NEXT_TURN = 'next_turn'
+}
diff --git a/tools/ui/src/lib/enums/files.enums.ts b/tools/ui/src/lib/enums/files.enums.ts
index 5aef3955ebe..2f583d52eae 100644
--- a/tools/ui/src/lib/enums/files.enums.ts
+++ b/tools/ui/src/lib/enums/files.enums.ts
@@ -183,6 +183,10 @@ export enum MimeTypeAudio {
 	MP3 = 'audio/mp3',
 	MP4 = 'audio/mp4',
 	WAV = 'audio/wav',
+	WAVE = 'audio/wave',
+	X_WAV = 'audio/x-wav',
+	X_WAVE = 'audio/x-wave',
+	X_PN_WAV = 'audio/x-pn-wav',
 	WEBM = 'audio/webm',
 	WEBM_OPUS = 'audio/webm;codecs=opus'
 }
diff --git a/tools/ui/src/lib/enums/index.ts b/tools/ui/src/lib/enums/index.ts
index a17cca1d8e1..b80b5b61ed8 100644
--- a/tools/ui/src/lib/enums/index.ts
+++ b/tools/ui/src/lib/enums/index.ts
@@ -6,7 +6,7 @@ export {
 	AttachmentItemVisibleWhen
 } from './attachment.enums';
 
-export { AgenticSectionType, ToolCallType } from './agentic.enums';
+export { AgenticSectionType, ContinueIntentKind, ToolCallType } from './agentic.enums';
 
 export {
 	ChatMessageStatsView,
diff --git a/tools/ui/src/lib/hooks/is-mobile.svelte.ts b/tools/ui/src/lib/hooks/is-mobile.svelte.ts
deleted file mode 100644
index 6454fc5b58a..00000000000
--- a/tools/ui/src/lib/hooks/is-mobile.svelte.ts
+++ /dev/null
@@ -1,8 +0,0 @@
-import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants';
-import { MediaQuery } from 'svelte/reactivity';
-
-export class IsMobile extends MediaQuery {
-	constructor(breakpoint: number = DEFAULT_MOBILE_BREAKPOINT) {
-		super(`max-width: ${breakpoint - 1}px`);
-	}
-}
diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts
index 3e405e9e689..3c9ca74796d 100644
--- a/tools/ui/src/lib/services/chat.service.ts
+++ b/tools/ui/src/lib/services/chat.service.ts
@@ -5,12 +5,15 @@ import {
 	ATTACHMENT_LABEL_PDF_FILE,
 	ATTACHMENT_LABEL_MCP_PROMPT,
 	ATTACHMENT_LABEL_MCP_RESOURCE,
-	LEGACY_AGENTIC_REGEX
+	LEGACY_AGENTIC_REGEX,
+	SETTINGS_KEYS
 } from '$lib/constants';
 import {
 	AttachmentType,
 	ContentPartType,
+	FileTypeAudio,
 	MessageRole,
+	MimeTypeAudio,
 	ReasoningFormat,
 	UrlProtocol
 } from '$lib/enums';
@@ -19,8 +22,31 @@ import type {
 	ApiChatMessageData,
 	ApiChatCompletionToolCall
 } from '$lib/types/api';
-import type { DatabaseMessageExtraMcpPrompt, DatabaseMessageExtraMcpResource } from '$lib/types';
+import type {
+	AudioInputFormat,
+	DatabaseMessageExtraMcpPrompt,
+	DatabaseMessageExtraMcpResource
+} from '$lib/types';
 import { modelsStore } from '$lib/stores/models.svelte';
+import { settingsStore } from '../stores/settings.svelte';
+import { capImageDataURLSize } from '../utils/cap-img-size';
+import { MEGAPIXELS_TO_PIXELS } from '$lib/constants/image-size';
+
+function getAudioInputFormat(mimeType: string): AudioInputFormat {
+	const normalizedMimeType = mimeType.trim().toLowerCase();
+
+	if (
+		normalizedMimeType === MimeTypeAudio.WAV ||
+		normalizedMimeType === MimeTypeAudio.WAVE ||
+		normalizedMimeType === MimeTypeAudio.X_WAV ||
+		normalizedMimeType === MimeTypeAudio.X_WAVE ||
+		normalizedMimeType === MimeTypeAudio.X_PN_WAV
+	) {
+		return FileTypeAudio.WAV;
+	}
+
+	return FileTypeAudio.MP3;
+}
 
 export class ChatService {
 	/**
@@ -134,26 +160,28 @@ export class ChatService {
 			continueFinalMessage
 		} = options;
 
-		const normalizedMessages: ApiChatMessageData[] = messages
-			.map((msg) => {
-				if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
-					const dbMsg = msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] };
-
-					return ChatService.convertDbMessageToApiChatMessageData(dbMsg);
-				} else {
-					return msg as ApiChatMessageData;
-				}
-			})
-			.filter((msg) => {
-				// Filter out empty system messages
-				if (msg.role === MessageRole.SYSTEM) {
-					const content = typeof msg.content === 'string' ? msg.content : '';
+		const normalizedMessages: ApiChatMessageData[] = (
+			await Promise.all(
+				messages.map((msg) => {
+					if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
+						const dbMsg = msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] };
 
-					return content.trim().length > 0;
-				}
+						return ChatService.convertDbMessageToApiChatMessageData(dbMsg);
+					} else {
+						return msg as ApiChatMessageData;
+					}
+				})
+			)
+		).filter((msg: { role: ChatRole; content: string | ApiChatMessageContentPart[] }) => {
+			// Filter out empty system messages
+			if (msg.role === MessageRole.SYSTEM) {
+				const content = typeof msg.content === 'string' ? msg.content : '';
+
+				return content.trim().length > 0;
+			}
 
-				return true;
-			});
+			return true;
+		});
 
 		// Filter out image attachments if the model doesn't support vision
 		if (options.model && !modelsStore.modelSupportsVision(options.model)) {
@@ -382,25 +410,27 @@ export class ChatService {
 		excludeReasoning?: boolean,
 		signal?: AbortSignal
 	): Promise<void> {
-		const normalizedMessages: ApiChatMessageData[] = messages
-			.map((msg) => {
-				if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
-					return ChatService.convertDbMessageToApiChatMessageData(
-						msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
-					);
-				}
+		const normalizedMessages: ApiChatMessageData[] = (
+			await Promise.all(
+				messages.map((msg) => {
+					if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
+						return ChatService.convertDbMessageToApiChatMessageData(
+							msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
+						);
+					}
 
-				return msg as ApiChatMessageData;
-			})
-			.filter((msg) => {
-				if (msg.role === MessageRole.SYSTEM) {
-					const content = typeof msg.content === 'string' ? msg.content : '';
+					return msg as ApiChatMessageData;
+				})
+			)
+		).filter((msg: { role: ChatRole; content: string | ApiChatMessageContentPart[] }) => {
+			if (msg.role === MessageRole.SYSTEM) {
+				const content = typeof msg.content === 'string' ? msg.content : '';
 
-					return content.trim().length > 0;
-				}
+				return content.trim().length > 0;
+			}
 
-				return true;
-			});
+			return true;
+		});
 
 		const requestBody: Record<string, unknown> = {
 			messages: normalizedMessages.map((msg: ApiChatMessageData) => {
@@ -783,9 +813,9 @@ export class ChatService {
 	 * @returns {ApiChatMessageData} object formatted for the chat completion API
 	 * @static
 	 */
-	static convertDbMessageToApiChatMessageData(
+	static async convertDbMessageToApiChatMessageData(
 		message: DatabaseMessage & { extra?: DatabaseMessageExtra[] }
-	): ApiChatMessageData {
+	): Promise<ApiChatMessageData> {
 		// Handle tool result messages (role: 'tool')
 		if (message.role === MessageRole.TOOL && message.toolCallId) {
 			return {
@@ -824,26 +854,6 @@ export class ChatService {
 
 		const contentParts: ApiChatMessageContentPart[] = [];
 
-		if (message.content) {
-			contentParts.push({
-				type: ContentPartType.TEXT,
-				text: message.content
-			});
-		}
-
-		// Include images from all messages
-		const imageFiles = message.extra.filter(
-			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
-				extra.type === AttachmentType.IMAGE
-		);
-
-		for (const image of imageFiles) {
-			contentParts.push({
-				type: ContentPartType.IMAGE_URL,
-				image_url: { url: image.base64Url }
-			});
-		}
-
 		const textFiles = message.extra.filter(
 			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraTextFile =>
 				extra.type === AttachmentType.TEXT
@@ -869,6 +879,23 @@ export class ChatService {
 			});
 		}
 
+		const imageFiles = message.extra.filter(
+			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
+				extra.type === AttachmentType.IMAGE
+		);
+
+		for (const image of imageFiles) {
+			const maxImageResolution = settingsStore.getConfig(SETTINGS_KEYS.MAX_IMAGE_RESOLUTION);
+			let base64Url = image.base64Url;
+			if (maxImageResolution > 1 / MEGAPIXELS_TO_PIXELS) {
+				base64Url = await capImageDataURLSize(image.base64Url, maxImageResolution);
+			}
+			contentParts.push({
+				type: ContentPartType.IMAGE_URL,
+				image_url: { url: base64Url }
+			});
+		}
+
 		const audioFiles = message.extra.filter(
 			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraAudioFile =>
 				extra.type === AttachmentType.AUDIO
@@ -879,11 +906,18 @@ export class ChatService {
 				type: ContentPartType.INPUT_AUDIO,
 				input_audio: {
 					data: audio.base64Data,
-					format: audio.mimeType.includes('wav') ? 'wav' : 'mp3'
+					format: getAudioInputFormat(audio.mimeType)
 				}
 			});
 		}
 
+		if (message.content) {
+			contentParts.push({
+				type: ContentPartType.TEXT,
+				text: message.content
+			});
+		}
+
 		const videoFiles = message.extra.filter(
 			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraVideoFile =>
 				extra.type === AttachmentType.VIDEO
diff --git a/tools/ui/src/lib/stores/agentic.svelte.ts b/tools/ui/src/lib/stores/agentic.svelte.ts
index e8c0cc52307..4866995b4a3 100644
--- a/tools/ui/src/lib/stores/agentic.svelte.ts
+++ b/tools/ui/src/lib/stores/agentic.svelte.ts
@@ -416,21 +416,23 @@ class AgenticStore {
 
 		console.log(`[AgenticStore] Starting agentic flow with ${tools.length} tools`);
 
-		const normalizedMessages: ApiChatMessageData[] = messages
-			.map((msg) => {
-				if ('id' in msg && 'convId' in msg && 'timestamp' in msg)
-					return ChatService.convertDbMessageToApiChatMessageData(
-						msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
-					);
-				return msg as ApiChatMessageData;
-			})
-			.filter((msg) => {
-				if (msg.role === MessageRole.SYSTEM) {
-					const content = typeof msg.content === 'string' ? msg.content : '';
-					return content.trim().length > 0;
-				}
-				return true;
-			});
+		const normalizedMessages: ApiChatMessageData[] = (
+			await Promise.all(
+				messages.map((msg) => {
+					if ('id' in msg && 'convId' in msg && 'timestamp' in msg)
+						return ChatService.convertDbMessageToApiChatMessageData(
+							msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
+						);
+					return msg as ApiChatMessageData;
+				})
+			)
+		).filter((msg: { role: ChatRole; content: string | ApiChatMessageContentPart[] }) => {
+			if (msg.role === MessageRole.SYSTEM) {
+				const content = typeof msg.content === 'string' ? msg.content : '';
+				return content.trim().length > 0;
+			}
+			return true;
+		});
 
 		this.updateSession(conversationId, {
 			isRunning: true,
diff --git a/tools/ui/src/lib/stores/chat.svelte.ts b/tools/ui/src/lib/stores/chat.svelte.ts
index 61ea4c89296..5b264482602 100644
--- a/tools/ui/src/lib/stores/chat.svelte.ts
+++ b/tools/ui/src/lib/stores/chat.svelte.ts
@@ -33,6 +33,7 @@ import {
 	isAbortError,
 	generateConversationTitle
 } from '$lib/utils';
+import { classifyContinueIntent } from '$lib/utils/agentic';
 import {
 	MAX_INACTIVE_CONVERSATION_STATES,
 	INACTIVE_CONVERSATION_STATE_MAX_AGE_MS,
@@ -51,7 +52,7 @@ import type {
 	DatabaseMessage,
 	DatabaseMessageExtra
 } from '$lib/types';
-import { ErrorDialogType, MessageRole, MessageType } from '$lib/enums';
+import { ContinueIntentKind, ErrorDialogType, MessageRole, MessageType } from '$lib/enums';
 
 interface ConversationStateEntry {
 	lastAccessed: number;
@@ -1259,6 +1260,57 @@ class ChatStore {
 		}
 	}
 
+	/**
+	 * Open a fresh assistant turn anchored at the last tool result of a resolved
+	 * agentic round and let streamChatCompletion route through runAgenticFlow.
+	 * Used by continueAssistantMessage when classifyContinueIntent returns
+	 * next_turn, meaning the target assistant already has its tool_calls paired
+	 * with trailing tool results and the next thing to generate is a brand new
+	 * turn rather than a token level continuation.
+	 */
+	private async continueAsNextAgenticTurn(anchorIndex: number): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) return;
+		const anchor = conversationsStore.activeMessages[anchorIndex];
+		if (!anchor) return;
+		this.cancelPreEncode();
+		this.setChatLoading(activeConv.id, true);
+		this.clearChatStreaming(activeConv.id);
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const anchorMessage = findMessageById(allMessages, anchor.id);
+			if (!anchorMessage) {
+				this.setChatLoading(activeConv.id, false);
+				return;
+			}
+			const newAssistantMessage = await DatabaseService.createMessageBranch(
+				{
+					convId: activeConv.id,
+					type: MessageType.TEXT,
+					timestamp: Date.now(),
+					role: MessageRole.ASSISTANT,
+					content: '',
+					toolCalls: '',
+					children: [],
+					model: null
+				},
+				anchorMessage.id
+			);
+			await conversationsStore.updateCurrentNode(newAssistantMessage.id);
+			conversationsStore.updateConversationTimestamp();
+			await conversationsStore.refreshActiveMessages();
+			const conversationPath = filterByLeafNodeId(
+				allMessages,
+				anchorMessage.id,
+				false
+			) as DatabaseMessage[];
+			await this.streamChatCompletion(conversationPath, newAssistantMessage);
+		} catch (error) {
+			if (!isAbortError(error)) console.error('Failed to continue agentic turn:', error);
+			this.setChatLoading(activeConv.id, false);
+		}
+	}
+
 	async continueAssistantMessage(messageId: string): Promise<void> {
 		const activeConv = conversationsStore.activeConversation;
 		if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
@@ -1268,6 +1320,18 @@ class ChatStore {
 
 		const { message: msg, index: idx } = result;
 
+		// Decide which resume path applies. tool_calls without tool results can
+		// not be resumed mid sequence by continue_final_message, branch instead.
+		// tool_calls already paired with tool results need a fresh next turn,
+		// not a token level continuation of the target assistant.
+		const intent = classifyContinueIntent(conversationsStore.activeMessages, idx);
+		if (intent.kind === ContinueIntentKind.RERUN_TURN) {
+			return this.regenerateMessageWithBranching(messageId);
+		}
+		if (intent.kind === ContinueIntentKind.NEXT_TURN) {
+			return this.continueAsNextAgenticTurn(intent.truncateAfter);
+		}
+
 		try {
 			this.showErrorDialog(null);
 			this.setChatLoading(activeConv.id, true);
@@ -1283,15 +1347,11 @@ class ChatStore {
 
 			const originalContent = dbMessage.content;
 			const originalReasoning = dbMessage.reasoningContent || '';
-			const conversationContext = conversationsStore.activeMessages.slice(0, idx);
-			const contextWithContinue = [
-				...conversationContext,
-				{
-					role: MessageRole.ASSISTANT as const,
-					content: originalContent,
-					reasoning_content: originalReasoning || undefined
-				}
-			];
+			// Hand the persisted DatabaseMessage straight to sendMessage so its
+			// internal converter preserves tool_calls and extras when present.
+			// Reconstructing a bare {role, content} here would drop those fields
+			// and break continue_final_message for messages with tool calls.
+			const contextWithContinue = conversationsStore.activeMessages.slice(0, idx + 1);
 
 			let appendedContent = '';
 			let appendedReasoning = '';
diff --git a/tools/ui/src/lib/stores/settings.svelte.ts b/tools/ui/src/lib/stores/settings.svelte.ts
index 58eea1ee6d3..8d3c711b82e 100644
--- a/tools/ui/src/lib/stores/settings.svelte.ts
+++ b/tools/ui/src/lib/stores/settings.svelte.ts
@@ -41,8 +41,7 @@ import {
 	SETTINGS_KEYS,
 	USER_OVERRIDES_LOCALSTORAGE_KEY
 } from '$lib/constants';
-
-import { IsMobile } from '$lib/hooks/is-mobile.svelte';
+import { isMobile } from '$lib/stores/viewport.svelte';
 import { ParameterSyncService } from '$lib/services/parameter-sync.service';
 import { serverStore } from '$lib/stores/server.svelte';
 import {
@@ -132,7 +131,7 @@ class SettingsStore {
 
 			// Default sendOnEnter to false on mobile when the user has no saved preference
 			if (!(SETTINGS_KEYS.SEND_ON_ENTER in savedVal)) {
-				if (new IsMobile().current) {
+				if (isMobile.current) {
 					this.config[SETTINGS_KEYS.SEND_ON_ENTER] = false;
 				}
 			}
diff --git a/tools/ui/src/lib/stores/viewport.svelte.ts b/tools/ui/src/lib/stores/viewport.svelte.ts
new file mode 100644
index 00000000000..dac241a0124
--- /dev/null
+++ b/tools/ui/src/lib/stores/viewport.svelte.ts
@@ -0,0 +1,9 @@
+import { browser } from '$app/environment';
+import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants/viewport';
+import { MediaQuery } from 'svelte/reactivity';
+
+export const viewport = $state({
+	width: browser ? window.innerWidth : 0
+});
+
+export const isMobile = new MediaQuery(`max-width: ${DEFAULT_MOBILE_BREAKPOINT - 1}px`);
diff --git a/tools/ui/src/lib/types/api.d.ts b/tools/ui/src/lib/types/api.d.ts
index 5f0a38dd3c3..c7803518019 100644
--- a/tools/ui/src/lib/types/api.d.ts
+++ b/tools/ui/src/lib/types/api.d.ts
@@ -1,6 +1,8 @@
-import type { ContentPartType, ServerModelStatus, ServerRole } from '$lib/enums';
+import type { ContentPartType, FileTypeAudio, ServerModelStatus, ServerRole } from '$lib/enums';
 import type { ChatMessagePromptProgress, ChatRole } from './chat';
 
+export type AudioInputFormat = FileTypeAudio.WAV | FileTypeAudio.MP3;
+
 export interface ApiChatCompletionToolFunction {
 	name: string;
 	description?: string;
@@ -20,7 +22,7 @@ export interface ApiChatMessageContentPart {
 	};
 	input_audio?: {
 		data: string;
-		format: 'wav' | 'mp3';
+		format: AudioInputFormat;
 	};
 	input_video?: {
 		data: string;
diff --git a/tools/ui/src/lib/types/index.ts b/tools/ui/src/lib/types/index.ts
index d704a4b3c8a..0eb1e670133 100644
--- a/tools/ui/src/lib/types/index.ts
+++ b/tools/ui/src/lib/types/index.ts
@@ -29,7 +29,8 @@ export type {
 	ApiRouterModelsStatusResponse,
 	ApiRouterModelsListResponse,
 	ApiRouterModelsUnloadRequest,
-	ApiRouterModelsUnloadResponse
+	ApiRouterModelsUnloadResponse,
+	AudioInputFormat
 } from './api';
 
 // Chat types
diff --git a/tools/ui/src/lib/utils/agentic.ts b/tools/ui/src/lib/utils/agentic.ts
index 549a1c9a0e1..52ff3579306 100644
--- a/tools/ui/src/lib/utils/agentic.ts
+++ b/tools/ui/src/lib/utils/agentic.ts
@@ -1,4 +1,4 @@
-import { AgenticSectionType, MessageRole } from '$lib/enums';
+import { AgenticSectionType, ContinueIntentKind, MessageRole } from '$lib/enums';
 import { ATTACHMENT_SAVED_REGEX, NEWLINE_SEPARATOR } from '$lib/constants';
 import type { ApiChatCompletionToolCall } from '$lib/types/api';
 import type {
@@ -225,3 +225,62 @@ export function hasAgenticContent(
 
 	return toolMessages.length > 0;
 }
+
+/**
+ * Classification of how a Continue click on an assistant message should resume
+ * generation. The caller dispatches the resume path based on this value.
+ *
+ *   append_text  -> the target is a plain text turn, resume with
+ *                   continue_final_message and rehydrate the persisted
+ *                   tool_calls and attachments through the regular DB to API
+ *                   message converter.
+ *   rerun_turn   -> the target carries tool_calls that were never resolved by
+ *                   tool result messages. The agentic stream was cut mid turn,
+ *                   so we drop the target and rerun the loop from the previous
+ *                   history. truncateAfter is the last kept index, inclusive.
+ *   next_turn    -> the target's tool_calls were already resolved by trailing
+ *                   tool results. Hand the history up to and including the
+ *                   last consecutive tool result back to the agentic loop so it
+ *                   starts the next turn naturally. truncateAfter points at
+ *                   that last tool result.
+ */
+export type ContinueIntent =
+	| { kind: ContinueIntentKind.APPEND_TEXT }
+	| { kind: ContinueIntentKind.RERUN_TURN; truncateAfter: number }
+	| { kind: ContinueIntentKind.NEXT_TURN; truncateAfter: number };
+
+/**
+ * Decide how a Continue click on messages[idx] should resume generation.
+ * Pure function over the persisted history snapshot.
+ */
+export function classifyContinueIntent(messages: DatabaseMessage[], idx: number): ContinueIntent {
+	const target = messages[idx];
+
+	// Defensive default: callers already filter by role, stay deterministic.
+	if (!target || target.role !== MessageRole.ASSISTANT) {
+		return { kind: ContinueIntentKind.APPEND_TEXT };
+	}
+
+	const hasToolCalls = parseToolCalls(target.toolCalls).length > 0;
+	if (!hasToolCalls) {
+		return { kind: ContinueIntentKind.APPEND_TEXT };
+	}
+
+	// Walk consecutive trailing tool results. The agentic loop only emits tool
+	// messages directly after the assistant turn that owns them, so the first
+	// non tool message marks the boundary.
+	let lastTrailingTool = idx;
+	for (let i = idx + 1; i < messages.length; i++) {
+		if (messages[i].role === MessageRole.TOOL) {
+			lastTrailingTool = i;
+		} else {
+			break;
+		}
+	}
+
+	if (lastTrailingTool > idx) {
+		return { kind: ContinueIntentKind.NEXT_TURN, truncateAfter: lastTrailingTool };
+	}
+
+	return { kind: ContinueIntentKind.RERUN_TURN, truncateAfter: idx - 1 };
+}
diff --git a/tools/ui/src/lib/utils/cap-img-size.ts b/tools/ui/src/lib/utils/cap-img-size.ts
new file mode 100644
index 00000000000..1ba218316b7
--- /dev/null
+++ b/tools/ui/src/lib/utils/cap-img-size.ts
@@ -0,0 +1,72 @@
+import { MEGAPIXELS_TO_PIXELS } from '$lib/constants/image-size';
+import { BASE64_IMAGE_URI_REGEX } from '$lib/constants/uri-template';
+import { MimeTypeImage } from '$lib/enums';
+
+/**
+ * Converts an Image base64 data URL to another Image data URL with capped dimensions to reduce file size.
+ * @param base64UrlImage - The Image base64 data URL to convert
+ * @param maxMegapixels - The maximum image size in megapixels for the output Image
+ * @returns Promise resolving to Image data URL
+ */
+export function capImageDataURLSize(
+	base64UrlImage: string,
+	maxMegapixels: number
+): Promise<string> {
+	return new Promise((resolve, reject) => {
+		try {
+			const mimeMatch = base64UrlImage.match(BASE64_IMAGE_URI_REGEX);
+
+			if (!mimeMatch) {
+				return reject(new Error('Invalid data URL format.'));
+			}
+
+			const mimeType = mimeMatch[1] as MimeTypeImage;
+
+			if (!Object.values(MimeTypeImage).includes(mimeType)) {
+				return reject(new Error(`Unsupported image MIME type: ${mimeType}`));
+			}
+
+			const img = new Image();
+
+			img.onload = () => {
+				try {
+					const canvas = document.createElement('canvas');
+					const ctx = canvas.getContext('2d');
+
+					if (!ctx) {
+						throw new Error('Failed to get 2D canvas context.');
+					}
+
+					const targetWidth = img.naturalWidth;
+					const targetHeight = img.naturalHeight;
+					const totalPixels = targetWidth * targetHeight;
+					const maxPixels = Math.floor(maxMegapixels * MEGAPIXELS_TO_PIXELS);
+
+					if (maxPixels > 0 && totalPixels > maxPixels) {
+						const scaleFactor = Math.sqrt(maxPixels / totalPixels);
+						canvas.width = Math.floor(targetWidth * scaleFactor);
+						canvas.height = Math.floor(targetHeight * scaleFactor);
+					} else {
+						return resolve(base64UrlImage);
+					}
+
+					ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+					resolve(canvas.toDataURL(mimeType));
+				} catch (err) {
+					reject(err instanceof Error ? err : new Error(String(err)));
+				}
+			};
+
+			img.onerror = () => {
+				reject(new Error('Failed to load image.'));
+			};
+
+			img.src = base64UrlImage;
+		} catch (error) {
+			const message = error instanceof Error ? error.message : String(error);
+			const errorMessage = `Error resizing image: ${message}`;
+			console.error(errorMessage, error);
+			reject(new Error(errorMessage));
+		}
+	});
+}
diff --git a/tools/ui/src/lib/utils/file-type.ts b/tools/ui/src/lib/utils/file-type.ts
index ae814e8059a..7495163d15d 100644
--- a/tools/ui/src/lib/utils/file-type.ts
+++ b/tools/ui/src/lib/utils/file-type.ts
@@ -18,8 +18,12 @@ import {
 	MimeTypeText
 } from '$lib/enums';
 
+function normalizeMimeType(mimeType: string): string {
+	return mimeType.trim().toLowerCase();
+}
+
 export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
-	switch (mimeType) {
+	switch (normalizeMimeType(mimeType)) {
 		// Images
 		case MimeTypeImage.JPEG:
 		case MimeTypeImage.PNG:
@@ -33,6 +37,10 @@ export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
 		case MimeTypeAudio.MP3:
 		case MimeTypeAudio.MP4:
 		case MimeTypeAudio.WAV:
+		case MimeTypeAudio.WAVE:
+		case MimeTypeAudio.X_WAV:
+		case MimeTypeAudio.X_WAVE:
+		case MimeTypeAudio.X_PN_WAV:
 		case MimeTypeAudio.WEBM:
 		case MimeTypeAudio.WEBM_OPUS:
 			return FileTypeCategory.AUDIO;
diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte
index 78227df3ce7..2f1f5249722 100644
--- a/tools/ui/src/routes/+layout.svelte
+++ b/tools/ui/src/routes/+layout.svelte
@@ -26,18 +26,18 @@
 	import { modelsStore } from '$lib/stores/models.svelte';
 	import { mcpStore } from '$lib/stores/mcp.svelte';
 	import { TOOLTIP_DELAY_DURATION } from '$lib/constants';
-	import { IsMobile } from '$lib/hooks/is-mobile.svelte';
 	import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
 	import { useSettingsNavigation } from '$lib/hooks/use-settings-navigation.svelte';
 	import { conversations } from '$lib/stores/conversations.svelte';
+	import { isMobile } from '$lib/stores/viewport.svelte';
 
 	let { children } = $props();
 	let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
-	let isMobile = new IsMobile();
 	let isDesktop = $derived(!isMobile.current);
 	let sidebarOpen = $state(false);
 	let mounted = $state(false);
 	let innerHeight = $state<number | undefined>();
+	let innerWidth = $state(browser ? window.innerWidth : 0);
 
 	let chatSidebar:
 		| {
@@ -240,7 +240,7 @@
 	/>
 
 	<Sidebar.Provider bind:open={sidebarOpen}>
-		<div class="flex h-screen w-full" style:height="{innerHeight}px">
+		<div class="flex h-screen w-full">
 			<Sidebar.Root variant="floating" class="h-full"
 				><SidebarNavigation bind:this={chatSidebar} /></Sidebar.Root
 			>
@@ -278,4 +278,4 @@
 	</Sidebar.Provider>
 </Tooltip.Provider>
 
-<svelte:window onkeydown={handleKeydown} bind:innerHeight />
+<svelte:window onkeydown={handleKeydown} bind:innerHeight bind:innerWidth />
diff --git a/tools/ui/svelte.config.js b/tools/ui/svelte.config.js
index 4b14065dacd..1a7355b2d55 100644
--- a/tools/ui/svelte.config.js
+++ b/tools/ui/svelte.config.js
@@ -2,6 +2,10 @@ import { mdsvex } from 'mdsvex';
 import adapter from '@sveltejs/adapter-static';
 import { vitePreprocess } from '@sveltejs/vite-plugin-svelte';
 
+// CMake sets LLAMA_UI_OUT_DIR to the staging dir under the build tree; manual
+// `npm run build` runs without the env var default to ./dist.
+const outDir = process.env.LLAMA_UI_OUT_DIR ?? './dist';
+
 /** @type {import('@sveltejs/kit').Config} */
 const config = {
 	// Consult https://svelte.dev/docs/kit/integrations
@@ -14,8 +18,8 @@ const config = {
 		},
 		router: { type: 'hash' },
 		adapter: adapter({
-			pages: '../../build/tools/ui/dist',
-			assets: '../../build/tools/ui/dist',
+			pages: outDir,
+			assets: outDir,
 			fallback: 'index.html',
 			precompress: false,
 			strict: true
diff --git a/tools/ui/tests/unit/continue-intent.test.ts b/tools/ui/tests/unit/continue-intent.test.ts
new file mode 100644
index 00000000000..76539c76a9e
--- /dev/null
+++ b/tools/ui/tests/unit/continue-intent.test.ts
@@ -0,0 +1,166 @@
+import { describe, it, expect } from 'vitest';
+import { classifyContinueIntent } from '$lib/utils/agentic';
+import { ContinueIntentKind, MessageRole, MessageType } from '$lib/enums';
+import type { DatabaseMessage } from '$lib/types/database';
+
+/**
+ * Tests for the Continue button intent classifier.
+ *
+ * The classifier walks the persisted message history to decide which of three
+ * resume paths a Continue click should take:
+ *
+ *   A. append_text  -> plain text assistant turn, resume with
+ *                      continue_final_message.
+ *   B. rerun_turn   -> assistant turn with tool_calls but no tool results yet,
+ *                      the stream was cut mid turn and the tool_calls are
+ *                      unrecoverable as a token level continuation. Drop the
+ *                      target and rerun from the previous history.
+ *   C. next_turn    -> assistant turn with tool_calls that were already
+ *                      resolved by trailing tool results. Hand the history
+ *                      back to the agentic loop so it starts the next turn.
+ */
+
+let nextId = 0;
+function makeMsg(role: MessageRole, opts: Partial<DatabaseMessage> = {}): DatabaseMessage {
+	nextId++;
+	return {
+		id: `msg-${nextId}`,
+		convId: 'conv-1',
+		type: MessageType.TEXT,
+		timestamp: nextId,
+		role,
+		content: '',
+		parent: null,
+		children: [],
+		...opts
+	};
+}
+
+function toolCall(id: string, name: string, args: string = '{}'): string {
+	return JSON.stringify([{ id, type: 'function', function: { name, arguments: args } }]);
+}
+
+describe('classifyContinueIntent', () => {
+	it('returns append_text for a plain text assistant turn at the tail', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'hello' }),
+			makeMsg(MessageRole.ASSISTANT, { content: 'hi there' })
+		];
+
+		const intent = classifyContinueIntent(messages, 1);
+
+		expect(intent).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
+	});
+
+	it('returns append_text for a plain text assistant turn in the middle', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'q1' }),
+			makeMsg(MessageRole.ASSISTANT, { content: 'a1' }),
+			makeMsg(MessageRole.USER, { content: 'q2' }),
+			makeMsg(MessageRole.ASSISTANT, { content: 'a2' })
+		];
+
+		expect(classifyContinueIntent(messages, 1)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
+	});
+
+	it('returns rerun_turn when the assistant has tool_calls without results', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'list files' }),
+			makeMsg(MessageRole.ASSISTANT, {
+				content: '',
+				toolCalls: toolCall('call_1', 'bash_tool', '{"command":"ls"}')
+			})
+		];
+
+		const intent = classifyContinueIntent(messages, 1);
+
+		expect(intent).toEqual({ kind: ContinueIntentKind.RERUN_TURN, truncateAfter: 0 });
+	});
+
+	it('returns next_turn when trailing tool results resolve the tool_calls', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'list files' }),
+			makeMsg(MessageRole.ASSISTANT, {
+				content: '',
+				toolCalls: toolCall('call_1', 'bash_tool')
+			}),
+			makeMsg(MessageRole.TOOL, { content: 'file1\nfile2', toolCallId: 'call_1' })
+		];
+
+		const intent = classifyContinueIntent(messages, 1);
+
+		expect(intent).toEqual({ kind: ContinueIntentKind.NEXT_TURN, truncateAfter: 2 });
+	});
+
+	it('next_turn keeps all consecutive trailing tool results, not just one', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'do many things' }),
+			makeMsg(MessageRole.ASSISTANT, {
+				content: '',
+				toolCalls: JSON.stringify([
+					{ id: 'call_1', type: 'function', function: { name: 'a', arguments: '{}' } },
+					{ id: 'call_2', type: 'function', function: { name: 'b', arguments: '{}' } }
+				])
+			}),
+			makeMsg(MessageRole.TOOL, { content: 'r1', toolCallId: 'call_1' }),
+			makeMsg(MessageRole.TOOL, { content: 'r2', toolCallId: 'call_2' })
+		];
+
+		const intent = classifyContinueIntent(messages, 1);
+
+		expect(intent).toEqual({ kind: ContinueIntentKind.NEXT_TURN, truncateAfter: 3 });
+	});
+
+	it('next_turn stops at the first non tool message after the target', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'go' }),
+			makeMsg(MessageRole.ASSISTANT, {
+				content: '',
+				toolCalls: toolCall('call_1', 'a')
+			}),
+			makeMsg(MessageRole.TOOL, { content: 'r1', toolCallId: 'call_1' }),
+			makeMsg(MessageRole.USER, { content: 'wait' }),
+			makeMsg(MessageRole.TOOL, { content: 'late', toolCallId: 'call_1' })
+		];
+
+		const intent = classifyContinueIntent(messages, 1);
+
+		// truncateAfter must point at the contiguous tool block, not jump over
+		// the user message to grab the dangling late tool.
+		expect(intent).toEqual({ kind: ContinueIntentKind.NEXT_TURN, truncateAfter: 2 });
+	});
+
+	it('returns append_text when toolCalls is set but parses to empty array', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'q' }),
+			makeMsg(MessageRole.ASSISTANT, { content: 'a', toolCalls: '[]' })
+		];
+
+		expect(classifyContinueIntent(messages, 1)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
+	});
+
+	it('returns append_text when toolCalls is malformed JSON', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'q' }),
+			makeMsg(MessageRole.ASSISTANT, { content: 'a', toolCalls: '{not json' })
+		];
+
+		expect(classifyContinueIntent(messages, 1)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
+	});
+
+	it('returns append_text defensively when idx points at a non assistant message', () => {
+		const messages = [
+			makeMsg(MessageRole.USER, { content: 'q' }),
+			makeMsg(MessageRole.ASSISTANT, { content: 'a' })
+		];
+
+		expect(classifyContinueIntent(messages, 0)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
+	});
+
+	it('returns append_text defensively when idx is out of bounds', () => {
+		const messages = [makeMsg(MessageRole.ASSISTANT, { content: 'a' })];
+
+		expect(classifyContinueIntent(messages, 5)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
+		expect(classifyContinueIntent([], 0)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
+	});
+});
diff --git a/tools/ui/ui.cpp b/tools/ui/ui.cpp
deleted file mode 100644
index d02a62c2c71..00000000000
--- a/tools/ui/ui.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifdef LLAMA_BUILD_UI
-// auto generated files (see README.md for details)
-#include "index.html.hpp"
-#include "bundle.js.hpp"
-#include "bundle.css.hpp"
-#include "loading.html.hpp"
-#endif
diff --git a/tools/ui/ui.h b/tools/ui/ui.h
deleted file mode 100644
index 6f775ea3ada..00000000000
--- a/tools/ui/ui.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-// TODO @ngxson : refactor, wrap these in a function
-
-#ifdef LLAMA_BUILD_UI
-extern unsigned char index_html[];
-extern unsigned int  index_html_len;
-
-extern unsigned char bundle_js[];
-extern unsigned int  bundle_js_len;
-
-extern unsigned char bundle_css[];
-extern unsigned int  bundle_css_len;
-
-extern unsigned char loading_html[];
-extern unsigned int  loading_html_len;
-#endif
diff --git a/ty.toml b/ty.toml
index ad88ac7bdad..340b0649d33 100644
--- a/ty.toml
+++ b/ty.toml
@@ -14,6 +14,7 @@ exclude = [
 include = [
     "./tools/server/tests/**",
     "./scripts/snapdragon/qdc/**",
+    "./tools/mtmd/tests/**",
 ]
 
 [overrides.rules]
diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp
index b28549607a2..f3555f2d415 100644
--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -1567,7 +1567,7 @@ void mmap::close() {
 #endif
   size_ = 0;
 }
-int close_socket(socket_t sock) {
+int close_socket(socket_t sock) noexcept {
 #ifdef _WIN32
   return closesocket(sock);
 #else
@@ -1794,7 +1794,7 @@ bool process_client_socket(
   return callback(strm);
 }
 
-int shutdown_socket(socket_t sock) {
+int shutdown_socket(socket_t sock) noexcept {
 #ifdef _WIN32
   return shutdown(sock, SD_BOTH);
 #else
@@ -6650,6 +6650,176 @@ make_host_and_port_string_always_port(const std::string &host, int port) {
   return prepare_host_string(host) + ":" + std::to_string(port);
 }
 
+bool parse_no_proxy_entry(const std::string &token, NoProxyEntry &out);
+NormalizedTarget normalize_target(const std::string &host);
+bool ip_in_cidr(const IPBytes &ip, const IPBytes &net, int prefix_bits);
+bool host_matches_no_proxy(const NormalizedTarget &target,
+                           const std::vector<NoProxyEntry> &entries);
+
+bool ip_in_cidr(const IPBytes &ip, const IPBytes &net, int prefix_bits) {
+  if (prefix_bits < 0 || prefix_bits > 128) { return false; }
+  if (prefix_bits == 0) { return true; }
+  int full_bytes = prefix_bits / 8;
+  int rem_bits = prefix_bits % 8;
+  if (full_bytes > 0 && std::memcmp(ip.data(), net.data(),
+                                    static_cast<size_t>(full_bytes)) != 0) {
+    return false;
+  }
+  if (rem_bits == 0) { return true; }
+  auto i = static_cast<size_t>(full_bytes);
+  auto mask = static_cast<uint8_t>(0xFFu << (8 - rem_bits));
+  return (ip[i] & mask) == (net[i] & mask);
+}
+
+bool parse_no_proxy_entry(const std::string &token, NoProxyEntry &out) {
+  if (token.empty()) { return false; }
+
+  if (token == "*") {
+    out.kind = NoProxyKind::Wildcard;
+    return true;
+  }
+
+  auto slash = token.find('/');
+  std::string addr_part =
+      (slash == std::string::npos) ? token : token.substr(0, slash);
+  std::string prefix_part =
+      (slash == std::string::npos) ? std::string() : token.substr(slash + 1);
+
+  // A bare slash or trailing-slash CIDR like "10.0.0.0/" is malformed;
+  // don't silently treat it as a /32 (or /128).
+  if (slash != std::string::npos && prefix_part.empty()) { return false; }
+
+  // Accept the bracketed IPv6 form ("[::1]", "[fe80::]/10") as well as the
+  // bare form. Brackets have no meaning for IPv4, so skip the IPv4 attempt
+  // when brackets are present.
+  bool bracketed = addr_part.size() >= 2 && addr_part.front() == '[' &&
+                   addr_part.back() == ']';
+  if (bracketed) { addr_part = addr_part.substr(1, addr_part.size() - 2); }
+
+  if (!bracketed) {
+    struct in_addr v4;
+    if (inet_pton(AF_INET, addr_part.c_str(), &v4) == 1) {
+      int prefix = 32;
+      if (!prefix_part.empty()) {
+        auto r = from_chars(prefix_part.data(),
+                            prefix_part.data() + prefix_part.size(), prefix);
+        if (r.ec != std::errc{} ||
+            r.ptr != prefix_part.data() + prefix_part.size()) {
+          return false;
+        }
+        if (prefix < 0 || prefix > 32) { return false; }
+      }
+      out.kind = NoProxyKind::IPv4Cidr;
+      std::memcpy(out.net.data(), &v4, sizeof(v4));
+      out.prefix_bits = prefix;
+      return true;
+    }
+  }
+
+  struct in6_addr v6;
+  if (inet_pton(AF_INET6, addr_part.c_str(), &v6) == 1) {
+    int prefix = 128;
+    if (!prefix_part.empty()) {
+      auto r = from_chars(prefix_part.data(),
+                          prefix_part.data() + prefix_part.size(), prefix);
+      if (r.ec != std::errc{} ||
+          r.ptr != prefix_part.data() + prefix_part.size()) {
+        return false;
+      }
+      if (prefix < 0 || prefix > 128) { return false; }
+    }
+    out.kind = NoProxyKind::IPv6Cidr;
+    std::memcpy(out.net.data(), &v6, sizeof(v6));
+    out.prefix_bits = prefix;
+    return true;
+  }
+
+  // Bracketed entries can only be IPv6. If the IPv6 parse above failed,
+  // the entry is malformed — don't fall through to the hostname branch.
+  if (bracketed) { return false; }
+
+  // A '/' on a non-IP token means a CIDR prefix without an address. Reject.
+  if (slash != std::string::npos) { return false; }
+  // Port-specific entries (host:port) are not supported.
+  if (token.find(':') != std::string::npos) { return false; }
+
+  std::string hostname = case_ignore::to_lower(token);
+  while (!hostname.empty() && hostname.front() == '.') {
+    hostname.erase(hostname.begin());
+  }
+  while (!hostname.empty() && hostname.back() == '.') {
+    hostname.pop_back();
+  }
+  if (hostname.empty()) { return false; }
+
+  out.kind = NoProxyKind::HostnameSuffix;
+  out.hostname_pattern = std::move(hostname);
+  return true;
+}
+
+NormalizedTarget normalize_target(const std::string &host) {
+  NormalizedTarget t;
+  std::string h = host;
+
+  if (h.size() >= 2 && h.front() == '[' && h.back() == ']') {
+    h = h.substr(1, h.size() - 2);
+  }
+
+  // Strip a single trailing dot so "example.com." canonicalizes to
+  // "example.com".
+  if (!h.empty() && h.back() == '.') { h.pop_back(); }
+
+  t.hostname = case_ignore::to_lower(h);
+
+  if (!t.hostname.empty()) {
+    struct in_addr v4;
+    struct in6_addr v6;
+    if (inet_pton(AF_INET, t.hostname.c_str(), &v4) == 1) {
+      t.is_ipv4 = true;
+      std::memcpy(t.ip.data(), &v4, sizeof(v4));
+    } else if (inet_pton(AF_INET6, t.hostname.c_str(), &v6) == 1) {
+      t.is_ipv6 = true;
+      std::memcpy(t.ip.data(), &v6, sizeof(v6));
+    }
+  }
+  return t;
+}
+
+bool host_matches_no_proxy(const NormalizedTarget &target,
+                                  const std::vector<NoProxyEntry> &entries) {
+  if (target.hostname.empty()) { return false; }
+  for (const auto &e : entries) {
+    switch (e.kind) {
+    case NoProxyKind::Wildcard: return true;
+    case NoProxyKind::IPv4Cidr:
+      if (target.is_ipv4 && ip_in_cidr(target.ip, e.net, e.prefix_bits)) {
+        return true;
+      }
+      break;
+    case NoProxyKind::IPv6Cidr:
+      if (target.is_ipv6 && ip_in_cidr(target.ip, e.net, e.prefix_bits)) {
+        return true;
+      }
+      break;
+    case NoProxyKind::HostnameSuffix:
+      if (target.is_ipv4 || target.is_ipv6) { break; }
+      if (target.hostname == e.hostname_pattern) { return true; }
+      // Dot-boundary suffix match: prevents "evilexample.com" from matching
+      // an entry of "example.com".
+      if (target.hostname.size() > e.hostname_pattern.size() + 1) {
+        auto offset = target.hostname.size() - e.hostname_pattern.size();
+        if (target.hostname[offset - 1] == '.' &&
+            target.hostname.compare(offset, e.hostname_pattern.size(),
+                                    e.hostname_pattern) == 0) {
+          return true;
+        }
+      }
+      break;
+    }
+  }
+  return false;
+}
+
 template <typename T>
 bool check_and_write_headers(Stream &strm, Headers &headers,
                                     T header_writer, Error &error) {
@@ -7149,7 +7319,7 @@ void Server::wait_until_ready() const {
   }
 }
 
-void Server::stop() {
+void Server::stop() noexcept {
   if (is_running_) {
     assert(svr_sock_ != INVALID_SOCKET);
     std::atomic<socket_t> sock(svr_sock_.exchange(INVALID_SOCKET));
@@ -8455,6 +8625,7 @@ void ClientImpl::copy_settings(const ClientImpl &rhs) {
   proxy_basic_auth_username_ = rhs.proxy_basic_auth_username_;
   proxy_basic_auth_password_ = rhs.proxy_basic_auth_password_;
   proxy_bearer_token_auth_token_ = rhs.proxy_bearer_token_auth_token_;
+  no_proxy_entries_ = rhs.no_proxy_entries_;
   logger_ = rhs.logger_;
   error_logger_ = rhs.error_logger_;
 
@@ -8470,8 +8641,25 @@ void ClientImpl::copy_settings(const ClientImpl &rhs) {
 #endif
 }
 
+bool
+ClientImpl::is_proxy_enabled_for_host(const std::string &host) const {
+  if (proxy_host_.empty() || proxy_port_ == -1) { return false; }
+  if (no_proxy_entries_.empty()) { return true; }
+  // host_ is const so its normalized form is invariant; cache it. The
+  // cross-host path (setup_redirect_client passing next_host) re-normalizes.
+  if (host == host_) {
+    if (!host_normalized_valid_) {
+      host_normalized_ = detail::normalize_target(host_);
+      host_normalized_valid_ = true;
+    }
+    return !detail::host_matches_no_proxy(host_normalized_, no_proxy_entries_);
+  }
+  auto target = detail::normalize_target(host);
+  return !detail::host_matches_no_proxy(target, no_proxy_entries_);
+}
+
 socket_t ClientImpl::create_client_socket(Error &error) const {
-  if (!proxy_host_.empty() && proxy_port_ != -1) {
+  if (is_proxy_enabled_for_host(host_)) {
     return detail::create_client_socket(
         proxy_host_, std::string(), proxy_port_, address_family_, tcp_nodelay_,
         ipv6_v6only_, socket_options_, connection_timeout_sec_,
@@ -8543,6 +8731,12 @@ void ClientImpl::close_socket(Socket &socket) {
   socket.sock = INVALID_SOCKET;
 }
 
+void ClientImpl::disconnect(bool gracefully) {
+  shutdown_ssl(socket_, gracefully);
+  shutdown_socket(socket_);
+  close_socket(socket_);
+}
+
 bool ClientImpl::read_response_line(Stream &strm, const Request &req,
                                            Response &res,
                                            bool skip_100_continue) const {
@@ -8614,14 +8808,8 @@ bool ClientImpl::send_(Request &req, Response &res, Error &error) {
 #endif
 
       if (!is_alive) {
-        // Attempt to avoid sigpipe by shutting down non-gracefully if it
-        // seems like the other side has already closed the connection Also,
-        // there cannot be any requests in flight from other threads since we
-        // locked request_mutex_, so safe to close everything immediately
-        const bool shutdown_gracefully = false;
-        shutdown_ssl(socket_, shutdown_gracefully);
-        shutdown_socket(socket_);
-        close_socket(socket_);
+        // Peer seems gone — non-graceful shutdown to avoid SIGPIPE.
+        disconnect(/*gracefully=*/false);
       }
     }
 
@@ -8671,9 +8859,7 @@ bool ClientImpl::send_(Request &req, Response &res, Error &error) {
 
     if (socket_should_be_closed_when_request_is_done_ || close_connection ||
         !ret) {
-      shutdown_ssl(socket_, true);
-      shutdown_socket(socket_);
-      close_socket(socket_);
+      disconnect(/*gracefully=*/true);
     }
   });
 
@@ -8786,11 +8972,7 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
         }
       }
 #endif
-      if (!is_alive) {
-        shutdown_ssl(socket_, false);
-        shutdown_socket(socket_);
-        close_socket(socket_);
-      }
+      if (!is_alive) { disconnect(/*gracefully=*/false); }
     }
 
     if (!is_alive) {
@@ -9082,7 +9264,7 @@ bool ClientImpl::handle_request(Stream &strm, Request &req,
 
   bool ret;
 
-  if (!is_ssl() && !proxy_host_.empty() && proxy_port_ != -1) {
+  if (!is_ssl() && is_proxy_enabled_for_host(host_)) {
     auto req2 = req;
     req2.path = "http://" +
                 detail::make_host_and_port_string(host_, port_, false) +
@@ -9106,9 +9288,7 @@ bool ClientImpl::handle_request(Stream &strm, Request &req,
     // to call it from a different thread since it's a thread-safety issue
     // to do these things to the socket if another thread is using the socket.
     std::lock_guard<std::mutex> guard(socket_mutex_);
-    shutdown_ssl(socket_, true);
-    shutdown_socket(socket_);
-    close_socket(socket_);
+    disconnect(/*gracefully=*/true);
   }
 
   if (300 < res.status && res.status < 400 && follow_location_) {
@@ -9121,6 +9301,14 @@ bool ClientImpl::handle_request(Stream &strm, Request &req,
        res.status == StatusCode::ProxyAuthenticationRequired_407) &&
       req.authorization_count_ < 5) {
     auto is_proxy = res.status == StatusCode::ProxyAuthenticationRequired_407;
+
+    // Only retry when the 407 actually came from a proxy hop: plain HTTP
+    // through an enabled proxy. HTTPS via CONNECT tunnels the 407 from the
+    // origin (#2457); direct/bypassed origins have no proxy hop at all.
+    if (is_proxy && !(!is_ssl() && is_proxy_enabled_for_host(host_))) {
+      return ret;
+    }
+
     const auto &username =
         is_proxy ? proxy_digest_auth_username_ : digest_auth_username_;
     const auto &password =
@@ -9288,13 +9476,13 @@ void ClientImpl::setup_redirect_client(ClientType &client) {
   // host. This function is only called for cross-host redirects; same-host
   // redirects are handled directly in ClientImpl::redirect().
 
-  // Setup proxy configuration (CRITICAL ORDER - proxy must be set
-  // before proxy auth)
+  // Copy the proxy configuration unconditionally; the per-target bypass is
+  // re-evaluated at send time, so a later hop to a non-bypassed host can
+  // still use the proxy.
+  client.no_proxy_entries_ = no_proxy_entries_;
   if (!proxy_host_.empty() && proxy_port_ != -1) {
-    // First set proxy host and port
     client.set_proxy(proxy_host_, proxy_port_);
 
-    // Then set proxy authentication (order matters!)
     if (!proxy_basic_auth_username_.empty()) {
       client.set_proxy_basic_auth(proxy_basic_auth_username_,
                                   proxy_basic_auth_password_);
@@ -9385,14 +9573,6 @@ bool ClientImpl::write_request(Stream &strm, Request &req,
     }
   }
 
-  if (!proxy_basic_auth_username_.empty() &&
-      !proxy_basic_auth_password_.empty()) {
-    if (!req.has_header("Proxy-Authorization")) {
-      req.headers.insert(make_basic_authentication_header(
-          proxy_basic_auth_username_, proxy_basic_auth_password_, true));
-    }
-  }
-
   if (!bearer_token_auth_token_.empty()) {
     if (!req.has_header("Authorization")) {
       req.headers.insert(make_bearer_token_authentication_header(
@@ -9400,8 +9580,18 @@ bool ClientImpl::write_request(Stream &strm, Request &req,
     }
   }
 
-  if (!proxy_bearer_token_auth_token_.empty()) {
-    if (!req.has_header("Proxy-Authorization")) {
+  // Proxy-Authorization is only sent when the proxy is actually used for
+  // this target — otherwise NO_PROXY-matched requests would leak proxy
+  // credentials directly to the destination server.
+  if (is_proxy_enabled_for_host(host_)) {
+    if (!proxy_basic_auth_username_.empty() &&
+        !proxy_basic_auth_password_.empty() &&
+        !req.has_header("Proxy-Authorization")) {
+      req.headers.insert(make_basic_authentication_header(
+          proxy_basic_auth_username_, proxy_basic_auth_password_, true));
+    }
+    if (!proxy_bearer_token_auth_token_.empty() &&
+        !req.has_header("Proxy-Authorization")) {
       req.headers.insert(make_bearer_token_authentication_header(
           proxy_bearer_token_auth_token_, true));
     }
@@ -9711,7 +9901,7 @@ bool ClientImpl::process_request(Stream &strm, Request &req,
 
 #ifdef CPPHTTPLIB_SSL_ENABLED
   if (is_ssl() && !expect_100_continue) {
-    auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1;
+    auto is_proxy_enabled = is_proxy_enabled_for_host(host_);
     if (!is_proxy_enabled) {
       if (tls::is_peer_closed(socket_.ssl, socket_.sock)) {
         error = Error::SSLPeerCouldBeClosed_;
@@ -10718,10 +10908,7 @@ void ClientImpl::stop() {
     return;
   }
 
-  // Otherwise, still holding the mutex, we can shut everything down ourselves
-  shutdown_ssl(socket_, true);
-  shutdown_socket(socket_);
-  close_socket(socket_);
+  disconnect(/*gracefully=*/true);
 }
 
 std::string ClientImpl::host() const { return host_; }
@@ -10812,6 +10999,8 @@ void ClientImpl::set_interface(const std::string &intf) {
 void ClientImpl::set_proxy(const std::string &host, int port) {
   proxy_host_ = host;
   proxy_port_ = port;
+  std::lock_guard<std::mutex> guard(socket_mutex_);
+  disconnect(/*gracefully=*/true);
 }
 
 void ClientImpl::set_proxy_basic_auth(const std::string &username,
@@ -10824,6 +11013,22 @@ void ClientImpl::set_proxy_bearer_token_auth(const std::string &token) {
   proxy_bearer_token_auth_token_ = token;
 }
 
+void ClientImpl::set_no_proxy(const std::vector<std::string> &patterns) {
+  std::vector<detail::NoProxyEntry> parsed;
+  parsed.reserve(patterns.size());
+  for (const auto &p : patterns) {
+    auto trimmed = detail::trim_copy(p);
+    if (trimmed.empty()) { continue; }
+    detail::NoProxyEntry entry;
+    if (detail::parse_no_proxy_entry(trimmed, entry)) {
+      parsed.push_back(std::move(entry));
+    }
+  }
+  no_proxy_entries_ = std::move(parsed);
+  std::lock_guard<std::mutex> guard(socket_mutex_);
+  disconnect(/*gracefully=*/true);
+}
+
 #ifdef CPPHTTPLIB_SSL_ENABLED
 void ClientImpl::set_digest_auth(const std::string &username,
                                         const std::string &password) {
@@ -11525,6 +11730,9 @@ void Client::set_proxy_basic_auth(const std::string &username,
 void Client::set_proxy_bearer_token_auth(const std::string &token) {
   cli_->set_proxy_bearer_token_auth(token);
 }
+void Client::set_no_proxy(const std::vector<std::string> &patterns) {
+  cli_->set_no_proxy(patterns);
+}
 
 void Client::set_logger(Logger logger) {
   cli_->set_logger(std::move(logger));
@@ -11754,7 +11962,7 @@ bool SSLClient::setup_proxy_connection(
     Socket &socket,
     std::chrono::time_point<std::chrono::steady_clock> start_time,
     Response &res, bool &success, Error &error) {
-  if (proxy_host_.empty() || proxy_port_ == -1) { return true; }
+  if (!is_proxy_enabled_for_host(host_)) { return true; }
 
   if (!connect_with_proxy(socket, start_time, res, success, error)) {
     return false;
@@ -11867,7 +12075,7 @@ bool SSLClient::connect_with_proxy(
 bool SSLClient::ensure_socket_connection(Socket &socket, Error &error) {
   if (!ClientImpl::ensure_socket_connection(socket, error)) { return false; }
 
-  if (!proxy_host_.empty() && proxy_port_ != -1) { return true; }
+  if (is_proxy_enabled_for_host(host_)) { return true; }
 
   if (!initialize_ssl(socket, error)) {
     shutdown_socket(socket);
@@ -12290,9 +12498,18 @@ bool enumerate_windows_system_certs(Callback cb) {
 template <typename Callback>
 bool enumerate_macos_keychain_certs(Callback cb) {
   bool loaded = false;
-  CFArrayRef certs = nullptr;
-  OSStatus status = SecTrustCopyAnchorCertificates(&certs);
-  if (status == errSecSuccess && certs) {
+  const SecTrustSettingsDomain domains[] = {
+      kSecTrustSettingsDomainSystem,
+      kSecTrustSettingsDomainAdmin,
+      kSecTrustSettingsDomainUser,
+  };
+  for (auto domain : domains) {
+    CFArrayRef certs = nullptr;
+    OSStatus status = SecTrustSettingsCopyCertificates(domain, &certs);
+    if (status != errSecSuccess || !certs) {
+      if (certs) CFRelease(certs);
+      continue;
+    }
     CFIndex count = CFArrayGetCount(certs);
     for (CFIndex i = 0; i < count; i++) {
       SecCertificateRef cert =
@@ -12655,28 +12872,36 @@ bool load_system_certs(ctx_t ctx) {
   auto store = SSL_CTX_get_cert_store(ssl_ctx);
   if (!store) return false;
 
-  CFArrayRef certs = nullptr;
-  if (SecTrustCopyAnchorCertificates(&certs) != errSecSuccess || !certs) {
-    return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
-  }
-
   bool loaded_any = false;
-  auto count = CFArrayGetCount(certs);
-  for (CFIndex i = 0; i < count; i++) {
-    auto cert = reinterpret_cast<SecCertificateRef>(
-        const_cast<void *>(CFArrayGetValueAtIndex(certs, i)));
-    CFDataRef der = SecCertificateCopyData(cert);
-    if (der) {
-      const unsigned char *data = CFDataGetBytePtr(der);
-      auto x509 = d2i_X509(nullptr, &data, CFDataGetLength(der));
-      if (x509) {
-        if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; }
-        X509_free(x509);
+  const SecTrustSettingsDomain domains[] = {
+      kSecTrustSettingsDomainSystem,
+      kSecTrustSettingsDomainAdmin,
+      kSecTrustSettingsDomainUser,
+  };
+  for (auto domain : domains) {
+    CFArrayRef certs = nullptr;
+    if (SecTrustSettingsCopyCertificates(domain, &certs) != errSecSuccess ||
+        !certs) {
+      if (certs) CFRelease(certs);
+      continue;
+    }
+    auto count = CFArrayGetCount(certs);
+    for (CFIndex i = 0; i < count; i++) {
+      auto cert = reinterpret_cast<SecCertificateRef>(
+          const_cast<void *>(CFArrayGetValueAtIndex(certs, i)));
+      CFDataRef der = SecCertificateCopyData(cert);
+      if (der) {
+        const unsigned char *data = CFDataGetBytePtr(der);
+        auto x509 = d2i_X509(nullptr, &data, CFDataGetLength(der));
+        if (x509) {
+          if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; }
+          X509_free(x509);
+        }
+        CFRelease(der);
       }
-      CFRelease(der);
     }
+    CFRelease(certs);
   }
-  CFRelease(certs);
   return loaded_any || SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
 #else
   return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h
index af750cdd905..af856dd6316 100644
--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H
 
-#define CPPHTTPLIB_VERSION "0.45.0"
-#define CPPHTTPLIB_VERSION_NUM "0x002d00"
+#define CPPHTTPLIB_VERSION "0.46.0"
+#define CPPHTTPLIB_VERSION_NUM "0x002e00"
 
 #ifdef _WIN32
 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@@ -339,16 +339,26 @@ using socket_t = int;
 #include <utility>
 
 // On macOS with a TLS backend, enable Keychain root certificates by default
-// unless the user explicitly opts out.
+// unless the user explicitly opts out. Not enabled on iOS/tvOS/watchOS since
+// the SecTrustSettings APIs used to enumerate anchor certificates are macOS
+// only; on those platforms the user must provide a CA bundle explicitly.
 #if defined(__APPLE__) && defined(__clang__) &&                                \
     !defined(CPPHTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES) &&         \
     (defined(CPPHTTPLIB_OPENSSL_SUPPORT) ||                                    \
      defined(CPPHTTPLIB_MBEDTLS_SUPPORT) ||                                    \
      defined(CPPHTTPLIB_WOLFSSL_SUPPORT))
+#if TARGET_OS_OSX
 #ifndef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
 #define CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
 #endif
 #endif
+#endif
+
+#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) &&                      \
+    defined(__APPLE__) && !TARGET_OS_OSX
+#error                                                                         \
+    "CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN is only supported on macOS. On iOS/tvOS/watchOS, supply a CA bundle via set_ca_cert_path()."
+#endif
 
 // On Windows, enable Schannel certificate verification by default
 // unless the user explicitly opts out.
@@ -382,7 +392,7 @@ using socket_t = int;
 #endif // _WIN32
 
 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
-#if TARGET_OS_MAC
+#if TARGET_OS_OSX
 #include <Security/Security.h>
 #endif
 #endif
@@ -430,7 +440,7 @@ using socket_t = int;
 #endif
 #endif // _WIN32
 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
-#if TARGET_OS_MAC
+#if TARGET_OS_OSX
 #include <Security/Security.h>
 #endif
 #endif
@@ -473,7 +483,7 @@ using socket_t = int;
 #endif
 #endif // _WIN32
 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
-#if TARGET_OS_MAC
+#if TARGET_OS_OSX
 #include <Security/Security.h>
 #endif
 #endif
@@ -1597,7 +1607,7 @@ class RegexMatcher final : public MatcherBase {
   std::regex regex_;
 };
 
-int close_socket(socket_t sock);
+int close_socket(socket_t sock) noexcept;
 
 ssize_t write_headers(Stream &strm, const Headers &headers);
 
@@ -1734,7 +1744,7 @@ class Server {
 
   bool is_running() const;
   void wait_until_ready() const;
-  void stop();
+  void stop() noexcept;
   void decommission();
 
   std::function<TaskQueue *(void)> new_task_queue;
@@ -2014,6 +2024,31 @@ inline ssize_t read_body_content(Stream *stream, BodyReader &br, char *buf,
 
 class decompressor;
 
+enum class NoProxyKind {
+  Wildcard,       // "*"
+  HostnameSuffix, // "example.com" or ".example.com"
+  IPv4Cidr,       // "10.0.0.0/8" (or single IP, treated as /32)
+  IPv6Cidr,       // "fe80::/10" (or single IP, treated as /128)
+};
+
+// Unified 16-byte buffer holding either a v4 (first 4 bytes) or v6 address.
+// Lets one CIDR matcher cover both families.
+using IPBytes = std::array<uint8_t, 16>;
+
+struct NoProxyEntry {
+  NoProxyKind kind = NoProxyKind::Wildcard;
+  std::string hostname_pattern; // lowercased, leading/trailing dot stripped
+  IPBytes net{};
+  int prefix_bits = 0;
+};
+
+struct NormalizedTarget {
+  std::string hostname; // lowercase; brackets and trailing dot removed
+  bool is_ipv4 = false;
+  bool is_ipv6 = false;
+  IPBytes ip{};
+};
+
 } // namespace detail
 
 class ClientImpl {
@@ -2230,6 +2265,7 @@ class ClientImpl {
   void set_proxy_basic_auth(const std::string &username,
                             const std::string &password);
   void set_proxy_bearer_token_auth(const std::string &token);
+  void set_no_proxy(const std::vector<std::string> &patterns);
 
   void set_logger(Logger logger);
   void set_error_logger(ErrorLogger error_logger);
@@ -2255,16 +2291,19 @@ class ClientImpl {
       std::chrono::time_point<std::chrono::steady_clock> start_time,
       Response &res, bool &success, Error &error);
 
+  bool is_proxy_enabled_for_host(const std::string &host) const;
+
   // All of:
   //   shutdown_ssl
   //   shutdown_socket
   //   close_socket
-  // should ONLY be called when socket_mutex_ is locked.
-  // Also, shutdown_ssl and close_socket should also NOT be called concurrently
-  // with a DIFFERENT thread sending requests using that socket.
+  //   disconnect
+  // should ONLY be called when socket_mutex_ is locked, and only when
+  // no other thread is using the socket.
   virtual void shutdown_ssl(Socket &socket, bool shutdown_gracefully);
   void shutdown_socket(Socket &socket) const;
   void close_socket(Socket &socket);
+  void disconnect(bool gracefully);
 
   bool process_request(Stream &strm, Request &req, Response &res,
                        bool close_connection, Error &error);
@@ -2342,6 +2381,11 @@ class ClientImpl {
   std::string proxy_basic_auth_password_;
   std::string proxy_bearer_token_auth_token_;
 
+  std::vector<detail::NoProxyEntry> no_proxy_entries_;
+
+  mutable detail::NormalizedTarget host_normalized_;
+  mutable bool host_normalized_valid_ = false;
+
   mutable std::mutex logger_mutex_;
   Logger logger_;
   ErrorLogger error_logger_;
@@ -2602,6 +2646,7 @@ class Client {
   void set_proxy_basic_auth(const std::string &username,
                             const std::string &password);
   void set_proxy_bearer_token_auth(const std::string &token);
+  void set_no_proxy(const std::vector<std::string> &patterns);
   void set_logger(Logger logger);
   void set_error_logger(ErrorLogger error_logger);
 
@@ -3028,8 +3073,6 @@ bool parse_range_header(const std::string &s, Ranges &ranges);
 bool parse_accept_header(const std::string &s,
                          std::vector<std::string> &content_types);
 
-int close_socket(socket_t sock);
-
 ssize_t send_socket(socket_t sock, const void *ptr, size_t size, int flags);
 
 ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags);