From 5c256131b1deee2d950fc0e21a42ec20d551ba02 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 16 Mar 2026 02:32:29 +0200
Subject: [PATCH 01/10] feat(minor): customize `postinstall` behavior

---
 docs/guide/troubleshooting.md            | 23 ++++++++
 src/cli/commands/OnPostInstallCommand.ts | 50 +++++++++++++++--
 src/cli/utils/packageJsonConfig.ts       | 69 ++++++++++++++++++++++++
 src/cli/utils/packageManager.ts          | 16 ++++++
 src/config.ts                            | 13 ++++-
 src/types.ts                             |  2 +
 6 files changed, 168 insertions(+), 5 deletions(-)
 create mode 100644 src/cli/utils/packageJsonConfig.ts
 create mode 100644 src/cli/utils/packageManager.ts
diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md
index f60a7745..96aeea63 100644
--- a/docs/guide/troubleshooting.md
+++ b/docs/guide/troubleshooting.md
@@ -164,3 +164,26 @@ Ensure you're not using the `Administrator` user for `npm install` nor to run th
 To do that, go to `Settings > Update & Security > For developers` and enable `Developer mode`.
 
 After that, delete the `.cache` folder under your user directory and try building the app again.
+
+## Customizing `postinstall` Behavior {#postinstall-behavior}
+When installing `node-llama-cpp`, its `postinstall` script checks whether the prebuilt binaries
+are compatible with current machine (which they almost always are, at least the CPU-only ones which are the last resort fallback),
+and when not, attempts [building the native bindings from source](./building-from-source.md).
+
+When attempting to [build from source](./building-from-source.md), if the machine lacks the required build tools,
+the build will fail and indicative error messages will direct you to the specific commands you need to run
+or packages you need to install in order for the build process to succeed.
+
+If you want to customize the `postinstall` behavior, you can do so using any of the following methods:
+* Passing the `--node-llama-cpp-postinstall=<behavior>` flag to the `npm install` command.
+* Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `<behavior>` before running `npm install`.
+* Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to `<behavior>`.
+
+Where `<behavior>` can be one of the following options:
+* **`auto` (default)**: the default behavior explained above.
+* **`ignoreFailedBuild`**: same as the default behavior,
+    but a failed build will not throw an error and will be ignored, which means the installation will succeed.
+    Using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source again by default.
+* **`skip`**: skip the entire `postinstall` script.
+    If the prebuilt binaries are incompatible with the current machine,
+    using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source by default.
diff --git a/src/cli/commands/OnPostInstallCommand.ts b/src/cli/commands/OnPostInstallCommand.ts
index e81e9b0b..9731b71b 100644
--- a/src/cli/commands/OnPostInstallCommand.ts
+++ b/src/cli/commands/OnPostInstallCommand.ts
@@ -1,10 +1,16 @@
+import path from "path";
+import {fileURLToPath} from "url";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
-import {defaultSkipDownload, documentationPageUrls} from "../../config.js";
+import {defaultSkipDownload, documentationPageUrls, defaultNodeLlamaCppPostinstall} from "../../config.js";
 import {getLlamaForOptions} from "../../bindings/getLlama.js";
 import {setForceShowConsoleLogPrefix} from "../../state.js";
 import {isRunningUnderRosetta} from "../utils/isRunningUnderRosetta.js";
 import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
+import {parsePackageJsonConfig, resolvePackageJsonConfig} from "../utils/packageJsonConfig.js";
+import {detectCurrentPackageManager} from "../utils/packageManager.js";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
 type OnPostInstallCommand = null;
 
@@ -13,7 +19,22 @@ export const OnPostInstallCommand: CommandModule<object, OnPostInstallCommand> =
     describe: false,
     async handler() {
         if (defaultSkipDownload)
-            return;
+            return void process.exit(0);
+
+        const nlcConfig = parsePackageJsonConfig(await resolvePackageJsonConfig(__dirname));
+        const postinstallConfig = (defaultNodeLlamaCppPostinstall == null || defaultNodeLlamaCppPostinstall === "auto")
+            ? nlcConfig.nodeLlamaCppPostinstall ?? defaultNodeLlamaCppPostinstall
+            : defaultNodeLlamaCppPostinstall;
+
+        // set via a `--node-llama-cpp-postinstall=skip` flag on an `npm install` command
+        //  (prefer `--node-llama-cpp-postinstall=ignoreFailedBuild` if you really need it)
+        if (postinstallConfig === "skip") {
+            console.info(
+                getConsoleLogPrefix(false, false),
+                "Skipping node-llama-cpp postinstall due to a 'skip' configuration"
+            );
+            return void process.exit(0);
+        }
 
         setForceShowConsoleLogPrefix(true);
 
@@ -34,7 +55,10 @@ export const OnPostInstallCommand: CommandModule<object, OnPostInstallCommand> =
                 "troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction
             );
 
-            process.exit(1);
+            if (postinstallConfig === "ignoreFailedBuild")
+                process.exit(0);
+            else
+                process.exit(1);
         }
 
         try {
@@ -47,7 +71,25 @@ export const OnPostInstallCommand: CommandModule<object, OnPostInstallCommand> =
             process.exit(0);
         } catch (err) {
             console.error(err);
-            process.exit(1);
+
+            const packageManager = detectCurrentPackageManager();
+            if (postinstallConfig === "auto" && packageManager === "npm")
+                console.info(
+                    getConsoleLogPrefix(false, false),
+                    "To disable node-llama-cpp's postinstall for this 'npm install', use the '--node-llama-cpp-postinstall=skip' flag when running 'npm install' command"
+                );
+
+            if (postinstallConfig === "auto")
+                console.info(
+                    getConsoleLogPrefix(false, false),
+                    "To customize node-llama-cpp's postinstall behavior, see the troubleshooting guide: " +
+                    documentationPageUrls.troubleshooting.PostinstallBehavior
+                );
+
+            if (postinstallConfig === "ignoreFailedBuild")
+                process.exit(0);
+            else
+                process.exit(1);
         }
     }
 };
diff --git a/src/cli/utils/packageJsonConfig.ts b/src/cli/utils/packageJsonConfig.ts
new file mode 100644
index 00000000..af08880c
--- /dev/null
+++ b/src/cli/utils/packageJsonConfig.ts
@@ -0,0 +1,69 @@
+import path from "path";
+import fs from "fs-extra";
+import {NodeLlamaCppPostinstallBehavior} from "../../types.js";
+
+export async function resolvePackageJsonConfig(startDir: string) {
+    const currentConfig: Record<string, any> = {};
+
+    let currentDirPath = path.resolve(startDir);
+    while (true) {
+        const packageJsonPath = path.join(currentDirPath, "package.json");
+        try {
+            if (await fs.pathExists(packageJsonPath))
+                applyConfig(currentConfig, await readPackageJsonConfig(packageJsonPath));
+        } catch (err) {
+            // do nothing
+        }
+
+        const parentDirPath = path.dirname(currentDirPath);
+        if (parentDirPath === currentDirPath)
+            break;
+
+        currentDirPath = parentDirPath;
+    }
+
+    return currentConfig;
+}
+
+export function parsePackageJsonConfig(config: Record<string, any>) {
+    const res: NlcPackageJsonConfig = {};
+
+    const castedConfig = config as NlcPackageJsonConfig;
+
+    if (castedConfig.nodeLlamaCppPostinstall === "auto" ||
+        castedConfig.nodeLlamaCppPostinstall === "ignoreFailedBuild" ||
+        castedConfig.nodeLlamaCppPostinstall === "skip"
+    )
+        res.nodeLlamaCppPostinstall = castedConfig.nodeLlamaCppPostinstall;
+    else
+        void (castedConfig.nodeLlamaCppPostinstall satisfies undefined);
+
+    return res;
+}
+
+export type NlcPackageJsonConfig = {
+    nodeLlamaCppPostinstall?: NodeLlamaCppPostinstallBehavior
+};
+
+function readPackageJsonConfig(packageJsonPath: string) {
+    try {
+        const packageJsonContent = fs.readFileSync(packageJsonPath, "utf8");
+        const packageJson = JSON.parse(packageJsonContent);
+        const config = packageJson?.config;
+        if (typeof config === "object")
+            return config;
+
+        return {};
+    } catch (err) {
+        return {};
+    }
+}
+
+function applyConfig(baseConfig: Record<string, any>, newConfig: Record<string, any>) {
+    for (const key in newConfig) {
+        if (key in baseConfig)
+            continue;
+
+        baseConfig[key] = newConfig[key];
+    }
+}
diff --git a/src/cli/utils/packageManager.ts b/src/cli/utils/packageManager.ts
new file mode 100644
index 00000000..0b76eef1
--- /dev/null
+++ b/src/cli/utils/packageManager.ts
@@ -0,0 +1,16 @@
+export function detectCurrentPackageManager(): "npm" | "bun" | "pnpm" | "deno" | "yarn" | undefined {
+    const userAgent = (process.env["npm_config_user_agent"] ?? "").toLowerCase();
+
+    if (userAgent.startsWith("bun/"))
+        return "bun";
+    else if (userAgent.startsWith("pnpm/"))
+        return "pnpm";
+    else if (userAgent.startsWith("yarn/"))
+        return "yarn";
+    else if (userAgent.startsWith("deno/"))
+        return "deno";
+    else if (userAgent.startsWith("npm/"))
+        return "npm";
+
+    return undefined;
+}
diff --git a/src/config.ts b/src/config.ts
index 631df45c..5337d012 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -8,6 +8,7 @@ import {getBinariesGithubRelease} from "./bindings/utils/binariesGithubRelease.j
 import {
     nodeLlamaCppGpuOptions, LlamaLogLevel, LlamaLogLevelValues, parseNodeLlamaCppGpuOption, nodeLlamaCppGpuOffStringOptions
 } from "./bindings/types.js";
+import type {NodeLlamaCppPostinstallBehavior} from "./types.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
@@ -75,6 +76,15 @@ export const defaultLlamaCppDebugMode = env.get("NODE_LLAMA_CPP_DEBUG")
 export const defaultSkipDownload = env.get("NODE_LLAMA_CPP_SKIP_DOWNLOAD")
     .default("false")
     .asBool();
+
+// set via a `--node-llama-cpp-postinstall=ignoreFailedBuild` flag on an `npm install` command
+export const defaultNodeLlamaCppPostinstall = env.get("NODE_LLAMA_CPP_POSTINSTALL")
+    .default(
+        env.get("npm_config_node_llama_cpp_postinstall")
+            .default("auto")
+            .asEnum(["auto", "ignoreFailedBuild", "skip"] as const satisfies NodeLlamaCppPostinstallBehavior[])
+    )
+    .asEnum(["auto", "ignoreFailedBuild", "skip"] as const satisfies NodeLlamaCppPostinstallBehavior[]);
 export const defaultBindingTestLogLevel = env.get("NODE_LLAMA_CPP_BINDING_TEST_LOG_LEVEL")
     .default(LlamaLogLevel.error)
     .asEnum(LlamaLogLevelValues);
@@ -125,7 +135,8 @@ export const documentationPageUrls = {
         }
     },
     troubleshooting: {
-        RosettaIllegalHardwareInstruction: documentationUrl + "/guide/troubleshooting#illegal-hardware-instruction"
+        RosettaIllegalHardwareInstruction: documentationUrl + "/guide/troubleshooting#illegal-hardware-instruction",
+        PostinstallBehavior: documentationUrl + "/guide/troubleshooting#postinstall-behavior"
     }
 } as const;
 export const newGithubIssueUrl = "https://github.com/withcatai/node-llama-cpp/issues";
diff --git a/src/types.ts b/src/types.ts
index 4d24d155..630da6c1 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -477,3 +477,5 @@ export type LLamaContextualDryRepeatPenalty = {
      */
     sequenceBreakers?: string[]
 };
+
+export type NodeLlamaCppPostinstallBehavior = "auto" | "ignoreFailedBuild" | "skip";

From 40e911def9fc348b6359c19ca26d46cc759cafca Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 16 Mar 2026 02:48:00 +0200
Subject: [PATCH 02/10] docs: `postinstall` configuration examples

---
 docs/guide/troubleshooting.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md
index 96aeea63..8395256c 100644
--- a/docs/guide/troubleshooting.md
+++ b/docs/guide/troubleshooting.md
@@ -178,6 +178,8 @@ If you want to customize the `postinstall` behavior, you can do so using any of
 * Passing the `--node-llama-cpp-postinstall=<behavior>` flag to the `npm install` command.
 * Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `<behavior>` before running `npm install`.
 * Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to `<behavior>`.
+  <br/>
+  This will only work when your module is installed globally using `npm -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module.
 
 Where `<behavior>` can be one of the following options:
 * **`auto` (default)**: the default behavior explained above.
@@ -187,3 +189,25 @@ Where `<behavior>` can be one of the following options:
 * **`skip`**: skip the entire `postinstall` script.
     If the prebuilt binaries are incompatible with the current machine,
     using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source by default.
+
+::: code-group
+```shell [<code>npm install</code> flag]
+npm install --node-llama-cpp-postinstall=ignoreFailedBuild
+```
+
+```shell [env var (bash)]
+NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install
+```
+
+```shell [env var (using <code>cross-env</code>)]
+npx --yes cross-env NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install
+```
+
+```json [<code>package.json</code>]
+{
+    "config": {
+        "nodeLlamaCppPostinstall": "ignoreFailedBuild"
+    }
+}
+```
+:::

From e0986fad977f89f2ae037478788a794ca64d3223 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 16 Mar 2026 02:51:33 +0200
Subject: [PATCH 03/10] fix: bugs

---
 src/cli/utils/packageJsonConfig.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cli/utils/packageJsonConfig.ts b/src/cli/utils/packageJsonConfig.ts
index af08880c..9f4c740e 100644
--- a/src/cli/utils/packageJsonConfig.ts
+++ b/src/cli/utils/packageJsonConfig.ts
@@ -60,8 +60,8 @@ function readPackageJsonConfig(packageJsonPath: string) {
 }
 
 function applyConfig(baseConfig: Record<string, any>, newConfig: Record<string, any>) {
-    for (const key in newConfig) {
-        if (key in baseConfig)
+    for (const key of Object.keys(newConfig)) {
+        if (Object.hasOwn(baseConfig, key))
             continue;
 
         baseConfig[key] = newConfig[key];

From 8bf19bcefd0af7422ed7f55aabb35d7baa94dc96 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 16 Mar 2026 03:09:59 +0200
Subject: [PATCH 04/10] feat: support `NVFP4` quants

---
 src/gguf/types/GgufMetadataTypes.ts   | 3 ++-
 src/gguf/types/GgufTensorInfoTypes.ts | 3 ++-
 src/gguf/utils/ggufQuantNames.ts      | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index b4c4b458..a2cd41b9 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -193,7 +193,8 @@ export enum GgufFileType {
     MOSTLY_Q4_0_8_8 = 35, // deprecated
     MOSTLY_TQ1_0 = 36,
     MOSTLY_TQ2_0 = 37,
-    MOSTLY_MXFP4_MOE = 38
+    MOSTLY_MXFP4_MOE = 38,
+    MOSTLY_NVFP4 = 39
 }
 
 
diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts
index ed750329..1ada8204 100644
--- a/src/gguf/types/GgufTensorInfoTypes.ts
+++ b/src/gguf/types/GgufTensorInfoTypes.ts
@@ -60,5 +60,6 @@ export const enum GgmlType {
     IQ4_NL_4_4 = 36,
     IQ4_NL_4_8 = 37,
     IQ4_NL_8_8 = 38,
-    MXFP4 = 39 // MXFP4 (1 block)
+    MXFP4 = 39, // MXFP4 (1 block)
+    NVFP4 = 40 // NVFP4 (4 blocks, E4M3 scale)
 }
diff --git a/src/gguf/utils/ggufQuantNames.ts b/src/gguf/utils/ggufQuantNames.ts
index abff8a8f..3e2c5c65 100644
--- a/src/gguf/utils/ggufQuantNames.ts
+++ b/src/gguf/utils/ggufQuantNames.ts
@@ -4,6 +4,7 @@ export const ggufQuantNames = new Map<string, GgufFileType>([
     ["Q4_0", GgufFileType.MOSTLY_Q4_0],
     ["Q4_1", GgufFileType.MOSTLY_Q4_1],
     ["MXFP4", GgufFileType.MOSTLY_MXFP4_MOE],
+    ["NVFP4", GgufFileType.MOSTLY_MXFP4_MOE],
     ["Q5_0", GgufFileType.MOSTLY_Q5_0],
     ["Q5_1", GgufFileType.MOSTLY_Q5_1],
     ["IQ2_XXS", GgufFileType.MOSTLY_IQ2_XXS],

From 4a4701e5e9154497e211b00b8479c48ee81206de Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 16 Mar 2026 06:11:21 +0200
Subject: [PATCH 05/10] docs: fix typo

---
 docs/guide/troubleshooting.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md
index 8395256c..71865b1b 100644
--- a/docs/guide/troubleshooting.md
+++ b/docs/guide/troubleshooting.md
@@ -179,7 +179,7 @@ If you want to customize the `postinstall` behavior, you can do so using any of
 * Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `<behavior>` before running `npm install`.
 * Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to `<behavior>`.
   <br/>
-  This will only work when your module is installed globally using `npm -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module.
+  This will only work when your module is installed globally using `npm install -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module.
 
 Where `<behavior>` can be one of the following options:
 * **`auto` (default)**: the default behavior explained above.

From 1257846f31a957cbe8b3399b4e4fcb0604df968d Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 16 Mar 2026 23:48:38 +0200
Subject: [PATCH 06/10] fix: bugs

---
 src/cli/utils/packageJsonConfig.ts | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/cli/utils/packageJsonConfig.ts b/src/cli/utils/packageJsonConfig.ts
index 9f4c740e..f9ed820d 100644
--- a/src/cli/utils/packageJsonConfig.ts
+++ b/src/cli/utils/packageJsonConfig.ts
@@ -7,13 +7,7 @@ export async function resolvePackageJsonConfig(startDir: string) {
 
     let currentDirPath = path.resolve(startDir);
     while (true) {
-        const packageJsonPath = path.join(currentDirPath, "package.json");
-        try {
-            if (await fs.pathExists(packageJsonPath))
-                applyConfig(currentConfig, await readPackageJsonConfig(packageJsonPath));
-        } catch (err) {
-            // do nothing
-        }
+        applyConfig(currentConfig, await readPackageJsonConfig(path.join(currentDirPath, "package.json")));
 
         const parentDirPath = path.dirname(currentDirPath);
         if (parentDirPath === currentDirPath)
@@ -22,6 +16,10 @@ export async function resolvePackageJsonConfig(startDir: string) {
         currentDirPath = parentDirPath;
     }
 
+    const npmPackageJsonPath = process.env["npm_package_json"] ?? "";
+    if (npmPackageJsonPath !== "")
+        applyConfig(currentConfig, await readPackageJsonConfig(npmPackageJsonPath));
+
     return currentConfig;
 }
 
@@ -45,9 +43,12 @@ export type NlcPackageJsonConfig = {
     nodeLlamaCppPostinstall?: NodeLlamaCppPostinstallBehavior
 };
 
-function readPackageJsonConfig(packageJsonPath: string) {
+async function readPackageJsonConfig(packageJsonPath: string) {
     try {
-        const packageJsonContent = fs.readFileSync(packageJsonPath, "utf8");
+        if (!(await fs.pathExists(packageJsonPath)))
+            return {};
+
+        const packageJsonContent = await fs.readFile(packageJsonPath, "utf8");
         const packageJson = JSON.parse(packageJsonContent);
         const config = packageJson?.config;
         if (typeof config === "object")

From 47b678b9090396d0a8ac52178a5ad6c926a6f23c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 17 Mar 2026 06:53:32 +0200
Subject: [PATCH 07/10] feat: context kv cache key and value type
 configurations

---
 llama/addon/AddonContext.cpp                  | 20 +++++-
 src/bindings/AddonTypes.ts                    |  2 +
 src/cli/commands/ChatCommand.ts               | 37 ++++++++--
 src/cli/commands/CompleteCommand.ts           | 39 +++++++++--
 src/cli/commands/InfillCommand.ts             | 38 +++++++++--
 .../commands/InspectEstimateCommand.ts        | 30 +++++++-
 .../inspect/commands/InspectGgufCommand.ts    |  9 ++-
 .../inspect/commands/InspectMeasureCommand.ts | 68 ++++++++++++++++---
 src/cli/utils/interactivelyAskForModel.ts     | 41 ++++++++---
 src/cli/utils/printCommonInfoLines.ts         |  9 +++
 src/cli/utils/resolveCommandGgufPath.ts       | 16 ++++-
 src/evaluator/LlamaContext/LlamaContext.ts    | 35 +++++++++-
 src/evaluator/LlamaContext/types.ts           | 23 ++++++-
 src/evaluator/LlamaModel/LlamaModel.ts        | 51 +++++++++++++-
 src/gguf/insights/GgufInsights.ts             | 64 ++++++++++++-----
 .../GgufInsightsConfigurationResolver.ts      | 32 ++++++++-
 .../utils/resolveContextContextSizeOption.ts  | 12 +++-
 .../utils/resolveModelGpuLayersOption.ts      | 37 ++++++++--
 src/gguf/types/GgufTensorInfoTypes.ts         | 14 +++-
 19 files changed, 508 insertions(+), 69 deletions(-)

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 2b70c08f..b8a22ef0 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -443,6 +443,20 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.no_perf = !(options.Get("performanceTracking").As<Napi::Boolean>().Value());
         }
 
+        if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
+            auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
+            if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
+                context_params.type_k = keyType;
+            }
+        }
+
+        if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
+            auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
+            if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
+                context_params.type_v = valueType;
+            }
+        }
+
         if (options.Has("swaFullCache")) {
             context_params.swa_full = options.Get("swaFullCache").As<Napi::Boolean>().Value();
         }
@@ -1063,7 +1077,7 @@ void AddonContext::init(Napi::Object exports) {
 }
 
 AddonContextSequenceCheckpoint::AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonContextSequenceCheckpoint>(info) {
-    
+
 }
 AddonContextSequenceCheckpoint::~AddonContextSequenceCheckpoint() {
     dispose();
@@ -1099,7 +1113,7 @@ class AddonContextSequenceCheckpointInitWorker : public Napi::AsyncWorker {
                 checkpoint->minPos = llama_memory_seq_pos_min(llama_get_memory(context->ctx), checkpoint->sequenceId);
                 checkpoint->maxPos = llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId);
                 const size_t checkpointSize = llama_state_seq_get_size_ext(context->ctx, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                
+
                 checkpoint->data.resize(checkpointSize, 0);
                 llama_state_seq_get_data_ext(context->ctx, checkpoint->data.data(), checkpointSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
             } catch (const std::exception& e) {
@@ -1164,4 +1178,4 @@ void AddonContextSequenceCheckpoint::init(Napi::Object exports) {
             }
         )
     );
-}
\ No newline at end of file
+}
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 9fbada1d..294749d5 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -31,6 +31,8 @@ export type BindingModule = {
             ranking?: boolean,
             threads?: number,
             performanceTracking?: boolean,
+            kvCacheKeyType?: number,
+            kvCacheValueType?: number,
             swaFullCache?: boolean
         }): AddonContext
     },
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index 96227f65..d1dca381 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -31,6 +31,7 @@ import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescripti
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
 import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 
 type ChatCommand = {
     modelPath?: string,
@@ -46,6 +47,8 @@ type ChatCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     noTrimWhitespace: boolean,
     grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[1],
@@ -172,6 +175,24 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the key for the context KV cache tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the value for the context KV cache tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -379,7 +400,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
-        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
+        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -390,8 +411,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         try {
             await RunChat({
                 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
-                batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
-                temperature, minP, topK, topP, seed, xtc,
+                batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile,
+                threads, temperature, minP, topK, topP, seed, xtc,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
                 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
@@ -408,7 +429,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 
 async function RunChat({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
-    contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
+    contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -444,12 +465,16 @@ async function RunChat({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
         swaFullCache,
+        kvCacheKeyType,
+        kvCacheValueType,
         useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
             swaFullCache,
+            kvCacheKeyType,
+            kvCacheValueType,
             useMmap,
             consoleTitle: "Draft model file"
         })
@@ -495,6 +520,8 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextKvCacheKeyType: kvCacheKeyType,
+                defaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -530,6 +557,8 @@ async function RunChat({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextKvCacheKeyType: kvCacheKeyType,
+                    defaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index aeeb7117..e5711c7e 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -23,6 +23,7 @@ import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
 import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 
 type CompleteCommand = {
     modelPath?: string,
@@ -34,6 +35,8 @@ type CompleteCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     threads?: number,
     temperature: number,
@@ -129,6 +132,24 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the key for the context KV cache tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the value for the context KV cache tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -299,7 +320,7 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
-        flashAttention, swaFullCache, threads, temperature, minP, topK,
+        flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK,
         topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
         dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
@@ -307,7 +328,8 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
     }) {
         try {
             await RunCompletion({
-                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
+                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+                kvCacheKeyType, kvCacheValueType, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength,
                 dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens,
@@ -323,7 +345,8 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
 
 
 async function RunCompletion({
-    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
+    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+    kvCacheKeyType, kvCacheValueType, swaFullCache,
     threads, temperature, minP, topK, topP, seed, xtc, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
@@ -356,13 +379,17 @@ async function RunCompletion({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
         swaFullCache,
-        useMmap
+        useMmap,
+        kvCacheKeyType,
+        kvCacheValueType
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
             swaFullCache,
             useMmap,
+            kvCacheKeyType,
+            kvCacheValueType,
             consoleTitle: "Draft model file"
         })
         : undefined;
@@ -400,6 +427,8 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextKvCacheKeyType: kvCacheKeyType,
+                defaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -435,6 +464,8 @@ async function RunCompletion({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextKvCacheKeyType: kvCacheKeyType,
+                    defaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index 5a69a3a7..6b8eabea 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -23,6 +23,7 @@ import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
 import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 
 type InfillCommand = {
     modelPath?: string,
@@ -36,6 +37,8 @@ type InfillCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     threads?: number,
     temperature: number,
@@ -139,6 +142,24 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the key for the context KV cache tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the value for the context KV cache tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -309,7 +330,7 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize,
-        flashAttention, swaFullCache, threads, temperature, minP, topK,
+        flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK,
         topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
         dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
@@ -318,7 +339,8 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
-                swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty,
+                kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers,
+                lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength,
                 dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens,
                 tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings
@@ -334,7 +356,7 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
 
 async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
-    swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers,
+    kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
     tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, useDirectIo, printTimings
@@ -366,13 +388,17 @@ async function RunInfill({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
         swaFullCache,
-        useMmap
+        useMmap,
+        kvCacheKeyType,
+        kvCacheValueType
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
             swaFullCache,
             useMmap,
+            kvCacheKeyType,
+            kvCacheValueType,
             consoleTitle: "Draft model file"
         })
         : undefined;
@@ -424,6 +450,8 @@ async function RunInfill({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextKvCacheKeyType: kvCacheKeyType,
+                defaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -459,6 +487,8 @@ async function RunInfill({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextKvCacheKeyType: kvCacheKeyType,
+                    defaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
index ffd5f65e..baaa2a5f 100644
--- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
@@ -24,6 +24,7 @@ import {printModelDestination} from "../../../utils/printModelDestination.js";
 import {toBytes} from "../../../utils/toBytes.js";
 import {printDidYouMeanUri} from "../../../utils/resolveCommandGgufPath.js";
 import {isModelUri} from "../../../../utils/parseModelUri.js";
+import {GgmlType, resolveGgmlTypeOption} from "../../../../gguf/types/GgufTensorInfoTypes.js";
 
 type InspectEstimateCommand = {
     modelPath: string,
@@ -33,6 +34,8 @@ type InspectEstimateCommand = {
     contextSize?: number | "train",
     embedding?: boolean,
     noMmap?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean
 };
 
@@ -117,6 +120,24 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the key for the context KV cache tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the value for the context KV cache tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -125,7 +146,8 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap, swaFullCache
+        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap,
+        kvCacheKeyType, kvCacheValueType, swaFullCache
     }: InspectEstimateCommand) {
         if (gpuLayers === -1) gpuLayers = undefined;
         if (gpuLayers === -2) gpuLayers = "max";
@@ -189,6 +211,12 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 targetGpuLayers: gpuLayers,
                 embeddingContext: embedding,
                 useMmap,
+                kvCacheKeyType: kvCacheKeyType === "currentQuant"
+                    ? ggufInsights.dominantTensorType
+                    : resolveGgmlTypeOption(kvCacheKeyType),
+                kvCacheValueType: kvCacheValueType === "currentQuant"
+                    ? ggufInsights.dominantTensorType
+                    : resolveGgmlTypeOption(kvCacheValueType),
                 swaFullCache
             });
         }
diff --git a/src/cli/commands/inspect/commands/InspectGgufCommand.ts b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
index e73aaea9..b73f42a8 100644
--- a/src/cli/commands/inspect/commands/InspectGgufCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
@@ -14,10 +14,11 @@ import withOra from "../../../../utils/withOra.js";
 import {resolveModelArgToFilePathOrUrl} from "../../../../utils/resolveModelDestination.js";
 import {printModelDestination} from "../../../utils/printModelDestination.js";
 import {getGgufMetadataKeyValue} from "../../../../gguf/utils/getGgufMetadataKeyValue.js";
-import {GgufTensorInfo} from "../../../../gguf/types/GgufTensorInfoTypes.js";
+import {GgmlType, GgufTensorInfo} from "../../../../gguf/types/GgufTensorInfoTypes.js";
 import {toBytes} from "../../../utils/toBytes.js";
 import {printDidYouMeanUri} from "../../../utils/resolveCommandGgufPath.js";
 import {isModelUri} from "../../../../utils/parseModelUri.js";
+import {getDominantTensorType} from "../../../../gguf/insights/GgufInsights.js";
 
 const chatTemplateKey = ".chatTemplate";
 
@@ -224,11 +225,17 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
             if (parsedMetadata.splicedParts > 1)
                 console.info(`${chalk.yellow("Spliced parts:")} ${parsedMetadata.splicedParts}`);
 
+            const dominantTensorType = getDominantTensorType(parsedMetadata.fullTensorInfo ?? []);
+
             console.info(`${chalk.yellow("GGUF version:")} ${parsedMetadata.version}`);
             console.info(`${chalk.yellow("Tensor count:")} ${parsedMetadata.totalTensorCount.toLocaleString("en-US", numberLocaleFormattingOptions)}`);
             console.info(`${chalk.yellow("Metadata size:")} ${toBytes(parsedMetadata.totalMetadataSize)}`);
             console.info(`${chalk.yellow("Tensor info size:")} ${toBytes(parsedMetadata.totalTensorInfoSize!)}`);
             console.info(`${chalk.yellow("File type:")} ${fileTypeName ?? ""} ${chalk.white(`(${parsedMetadata.metadata.general?.file_type})`)}`);
+
+            if (dominantTensorType != null)
+                console.info(`${chalk.yellow("Dominant tensor type:")} ${dominantTensorType} (${GgmlType[dominantTensorType]})`);
+
             console.info(`${chalk.yellow("Metadata:")} ${prettyPrintObject(parsedMetadata.metadata, undefined, metadataPrettyPrintOptions)}`);
             console.info(`${chalk.yellow("Tensor info:")} ${prettyPrintObject(parsedMetadata.fullTensorInfo, undefined, tensorInfoPrettyPrintOptions)}`);
         }
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index 482353f6..dd5fb29a 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -23,6 +23,7 @@ import {Llama} from "../../../../bindings/Llama.js";
 import {toBytes} from "../../../utils/toBytes.js";
 import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
 import {getPlatform} from "../../../../bindings/utils/getPlatform.js";
+import {GgmlType, resolveGgmlTypeOption} from "../../../../gguf/types/GgufTensorInfoTypes.js";
 
 type InspectMeasureCommand = {
     modelPath?: string,
@@ -33,6 +34,8 @@ type InspectMeasureCommand = {
     minContextSize: number,
     maxContextSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     batchSize?: number,
     measures: number,
@@ -109,6 +112,24 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 default: false,
                 description: "Enable flash attention for the context"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the key for the context KV cache tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the value for the context KV cache tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -161,7 +182,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache,
+        modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention,
+        kvCacheKeyType, kvCacheValueType, swaFullCache,
         batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText,
         repeatEvaluateText
     }: InspectMeasureCommand) {
@@ -186,7 +208,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
         const useMmap = !noMmap && llama.supportsMmap;
         const useDirectIo = !noDirectIo;
         const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
-            flashAttention, swaFullCache, useMmap
+            flashAttention, swaFullCache, useMmap, kvCacheKeyType, kvCacheValueType
         });
 
         console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
@@ -221,6 +243,16 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
         let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers;
         let previousContextSizeCheck: undefined | number = undefined;
 
+        const resolvedKvCacheKeyType = kvCacheKeyType === "currentQuant"
+            ? ggufInsights.dominantTensorType ?? GgmlType.F16
+            : resolveGgmlTypeOption(kvCacheKeyType) ?? GgmlType.F16;
+        const resolvedKvCacheValueType = kvCacheValueType === "currentQuant"
+            ? ggufInsights.dominantTensorType ?? GgmlType.F16
+            : resolveGgmlTypeOption(kvCacheValueType) ?? GgmlType.F16;
+
+        if (resolvedKvCacheKeyType != GgmlType.F16 || resolvedKvCacheValueType != GgmlType.F16)
+            console.info(`${chalk.yellow("KV cache:")} ${GgmlType[resolvedKvCacheKeyType] + " " + GgmlType[resolvedKvCacheValueType]}`);
+
         const measureTable = getMeasureTable(measureMemoryType);
 
         measureTable.logHeader({drawRowSeparator: !printHeaderBeforeEachLayer});
@@ -249,6 +281,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 maxContextSize,
                 minContextSize,
                 flashAttention,
+                kvCacheKeyType: resolvedKvCacheKeyType,
+                kvCacheValueType: resolvedKvCacheValueType,
                 swaFullCache,
                 batchSize,
                 tests: measures,
@@ -533,7 +567,7 @@ const expectedFileName = "InspectMeasureCommand";
 
 async function measureModel({
     modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers,
-    flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
+    flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
 }: {
     modelPath: string,
     useMmap?: boolean,
@@ -546,6 +580,8 @@ async function measureModel({
     maxGpuLayers: number,
     minGpuLayers?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: GgmlType,
+    kvCacheValueType?: GgmlType,
     swaFullCache?: boolean,
     batchSize?: number,
     evaluateText?: string,
@@ -656,6 +692,8 @@ async function measureModel({
                         maxGpuLayers,
                         minGpuLayers,
                         flashAttention,
+                        kvCacheKeyType,
+                        kvCacheValueType,
                         swaFullCache,
                         batchSize,
                         evaluateText,
@@ -759,11 +797,12 @@ async function runTestWorkerLogic() {
     }
 
     async function testContextSizes({
-        model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, swaFullCache,
-        batchSize, evaluateText, exitAfterMeasurement = false
+        model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention,
+        kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false
     }: {
         model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number,
-        minContextSize?: number, tests: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, evaluateText?: string,
+        minContextSize?: number, tests: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType,
+        swaFullCache?: boolean, batchSize?: number, evaluateText?: string,
         exitAfterMeasurement?: boolean
     }) {
         let measurementsDone: number = 0;
@@ -794,6 +833,8 @@ async function runTestWorkerLogic() {
                     ),
                     ignoreMemorySafetyChecks: currentContextSizeCheck != null,
                     flashAttention,
+                    kvCacheKeyType,
+                    kvCacheValueType,
                     swaFullCache,
                     batchSize,
                     failedCreationRemedy: false
@@ -849,11 +890,12 @@ async function runTestWorkerLogic() {
     }
 
     async function testWithGpuLayers({
-        modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache,
-        batchSize, evaluateText, exitAfterMeasurement = false
+        modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention,
+        kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false
     }: {
         modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number,
-        maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number,
+        maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType,
+        swaFullCache?: boolean, batchSize?: number,
         evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
         try {
@@ -865,6 +907,8 @@ async function runTestWorkerLogic() {
                 useDirectIo,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextKvCacheKeyType: kvCacheKeyType,
+                defaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 ignoreMemorySafetyChecks: true
             });
@@ -888,6 +932,8 @@ async function runTestWorkerLogic() {
                 maxContextSize,
                 minContextSize,
                 flashAttention,
+                kvCacheKeyType,
+                kvCacheValueType,
                 swaFullCache,
                 batchSize,
                 tests,
@@ -939,6 +985,8 @@ async function runTestWorkerLogic() {
                     maxContextSize: message.maxContextSize,
                     minContextSize: message.minContextSize,
                     flashAttention: message.flashAttention,
+                    kvCacheKeyType: message.kvCacheKeyType,
+                    kvCacheValueType: message.kvCacheValueType,
                     swaFullCache: message.swaFullCache,
                     batchSize: message.batchSize,
                     evaluateText: message.evaluateText,
@@ -1033,6 +1081,8 @@ type ParentToChildMessage = {
     maxGpuLayers: number,
     minGpuLayers?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: GgmlType,
+    kvCacheValueType?: GgmlType,
     swaFullCache?: boolean,
     batchSize?: number,
     initialMaxContextSize?: number,
diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts
index 8238daec..bd1cfb71 100644
--- a/src/cli/utils/interactivelyAskForModel.ts
+++ b/src/cli/utils/interactivelyAskForModel.ts
@@ -15,6 +15,7 @@ import {getPrettyBuildGpuName} from "../../bindings/consts.js";
 import {GgufInsightsConfigurationResolver} from "../../gguf/insights/GgufInsightsConfigurationResolver.js";
 import {isUrl} from "../../utils/isUrl.js";
 import {isModelUri, parseModelUri} from "../../utils/parseModelUri.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 import {resolveModelRecommendationFileOptions} from "./resolveModelRecommendationFileOptions.js";
 import {getReadablePath} from "./getReadablePath.js";
 import {basicChooseFromListConsoleInteraction} from "./basicChooseFromListConsoleInteraction.js";
@@ -61,7 +62,9 @@ export async function interactivelyAskForModel({
     downloadIntent = true,
     flashAttention = false,
     swaFullCache = false,
-    useMmap
+    useMmap,
+    kvCacheKeyType,
+    kvCacheValueType
 }: {
     llama: Llama,
     modelsDirectory?: string,
@@ -69,7 +72,9 @@ export async function interactivelyAskForModel({
     downloadIntent?: boolean,
     flashAttention?: boolean,
     swaFullCache?: boolean,
-    useMmap?: boolean
+    useMmap?: boolean,
+    kvCacheKeyType?: "currentQuant" | GgmlType,
+    kvCacheValueType?: "currentQuant" | GgmlType
 }): Promise<string> {
     let localModelFileOptions: (ModelOption & {type: "localModel"})[] = [];
     const recommendedModelOptions: (ModelOption & {type: "recommendedModel"})[] = [];
@@ -123,7 +128,13 @@ export async function interactivelyAskForModel({
                         const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({
                             flashAttention: flashAttention && ggufInsights?.flashAttentionSupported,
                             swaFullCache,
-                            useMmap
+                            useMmap,
+                            kvCacheKeyType: kvCacheKeyType === "currentQuant"
+                                ? ggufInsights?.dominantTensorType
+                                : kvCacheKeyType,
+                            kvCacheValueType: kvCacheValueType === "currentQuant"
+                                ? ggufInsights?.dominantTensorType
+                                : kvCacheValueType
                         });
 
                         return {
@@ -296,7 +307,8 @@ export async function interactivelyAskForModel({
                 items: options,
                 renderItem(item, focused, rerender) {
                     return renderSelectionItem(
-                        item, focused, rerender, activeInteractionController.signal, llama, flashAttention, swaFullCache, useMmap
+                        item, focused, rerender, activeInteractionController.signal, llama, flashAttention, swaFullCache, useMmap,
+                        kvCacheKeyType, kvCacheValueType
                     );
                 },
                 canFocusItem(item) {
@@ -413,7 +425,8 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise<string |
 
 function renderSelectionItem(
     item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean,
-    swaFullCache: boolean, useMmap?: boolean
+    swaFullCache: boolean, useMmap: boolean | undefined,
+    kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType
 ) {
     if (item.type === "localModel") {
         let modelText = item.title instanceof Function
@@ -441,7 +454,9 @@ function renderSelectionItem(
                     llama,
                     flashAttention,
                     swaFullCache,
-                    useMmap
+                    useMmap,
+                    kvCacheKeyType,
+                    kvCacheValueType
                 });
             }
 
@@ -563,7 +578,7 @@ function renderRecommendedModelTechnicalInfo(
 }
 
 async function selectFileForModelRecommendation({
-    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, swaFullCache, useMmap
+    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, swaFullCache, useMmap, kvCacheKeyType, kvCacheValueType
 }: {
     recommendedModelOption: ModelOption & {type: "recommendedModel"},
     llama: Llama,
@@ -571,7 +586,9 @@ async function selectFileForModelRecommendation({
     rerenderOption(): void,
     flashAttention: boolean,
     swaFullCache: boolean,
-    useMmap?: boolean
+    useMmap?: boolean,
+    kvCacheKeyType?: "currentQuant" | GgmlType,
+    kvCacheValueType?: "currentQuant" | GgmlType
 }) {
     try {
         let bestScore: number | undefined = undefined;
@@ -594,7 +611,13 @@ async function selectFileForModelRecommendation({
                 const compatibilityScore = await ggufInsights.configurationResolver.scoreModelConfigurationCompatibility({
                     flashAttention,
                     swaFullCache,
-                    useMmap
+                    useMmap,
+                    kvCacheKeyType: kvCacheKeyType === "currentQuant"
+                        ? ggufInsights.dominantTensorType
+                        : kvCacheKeyType,
+                    kvCacheValueType: kvCacheValueType === "currentQuant"
+                        ? ggufInsights.dominantTensorType
+                        : kvCacheValueType
                 });
 
                 if (bestScore == null || compatibilityScore.compatibilityScore > bestScore) {
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
index 983a1056..47a81c42 100644
--- a/src/cli/utils/printCommonInfoLines.ts
+++ b/src/cli/utils/printCommonInfoLines.ts
@@ -2,6 +2,7 @@ import chalk from "chalk";
 import {getPrettyBuildGpuName} from "../../bindings/consts.js";
 import {LlamaContext} from "../../evaluator/LlamaContext/LlamaContext.js";
 import {getPlatform} from "../../bindings/utils/getPlatform.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 import {printInfoLine} from "./printInfoLine.js";
 import {toBytes} from "./toBytes.js";
 
@@ -125,6 +126,10 @@ export async function printCommonInfoLines({
             show: tokenMeterEnabled,
             title: "Token meter",
             value: "enabled"
+        }, {
+            show: context.kvCacheKeyType !== GgmlType.F16 || context.kvCacheValueType !== GgmlType.F16,
+            title: "KV cache",
+            value: GgmlType[context.kvCacheKeyType] + " " + GgmlType[context.kvCacheValueType]
         }]
     });
 
@@ -180,6 +185,10 @@ export async function printCommonInfoLines({
                 show: tokenMeterEnabled,
                 title: "Token meter",
                 value: "enabled"
+            }, {
+                show: draftContext.kvCacheKeyType !== GgmlType.F16 || draftContext.kvCacheValueType !== GgmlType.F16,
+                title: "KV cache",
+                value: GgmlType[draftContext.kvCacheKeyType] + " " + GgmlType[draftContext.kvCacheValueType]
             }]
         });
     }
diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts
index 219d1808..a11cfa96 100644
--- a/src/cli/utils/resolveCommandGgufPath.ts
+++ b/src/cli/utils/resolveCommandGgufPath.ts
@@ -8,14 +8,18 @@ import {resolveModelDestination} from "../../utils/resolveModelDestination.js";
 import {ggufQuantNames} from "../../gguf/utils/ggufQuantNames.js";
 import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
 import {isModelUri} from "../../utils/parseModelUri.js";
+import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "./ConsoleInteraction.js";
 import {getReadablePath} from "./getReadablePath.js";
 import {interactivelyAskForModel} from "./interactivelyAskForModel.js";
 
 export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record<string, string>, {
-    targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File"
+    targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File",
+    kvCacheKeyType, kvCacheValueType
 }: {
-    targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string
+    targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType
 } = {}) {
     if (ggufPath == null)
         ggufPath = await interactivelyAskForModel({
@@ -25,7 +29,13 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
             downloadIntent: true,
             flashAttention,
             swaFullCache,
-            useMmap
+            useMmap,
+            kvCacheKeyType: kvCacheKeyType === "currentQuant"
+                ? "currentQuant"
+                : resolveGgmlTypeOption(kvCacheKeyType),
+            kvCacheValueType: kvCacheValueType === "currentQuant"
+                ? "currentQuant"
+                : resolveGgmlTypeOption(kvCacheValueType)
         });
 
     const resolvedModelDestination = resolveModelDestination(ggufPath);
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 43b205cc..7d013a51 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -15,6 +15,7 @@ import {pushAll} from "../../utils/pushAll.js";
 import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js";
 import {LlamaLogLevel} from "../../bindings/types.js";
+import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js";
 import {
     BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem,
     EvaluationPriority, LlamaContextOptions, LlamaContextSequenceDryRepeatPenalty, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem,
@@ -74,6 +75,8 @@ export class LlamaContext {
     /** @internal */ private readonly _idealThreads: number;
     /** @internal */ private readonly _minThreads: number;
     /** @internal */ private readonly _performanceTracking: boolean;
+    /** @internal */ private readonly _kvCacheKeyType: GgmlType;
+    /** @internal */ private readonly _kvCacheValueType: GgmlType;
     /** @internal */ private readonly _totalSequences: number;
     /** @internal */ private readonly _unusedSequenceIds: number[] = [];
     /** @internal */ private readonly _batchingOptions: Required<BatchingOptions>;
@@ -110,13 +113,17 @@ export class LlamaContext {
         } = {},
         swaFullCache = _model.defaultContextSwaFullCache,
         performanceTracking = false,
+        kvCacheKeyType,
+        kvCacheValueType,
         _embeddings,
         _ranking
     }: LlamaContextOptions & {
         sequences: number,
         contextSize: number,
         batchSize: number,
-        flashAttention: boolean
+        flashAttention: boolean,
+        kvCacheKeyType: GgmlType,
+        kvCacheValueType: GgmlType
     }) {
         if (_model.disposed)
             throw new DisposedError();
@@ -145,6 +152,8 @@ export class LlamaContext {
                 : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1)
         );
         this._performanceTracking = !!performanceTracking;
+        this._kvCacheKeyType = kvCacheKeyType;
+        this._kvCacheValueType = kvCacheValueType;
         this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
@@ -159,6 +168,8 @@ export class LlamaContext {
             embeddings: _embeddings,
             ranking: _ranking,
             performanceTracking: this._performanceTracking,
+            kvCacheKeyType: this._kvCacheKeyType,
+            kvCacheValueType: this._kvCacheValueType,
             swaFullCache: this._swaFullCache
         }));
         this._batchingOptions = {
@@ -221,6 +232,14 @@ export class LlamaContext {
         return this._flashAttention;
     }
 
+    public get kvCacheKeyType() {
+        return this._kvCacheKeyType;
+    }
+
+    public get kvCacheValueType() {
+        return this._kvCacheValueType;
+    }
+
     /**
      * The actual size of the state in the memory in bytes.
      * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
@@ -872,6 +891,12 @@ export class LlamaContext {
         const flashAttention = _model.flashAttentionSupported
             ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
             : false;
+        const kvCacheKeyType = options.kvCacheKeyType === "currentQuant"
+            ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType
+            : resolveGgmlTypeOption(options.kvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
+        const kvCacheValueType = options.kvCacheValueType === "currentQuant"
+            ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType
+            : resolveGgmlTypeOption(options.kvCacheValueType) ?? _model.defaultContextKvCacheValueType;
         const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache;
         const loraOptions = typeof options.lora === "string"
             ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"]
@@ -889,6 +914,8 @@ export class LlamaContext {
             modelGpuLayers: _model.gpuLayers,
             modelTrainContextSize: _model.trainContextSize,
             flashAttention,
+            kvCacheKeyType,
+            kvCacheValueType,
             swaFullCache,
             getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
             llamaGpu: _model._llama.gpu,
@@ -920,10 +947,14 @@ export class LlamaContext {
                 modelGpuLayers: _model.gpuLayers,
                 batchSize,
                 flashAttention,
+                kvCacheKeyType,
+                kvCacheValueType,
                 swaFullCache
             });
 
-            const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention, swaFullCache});
+            const context = new LlamaContext({_model}, {
+                ...options, contextSize, batchSize, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache
+            });
             const contextCreationVramReservation = options.ignoreMemorySafetyChecks
                 ? null
                 : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index fad4c435..7638f34c 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -1,4 +1,5 @@
-import {PickOptions} from "../../utils/utilTypes.js";
+import type {PickOptions} from "../../utils/utilTypes.js";
+import type {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 import type {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js";
 import type {TokenBias} from "../TokenBias.js";
 import type {Token} from "../../types.js";
@@ -104,6 +105,26 @@ export type LlamaContextOptions = {
      */
     batching?: BatchingOptions,
 
+    /**
+     * The type of the key for the KV cache tensors used in this context.
+     *
+     * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
+     *
+     * Defaults to `F16` (inherited from the model option `defaultContextKvCacheKeyType`).
+     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     */
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+
+    /**
+     * The type of the value for the KV cache tensors used in this context.
+     *
+     * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
+     *
+     * Defaults to `F16` (inherited from the model option `defaultContextKvCacheValueType`).
+     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     */
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+
     /**
      * When using SWA (Sliding Window Attention) on a supported model,
      * extend the sliding window size to the current context size (meaning practically disabling SWA).
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 24c84375..480a9df9 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -19,6 +19,7 @@ import {GgufArchitectureType, GgufMetadata} from "../../gguf/types/GgufMetadataT
 import {OverridesObject} from "../../utils/OverridesObject.js";
 import {maxRecentDetokenizerTokens} from "../../consts.js";
 import {LlamaRankingContext, LlamaRankingContextOptions} from "../LlamaRankingContext.js";
+import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js";
 import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js";
 import type {Llama} from "../../bindings/Llama.js";
 import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js";
@@ -127,6 +128,26 @@ export type LlamaModelOptions = {
      */
     defaultContextFlashAttention?: boolean,
 
+    /**
+     * The default type of the key for the KV cache tensors used for contexts created with this model.
+     *
+     * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
+     *
+     * Defaults to `F16`.
+     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     */
+    defaultContextKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+
+    /**
+     * The default type of the value for the KV cache tensors used for contexts created with this model.
+     *
+     * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
+     *
+     * Defaults to `F16`.
+     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     */
+    defaultContextKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+
     /**
      * When using SWA (Sliding Window Attention) on a supported model,
      * extend the sliding window size to the current context size (meaning practically disabling SWA)
@@ -187,6 +208,8 @@ export class LlamaModel {
     /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean;
     /** @internal */ private readonly _defaultContextFlashAttention: boolean;
     /** @internal */ private readonly _defaultContextSwaFullCache: boolean;
+    /** @internal */ private readonly _defaultContextKvCacheKeyType: GgmlType;
+    /** @internal */ private readonly _defaultContextKvCacheValueType: GgmlType;
     /** @internal */ private readonly _flashAttentionSupported: boolean;
     /** @internal */ private readonly _loraAdapters = new Map<string, AddonModelLora>();
     /** @internal */ private _typeDescription?: ModelTypeDescription;
@@ -208,6 +231,8 @@ export class LlamaModel {
         _defaultContextFlashAttentionOptionEnabled,
         _defaultContextFlashAttention,
         _defaultContextSwaFullCache,
+        _defaultContextKvCacheKeyType,
+        _defaultContextKvCacheValueType,
         _flashAttentionSupported
     }: {
         _llama: Llama,
@@ -216,6 +241,8 @@ export class LlamaModel {
         _defaultContextFlashAttentionOptionEnabled: boolean,
         _defaultContextFlashAttention: boolean,
         _defaultContextSwaFullCache: boolean,
+        _defaultContextKvCacheKeyType: GgmlType,
+        _defaultContextKvCacheValueType: GgmlType,
         _flashAttentionSupported: boolean
     }) {
         this._llama = _llama;
@@ -229,6 +256,8 @@ export class LlamaModel {
         this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled;
         this._defaultContextFlashAttention = _defaultContextFlashAttention;
         this._defaultContextSwaFullCache = _defaultContextSwaFullCache;
+        this._defaultContextKvCacheKeyType = _defaultContextKvCacheKeyType;
+        this._defaultContextKvCacheValueType = _defaultContextKvCacheValueType;
         this._flashAttentionSupported = _flashAttentionSupported;
         const overridesList = ggufMetadataOverridesToList(metadataOverrides);
         this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({
@@ -357,6 +386,14 @@ export class LlamaModel {
         return this._defaultContextSwaFullCache;
     }
 
+    public get defaultContextKvCacheKeyType() {
+        return this._defaultContextKvCacheKeyType;
+    }
+
+    public get defaultContextKvCacheValueType() {
+        return this._defaultContextKvCacheValueType;
+    }
+
     /**
      * Transform text into tokens that can be fed to the model
      * @param text - the text to tokenize
@@ -707,7 +744,7 @@ export class LlamaModel {
     }: {
         _llama: Llama
     }) {
-        const {loadSignal, defaultContextFlashAttention} = modelOptions;
+        const {loadSignal, defaultContextFlashAttention, defaultContextKvCacheKeyType, defaultContextKvCacheValueType} = modelOptions;
         const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
         const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo;
 
@@ -722,10 +759,18 @@ export class LlamaModel {
             ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled)
             : false;
         const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache;
+        const resolvedDefaultContextKvCacheKeyType = defaultContextKvCacheKeyType === "currentQuant"
+            ? ggufInsights.dominantTensorType ?? GgmlType.F16
+            : resolveGgmlTypeOption(defaultContextKvCacheKeyType) ?? GgmlType.F16;
+        const resolvedDefaultContextKvCacheValueType = defaultContextKvCacheValueType === "currentQuant"
+            ? ggufInsights.dominantTensorType ?? GgmlType.F16
+            : resolveGgmlTypeOption(defaultContextKvCacheValueType) ?? GgmlType.F16;
         const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
             ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
             defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
             defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache,
+            defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType,
+            defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType,
             useMmap
         });
         const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({
@@ -740,7 +785,9 @@ export class LlamaModel {
             _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false,
             _flashAttentionSupported: flashAttentionSupported,
             _defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
-            _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache
+            _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache,
+            _defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType,
+            _defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType
         });
         const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks
             ? null
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index cc594ace..ed364c35 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -2,7 +2,7 @@ import {Llama} from "../../bindings/Llama.js";
 import {getLlamaWithoutBackend} from "../../bindings/utils/getLlamaWithoutBackend.js";
 import {getDefaultContextBatchSize, getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js";
 import {GgufFileInfo} from "../types/GgufFileInfoTypes.js";
-import {GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
+import {GgmlType, GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
 import {GgufArchitectureType} from "../types/GgufMetadataTypes.js";
 import {getReadablePath} from "../../cli/utils/getReadablePath.js";
 import {padSafeContextSize} from "../../evaluator/LlamaContext/utils/padSafeContextSize.js";
@@ -19,6 +19,7 @@ export class GgufInsights {
     /** @internal */ private readonly _modelSize: number;
     /** @internal */ private _totalFileLayers: number | null = null;
     /** @internal */ private _supportsRanking?: boolean;
+    /** @internal */ private _dominantTensorType?: GgmlType;
     /** @internal */ public readonly _ggufFileInfo: GgufFileInfo;
     /** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver;
     /** @internal */ private readonly _tokens: GgufInsightsTokens;
@@ -163,6 +164,16 @@ export class GgufInsights {
         return false;
     }
 
+    /**
+     * Get the dominant tensor type used in the model file
+     */
+    public get dominantTensorType(): GgmlType | undefined {
+        if (this._dominantTensorType == null)
+            this._dominantTensorType = getDominantTensorType(this._ggufFileInfo.fullTensorInfo ?? []);
+
+        return this._dominantTensorType;
+    }
+
     public get supportsRanking() {
         if (this._supportsRanking != null)
             return this._supportsRanking;
@@ -223,10 +234,12 @@ export class GgufInsights {
      */
     public estimateContextResourceRequirements({
         contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false,
-        swaFullCache = false
+        swaFullCache = false,
+        kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16
     }: {
         contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean,
-        flashAttention?: boolean, includeGraphOverhead?: boolean, swaFullCache?: boolean
+        flashAttention?: boolean, includeGraphOverhead?: boolean, swaFullCache?: boolean,
+        kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType
     }): GgufInsightsResourceRequirements {
         if (sequences == null) sequences = getDefaultContextSequences();
         if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences});
@@ -277,7 +290,9 @@ export class GgufInsights {
             sequences,
             totalFileLayers,
             finalModelGpuLayers,
-            usingGpu
+            usingGpu,
+            kvCacheKeyType,
+            kvCacheValueType
         });
 
         const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0;
@@ -569,13 +584,17 @@ export class GgufInsights {
         sequences,
         totalFileLayers,
         finalModelGpuLayers,
-        usingGpu
+        usingGpu,
+        kvCacheKeyType = GgmlType.F16,
+        kvCacheValueType = GgmlType.F16
     }: {
         kvSize: number,
         sequences: number,
         totalFileLayers: number,
         finalModelGpuLayers: number,
-        usingGpu: boolean
+        usingGpu: boolean,
+        kvCacheKeyType?: GgmlType,
+        kvCacheValueType?: GgmlType
     }) {
         // source: `llama_kv_cache_init` in `llama.cpp`
         const architecture = this._ggufFileInfo.metadata.general?.architecture;
@@ -584,16 +603,8 @@ export class GgufInsights {
         const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
         const nHeadKv: number | number[] = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead;
         const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
-        const keyTypeSize = architecture === GgufArchitectureType.mamba
-            // if `type_k` of `llama_context_params` changes to be configurable in `LlamaContext`,
-            // this would have to depend on that value
-            ? this._llama._consts.ggmlTypeF32Size
-            : this._llama._consts.ggmlTypeF16Size;
-        const valueTypeSize = architecture === GgufArchitectureType.mamba
-            // if `type_v` of `llama_context_params` changes to be configurable in `LlamaContext`,
-            // this would have to depend on that value
-            ? this._llama._consts.ggmlTypeF32Size
-            : this._llama._consts.ggmlTypeF16Size;
+        const keyTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheKeyType) ?? this._llama._consts.ggmlTypeF16Size;
+        const valueTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheValueType) ?? this._llama._consts.ggmlTypeF16Size;
 
         // source: `llama_model::load_tensors` in `llama-model.cpp`
         // repeating layers are assigned to GPU from `i_gpu_start = n_layer + 1 - n_gpu_layers`
@@ -1088,3 +1099,24 @@ export function parseRankingTemplate(template: string | undefined | null): strin
 export function isRankingTemplateValid(template: string | undefined | null): boolean {
     return template != null && template.includes("{{query}}") && template.includes("{{document}}");
 }
+
+export function getDominantTensorType(tensorInfo: GgufTensorInfo[]): GgmlType | undefined {
+    const tensorTypes: number[] = [];
+    for (const tensor of tensorInfo)
+        tensorTypes[tensor.ggmlType] = (
+            (tensorTypes[tensor.ggmlType] ?? 0) +
+            tensor.dimensions.map(((dim) => Number(dim))).reduce((a, b) => a * b, 1)
+        );
+
+    let dominantType: GgmlType | undefined = undefined;
+    let maxCount = 0;
+
+    for (const [type, count] of tensorTypes.entries()) {
+        if (count > maxCount) {
+            maxCount = count;
+            dominantType = type;
+        }
+    }
+
+    return dominantType;
+}
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index b0179ae9..ea41dcfa 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -3,6 +3,7 @@ import {LlamaModelOptions} from "../../evaluator/LlamaModel/LlamaModel.js";
 import {LlamaContextOptions} from "../../evaluator/LlamaContext/types.js";
 import {getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js";
 import {InsufficientMemoryError} from "../../utils/InsufficientMemoryError.js";
+import {GgmlType} from "../types/GgufTensorInfoTypes.js";
 import {resolveModelGpuLayersOption} from "./utils/resolveModelGpuLayersOption.js";
 import {resolveContextContextSizeOption} from "./utils/resolveContextContextSizeOption.js";
 import {scoreLevels} from "./utils/scoreLevels.js";
@@ -39,6 +40,8 @@ export class GgufInsightsConfigurationResolver {
         targetContextSize,
         embeddingContext = false,
         flashAttention = false,
+        kvCacheKeyType,
+        kvCacheValueType,
         swaFullCache = false,
         useMmap = this._ggufInsights._llama.supportsMmap
     }: {
@@ -46,6 +49,8 @@ export class GgufInsightsConfigurationResolver {
         targetContextSize?: number,
         embeddingContext?: boolean,
         flashAttention?: boolean,
+        kvCacheKeyType?: GgmlType,
+        kvCacheValueType?: GgmlType,
         swaFullCache?: boolean,
         useMmap?: boolean
     } = {}, {
@@ -65,6 +70,8 @@ export class GgufInsightsConfigurationResolver {
     } = {}) {
         const compatibilityScore = await this.scoreModelConfigurationCompatibility({
             flashAttention,
+            kvCacheKeyType,
+            kvCacheValueType,
             swaFullCache,
             contextSize: targetContextSize,
             embeddingContext,
@@ -108,6 +115,8 @@ export class GgufInsightsConfigurationResolver {
         contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096),
         embeddingContext = false,
         flashAttention = false,
+        kvCacheKeyType,
+        kvCacheValueType,
         swaFullCache = false,
         maximumFittedContextSizeMultiplier = 100,
         maximumUnfitConfigurationResourceMultiplier = 100,
@@ -118,6 +127,8 @@ export class GgufInsightsConfigurationResolver {
         contextSize?: number,
         embeddingContext?: boolean,
         flashAttention?: boolean,
+        kvCacheKeyType?: GgmlType,
+        kvCacheValueType?: GgmlType,
         swaFullCache?: boolean,
         maximumFittedContextSizeMultiplier?: number,
         maximumUnfitConfigurationResourceMultiplier?: number,
@@ -215,6 +226,8 @@ export class GgufInsightsConfigurationResolver {
                     llamaSupportsGpuOffloading,
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
+                    defaultContextKvCacheKeyType: kvCacheKeyType,
+                    defaultContextKvCacheValueType: kvCacheValueType,
                     ignoreMemorySafetyChecks: forceGpuLayers != null,
                     useMmap
                 }
@@ -272,6 +285,8 @@ export class GgufInsightsConfigurationResolver {
                 modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
                 ignoreMemorySafetyChecks: forceStrictContextSize,
                 flashAttention,
+                kvCacheKeyType,
+                kvCacheValueType,
                 swaFullCache
             });
             contextFitsMemory = true;
@@ -292,7 +307,9 @@ export class GgufInsightsConfigurationResolver {
             isEmbeddingContext: embeddingContext,
             modelGpuLayers: resolvedGpuLayers,
             flashAttention,
-            swaFullCache
+            swaFullCache,
+            kvCacheKeyType,
+            kvCacheValueType
         });
 
         const rankPoints = {
@@ -388,12 +405,15 @@ export class GgufInsightsConfigurationResolver {
         llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu,
         llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading,
         defaultContextFlashAttention = false,
+        defaultContextKvCacheKeyType,
+        defaultContextKvCacheValueType,
         defaultContextSwaFullCache = false,
         useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>,
         llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean,
-        defaultContextSwaFullCache?: boolean, useMmap?: boolean
+        defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache?: boolean,
+        useMmap?: boolean
     } = {}) {
         return resolveModelGpuLayersOption(gpuLayers, {
             ggufInsights: this._ggufInsights,
@@ -403,6 +423,8 @@ export class GgufInsightsConfigurationResolver {
             llamaGpu,
             llamaSupportsGpuOffloading,
             defaultContextFlashAttention,
+            defaultContextKvCacheKeyType,
+            defaultContextKvCacheValueType,
             defaultContextSwaFullCache,
             useMmap
         });
@@ -418,6 +440,8 @@ export class GgufInsightsConfigurationResolver {
         batchSize,
         modelTrainContextSize,
         flashAttention = false,
+        kvCacheKeyType,
+        kvCacheValueType,
         swaFullCache = false,
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
@@ -430,6 +454,8 @@ export class GgufInsightsConfigurationResolver {
         modelGpuLayers: number,
         modelTrainContextSize: number,
         flashAttention?: boolean,
+        kvCacheKeyType?: GgmlType,
+        kvCacheValueType?: GgmlType,
         swaFullCache?: boolean,
         batchSize?: LlamaContextOptions["batchSize"],
         sequences?: number,
@@ -448,6 +474,8 @@ export class GgufInsightsConfigurationResolver {
             modelGpuLayers,
             modelTrainContextSize,
             flashAttention,
+            kvCacheKeyType,
+            kvCacheValueType,
             swaFullCache,
             getVramState,
             getRamState,
diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
index 49ace603..ba0e4ae7 100644
--- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts
+++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
@@ -5,11 +5,13 @@ import {minAllowedContextSizeInCalculations} from "../../../config.js";
 import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js";
 import {InsufficientMemoryError} from "../../../utils/InsufficientMemoryError.js";
 import {getRamUsageFromUnifiedVram} from "./getRamUsageFromUnifiedVram.js";
+import type {GgmlType} from "../../types/GgufTensorInfoTypes.js";
 
 const defaultMaxContextSizeSwapUse = 2048;
 
 export async function resolveContextContextSizeOption({
-    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, swaFullCache,
+    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention,
+    kvCacheKeyType, kvCacheValueType, swaFullCache,
     getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false,
     maxContextSizeSwapUse = defaultMaxContextSizeSwapUse
 }: {
@@ -20,6 +22,8 @@ export async function resolveContextContextSizeOption({
     modelGpuLayers: number,
     modelTrainContextSize: number,
     flashAttention: boolean,
+    kvCacheKeyType?: GgmlType,
+    kvCacheValueType?: GgmlType,
     swaFullCache: boolean,
     getVramState(): Promise<{total: number, free: number, unifiedSize: number}>,
     getRamState(): Promise<{total: number, free: number}>,
@@ -53,6 +57,8 @@ export async function resolveContextContextSizeOption({
             modelGpuLayers: modelGpuLayers,
             sequences,
             flashAttention,
+            kvCacheKeyType,
+            kvCacheValueType,
             swaFullCache,
             isEmbeddingContext
         });
@@ -99,6 +105,8 @@ export async function resolveContextContextSizeOption({
                 modelGpuLayers: modelGpuLayers,
                 sequences,
                 flashAttention,
+                kvCacheKeyType,
+                kvCacheValueType,
                 swaFullCache,
                 isEmbeddingContext
             });
@@ -148,6 +156,8 @@ export async function resolveContextContextSizeOption({
             modelGpuLayers: modelGpuLayers,
             sequences,
             flashAttention,
+            kvCacheKeyType,
+            kvCacheValueType,
             swaFullCache,
             isEmbeddingContext
         });
diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
index 62d58141..5c544744 100644
--- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
+++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
@@ -5,17 +5,21 @@ import {findBestOption} from "../../../utils/findBestOption.js";
 import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js";
 import {minAllowedContextSizeInCalculations} from "../../../config.js";
 import {scoreLevels} from "./scoreLevels.js";
+import type {GgmlType} from "../../types/GgufTensorInfoTypes.js";
 import type {GgufInsights} from "../GgufInsights.js";
 
 const fitContextExtraMemoryPaddingPercentage = 0.5;
 
 export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], {
     ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize,
-    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap
+    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention,
+    defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap
 }: {
     ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean,
     getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu,
-    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean
+    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean,
+    defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache: boolean,
+    useMmap?: boolean
 }): Promise<number> {
     if (gpuLayers == null)
         gpuLayers = "auto";
@@ -37,6 +41,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
             ggufInsights,
             currentVram: vramState.free,
             defaultContextFlashAttention,
+            defaultContextKvCacheKeyType,
+            defaultContextKvCacheValueType,
             defaultContextSwaFullCache,
             useMmap
         });
@@ -74,6 +80,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
                 ? gpuLayers.max
                 : undefined,
             defaultContextFlashAttention,
+            defaultContextKvCacheKeyType,
+            defaultContextKvCacheValueType,
             defaultContextSwaFullCache,
             useMmap
         });
@@ -97,6 +105,8 @@ function getBestGpuLayersForFreeVram({
     minGpuLayers,
     maxGpuLayers,
     defaultContextFlashAttention,
+    defaultContextKvCacheKeyType,
+    defaultContextKvCacheValueType,
     defaultContextSwaFullCache,
     useMmap
 }: {
@@ -106,6 +116,8 @@ function getBestGpuLayersForFreeVram({
     minGpuLayers?: number,
     maxGpuLayers?: number,
     defaultContextFlashAttention: boolean,
+    defaultContextKvCacheKeyType?: GgmlType,
+    defaultContextKvCacheValueType?: GgmlType,
     defaultContextSwaFullCache: boolean,
     useMmap?: boolean
 }) {
@@ -128,6 +140,8 @@ function getBestGpuLayersForFreeVram({
                 fitContext,
                 defaultContextFlashAttention,
                 defaultContextSwaFullCache,
+                defaultContextKvCacheKeyType,
+                defaultContextKvCacheValueType,
                 useMmap
             });
 
@@ -187,10 +201,12 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer
 }
 
 function getVramRequiredForGpuLayers({
-    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap
+    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false,
+    defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache = false, useMmap
 }: {
     gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean},
-    defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean
+    defaultContextFlashAttention: boolean, defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType,
+    defaultContextSwaFullCache: boolean, useMmap?: boolean
 }) {
     const modelVram = ggufInsights.estimateModelResourceRequirements({
         gpuLayers,
@@ -208,6 +224,8 @@ function getVramRequiredForGpuLayers({
             sequences: 1,
             isEmbeddingContext: fitContext.embeddingContext ?? false,
             flashAttention: defaultContextFlashAttention,
+            kvCacheKeyType: defaultContextKvCacheKeyType,
+            kvCacheValueType: defaultContextKvCacheValueType,
             swaFullCache: defaultContextSwaFullCache
         }).gpuVram;
 
@@ -228,6 +246,8 @@ function getVramRequiredForGpuLayers({
         vram: currentVram - modelVram,
         isEmbeddingContext: fitContext?.embeddingContext ?? false,
         flashAttention: defaultContextFlashAttention,
+        kvCacheKeyType: defaultContextKvCacheKeyType,
+        kvCacheValueType: defaultContextKvCacheValueType,
         swaFullCache: defaultContextSwaFullCache
     });
 
@@ -241,8 +261,11 @@ function getVramRequiredForGpuLayers({
     };
 }
 
-function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, swaFullCache}: {
-    gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, swaFullCache: boolean
+function findMaxPossibleContextSizeForVram({
+    gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache
+}: {
+    gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean,
+    kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean
 }) {
     const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize});
 
@@ -258,6 +281,8 @@ function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmb
                 sequences: 1,
                 isEmbeddingContext,
                 flashAttention,
+                kvCacheKeyType,
+                kvCacheValueType,
                 swaFullCache
             }).gpuVram;
 
diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts
index 1ada8204..39e2b984 100644
--- a/src/gguf/types/GgufTensorInfoTypes.ts
+++ b/src/gguf/types/GgufTensorInfoTypes.ts
@@ -20,7 +20,7 @@ export type GgufTensorInfo = {
     readonly filePart: number
 };
 
-export const enum GgmlType {
+export enum GgmlType {
     F32 = 0,
     F16 = 1,
     Q4_0 = 2,
@@ -63,3 +63,15 @@ export const enum GgmlType {
     MXFP4 = 39, // MXFP4 (1 block)
     NVFP4 = 40 // NVFP4 (4 blocks, E4M3 scale)
 }
+
+export function resolveGgmlTypeOption(option?: keyof typeof GgmlType | GgmlType) {
+    if (option == null)
+        return undefined;
+
+    if (typeof option === "number" && Object.hasOwn(GgmlType, option))
+        return option as GgmlType;
+    else if (typeof option === "string" && Object.hasOwn(GgmlType, option))
+        return GgmlType[option as keyof typeof GgmlType];
+
+    return undefined;
+}

From 30d5e2ddb0ec8825609bbf5e5cc18c8f17240f0c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 17 Mar 2026 06:54:08 +0200
Subject: [PATCH 08/10] fix: bug

---
 src/utils/prettyPrintObject.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/prettyPrintObject.ts b/src/utils/prettyPrintObject.ts
index d02b3fca..56891e88 100644
--- a/src/utils/prettyPrintObject.ts
+++ b/src/utils/prettyPrintObject.ts
@@ -64,7 +64,7 @@ function prettyPrintArray(arr: any[], indent: number = 4, options: PrettyPrintOb
     const arrayItems = slicedArray.map((item) => prettyPrintObject(item, indent, options))
         .concat(
             hiddenItems > 0
-                ? [chalk.white("..." + hiddenItems + " more item" + (hiddenItems !== 1 ? "s" : ""))]
+                ? [chalk.white("..." + hiddenItems.toLocaleString("en-US") + " more item" + (hiddenItems !== 1 ? "s" : ""))]
                 : []
         );
     const oneLineJoinedArrayItems = arrayItems.join(chalk.whiteBright(", "));

From a0a46d5a0a6cb63fd56672521ae0e12df16246f2 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 17 Mar 2026 07:50:49 +0200
Subject: [PATCH 09/10] fix: bugs

---
 llama/addon/AddonContext.cpp                  |  4 +--
 src/cli/commands/ChatCommand.ts               | 18 +++++-----
 src/cli/commands/CompleteCommand.ts           | 18 +++++-----
 src/cli/commands/InfillCommand.ts             | 18 +++++-----
 .../commands/InspectEstimateCommand.ts        | 10 +++---
 .../inspect/commands/InspectMeasureCommand.ts | 18 +++++-----
 src/evaluator/LlamaContext/LlamaContext.ts    | 29 +++++++++------
 src/evaluator/LlamaContext/types.ts           | 20 ++++++++---
 src/evaluator/LlamaModel/LlamaModel.ts        | 35 ++++++++++++++-----
 9 files changed, 108 insertions(+), 62 deletions(-)

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index b8a22ef0..017c6967 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -446,14 +446,14 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
         if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
             auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
             if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
-                context_params.type_k = keyType;
+                context_params.type_k = static_cast<ggml_type>(keyType);
             }
         }
 
         if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
             auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
             if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
-                context_params.type_v = valueType;
+                context_params.type_v = static_cast<ggml_type>(valueType);
             }
         }
 
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index d1dca381..88ec258e 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -180,18 +180,20 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -520,8 +522,8 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -557,8 +559,8 @@ async function RunChat({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
-                    defaultContextKvCacheKeyType: kvCacheKeyType,
-                    defaultContextKvCacheValueType: kvCacheValueType,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index e5711c7e..f3fca2a9 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -137,18 +137,20 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -427,8 +429,8 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -464,8 +466,8 @@ async function RunCompletion({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
-                    defaultContextKvCacheKeyType: kvCacheKeyType,
-                    defaultContextKvCacheValueType: kvCacheValueType,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index 6b8eabea..7bdb7bd6 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -147,18 +147,20 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -450,8 +452,8 @@ async function RunInfill({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -487,8 +489,8 @@ async function RunInfill({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
-                    defaultContextKvCacheKeyType: kvCacheKeyType,
-                    defaultContextKvCacheValueType: kvCacheValueType,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
index baaa2a5f..5915632f 100644
--- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
@@ -125,18 +125,20 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index dd5fb29a..60dcace7 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -117,18 +117,20 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -833,8 +835,8 @@ async function runTestWorkerLogic() {
                     ),
                     ignoreMemorySafetyChecks: currentContextSizeCheck != null,
                     flashAttention,
-                    kvCacheKeyType,
-                    kvCacheValueType,
+                    experimentalKvCacheKeyType: kvCacheKeyType,
+                    experimentalKvCacheValueType: kvCacheValueType,
                     swaFullCache,
                     batchSize,
                     failedCreationRemedy: false
@@ -907,8 +909,8 @@ async function runTestWorkerLogic() {
                 useDirectIo,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 ignoreMemorySafetyChecks: true
             });
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 7d013a51..248c763e 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -113,8 +113,8 @@ export class LlamaContext {
         } = {},
         swaFullCache = _model.defaultContextSwaFullCache,
         performanceTracking = false,
-        kvCacheKeyType,
-        kvCacheValueType,
+        experimentalKvCacheKeyType,
+        experimentalKvCacheValueType,
         _embeddings,
         _ranking
     }: LlamaContextOptions & {
@@ -122,8 +122,8 @@ export class LlamaContext {
         contextSize: number,
         batchSize: number,
         flashAttention: boolean,
-        kvCacheKeyType: GgmlType,
-        kvCacheValueType: GgmlType
+        experimentalKvCacheKeyType: GgmlType,
+        experimentalKvCacheValueType: GgmlType
     }) {
         if (_model.disposed)
             throw new DisposedError();
@@ -152,8 +152,8 @@ export class LlamaContext {
                 : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1)
         );
         this._performanceTracking = !!performanceTracking;
-        this._kvCacheKeyType = kvCacheKeyType;
-        this._kvCacheValueType = kvCacheValueType;
+        this._kvCacheKeyType = experimentalKvCacheKeyType;
+        this._kvCacheValueType = experimentalKvCacheValueType;
         this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
@@ -891,12 +891,12 @@ export class LlamaContext {
         const flashAttention = _model.flashAttentionSupported
             ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
             : false;
-        const kvCacheKeyType = options.kvCacheKeyType === "currentQuant"
+        const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant"
             ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType
-            : resolveGgmlTypeOption(options.kvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
-        const kvCacheValueType = options.kvCacheValueType === "currentQuant"
+            : resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
+        const kvCacheValueType = options.experimentalKvCacheValueType === "currentQuant"
             ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType
-            : resolveGgmlTypeOption(options.kvCacheValueType) ?? _model.defaultContextKvCacheValueType;
+            : resolveGgmlTypeOption(options.experimentalKvCacheValueType) ?? _model.defaultContextKvCacheValueType;
         const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache;
         const loraOptions = typeof options.lora === "string"
             ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"]
@@ -953,7 +953,14 @@ export class LlamaContext {
             });
 
             const context = new LlamaContext({_model}, {
-                ...options, contextSize, batchSize, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache
+                ...options,
+                contextSize,
+                batchSize,
+                sequences,
+                flashAttention,
+                experimentalKvCacheKeyType: kvCacheKeyType,
+                experimentalKvCacheValueType: kvCacheValueType,
+                swaFullCache
             });
             const contextCreationVramReservation = options.ignoreMemorySafetyChecks
                 ? null
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 7638f34c..a0a64f02 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -111,9 +111,15 @@ export type LlamaContextOptions = {
      * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
      *
      * Defaults to `F16` (inherited from the model option `defaultContextKvCacheKeyType`).
-     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     * @deprecated - this option is experimental and highly unstable.
+     * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
+     * Avoid allowing end users to configure this option, as it's highly unstable.
+     * @experimental - this option is experimental and highly unstable.
+     * It may not work as intended or even crash the process.
+     * Use with caution.
+     * This option may change or get removed in the future without a breaking change version.
      */
-    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+    experimentalKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
 
     /**
      * The type of the value for the KV cache tensors used in this context.
@@ -121,9 +127,15 @@ export type LlamaContextOptions = {
      * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
      *
      * Defaults to `F16` (inherited from the model option `defaultContextKvCacheValueType`).
-     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     * @deprecated - this option is experimental and highly unstable.
+     * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
+     * Avoid allowing end users to configure this option, as it's highly unstable.
+     * @experimental - this option is experimental and highly unstable.
+     * It may not work as intended or even crash the process.
+     * Use with caution.
+     * This option may change or get removed in the future without a breaking change version.
      */
-    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+    experimentalKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
 
     /**
      * When using SWA (Sliding Window Attention) on a supported model,
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 480a9df9..84dfa65e 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -134,9 +134,15 @@ export type LlamaModelOptions = {
      * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
      *
      * Defaults to `F16`.
-     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     * @deprecated - this option is experimental and highly unstable.
+     * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
+     * Avoid allowing end users to configure this option, as it's highly unstable.
+     * @experimental - this option is experimental and highly unstable.
+     * It may not work as intended or even crash the process.
+     * Use with caution.
+     * This option may change or get removed in the future without a breaking change version.
      */
-    defaultContextKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+    experimentalDefaultContextKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
 
     /**
      * The default type of the value for the KV cache tensors used for contexts created with this model.
@@ -144,9 +150,15 @@ export type LlamaModelOptions = {
      * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
      *
      * Defaults to `F16`.
-     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     * @deprecated - this option is experimental and highly unstable.
+     * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
+     * Avoid allowing end users to configure this option, as it's highly unstable.
+     * @experimental - this option is experimental and highly unstable.
+     * It may not work as intended or even crash the process.
+     * Use with caution.
+     * This option may change or get removed in the future without a breaking change version.
      */
-    defaultContextKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+    experimentalDefaultContextKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
 
     /**
      * When using SWA (Sliding Window Attention) on a supported model,
@@ -744,7 +756,12 @@ export class LlamaModel {
     }: {
         _llama: Llama
     }) {
-        const {loadSignal, defaultContextFlashAttention, defaultContextKvCacheKeyType, defaultContextKvCacheValueType} = modelOptions;
+        const {
+            loadSignal,
+            defaultContextFlashAttention,
+            experimentalDefaultContextKvCacheKeyType,
+            experimentalDefaultContextKvCacheValueType
+        } = modelOptions;
         const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
         const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo;
 
@@ -759,12 +776,12 @@ export class LlamaModel {
             ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled)
             : false;
         const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache;
-        const resolvedDefaultContextKvCacheKeyType = defaultContextKvCacheKeyType === "currentQuant"
+        const resolvedDefaultContextKvCacheKeyType = experimentalDefaultContextKvCacheKeyType === "currentQuant"
             ? ggufInsights.dominantTensorType ?? GgmlType.F16
-            : resolveGgmlTypeOption(defaultContextKvCacheKeyType) ?? GgmlType.F16;
-        const resolvedDefaultContextKvCacheValueType = defaultContextKvCacheValueType === "currentQuant"
+            : resolveGgmlTypeOption(experimentalDefaultContextKvCacheKeyType) ?? GgmlType.F16;
+        const resolvedDefaultContextKvCacheValueType = experimentalDefaultContextKvCacheValueType === "currentQuant"
             ? ggufInsights.dominantTensorType ?? GgmlType.F16
-            : resolveGgmlTypeOption(defaultContextKvCacheValueType) ?? GgmlType.F16;
+            : resolveGgmlTypeOption(experimentalDefaultContextKvCacheValueType) ?? GgmlType.F16;
         const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
             ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
             defaultContextFlashAttention: resolvedDefaultContextFlashAttention,

From 28e6817128f51b8028df70bc3874b6e538a95eee Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 17 Mar 2026 08:34:59 +0200
Subject: [PATCH 10/10] chore: mistral 4 gguf arch type

---
 src/gguf/types/GgufMetadataTypes.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index a2cd41b9..249cad32 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -114,6 +114,7 @@ export const enum GgufArchitectureType {
     rnd1 = "rnd1",
     panguEmbedded = "pangu-embedded",
     mistral3 = "mistral3",
+    mistral4 = "mistral4",
     paddleocr = "paddleocr",
     mimo2 = "mimo2",
     step35 = "step35",