From 5c256131b1deee2d950fc0e21a42ec20d551ba02 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 16 Mar 2026 02:32:29 +0200 Subject: [PATCH 01/10] feat(minor): customize `postinstall` behavior --- docs/guide/troubleshooting.md | 23 ++++++++ src/cli/commands/OnPostInstallCommand.ts | 50 +++++++++++++++-- src/cli/utils/packageJsonConfig.ts | 69 ++++++++++++++++++++++++ src/cli/utils/packageManager.ts | 16 ++++++ src/config.ts | 13 ++++- src/types.ts | 2 + 6 files changed, 168 insertions(+), 5 deletions(-) create mode 100644 src/cli/utils/packageJsonConfig.ts create mode 100644 src/cli/utils/packageManager.ts diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md index f60a7745..96aeea63 100644 --- a/docs/guide/troubleshooting.md +++ b/docs/guide/troubleshooting.md @@ -164,3 +164,26 @@ Ensure you're not using the `Administrator` user for `npm install` nor to run th To do that, go to `Settings > Update & Security > For developers` and enable `Developer mode`. After that, delete the `.cache` folder under your user directory and try building the app again. + +## Customizing `postinstall` Behavior {#postinstall-behavior} +When installing `node-llama-cpp`, its `postinstall` script checks whether the prebuilt binaries +are compatible with current machine (which they almost always are, at least the CPU-only ones which are the last resort fallback), +and when not, attempts [building the native bindings from source](./building-from-source.md). + +When attempting to [build from source](./building-from-source.md), if the machine lacks the required build tools, +the build will fail and indicative error messages will direct you to the specific commands you need to run +or packages you need to install in order for the build process to succeed. + +If you want to customize the `postinstall` behavior, you can do so using any of the following methods: +* Passing the `--node-llama-cpp-postinstall=` flag to the `npm install` command. +* Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `` before running `npm install`. +* Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to ``. + +Where `` can be one of the following options: +* **`auto` (default)**: the default behavior explained above. +* **`ignoreFailedBuild`**: same as the default behavior, + but a failed build will not throw an error and will be ignored, which means the installation will succeed. + Using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source again by default. +* **`skip`**: skip the entire `postinstall` script. + If the prebuilt binaries are incompatible with the current machine, + using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source by default. diff --git a/src/cli/commands/OnPostInstallCommand.ts b/src/cli/commands/OnPostInstallCommand.ts index e81e9b0b..9731b71b 100644 --- a/src/cli/commands/OnPostInstallCommand.ts +++ b/src/cli/commands/OnPostInstallCommand.ts @@ -1,10 +1,16 @@ +import path from "path"; +import {fileURLToPath} from "url"; import {CommandModule} from "yargs"; import chalk from "chalk"; -import {defaultSkipDownload, documentationPageUrls} from "../../config.js"; +import {defaultSkipDownload, documentationPageUrls, defaultNodeLlamaCppPostinstall} from "../../config.js"; import {getLlamaForOptions} from "../../bindings/getLlama.js"; import {setForceShowConsoleLogPrefix} from "../../state.js"; import {isRunningUnderRosetta} from "../utils/isRunningUnderRosetta.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; +import {parsePackageJsonConfig, resolvePackageJsonConfig} from "../utils/packageJsonConfig.js"; +import {detectCurrentPackageManager} from "../utils/packageManager.js"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); type OnPostInstallCommand = null; @@ -13,7 +19,22 @@ export const OnPostInstallCommand: CommandModule = describe: false, async handler() { if (defaultSkipDownload) - return; + return void process.exit(0); + + const nlcConfig = parsePackageJsonConfig(await resolvePackageJsonConfig(__dirname)); + const postinstallConfig = (defaultNodeLlamaCppPostinstall == null || defaultNodeLlamaCppPostinstall === "auto") + ? nlcConfig.nodeLlamaCppPostinstall ?? defaultNodeLlamaCppPostinstall + : defaultNodeLlamaCppPostinstall; + + // set via a `--node-llama-cpp-postinstall=skip` flag on an `npm install` command + // (prefer `--node-llama-cpp-postinstall=ignoreFailedBuild` if you really need it) + if (postinstallConfig === "skip") { + console.info( + getConsoleLogPrefix(false, false), + "Skipping node-llama-cpp postinstall due to a 'skip' configuration" + ); + return void process.exit(0); + } setForceShowConsoleLogPrefix(true); @@ -34,7 +55,10 @@ export const OnPostInstallCommand: CommandModule = "troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction ); - process.exit(1); + if (postinstallConfig === "ignoreFailedBuild") + process.exit(0); + else + process.exit(1); } try { @@ -47,7 +71,25 @@ export const OnPostInstallCommand: CommandModule = process.exit(0); } catch (err) { console.error(err); - process.exit(1); + + const packageManager = detectCurrentPackageManager(); + if (postinstallConfig === "auto" && packageManager === "npm") + console.info( + getConsoleLogPrefix(false, false), + "To disable node-llama-cpp's postinstall for this 'npm install', use the '--node-llama-cpp-postinstall=skip' flag when running 'npm install' command" + ); + + if (postinstallConfig === "auto") + console.info( + getConsoleLogPrefix(false, false), + "To customize node-llama-cpp's postinstall behavior, see the troubleshooting guide: " + + documentationPageUrls.troubleshooting.PostinstallBehavior + ); + + if (postinstallConfig === "ignoreFailedBuild") + process.exit(0); + else + process.exit(1); } } }; diff --git a/src/cli/utils/packageJsonConfig.ts b/src/cli/utils/packageJsonConfig.ts new file mode 100644 index 00000000..af08880c --- /dev/null +++ b/src/cli/utils/packageJsonConfig.ts @@ -0,0 +1,69 @@ +import path from "path"; +import fs from "fs-extra"; +import {NodeLlamaCppPostinstallBehavior} from "../../types.js"; + +export async function resolvePackageJsonConfig(startDir: string) { + const currentConfig: Record = {}; + + let currentDirPath = path.resolve(startDir); + while (true) { + const packageJsonPath = path.join(currentDirPath, "package.json"); + try { + if (await fs.pathExists(packageJsonPath)) + applyConfig(currentConfig, await readPackageJsonConfig(packageJsonPath)); + } catch (err) { + // do nothing + } + + const parentDirPath = path.dirname(currentDirPath); + if (parentDirPath === currentDirPath) + break; + + currentDirPath = parentDirPath; + } + + return currentConfig; +} + +export function parsePackageJsonConfig(config: Record) { + const res: NlcPackageJsonConfig = {}; + + const castedConfig = config as NlcPackageJsonConfig; + + if (castedConfig.nodeLlamaCppPostinstall === "auto" || + castedConfig.nodeLlamaCppPostinstall === "ignoreFailedBuild" || + castedConfig.nodeLlamaCppPostinstall === "skip" + ) + res.nodeLlamaCppPostinstall = castedConfig.nodeLlamaCppPostinstall; + else + void (castedConfig.nodeLlamaCppPostinstall satisfies undefined); + + return res; +} + +export type NlcPackageJsonConfig = { + nodeLlamaCppPostinstall?: NodeLlamaCppPostinstallBehavior +}; + +function readPackageJsonConfig(packageJsonPath: string) { + try { + const packageJsonContent = fs.readFileSync(packageJsonPath, "utf8"); + const packageJson = JSON.parse(packageJsonContent); + const config = packageJson?.config; + if (typeof config === "object") + return config; + + return {}; + } catch (err) { + return {}; + } +} + +function applyConfig(baseConfig: Record, newConfig: Record) { + for (const key in newConfig) { + if (key in baseConfig) + continue; + + baseConfig[key] = newConfig[key]; + } +} diff --git a/src/cli/utils/packageManager.ts b/src/cli/utils/packageManager.ts new file mode 100644 index 00000000..0b76eef1 --- /dev/null +++ b/src/cli/utils/packageManager.ts @@ -0,0 +1,16 @@ +export function detectCurrentPackageManager(): "npm" | "bun" | "pnpm" | "deno" | "yarn" | undefined { + const userAgent = (process.env["npm_config_user_agent"] ?? "").toLowerCase(); + + if (userAgent.startsWith("bun/")) + return "bun"; + else if (userAgent.startsWith("pnpm/")) + return "pnpm"; + else if (userAgent.startsWith("yarn/")) + return "yarn"; + else if (userAgent.startsWith("deno/")) + return "deno"; + else if (userAgent.startsWith("npm/")) + return "npm"; + + return undefined; +} diff --git a/src/config.ts b/src/config.ts index 631df45c..5337d012 100644 --- a/src/config.ts +++ b/src/config.ts @@ -8,6 +8,7 @@ import {getBinariesGithubRelease} from "./bindings/utils/binariesGithubRelease.j import { nodeLlamaCppGpuOptions, LlamaLogLevel, LlamaLogLevelValues, parseNodeLlamaCppGpuOption, nodeLlamaCppGpuOffStringOptions } from "./bindings/types.js"; +import type {NodeLlamaCppPostinstallBehavior} from "./types.js"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); @@ -75,6 +76,15 @@ export const defaultLlamaCppDebugMode = env.get("NODE_LLAMA_CPP_DEBUG") export const defaultSkipDownload = env.get("NODE_LLAMA_CPP_SKIP_DOWNLOAD") .default("false") .asBool(); + +// set via a `--node-llama-cpp-postinstall=ignoreFailedBuild` flag on an `npm install` command +export const defaultNodeLlamaCppPostinstall = env.get("NODE_LLAMA_CPP_POSTINSTALL") + .default( + env.get("npm_config_node_llama_cpp_postinstall") + .default("auto") + .asEnum(["auto", "ignoreFailedBuild", "skip"] as const satisfies NodeLlamaCppPostinstallBehavior[]) + ) + .asEnum(["auto", "ignoreFailedBuild", "skip"] as const satisfies NodeLlamaCppPostinstallBehavior[]); export const defaultBindingTestLogLevel = env.get("NODE_LLAMA_CPP_BINDING_TEST_LOG_LEVEL") .default(LlamaLogLevel.error) .asEnum(LlamaLogLevelValues); @@ -125,7 +135,8 @@ export const documentationPageUrls = { } }, troubleshooting: { - RosettaIllegalHardwareInstruction: documentationUrl + "/guide/troubleshooting#illegal-hardware-instruction" + RosettaIllegalHardwareInstruction: documentationUrl + "/guide/troubleshooting#illegal-hardware-instruction", + PostinstallBehavior: documentationUrl + "/guide/troubleshooting#postinstall-behavior" } } as const; export const newGithubIssueUrl = "https://github.com/withcatai/node-llama-cpp/issues"; diff --git a/src/types.ts b/src/types.ts index 4d24d155..630da6c1 100644 --- a/src/types.ts +++ b/src/types.ts @@ -477,3 +477,5 @@ export type LLamaContextualDryRepeatPenalty = { */ sequenceBreakers?: string[] }; + +export type NodeLlamaCppPostinstallBehavior = "auto" | "ignoreFailedBuild" | "skip"; From 40e911def9fc348b6359c19ca26d46cc759cafca Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 16 Mar 2026 02:48:00 +0200 Subject: [PATCH 02/10] docs: `postinstall` configuration examples --- docs/guide/troubleshooting.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md index 96aeea63..8395256c 100644 --- a/docs/guide/troubleshooting.md +++ b/docs/guide/troubleshooting.md @@ -178,6 +178,8 @@ If you want to customize the `postinstall` behavior, you can do so using any of * Passing the `--node-llama-cpp-postinstall=` flag to the `npm install` command. * Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `` before running `npm install`. * Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to ``. +
+ This will only work when your module is installed globally using `npm -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module. Where `` can be one of the following options: * **`auto` (default)**: the default behavior explained above. @@ -187,3 +189,25 @@ Where `` can be one of the following options: * **`skip`**: skip the entire `postinstall` script. If the prebuilt binaries are incompatible with the current machine, using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source by default. + +::: code-group +```shell [npm install flag] +npm install --node-llama-cpp-postinstall=ignoreFailedBuild +``` + +```shell [env var (bash)] +NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install +``` + +```shell [env var (using cross-env)] +npx --yes cross-env NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install +``` + +```json [package.json] +{ + "config": { + "nodeLlamaCppPostinstall": "ignoreFailedBuild" + } +} +``` +::: From e0986fad977f89f2ae037478788a794ca64d3223 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 16 Mar 2026 02:51:33 +0200 Subject: [PATCH 03/10] fix: bugs --- src/cli/utils/packageJsonConfig.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli/utils/packageJsonConfig.ts b/src/cli/utils/packageJsonConfig.ts index af08880c..9f4c740e 100644 --- a/src/cli/utils/packageJsonConfig.ts +++ b/src/cli/utils/packageJsonConfig.ts @@ -60,8 +60,8 @@ function readPackageJsonConfig(packageJsonPath: string) { } function applyConfig(baseConfig: Record, newConfig: Record) { - for (const key in newConfig) { - if (key in baseConfig) + for (const key of Object.keys(newConfig)) { + if (Object.hasOwn(baseConfig, key)) continue; baseConfig[key] = newConfig[key]; From 8bf19bcefd0af7422ed7f55aabb35d7baa94dc96 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 16 Mar 2026 03:09:59 +0200 Subject: [PATCH 04/10] feat: support `NVFP4` quants --- src/gguf/types/GgufMetadataTypes.ts | 3 ++- src/gguf/types/GgufTensorInfoTypes.ts | 3 ++- src/gguf/utils/ggufQuantNames.ts | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index b4c4b458..a2cd41b9 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -193,7 +193,8 @@ export enum GgufFileType { MOSTLY_Q4_0_8_8 = 35, // deprecated MOSTLY_TQ1_0 = 36, MOSTLY_TQ2_0 = 37, - MOSTLY_MXFP4_MOE = 38 + MOSTLY_MXFP4_MOE = 38, + MOSTLY_NVFP4 = 39 } diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts index ed750329..1ada8204 100644 --- a/src/gguf/types/GgufTensorInfoTypes.ts +++ b/src/gguf/types/GgufTensorInfoTypes.ts @@ -60,5 +60,6 @@ export const enum GgmlType { IQ4_NL_4_4 = 36, IQ4_NL_4_8 = 37, IQ4_NL_8_8 = 38, - MXFP4 = 39 // MXFP4 (1 block) + MXFP4 = 39, // MXFP4 (1 block) + NVFP4 = 40 // NVFP4 (4 blocks, E4M3 scale) } diff --git a/src/gguf/utils/ggufQuantNames.ts b/src/gguf/utils/ggufQuantNames.ts index abff8a8f..3e2c5c65 100644 --- a/src/gguf/utils/ggufQuantNames.ts +++ b/src/gguf/utils/ggufQuantNames.ts @@ -4,6 +4,7 @@ export const ggufQuantNames = new Map([ ["Q4_0", GgufFileType.MOSTLY_Q4_0], ["Q4_1", GgufFileType.MOSTLY_Q4_1], ["MXFP4", GgufFileType.MOSTLY_MXFP4_MOE], + ["NVFP4", GgufFileType.MOSTLY_MXFP4_MOE], ["Q5_0", GgufFileType.MOSTLY_Q5_0], ["Q5_1", GgufFileType.MOSTLY_Q5_1], ["IQ2_XXS", GgufFileType.MOSTLY_IQ2_XXS], From 4a4701e5e9154497e211b00b8479c48ee81206de Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 16 Mar 2026 06:11:21 +0200 Subject: [PATCH 05/10] docs: fix typo --- docs/guide/troubleshooting.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md index 8395256c..71865b1b 100644 --- a/docs/guide/troubleshooting.md +++ b/docs/guide/troubleshooting.md @@ -179,7 +179,7 @@ If you want to customize the `postinstall` behavior, you can do so using any of * Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `` before running `npm install`. * Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to ``.
- This will only work when your module is installed globally using `npm -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module. + This will only work when your module is installed globally using `npm install -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module. Where `` can be one of the following options: * **`auto` (default)**: the default behavior explained above. From 1257846f31a957cbe8b3399b4e4fcb0604df968d Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 16 Mar 2026 23:48:38 +0200 Subject: [PATCH 06/10] fix: bugs --- src/cli/utils/packageJsonConfig.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/cli/utils/packageJsonConfig.ts b/src/cli/utils/packageJsonConfig.ts index 9f4c740e..f9ed820d 100644 --- a/src/cli/utils/packageJsonConfig.ts +++ b/src/cli/utils/packageJsonConfig.ts @@ -7,13 +7,7 @@ export async function resolvePackageJsonConfig(startDir: string) { let currentDirPath = path.resolve(startDir); while (true) { - const packageJsonPath = path.join(currentDirPath, "package.json"); - try { - if (await fs.pathExists(packageJsonPath)) - applyConfig(currentConfig, await readPackageJsonConfig(packageJsonPath)); - } catch (err) { - // do nothing - } + applyConfig(currentConfig, await readPackageJsonConfig(path.join(currentDirPath, "package.json"))); const parentDirPath = path.dirname(currentDirPath); if (parentDirPath === currentDirPath) @@ -22,6 +16,10 @@ export async function resolvePackageJsonConfig(startDir: string) { currentDirPath = parentDirPath; } + const npmPackageJsonPath = process.env["npm_package_json"] ?? ""; + if (npmPackageJsonPath !== "") + applyConfig(currentConfig, await readPackageJsonConfig(npmPackageJsonPath)); + return currentConfig; } @@ -45,9 +43,12 @@ export type NlcPackageJsonConfig = { nodeLlamaCppPostinstall?: NodeLlamaCppPostinstallBehavior }; -function readPackageJsonConfig(packageJsonPath: string) { +async function readPackageJsonConfig(packageJsonPath: string) { try { - const packageJsonContent = fs.readFileSync(packageJsonPath, "utf8"); + if (!(await fs.pathExists(packageJsonPath))) + return {}; + + const packageJsonContent = await fs.readFile(packageJsonPath, "utf8"); const packageJson = JSON.parse(packageJsonContent); const config = packageJson?.config; if (typeof config === "object") From 47b678b9090396d0a8ac52178a5ad6c926a6f23c Mon Sep 17 00:00:00 2001 From: Gilad S Date: Tue, 17 Mar 2026 06:53:32 +0200 Subject: [PATCH 07/10] feat: context kv cache key and value type configurations --- llama/addon/AddonContext.cpp | 20 +++++- src/bindings/AddonTypes.ts | 2 + src/cli/commands/ChatCommand.ts | 37 ++++++++-- src/cli/commands/CompleteCommand.ts | 39 +++++++++-- src/cli/commands/InfillCommand.ts | 38 +++++++++-- .../commands/InspectEstimateCommand.ts | 30 +++++++- .../inspect/commands/InspectGgufCommand.ts | 9 ++- .../inspect/commands/InspectMeasureCommand.ts | 68 ++++++++++++++++--- src/cli/utils/interactivelyAskForModel.ts | 41 ++++++++--- src/cli/utils/printCommonInfoLines.ts | 9 +++ src/cli/utils/resolveCommandGgufPath.ts | 16 ++++- src/evaluator/LlamaContext/LlamaContext.ts | 35 +++++++++- src/evaluator/LlamaContext/types.ts | 23 ++++++- src/evaluator/LlamaModel/LlamaModel.ts | 51 +++++++++++++- src/gguf/insights/GgufInsights.ts | 64 ++++++++++++----- .../GgufInsightsConfigurationResolver.ts | 32 ++++++++- .../utils/resolveContextContextSizeOption.ts | 12 +++- .../utils/resolveModelGpuLayersOption.ts | 37 ++++++++-- src/gguf/types/GgufTensorInfoTypes.ts | 14 +++- 19 files changed, 508 insertions(+), 69 deletions(-) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 2b70c08f..b8a22ef0 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -443,6 +443,20 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value()); } + if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) { + auto keyType = options.Get("kvCacheKeyType").As().Int32Value(); + if (keyType >= 0 && keyType < GGML_TYPE_COUNT) { + context_params.type_k = keyType; + } + } + + if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) { + auto valueType = options.Get("kvCacheValueType").As().Int32Value(); + if (valueType >= 0 && valueType < GGML_TYPE_COUNT) { + context_params.type_v = valueType; + } + } + if (options.Has("swaFullCache")) { context_params.swa_full = options.Get("swaFullCache").As().Value(); } @@ -1063,7 +1077,7 @@ void AddonContext::init(Napi::Object exports) { } AddonContextSequenceCheckpoint::AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { - + } AddonContextSequenceCheckpoint::~AddonContextSequenceCheckpoint() { dispose(); @@ -1099,7 +1113,7 @@ class AddonContextSequenceCheckpointInitWorker : public Napi::AsyncWorker { checkpoint->minPos = llama_memory_seq_pos_min(llama_get_memory(context->ctx), checkpoint->sequenceId); checkpoint->maxPos = llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId); const size_t checkpointSize = llama_state_seq_get_size_ext(context->ctx, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - + checkpoint->data.resize(checkpointSize, 0); llama_state_seq_get_data_ext(context->ctx, checkpoint->data.data(), checkpointSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } catch (const std::exception& e) { @@ -1164,4 +1178,4 @@ void AddonContextSequenceCheckpoint::init(Napi::Object exports) { } ) ); -} \ No newline at end of file +} diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index 9fbada1d..294749d5 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -31,6 +31,8 @@ export type BindingModule = { ranking?: boolean, threads?: number, performanceTracking?: boolean, + kvCacheKeyType?: number, + kvCacheValueType?: number, swaFullCache?: boolean }): AddonContext }, diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 96227f65..d1dca381 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -31,6 +31,7 @@ import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescripti import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js"; import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js"; import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js"; +import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js"; type ChatCommand = { modelPath?: string, @@ -46,6 +47,8 @@ type ChatCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, + kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, noTrimWhitespace: boolean, grammar: "text" | Parameters[1], @@ -172,6 +175,24 @@ export const ChatCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("kvCacheKeyType", { + alias: "kvckt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the key for the context KV cache tensors" + }) + .option("kvCacheValueType", { + alias: "kvcvt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the value for the context KV cache tensors" + }) .option("swaFullCache", { alias: "noSwa", type: "boolean", @@ -379,7 +400,7 @@ export const ChatCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, - promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache, + promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, @@ -390,8 +411,8 @@ export const ChatCommand: CommandModule = { try { await RunChat({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, - batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, - temperature, minP, topK, topP, seed, xtc, + batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, + threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, @@ -408,7 +429,7 @@ export const ChatCommand: CommandModule = { async function RunChat({ modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, - contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, + contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, @@ -444,12 +465,16 @@ async function RunChat({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, swaFullCache, + kvCacheKeyType, + kvCacheValueType, useMmap }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, swaFullCache, + kvCacheKeyType, + kvCacheValueType, useMmap, consoleTitle: "Draft model file" }) @@ -495,6 +520,8 @@ async function RunChat({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, @@ -530,6 +557,8 @@ async function RunChat({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index aeeb7117..e5711c7e 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -23,6 +23,7 @@ import {documentationPageUrls} from "../../config.js"; import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js"; import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js"; import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js"; +import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js"; type CompleteCommand = { modelPath?: string, @@ -34,6 +35,8 @@ type CompleteCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, + kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, threads?: number, temperature: number, @@ -129,6 +132,24 @@ export const CompleteCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("kvCacheKeyType", { + alias: "kvckt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the key for the context KV cache tensors" + }) + .option("kvCacheValueType", { + alias: "kvcvt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the value for the context KV cache tensors" + }) .option("swaFullCache", { alias: "noSwa", type: "boolean", @@ -299,7 +320,7 @@ export const CompleteCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, - flashAttention, swaFullCache, threads, temperature, minP, topK, + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, @@ -307,7 +328,8 @@ export const CompleteCommand: CommandModule = { }) { try { await RunCompletion({ - modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, + modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, @@ -323,7 +345,8 @@ export const CompleteCommand: CommandModule = { async function RunCompletion({ - modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, + modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, @@ -356,13 +379,17 @@ async function RunCompletion({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, swaFullCache, - useMmap + useMmap, + kvCacheKeyType, + kvCacheValueType }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, swaFullCache, useMmap, + kvCacheKeyType, + kvCacheValueType, consoleTitle: "Draft model file" }) : undefined; @@ -400,6 +427,8 @@ async function RunCompletion({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, @@ -435,6 +464,8 @@ async function RunCompletion({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 5a69a3a7..6b8eabea 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -23,6 +23,7 @@ import {documentationPageUrls} from "../../config.js"; import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js"; import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js"; import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js"; +import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js"; type InfillCommand = { modelPath?: string, @@ -36,6 +37,8 @@ type InfillCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, + kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, threads?: number, temperature: number, @@ -139,6 +142,24 @@ export const InfillCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("kvCacheKeyType", { + alias: "kvckt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the key for the context KV cache tensors" + }) + .option("kvCacheValueType", { + alias: "kvcvt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the value for the context KV cache tensors" + }) .option("swaFullCache", { alias: "noSwa", type: "boolean", @@ -309,7 +330,7 @@ export const InfillCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, - flashAttention, swaFullCache, threads, temperature, minP, topK, + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, @@ -318,7 +339,8 @@ export const InfillCommand: CommandModule = { try { await RunInfill({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, + kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, + lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings @@ -334,7 +356,7 @@ export const InfillCommand: CommandModule = { async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, + kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, useDirectIo, printTimings @@ -366,13 +388,17 @@ async function RunInfill({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, swaFullCache, - useMmap + useMmap, + kvCacheKeyType, + kvCacheValueType }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, swaFullCache, useMmap, + kvCacheKeyType, + kvCacheValueType, consoleTitle: "Draft model file" }) : undefined; @@ -424,6 +450,8 @@ async function RunInfill({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, @@ -459,6 +487,8 @@ async function RunInfill({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts index ffd5f65e..baaa2a5f 100644 --- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts +++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts @@ -24,6 +24,7 @@ import {printModelDestination} from "../../../utils/printModelDestination.js"; import {toBytes} from "../../../utils/toBytes.js"; import {printDidYouMeanUri} from "../../../utils/resolveCommandGgufPath.js"; import {isModelUri} from "../../../../utils/parseModelUri.js"; +import {GgmlType, resolveGgmlTypeOption} from "../../../../gguf/types/GgufTensorInfoTypes.js"; type InspectEstimateCommand = { modelPath: string, @@ -33,6 +34,8 @@ type InspectEstimateCommand = { contextSize?: number | "train", embedding?: boolean, noMmap?: boolean, + kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, + kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean }; @@ -117,6 +120,24 @@ export const InspectEstimateCommand: CommandModule typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the key for the context KV cache tensors" + }) + .option("kvCacheValueType", { + alias: "kvcvt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the value for the context KV cache tensors" + }) .option("swaFullCache", { alias: "noSwa", type: "boolean", @@ -125,7 +146,8 @@ export const InspectEstimateCommand: CommandModule = { if (parsedMetadata.splicedParts > 1) console.info(`${chalk.yellow("Spliced parts:")} ${parsedMetadata.splicedParts}`); + const dominantTensorType = getDominantTensorType(parsedMetadata.fullTensorInfo ?? []); + console.info(`${chalk.yellow("GGUF version:")} ${parsedMetadata.version}`); console.info(`${chalk.yellow("Tensor count:")} ${parsedMetadata.totalTensorCount.toLocaleString("en-US", numberLocaleFormattingOptions)}`); console.info(`${chalk.yellow("Metadata size:")} ${toBytes(parsedMetadata.totalMetadataSize)}`); console.info(`${chalk.yellow("Tensor info size:")} ${toBytes(parsedMetadata.totalTensorInfoSize!)}`); console.info(`${chalk.yellow("File type:")} ${fileTypeName ?? ""} ${chalk.white(`(${parsedMetadata.metadata.general?.file_type})`)}`); + + if (dominantTensorType != null) + console.info(`${chalk.yellow("Dominant tensor type:")} ${dominantTensorType} (${GgmlType[dominantTensorType]})`); + console.info(`${chalk.yellow("Metadata:")} ${prettyPrintObject(parsedMetadata.metadata, undefined, metadataPrettyPrintOptions)}`); console.info(`${chalk.yellow("Tensor info:")} ${prettyPrintObject(parsedMetadata.fullTensorInfo, undefined, tensorInfoPrettyPrintOptions)}`); } diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts index 482353f6..dd5fb29a 100644 --- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts +++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts @@ -23,6 +23,7 @@ import {Llama} from "../../../../bindings/Llama.js"; import {toBytes} from "../../../utils/toBytes.js"; import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js"; import {getPlatform} from "../../../../bindings/utils/getPlatform.js"; +import {GgmlType, resolveGgmlTypeOption} from "../../../../gguf/types/GgufTensorInfoTypes.js"; type InspectMeasureCommand = { modelPath?: string, @@ -33,6 +34,8 @@ type InspectMeasureCommand = { minContextSize: number, maxContextSize?: number, flashAttention?: boolean, + kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, + kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, batchSize?: number, measures: number, @@ -109,6 +112,24 @@ export const InspectMeasureCommand: CommandModule default: false, description: "Enable flash attention for the context" }) + .option("kvCacheKeyType", { + alias: "kvckt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the key for the context KV cache tensors" + }) + .option("kvCacheValueType", { + alias: "kvcvt", + type: "string", + choices: [ + "currentQuant", + ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ] as const, + description: "The type of the value for the context KV cache tensors" + }) .option("swaFullCache", { alias: "noSwa", type: "boolean", @@ -161,7 +182,8 @@ export const InspectMeasureCommand: CommandModule }); }, async handler({ - modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache, + modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }: InspectMeasureCommand) { @@ -186,7 +208,7 @@ export const InspectMeasureCommand: CommandModule const useMmap = !noMmap && llama.supportsMmap; const useDirectIo = !noDirectIo; const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, { - flashAttention, swaFullCache, useMmap + flashAttention, swaFullCache, useMmap, kvCacheKeyType, kvCacheValueType }); console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`); @@ -221,6 +243,16 @@ export const InspectMeasureCommand: CommandModule let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers; let previousContextSizeCheck: undefined | number = undefined; + const resolvedKvCacheKeyType = kvCacheKeyType === "currentQuant" + ? ggufInsights.dominantTensorType ?? GgmlType.F16 + : resolveGgmlTypeOption(kvCacheKeyType) ?? GgmlType.F16; + const resolvedKvCacheValueType = kvCacheValueType === "currentQuant" + ? ggufInsights.dominantTensorType ?? GgmlType.F16 + : resolveGgmlTypeOption(kvCacheValueType) ?? GgmlType.F16; + + if (resolvedKvCacheKeyType != GgmlType.F16 || resolvedKvCacheValueType != GgmlType.F16) + console.info(`${chalk.yellow("KV cache:")} ${GgmlType[resolvedKvCacheKeyType] + " " + GgmlType[resolvedKvCacheValueType]}`); + const measureTable = getMeasureTable(measureMemoryType); measureTable.logHeader({drawRowSeparator: !printHeaderBeforeEachLayer}); @@ -249,6 +281,8 @@ export const InspectMeasureCommand: CommandModule maxContextSize, minContextSize, flashAttention, + kvCacheKeyType: resolvedKvCacheKeyType, + kvCacheValueType: resolvedKvCacheValueType, swaFullCache, batchSize, tests: measures, @@ -533,7 +567,7 @@ const expectedFileName = "InspectMeasureCommand"; async function measureModel({ modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, - flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo }: { modelPath: string, useMmap?: boolean, @@ -546,6 +580,8 @@ async function measureModel({ maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType, swaFullCache?: boolean, batchSize?: number, evaluateText?: string, @@ -656,6 +692,8 @@ async function measureModel({ maxGpuLayers, minGpuLayers, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, batchSize, evaluateText, @@ -759,11 +797,12 @@ async function runTestWorkerLogic() { } async function testContextSizes({ - model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, swaFullCache, - batchSize, evaluateText, exitAfterMeasurement = false + model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }: { model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number, - minContextSize?: number, tests: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, evaluateText?: string, + minContextSize?: number, tests: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, + swaFullCache?: boolean, batchSize?: number, evaluateText?: string, exitAfterMeasurement?: boolean }) { let measurementsDone: number = 0; @@ -794,6 +833,8 @@ async function runTestWorkerLogic() { ), ignoreMemorySafetyChecks: currentContextSizeCheck != null, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, batchSize, failedCreationRemedy: false @@ -849,11 +890,12 @@ async function runTestWorkerLogic() { } async function testWithGpuLayers({ - modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, - batchSize, evaluateText, exitAfterMeasurement = false + modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }: { modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number, - maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, + maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, + swaFullCache?: boolean, batchSize?: number, evaluateText?: string, exitAfterMeasurement?: boolean }) { try { @@ -865,6 +907,8 @@ async function runTestWorkerLogic() { useDirectIo, gpuLayers, defaultContextFlashAttention: flashAttention, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, ignoreMemorySafetyChecks: true }); @@ -888,6 +932,8 @@ async function runTestWorkerLogic() { maxContextSize, minContextSize, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, batchSize, tests, @@ -939,6 +985,8 @@ async function runTestWorkerLogic() { maxContextSize: message.maxContextSize, minContextSize: message.minContextSize, flashAttention: message.flashAttention, + kvCacheKeyType: message.kvCacheKeyType, + kvCacheValueType: message.kvCacheValueType, swaFullCache: message.swaFullCache, batchSize: message.batchSize, evaluateText: message.evaluateText, @@ -1033,6 +1081,8 @@ type ParentToChildMessage = { maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType, swaFullCache?: boolean, batchSize?: number, initialMaxContextSize?: number, diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts index 8238daec..bd1cfb71 100644 --- a/src/cli/utils/interactivelyAskForModel.ts +++ b/src/cli/utils/interactivelyAskForModel.ts @@ -15,6 +15,7 @@ import {getPrettyBuildGpuName} from "../../bindings/consts.js"; import {GgufInsightsConfigurationResolver} from "../../gguf/insights/GgufInsightsConfigurationResolver.js"; import {isUrl} from "../../utils/isUrl.js"; import {isModelUri, parseModelUri} from "../../utils/parseModelUri.js"; +import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js"; import {resolveModelRecommendationFileOptions} from "./resolveModelRecommendationFileOptions.js"; import {getReadablePath} from "./getReadablePath.js"; import {basicChooseFromListConsoleInteraction} from "./basicChooseFromListConsoleInteraction.js"; @@ -61,7 +62,9 @@ export async function interactivelyAskForModel({ downloadIntent = true, flashAttention = false, swaFullCache = false, - useMmap + useMmap, + kvCacheKeyType, + kvCacheValueType }: { llama: Llama, modelsDirectory?: string, @@ -69,7 +72,9 @@ export async function interactivelyAskForModel({ downloadIntent?: boolean, flashAttention?: boolean, swaFullCache?: boolean, - useMmap?: boolean + useMmap?: boolean, + kvCacheKeyType?: "currentQuant" | GgmlType, + kvCacheValueType?: "currentQuant" | GgmlType }): Promise { let localModelFileOptions: (ModelOption & {type: "localModel"})[] = []; const recommendedModelOptions: (ModelOption & {type: "recommendedModel"})[] = []; @@ -123,7 +128,13 @@ export async function interactivelyAskForModel({ const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({ flashAttention: flashAttention && ggufInsights?.flashAttentionSupported, swaFullCache, - useMmap + useMmap, + kvCacheKeyType: kvCacheKeyType === "currentQuant" + ? ggufInsights?.dominantTensorType + : kvCacheKeyType, + kvCacheValueType: kvCacheValueType === "currentQuant" + ? ggufInsights?.dominantTensorType + : kvCacheValueType }); return { @@ -296,7 +307,8 @@ export async function interactivelyAskForModel({ items: options, renderItem(item, focused, rerender) { return renderSelectionItem( - item, focused, rerender, activeInteractionController.signal, llama, flashAttention, swaFullCache, useMmap + item, focused, rerender, activeInteractionController.signal, llama, flashAttention, swaFullCache, useMmap, + kvCacheKeyType, kvCacheValueType ); }, canFocusItem(item) { @@ -413,7 +425,8 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean, - swaFullCache: boolean, useMmap?: boolean + swaFullCache: boolean, useMmap: boolean | undefined, + kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType ) { if (item.type === "localModel") { let modelText = item.title instanceof Function @@ -441,7 +454,9 @@ function renderSelectionItem( llama, flashAttention, swaFullCache, - useMmap + useMmap, + kvCacheKeyType, + kvCacheValueType }); } @@ -563,7 +578,7 @@ function renderRecommendedModelTechnicalInfo( } async function selectFileForModelRecommendation({ - recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, swaFullCache, useMmap + recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, swaFullCache, useMmap, kvCacheKeyType, kvCacheValueType }: { recommendedModelOption: ModelOption & {type: "recommendedModel"}, llama: Llama, @@ -571,7 +586,9 @@ async function selectFileForModelRecommendation({ rerenderOption(): void, flashAttention: boolean, swaFullCache: boolean, - useMmap?: boolean + useMmap?: boolean, + kvCacheKeyType?: "currentQuant" | GgmlType, + kvCacheValueType?: "currentQuant" | GgmlType }) { try { let bestScore: number | undefined = undefined; @@ -594,7 +611,13 @@ async function selectFileForModelRecommendation({ const compatibilityScore = await ggufInsights.configurationResolver.scoreModelConfigurationCompatibility({ flashAttention, swaFullCache, - useMmap + useMmap, + kvCacheKeyType: kvCacheKeyType === "currentQuant" + ? ggufInsights.dominantTensorType + : kvCacheKeyType, + kvCacheValueType: kvCacheValueType === "currentQuant" + ? ggufInsights.dominantTensorType + : kvCacheValueType }); if (bestScore == null || compatibilityScore.compatibilityScore > bestScore) { diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts index 983a1056..47a81c42 100644 --- a/src/cli/utils/printCommonInfoLines.ts +++ b/src/cli/utils/printCommonInfoLines.ts @@ -2,6 +2,7 @@ import chalk from "chalk"; import {getPrettyBuildGpuName} from "../../bindings/consts.js"; import {LlamaContext} from "../../evaluator/LlamaContext/LlamaContext.js"; import {getPlatform} from "../../bindings/utils/getPlatform.js"; +import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js"; import {printInfoLine} from "./printInfoLine.js"; import {toBytes} from "./toBytes.js"; @@ -125,6 +126,10 @@ export async function printCommonInfoLines({ show: tokenMeterEnabled, title: "Token meter", value: "enabled" + }, { + show: context.kvCacheKeyType !== GgmlType.F16 || context.kvCacheValueType !== GgmlType.F16, + title: "KV cache", + value: GgmlType[context.kvCacheKeyType] + " " + GgmlType[context.kvCacheValueType] }] }); @@ -180,6 +185,10 @@ export async function printCommonInfoLines({ show: tokenMeterEnabled, title: "Token meter", value: "enabled" + }, { + show: draftContext.kvCacheKeyType !== GgmlType.F16 || draftContext.kvCacheValueType !== GgmlType.F16, + title: "KV cache", + value: GgmlType[draftContext.kvCacheKeyType] + " " + GgmlType[draftContext.kvCacheValueType] }] }); } diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts index 219d1808..a11cfa96 100644 --- a/src/cli/utils/resolveCommandGgufPath.ts +++ b/src/cli/utils/resolveCommandGgufPath.ts @@ -8,14 +8,18 @@ import {resolveModelDestination} from "../../utils/resolveModelDestination.js"; import {ggufQuantNames} from "../../gguf/utils/ggufQuantNames.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; import {isModelUri} from "../../utils/parseModelUri.js"; +import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js"; import {ConsoleInteraction, ConsoleInteractionKey} from "./ConsoleInteraction.js"; import {getReadablePath} from "./getReadablePath.js"; import {interactivelyAskForModel} from "./interactivelyAskForModel.js"; export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record, { - targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File" + targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File", + kvCacheKeyType, kvCacheValueType }: { - targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string + targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string, + kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, + kvCacheValueType?: "currentQuant" | keyof typeof GgmlType } = {}) { if (ggufPath == null) ggufPath = await interactivelyAskForModel({ @@ -25,7 +29,13 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama downloadIntent: true, flashAttention, swaFullCache, - useMmap + useMmap, + kvCacheKeyType: kvCacheKeyType === "currentQuant" + ? "currentQuant" + : resolveGgmlTypeOption(kvCacheKeyType), + kvCacheValueType: kvCacheValueType === "currentQuant" + ? "currentQuant" + : resolveGgmlTypeOption(kvCacheValueType) }); const resolvedModelDestination = resolveModelDestination(ggufPath); diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 43b205cc..7d013a51 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -15,6 +15,7 @@ import {pushAll} from "../../utils/pushAll.js"; import {safeEventCallback} from "../../utils/safeEventCallback.js"; import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js"; import {LlamaLogLevel} from "../../bindings/types.js"; +import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js"; import { BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceDryRepeatPenalty, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, @@ -74,6 +75,8 @@ export class LlamaContext { /** @internal */ private readonly _idealThreads: number; /** @internal */ private readonly _minThreads: number; /** @internal */ private readonly _performanceTracking: boolean; + /** @internal */ private readonly _kvCacheKeyType: GgmlType; + /** @internal */ private readonly _kvCacheValueType: GgmlType; /** @internal */ private readonly _totalSequences: number; /** @internal */ private readonly _unusedSequenceIds: number[] = []; /** @internal */ private readonly _batchingOptions: Required; @@ -110,13 +113,17 @@ export class LlamaContext { } = {}, swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false, + kvCacheKeyType, + kvCacheValueType, _embeddings, _ranking }: LlamaContextOptions & { sequences: number, contextSize: number, batchSize: number, - flashAttention: boolean + flashAttention: boolean, + kvCacheKeyType: GgmlType, + kvCacheValueType: GgmlType }) { if (_model.disposed) throw new DisposedError(); @@ -145,6 +152,8 @@ export class LlamaContext { : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1) ); this._performanceTracking = !!performanceTracking; + this._kvCacheKeyType = kvCacheKeyType; + this._kvCacheValueType = kvCacheValueType; this._swaFullCache = !!swaFullCache; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own of cells @@ -159,6 +168,8 @@ export class LlamaContext { embeddings: _embeddings, ranking: _ranking, performanceTracking: this._performanceTracking, + kvCacheKeyType: this._kvCacheKeyType, + kvCacheValueType: this._kvCacheValueType, swaFullCache: this._swaFullCache })); this._batchingOptions = { @@ -221,6 +232,14 @@ export class LlamaContext { return this._flashAttention; } + public get kvCacheKeyType() { + return this._kvCacheKeyType; + } + + public get kvCacheValueType() { + return this._kvCacheValueType; + } + /** * The actual size of the state in the memory in bytes. * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context. @@ -872,6 +891,12 @@ export class LlamaContext { const flashAttention = _model.flashAttentionSupported ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention) : false; + const kvCacheKeyType = options.kvCacheKeyType === "currentQuant" + ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType + : resolveGgmlTypeOption(options.kvCacheKeyType) ?? _model.defaultContextKvCacheKeyType; + const kvCacheValueType = options.kvCacheValueType === "currentQuant" + ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType + : resolveGgmlTypeOption(options.kvCacheValueType) ?? _model.defaultContextKvCacheValueType; const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache; const loraOptions = typeof options.lora === "string" ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"] @@ -889,6 +914,8 @@ export class LlamaContext { modelGpuLayers: _model.gpuLayers, modelTrainContextSize: _model.trainContextSize, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, getVramState: () => _model._llama._vramOrchestrator.getMemoryState(), llamaGpu: _model._llama.gpu, @@ -920,10 +947,14 @@ export class LlamaContext { modelGpuLayers: _model.gpuLayers, batchSize, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache }); - const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention, swaFullCache}); + const context = new LlamaContext({_model}, { + ...options, contextSize, batchSize, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache + }); const contextCreationVramReservation = options.ignoreMemorySafetyChecks ? null : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram); diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index fad4c435..7638f34c 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -1,4 +1,5 @@ -import {PickOptions} from "../../utils/utilTypes.js"; +import type {PickOptions} from "../../utils/utilTypes.js"; +import type {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js"; import type {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js"; import type {TokenBias} from "../TokenBias.js"; import type {Token} from "../../types.js"; @@ -104,6 +105,26 @@ export type LlamaContextOptions = { */ batching?: BatchingOptions, + /** + * The type of the key for the KV cache tensors used in this context. + * + * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. + * + * Defaults to `F16` (inherited from the model option `defaultContextKvCacheKeyType`). + * @experimental - this option is experimental. it may not work as intended, and may change in the future + */ + kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + + /** + * The type of the value for the KV cache tensors used in this context. + * + * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. + * + * Defaults to `F16` (inherited from the model option `defaultContextKvCacheValueType`). + * @experimental - this option is experimental. it may not work as intended, and may change in the future + */ + kvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + /** * When using SWA (Sliding Window Attention) on a supported model, * extend the sliding window size to the current context size (meaning practically disabling SWA). diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 24c84375..480a9df9 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -19,6 +19,7 @@ import {GgufArchitectureType, GgufMetadata} from "../../gguf/types/GgufMetadataT import {OverridesObject} from "../../utils/OverridesObject.js"; import {maxRecentDetokenizerTokens} from "../../consts.js"; import {LlamaRankingContext, LlamaRankingContextOptions} from "../LlamaRankingContext.js"; +import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js"; import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js"; import type {Llama} from "../../bindings/Llama.js"; import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js"; @@ -127,6 +128,26 @@ export type LlamaModelOptions = { */ defaultContextFlashAttention?: boolean, + /** + * The default type of the key for the KV cache tensors used for contexts created with this model. + * + * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. + * + * Defaults to `F16`. + * @experimental - this option is experimental. it may not work as intended, and may change in the future + */ + defaultContextKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + + /** + * The default type of the value for the KV cache tensors used for contexts created with this model. + * + * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. + * + * Defaults to `F16`. + * @experimental - this option is experimental. it may not work as intended, and may change in the future + */ + defaultContextKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + /** * When using SWA (Sliding Window Attention) on a supported model, * extend the sliding window size to the current context size (meaning practically disabling SWA) @@ -187,6 +208,8 @@ export class LlamaModel { /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean; /** @internal */ private readonly _defaultContextFlashAttention: boolean; /** @internal */ private readonly _defaultContextSwaFullCache: boolean; + /** @internal */ private readonly _defaultContextKvCacheKeyType: GgmlType; + /** @internal */ private readonly _defaultContextKvCacheValueType: GgmlType; /** @internal */ private readonly _flashAttentionSupported: boolean; /** @internal */ private readonly _loraAdapters = new Map(); /** @internal */ private _typeDescription?: ModelTypeDescription; @@ -208,6 +231,8 @@ export class LlamaModel { _defaultContextFlashAttentionOptionEnabled, _defaultContextFlashAttention, _defaultContextSwaFullCache, + _defaultContextKvCacheKeyType, + _defaultContextKvCacheValueType, _flashAttentionSupported }: { _llama: Llama, @@ -216,6 +241,8 @@ export class LlamaModel { _defaultContextFlashAttentionOptionEnabled: boolean, _defaultContextFlashAttention: boolean, _defaultContextSwaFullCache: boolean, + _defaultContextKvCacheKeyType: GgmlType, + _defaultContextKvCacheValueType: GgmlType, _flashAttentionSupported: boolean }) { this._llama = _llama; @@ -229,6 +256,8 @@ export class LlamaModel { this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled; this._defaultContextFlashAttention = _defaultContextFlashAttention; this._defaultContextSwaFullCache = _defaultContextSwaFullCache; + this._defaultContextKvCacheKeyType = _defaultContextKvCacheKeyType; + this._defaultContextKvCacheValueType = _defaultContextKvCacheValueType; this._flashAttentionSupported = _flashAttentionSupported; const overridesList = ggufMetadataOverridesToList(metadataOverrides); this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({ @@ -357,6 +386,14 @@ export class LlamaModel { return this._defaultContextSwaFullCache; } + public get defaultContextKvCacheKeyType() { + return this._defaultContextKvCacheKeyType; + } + + public get defaultContextKvCacheValueType() { + return this._defaultContextKvCacheValueType; + } + /** * Transform text into tokens that can be fed to the model * @param text - the text to tokenize @@ -707,7 +744,7 @@ export class LlamaModel { }: { _llama: Llama }) { - const {loadSignal, defaultContextFlashAttention} = modelOptions; + const {loadSignal, defaultContextFlashAttention, defaultContextKvCacheKeyType, defaultContextKvCacheValueType} = modelOptions; const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap); const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo; @@ -722,10 +759,18 @@ export class LlamaModel { ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled) : false; const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache; + const resolvedDefaultContextKvCacheKeyType = defaultContextKvCacheKeyType === "currentQuant" + ? ggufInsights.dominantTensorType ?? GgmlType.F16 + : resolveGgmlTypeOption(defaultContextKvCacheKeyType) ?? GgmlType.F16; + const resolvedDefaultContextKvCacheValueType = defaultContextKvCacheValueType === "currentQuant" + ? ggufInsights.dominantTensorType ?? GgmlType.F16 + : resolveGgmlTypeOption(defaultContextKvCacheValueType) ?? GgmlType.F16; const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, { ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, defaultContextFlashAttention: resolvedDefaultContextFlashAttention, defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, + defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType, + defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType, useMmap }); const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({ @@ -740,7 +785,9 @@ export class LlamaModel { _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false, _flashAttentionSupported: flashAttentionSupported, _defaultContextFlashAttention: resolvedDefaultContextFlashAttention, - _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache + _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, + _defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType, + _defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType }); const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks ? null diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index cc594ace..ed364c35 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -2,7 +2,7 @@ import {Llama} from "../../bindings/Llama.js"; import {getLlamaWithoutBackend} from "../../bindings/utils/getLlamaWithoutBackend.js"; import {getDefaultContextBatchSize, getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js"; import {GgufFileInfo} from "../types/GgufFileInfoTypes.js"; -import {GgufTensorInfo} from "../types/GgufTensorInfoTypes.js"; +import {GgmlType, GgufTensorInfo} from "../types/GgufTensorInfoTypes.js"; import {GgufArchitectureType} from "../types/GgufMetadataTypes.js"; import {getReadablePath} from "../../cli/utils/getReadablePath.js"; import {padSafeContextSize} from "../../evaluator/LlamaContext/utils/padSafeContextSize.js"; @@ -19,6 +19,7 @@ export class GgufInsights { /** @internal */ private readonly _modelSize: number; /** @internal */ private _totalFileLayers: number | null = null; /** @internal */ private _supportsRanking?: boolean; + /** @internal */ private _dominantTensorType?: GgmlType; /** @internal */ public readonly _ggufFileInfo: GgufFileInfo; /** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver; /** @internal */ private readonly _tokens: GgufInsightsTokens; @@ -163,6 +164,16 @@ export class GgufInsights { return false; } + /** + * Get the dominant tensor type used in the model file + */ + public get dominantTensorType(): GgmlType | undefined { + if (this._dominantTensorType == null) + this._dominantTensorType = getDominantTensorType(this._ggufFileInfo.fullTensorInfo ?? []); + + return this._dominantTensorType; + } + public get supportsRanking() { if (this._supportsRanking != null) return this._supportsRanking; @@ -223,10 +234,12 @@ export class GgufInsights { */ public estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, - swaFullCache = false + swaFullCache = false, + kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16 }: { contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, - flashAttention?: boolean, includeGraphOverhead?: boolean, swaFullCache?: boolean + flashAttention?: boolean, includeGraphOverhead?: boolean, swaFullCache?: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType }): GgufInsightsResourceRequirements { if (sequences == null) sequences = getDefaultContextSequences(); if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences}); @@ -277,7 +290,9 @@ export class GgufInsights { sequences, totalFileLayers, finalModelGpuLayers, - usingGpu + usingGpu, + kvCacheKeyType, + kvCacheValueType }); const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0; @@ -569,13 +584,17 @@ export class GgufInsights { sequences, totalFileLayers, finalModelGpuLayers, - usingGpu + usingGpu, + kvCacheKeyType = GgmlType.F16, + kvCacheValueType = GgmlType.F16 }: { kvSize: number, sequences: number, totalFileLayers: number, finalModelGpuLayers: number, - usingGpu: boolean + usingGpu: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType }) { // source: `llama_kv_cache_init` in `llama.cpp` const architecture = this._ggufFileInfo.metadata.general?.architecture; @@ -584,16 +603,8 @@ export class GgufInsights { const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nHeadKv: number | number[] = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead; const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); - const keyTypeSize = architecture === GgufArchitectureType.mamba - // if `type_k` of `llama_context_params` changes to be configurable in `LlamaContext`, - // this would have to depend on that value - ? this._llama._consts.ggmlTypeF32Size - : this._llama._consts.ggmlTypeF16Size; - const valueTypeSize = architecture === GgufArchitectureType.mamba - // if `type_v` of `llama_context_params` changes to be configurable in `LlamaContext`, - // this would have to depend on that value - ? this._llama._consts.ggmlTypeF32Size - : this._llama._consts.ggmlTypeF16Size; + const keyTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheKeyType) ?? this._llama._consts.ggmlTypeF16Size; + const valueTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheValueType) ?? this._llama._consts.ggmlTypeF16Size; // source: `llama_model::load_tensors` in `llama-model.cpp` // repeating layers are assigned to GPU from `i_gpu_start = n_layer + 1 - n_gpu_layers` @@ -1088,3 +1099,24 @@ export function parseRankingTemplate(template: string | undefined | null): strin export function isRankingTemplateValid(template: string | undefined | null): boolean { return template != null && template.includes("{{query}}") && template.includes("{{document}}"); } + +export function getDominantTensorType(tensorInfo: GgufTensorInfo[]): GgmlType | undefined { + const tensorTypes: number[] = []; + for (const tensor of tensorInfo) + tensorTypes[tensor.ggmlType] = ( + (tensorTypes[tensor.ggmlType] ?? 0) + + tensor.dimensions.map(((dim) => Number(dim))).reduce((a, b) => a * b, 1) + ); + + let dominantType: GgmlType | undefined = undefined; + let maxCount = 0; + + for (const [type, count] of tensorTypes.entries()) { + if (count > maxCount) { + maxCount = count; + dominantType = type; + } + } + + return dominantType; +} diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index b0179ae9..ea41dcfa 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -3,6 +3,7 @@ import {LlamaModelOptions} from "../../evaluator/LlamaModel/LlamaModel.js"; import {LlamaContextOptions} from "../../evaluator/LlamaContext/types.js"; import {getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js"; import {InsufficientMemoryError} from "../../utils/InsufficientMemoryError.js"; +import {GgmlType} from "../types/GgufTensorInfoTypes.js"; import {resolveModelGpuLayersOption} from "./utils/resolveModelGpuLayersOption.js"; import {resolveContextContextSizeOption} from "./utils/resolveContextContextSizeOption.js"; import {scoreLevels} from "./utils/scoreLevels.js"; @@ -39,6 +40,8 @@ export class GgufInsightsConfigurationResolver { targetContextSize, embeddingContext = false, flashAttention = false, + kvCacheKeyType, + kvCacheValueType, swaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap }: { @@ -46,6 +49,8 @@ export class GgufInsightsConfigurationResolver { targetContextSize?: number, embeddingContext?: boolean, flashAttention?: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType, swaFullCache?: boolean, useMmap?: boolean } = {}, { @@ -65,6 +70,8 @@ export class GgufInsightsConfigurationResolver { } = {}) { const compatibilityScore = await this.scoreModelConfigurationCompatibility({ flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, contextSize: targetContextSize, embeddingContext, @@ -108,6 +115,8 @@ export class GgufInsightsConfigurationResolver { contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, flashAttention = false, + kvCacheKeyType, + kvCacheValueType, swaFullCache = false, maximumFittedContextSizeMultiplier = 100, maximumUnfitConfigurationResourceMultiplier = 100, @@ -118,6 +127,8 @@ export class GgufInsightsConfigurationResolver { contextSize?: number, embeddingContext?: boolean, flashAttention?: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType, swaFullCache?: boolean, maximumFittedContextSizeMultiplier?: number, maximumUnfitConfigurationResourceMultiplier?: number, @@ -215,6 +226,8 @@ export class GgufInsightsConfigurationResolver { llamaSupportsGpuOffloading, defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, + defaultContextKvCacheKeyType: kvCacheKeyType, + defaultContextKvCacheValueType: kvCacheValueType, ignoreMemorySafetyChecks: forceGpuLayers != null, useMmap } @@ -272,6 +285,8 @@ export class GgufInsightsConfigurationResolver { modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes, ignoreMemorySafetyChecks: forceStrictContextSize, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache }); contextFitsMemory = true; @@ -292,7 +307,9 @@ export class GgufInsightsConfigurationResolver { isEmbeddingContext: embeddingContext, modelGpuLayers: resolvedGpuLayers, flashAttention, - swaFullCache + swaFullCache, + kvCacheKeyType, + kvCacheValueType }); const rankPoints = { @@ -388,12 +405,15 @@ export class GgufInsightsConfigurationResolver { llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, defaultContextFlashAttention = false, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, defaultContextSwaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap }: { ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean, - defaultContextSwaFullCache?: boolean, useMmap?: boolean + defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache?: boolean, + useMmap?: boolean } = {}) { return resolveModelGpuLayersOption(gpuLayers, { ggufInsights: this._ggufInsights, @@ -403,6 +423,8 @@ export class GgufInsightsConfigurationResolver { llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap }); @@ -418,6 +440,8 @@ export class GgufInsightsConfigurationResolver { batchSize, modelTrainContextSize, flashAttention = false, + kvCacheKeyType, + kvCacheValueType, swaFullCache = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), @@ -430,6 +454,8 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers: number, modelTrainContextSize: number, flashAttention?: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType, swaFullCache?: boolean, batchSize?: LlamaContextOptions["batchSize"], sequences?: number, @@ -448,6 +474,8 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers, modelTrainContextSize, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, getVramState, getRamState, diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts index 49ace603..ba0e4ae7 100644 --- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts +++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts @@ -5,11 +5,13 @@ import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; import {InsufficientMemoryError} from "../../../utils/InsufficientMemoryError.js"; import {getRamUsageFromUnifiedVram} from "./getRamUsageFromUnifiedVram.js"; +import type {GgmlType} from "../../types/GgufTensorInfoTypes.js"; const defaultMaxContextSizeSwapUse = 2048; export async function resolveContextContextSizeOption({ - contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, swaFullCache, + contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, maxContextSizeSwapUse = defaultMaxContextSizeSwapUse }: { @@ -20,6 +22,8 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: number, modelTrainContextSize: number, flashAttention: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType, swaFullCache: boolean, getVramState(): Promise<{total: number, free: number, unifiedSize: number}>, getRamState(): Promise<{total: number, free: number}>, @@ -53,6 +57,8 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, isEmbeddingContext }); @@ -99,6 +105,8 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, isEmbeddingContext }); @@ -148,6 +156,8 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache, isEmbeddingContext }); diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts index 62d58141..5c544744 100644 --- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts +++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts @@ -5,17 +5,21 @@ import {findBestOption} from "../../../utils/findBestOption.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {scoreLevels} from "./scoreLevels.js"; +import type {GgmlType} from "../../types/GgufTensorInfoTypes.js"; import type {GgufInsights} from "../GgufInsights.js"; const fitContextExtraMemoryPaddingPercentage = 0.5; export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], { ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, - llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap + llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, + defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap }: { ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean, getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu, - llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean + llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, + defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache: boolean, + useMmap?: boolean }): Promise { if (gpuLayers == null) gpuLayers = "auto"; @@ -37,6 +41,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" ggufInsights, currentVram: vramState.free, defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap }); @@ -74,6 +80,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" ? gpuLayers.max : undefined, defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap }); @@ -97,6 +105,8 @@ function getBestGpuLayersForFreeVram({ minGpuLayers, maxGpuLayers, defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap }: { @@ -106,6 +116,8 @@ function getBestGpuLayersForFreeVram({ minGpuLayers?: number, maxGpuLayers?: number, defaultContextFlashAttention: boolean, + defaultContextKvCacheKeyType?: GgmlType, + defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache: boolean, useMmap?: boolean }) { @@ -128,6 +140,8 @@ function getBestGpuLayersForFreeVram({ fitContext, defaultContextFlashAttention, defaultContextSwaFullCache, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, useMmap }); @@ -187,10 +201,12 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer } function getVramRequiredForGpuLayers({ - gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap + gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, + defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache = false, useMmap }: { gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, - defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean + defaultContextFlashAttention: boolean, defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, + defaultContextSwaFullCache: boolean, useMmap?: boolean }) { const modelVram = ggufInsights.estimateModelResourceRequirements({ gpuLayers, @@ -208,6 +224,8 @@ function getVramRequiredForGpuLayers({ sequences: 1, isEmbeddingContext: fitContext.embeddingContext ?? false, flashAttention: defaultContextFlashAttention, + kvCacheKeyType: defaultContextKvCacheKeyType, + kvCacheValueType: defaultContextKvCacheValueType, swaFullCache: defaultContextSwaFullCache }).gpuVram; @@ -228,6 +246,8 @@ function getVramRequiredForGpuLayers({ vram: currentVram - modelVram, isEmbeddingContext: fitContext?.embeddingContext ?? false, flashAttention: defaultContextFlashAttention, + kvCacheKeyType: defaultContextKvCacheKeyType, + kvCacheValueType: defaultContextKvCacheValueType, swaFullCache: defaultContextSwaFullCache }); @@ -241,8 +261,11 @@ function getVramRequiredForGpuLayers({ }; } -function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, swaFullCache}: { - gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, swaFullCache: boolean +function findMaxPossibleContextSizeForVram({ + gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache +}: { + gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean }) { const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}); @@ -258,6 +281,8 @@ function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmb sequences: 1, isEmbeddingContext, flashAttention, + kvCacheKeyType, + kvCacheValueType, swaFullCache }).gpuVram; diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts index 1ada8204..39e2b984 100644 --- a/src/gguf/types/GgufTensorInfoTypes.ts +++ b/src/gguf/types/GgufTensorInfoTypes.ts @@ -20,7 +20,7 @@ export type GgufTensorInfo = { readonly filePart: number }; -export const enum GgmlType { +export enum GgmlType { F32 = 0, F16 = 1, Q4_0 = 2, @@ -63,3 +63,15 @@ export const enum GgmlType { MXFP4 = 39, // MXFP4 (1 block) NVFP4 = 40 // NVFP4 (4 blocks, E4M3 scale) } + +export function resolveGgmlTypeOption(option?: keyof typeof GgmlType | GgmlType) { + if (option == null) + return undefined; + + if (typeof option === "number" && Object.hasOwn(GgmlType, option)) + return option as GgmlType; + else if (typeof option === "string" && Object.hasOwn(GgmlType, option)) + return GgmlType[option as keyof typeof GgmlType]; + + return undefined; +} From 30d5e2ddb0ec8825609bbf5e5cc18c8f17240f0c Mon Sep 17 00:00:00 2001 From: Gilad S Date: Tue, 17 Mar 2026 06:54:08 +0200 Subject: [PATCH 08/10] fix: bug --- src/utils/prettyPrintObject.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/prettyPrintObject.ts b/src/utils/prettyPrintObject.ts index d02b3fca..56891e88 100644 --- a/src/utils/prettyPrintObject.ts +++ b/src/utils/prettyPrintObject.ts @@ -64,7 +64,7 @@ function prettyPrintArray(arr: any[], indent: number = 4, options: PrettyPrintOb const arrayItems = slicedArray.map((item) => prettyPrintObject(item, indent, options)) .concat( hiddenItems > 0 - ? [chalk.white("..." + hiddenItems + " more item" + (hiddenItems !== 1 ? "s" : ""))] + ? [chalk.white("..." + hiddenItems.toLocaleString("en-US") + " more item" + (hiddenItems !== 1 ? "s" : ""))] : [] ); const oneLineJoinedArrayItems = arrayItems.join(chalk.whiteBright(", ")); From a0a46d5a0a6cb63fd56672521ae0e12df16246f2 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Tue, 17 Mar 2026 07:50:49 +0200 Subject: [PATCH 09/10] fix: bugs --- llama/addon/AddonContext.cpp | 4 +-- src/cli/commands/ChatCommand.ts | 18 +++++----- src/cli/commands/CompleteCommand.ts | 18 +++++----- src/cli/commands/InfillCommand.ts | 18 +++++----- .../commands/InspectEstimateCommand.ts | 10 +++--- .../inspect/commands/InspectMeasureCommand.ts | 18 +++++----- src/evaluator/LlamaContext/LlamaContext.ts | 29 +++++++++------ src/evaluator/LlamaContext/types.ts | 20 ++++++++--- src/evaluator/LlamaModel/LlamaModel.ts | 35 ++++++++++++++----- 9 files changed, 108 insertions(+), 62 deletions(-) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index b8a22ef0..017c6967 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -446,14 +446,14 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Int32Value(); if (keyType >= 0 && keyType < GGML_TYPE_COUNT) { - context_params.type_k = keyType; + context_params.type_k = static_cast(keyType); } } if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) { auto valueType = options.Get("kvCacheValueType").As().Int32Value(); if (valueType >= 0 && valueType < GGML_TYPE_COUNT) { - context_params.type_v = valueType; + context_params.type_v = static_cast(valueType); } } diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index d1dca381..88ec258e 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -180,18 +180,20 @@ export const ChatCommand: CommandModule = { type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the key for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("kvCacheValueType", { alias: "kvcvt", type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the value for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("swaFullCache", { alias: "noSwa", @@ -520,8 +522,8 @@ async function RunChat({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, - defaultContextKvCacheKeyType: kvCacheKeyType, - defaultContextKvCacheValueType: kvCacheValueType, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, @@ -557,8 +559,8 @@ async function RunChat({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, - defaultContextKvCacheKeyType: kvCacheKeyType, - defaultContextKvCacheValueType: kvCacheValueType, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index e5711c7e..f3fca2a9 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -137,18 +137,20 @@ export const CompleteCommand: CommandModule = { type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the key for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("kvCacheValueType", { alias: "kvcvt", type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the value for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("swaFullCache", { alias: "noSwa", @@ -427,8 +429,8 @@ async function RunCompletion({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, - defaultContextKvCacheKeyType: kvCacheKeyType, - defaultContextKvCacheValueType: kvCacheValueType, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, @@ -464,8 +466,8 @@ async function RunCompletion({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, - defaultContextKvCacheKeyType: kvCacheKeyType, - defaultContextKvCacheValueType: kvCacheValueType, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 6b8eabea..7bdb7bd6 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -147,18 +147,20 @@ export const InfillCommand: CommandModule = { type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the key for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("kvCacheValueType", { alias: "kvcvt", type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the value for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("swaFullCache", { alias: "noSwa", @@ -450,8 +452,8 @@ async function RunInfill({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, - defaultContextKvCacheKeyType: kvCacheKeyType, - defaultContextKvCacheValueType: kvCacheValueType, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, @@ -487,8 +489,8 @@ async function RunInfill({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, - defaultContextKvCacheKeyType: kvCacheKeyType, - defaultContextKvCacheValueType: kvCacheValueType, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, useMmap, useDirectIo, diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts index baaa2a5f..5915632f 100644 --- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts +++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts @@ -125,18 +125,20 @@ export const InspectEstimateCommand: CommandModule typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the key for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("kvCacheValueType", { alias: "kvcvt", type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the value for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("swaFullCache", { alias: "noSwa", diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts index dd5fb29a..60dcace7 100644 --- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts +++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts @@ -117,18 +117,20 @@ export const InspectMeasureCommand: CommandModule type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the key for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("kvCacheValueType", { alias: "kvcvt", type: "string", choices: [ "currentQuant", - ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[] + ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, - description: "The type of the value for the context KV cache tensors" + default: "F16" as const, + description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" }) .option("swaFullCache", { alias: "noSwa", @@ -833,8 +835,8 @@ async function runTestWorkerLogic() { ), ignoreMemorySafetyChecks: currentContextSizeCheck != null, flashAttention, - kvCacheKeyType, - kvCacheValueType, + experimentalKvCacheKeyType: kvCacheKeyType, + experimentalKvCacheValueType: kvCacheValueType, swaFullCache, batchSize, failedCreationRemedy: false @@ -907,8 +909,8 @@ async function runTestWorkerLogic() { useDirectIo, gpuLayers, defaultContextFlashAttention: flashAttention, - defaultContextKvCacheKeyType: kvCacheKeyType, - defaultContextKvCacheValueType: kvCacheValueType, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, defaultContextSwaFullCache: swaFullCache, ignoreMemorySafetyChecks: true }); diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 7d013a51..248c763e 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -113,8 +113,8 @@ export class LlamaContext { } = {}, swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false, - kvCacheKeyType, - kvCacheValueType, + experimentalKvCacheKeyType, + experimentalKvCacheValueType, _embeddings, _ranking }: LlamaContextOptions & { @@ -122,8 +122,8 @@ export class LlamaContext { contextSize: number, batchSize: number, flashAttention: boolean, - kvCacheKeyType: GgmlType, - kvCacheValueType: GgmlType + experimentalKvCacheKeyType: GgmlType, + experimentalKvCacheValueType: GgmlType }) { if (_model.disposed) throw new DisposedError(); @@ -152,8 +152,8 @@ export class LlamaContext { : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1) ); this._performanceTracking = !!performanceTracking; - this._kvCacheKeyType = kvCacheKeyType; - this._kvCacheValueType = kvCacheValueType; + this._kvCacheKeyType = experimentalKvCacheKeyType; + this._kvCacheValueType = experimentalKvCacheValueType; this._swaFullCache = !!swaFullCache; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own of cells @@ -891,12 +891,12 @@ export class LlamaContext { const flashAttention = _model.flashAttentionSupported ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention) : false; - const kvCacheKeyType = options.kvCacheKeyType === "currentQuant" + const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant" ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType - : resolveGgmlTypeOption(options.kvCacheKeyType) ?? _model.defaultContextKvCacheKeyType; - const kvCacheValueType = options.kvCacheValueType === "currentQuant" + : resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType; + const kvCacheValueType = options.experimentalKvCacheValueType === "currentQuant" ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType - : resolveGgmlTypeOption(options.kvCacheValueType) ?? _model.defaultContextKvCacheValueType; + : resolveGgmlTypeOption(options.experimentalKvCacheValueType) ?? _model.defaultContextKvCacheValueType; const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache; const loraOptions = typeof options.lora === "string" ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"] @@ -953,7 +953,14 @@ export class LlamaContext { }); const context = new LlamaContext({_model}, { - ...options, contextSize, batchSize, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache + ...options, + contextSize, + batchSize, + sequences, + flashAttention, + experimentalKvCacheKeyType: kvCacheKeyType, + experimentalKvCacheValueType: kvCacheValueType, + swaFullCache }); const contextCreationVramReservation = options.ignoreMemorySafetyChecks ? null diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index 7638f34c..a0a64f02 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -111,9 +111,15 @@ export type LlamaContextOptions = { * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. * * Defaults to `F16` (inherited from the model option `defaultContextKvCacheKeyType`). - * @experimental - this option is experimental. it may not work as intended, and may change in the future + * @deprecated - this option is experimental and highly unstable. + * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly. + * Avoid allowing end users to configure this option, as it's highly unstable. + * @experimental - this option is experimental and highly unstable. + * It may not work as intended or even crash the process. + * Use with caution. + * This option may change or get removed in the future without a breaking change version. */ - kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + experimentalKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType, /** * The type of the value for the KV cache tensors used in this context. @@ -121,9 +127,15 @@ export type LlamaContextOptions = { * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. * * Defaults to `F16` (inherited from the model option `defaultContextKvCacheValueType`). - * @experimental - this option is experimental. it may not work as intended, and may change in the future + * @deprecated - this option is experimental and highly unstable. + * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly. + * Avoid allowing end users to configure this option, as it's highly unstable. + * @experimental - this option is experimental and highly unstable. + * It may not work as intended or even crash the process. + * Use with caution. + * This option may change or get removed in the future without a breaking change version. */ - kvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + experimentalKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType, /** * When using SWA (Sliding Window Attention) on a supported model, diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 480a9df9..84dfa65e 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -134,9 +134,15 @@ export type LlamaModelOptions = { * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. * * Defaults to `F16`. - * @experimental - this option is experimental. it may not work as intended, and may change in the future + * @deprecated - this option is experimental and highly unstable. + * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly. + * Avoid allowing end users to configure this option, as it's highly unstable. + * @experimental - this option is experimental and highly unstable. + * It may not work as intended or even crash the process. + * Use with caution. + * This option may change or get removed in the future without a breaking change version. */ - defaultContextKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + experimentalDefaultContextKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType, /** * The default type of the value for the KV cache tensors used for contexts created with this model. @@ -144,9 +150,15 @@ export type LlamaModelOptions = { * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors. * * Defaults to `F16`. - * @experimental - this option is experimental. it may not work as intended, and may change in the future + * @deprecated - this option is experimental and highly unstable. + * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly. + * Avoid allowing end users to configure this option, as it's highly unstable. + * @experimental - this option is experimental and highly unstable. + * It may not work as intended or even crash the process. + * Use with caution. + * This option may change or get removed in the future without a breaking change version. */ - defaultContextKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType, + experimentalDefaultContextKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType, /** * When using SWA (Sliding Window Attention) on a supported model, @@ -744,7 +756,12 @@ export class LlamaModel { }: { _llama: Llama }) { - const {loadSignal, defaultContextFlashAttention, defaultContextKvCacheKeyType, defaultContextKvCacheValueType} = modelOptions; + const { + loadSignal, + defaultContextFlashAttention, + experimentalDefaultContextKvCacheKeyType, + experimentalDefaultContextKvCacheValueType + } = modelOptions; const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap); const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo; @@ -759,12 +776,12 @@ export class LlamaModel { ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled) : false; const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache; - const resolvedDefaultContextKvCacheKeyType = defaultContextKvCacheKeyType === "currentQuant" + const resolvedDefaultContextKvCacheKeyType = experimentalDefaultContextKvCacheKeyType === "currentQuant" ? ggufInsights.dominantTensorType ?? GgmlType.F16 - : resolveGgmlTypeOption(defaultContextKvCacheKeyType) ?? GgmlType.F16; - const resolvedDefaultContextKvCacheValueType = defaultContextKvCacheValueType === "currentQuant" + : resolveGgmlTypeOption(experimentalDefaultContextKvCacheKeyType) ?? GgmlType.F16; + const resolvedDefaultContextKvCacheValueType = experimentalDefaultContextKvCacheValueType === "currentQuant" ? ggufInsights.dominantTensorType ?? GgmlType.F16 - : resolveGgmlTypeOption(defaultContextKvCacheValueType) ?? GgmlType.F16; + : resolveGgmlTypeOption(experimentalDefaultContextKvCacheValueType) ?? GgmlType.F16; const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, { ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, defaultContextFlashAttention: resolvedDefaultContextFlashAttention, From 28e6817128f51b8028df70bc3874b6e538a95eee Mon Sep 17 00:00:00 2001 From: Gilad S Date: Tue, 17 Mar 2026 08:34:59 +0200 Subject: [PATCH 10/10] chore: mistral 4 gguf arch type --- src/gguf/types/GgufMetadataTypes.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index a2cd41b9..249cad32 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -114,6 +114,7 @@ export const enum GgufArchitectureType { rnd1 = "rnd1", panguEmbedded = "pangu-embedded", mistral3 = "mistral3", + mistral4 = "mistral4", paddleocr = "paddleocr", mimo2 = "mimo2", step35 = "step35",