diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2082f7b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,79 @@ +# Pin line endings on text files so cross-platform contributors don't +# see phantom "modified" diffs from autocrlf-driven CRLF<->LF flips. +# +# Background: Windows users with `core.autocrlf=true` (the Git for +# Windows default) see Cargo.toml / tauri.conf.json / etc. as modified +# the moment they `git checkout` because the working-tree copy gets +# rewritten with CRLF while origin's blobs are LF. Without this file, +# every status check on Windows lights those up as dirty even though +# no real change was made. With this file, git normalizes them on the +# way in and out and the status stays clean. + +# Default: treat as text, normalize to LF in the index. The working +# tree gets the platform's native line ending on checkout (LF on +# macOS/Linux, LF on Windows-with-`core.eol=lf`, CRLF on +# Windows-with-default-config). +* text=auto + +# Repo-shape files MUST stay LF in the working tree everywhere -- the +# Tauri / Cargo / npm toolchains all read them with LF assumptions +# even on Windows, and a CRLF-shaped tauri.conf.json caused real +# parse failures earlier in the project history (see the patch- +# tauri-conf.mjs script's "self-heal an empty/corrupt JSON" branch). +*.toml text eol=lf +*.json text eol=lf +*.yml text eol=lf +*.yaml text eol=lf +*.md text eol=lf + +# Source files: LF everywhere. Vite + tsc handle either, but pinning +# avoids whitespace-only diffs in PRs. +*.ts text eol=lf +*.tsx text eol=lf +*.js text eol=lf +*.jsx text eol=lf +*.mjs text eol=lf +*.cjs text eol=lf +*.py text eol=lf +*.rs text eol=lf +*.css text eol=lf +*.html text eol=lf + +# Shell scripts: LF (would otherwise silently break on macOS / Linux +# with "bad interpreter" errors when bash sees \r in the shebang). +*.sh text eol=lf + +# PowerShell: CRLF. The PS 5.1 parser handles either but PowerShell +# scripts authored on Windows traditionally ship CRLF, and Windows +# editors would otherwise rewrite them on save and produce noise. +*.ps1 text eol=crlf +*.psm1 text eol=crlf +*.psd1 text eol=crlf + +# Binary blobs that Git would otherwise try to diff/normalize. Mark +# them explicitly so a `text=auto` heuristic mistake can't corrupt +# them on a cross-platform clone. +*.png binary +*.jpg binary +*.jpeg binary +*.gif binary +*.webp binary +*.ico binary +*.icns binary +*.woff binary +*.woff2 binary +*.ttf binary +*.otf binary +*.zip binary +*.gz binary +*.tar binary +*.7z binary +*.exe binary +*.dll binary +*.so binary +*.dylib binary +*.pyd binary +*.safetensors binary +*.gguf binary +*.bin binary +*.onnx binary diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 04a3820..7db93cc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -265,7 +265,7 @@ jobs: tagName: ${{ inputs.release_tag || github.ref_name }} tauriScript: npx tauri args: --bundles ${{ matrix.bundle_targets }} --ci - includeUpdaterJson: false + includeUpdaterJson: true updaterJsonPreferNsis: false publish-manifest: diff --git a/.gitignore b/.gitignore index d6d110b..92b50b7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ assets/ src-tauri/gen/ .env .env.local -.claude \ No newline at end of file +.claude +AGENTS.md diff --git a/CLAUDE.md b/CLAUDE.md index 6557c50..a4304f5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -84,7 +84,7 @@ Check for updates to external repos we build from or depend on: | dflash-mlx | `bstnxbt/dflash-mlx` | `main` pinned to commit `f825ffb2` (upstream deleted all tags April 2026) | `git ls-remote https://github.com/bstnxbt/dflash-mlx.git refs/heads/main` | | turboquant | `back2matching/turboquant` | — | `.venv/bin/pip index versions turboquant 2>/dev/null` | | turboquant-mlx | `arozanov/turboquant-mlx` | — | `.venv/bin/pip index versions turboquant-mlx 2>/dev/null` | -| turboquant-mlx-full | `helgklaizar/turboquant_mlx` | — | `.venv/bin/pip index versions turboquant-mlx-full 2>/dev/null` | +| turboquant-mlx-full | `manjunathshiva/turboquant-mlx` | — | `.venv/bin/pip index versions turboquant-mlx-full 2>/dev/null` | | DDTree (ported algorithm) | `liranringel/ddtree` | `main` | `git ls-remote https://github.com/liranringel/ddtree.git HEAD` | ### 4. Cache Strategy Health @@ -108,20 +108,33 @@ no longer relevant. | ID | Item | Trigger / Condition | Notes | |----|------|---------------------|-------| -| FU-001 | Bump `turboquant` to 0.3.x | PyPI publishes `>=0.3.0` (source at 0.3.1 since 2026-04-16) | Adds asymmetric K/V bits, layer-adaptive precision, `--no-quant` eval flag, NumPy 2.0 + transformers 5.x compat. Backward compatible per upstream README. Bump extra in [pyproject.toml](pyproject.toml) once available. | -| FU-002 | Wire TriAttention MLX compressor into mlx_worker | When adding experimental KV compression path for mlx-lm generation | **Blocked on upstream API gap.** `TriAttentionStrategy.apply_mlx_compressor()` exists ([cache_compression/triattention.py](cache_compression/triattention.py)) and triattention 0.2.0 is installable via `pip install --no-deps` (skips triton which is CUDA-only). BUT: (1) `mlx_lm.stream_generate` exposes no per-step callback for invoking the compressor; (2) upstream's `triattention_generate_step` expects `List[Tuple[mx.array, mx.array]]` raw tensor tuples but mlx-lm passes `KVCache` wrapper objects. Fix path: custom generation loop (~100-200 lines) bridging KVCache ↔ tuples, plus calibration-stats UX + kv_budget setting. Do on a CUDA box or with a small test model — don't ship blind. | +| ~~FU-001~~ | ~~Bump `turboquant` to 0.3.x~~ | **Shipped 2026-05-03.** | `turboquant-mlx-full` 0.3.0 published to PyPI; `[turboquant]` extra pin bumped from `>=0.1.3` to `>=0.3.0` in [pyproject.toml](pyproject.toml). Adds asymmetric K/V bits, layer-adaptive precision, `--no-quant` eval flag, NumPy 2.0 + transformers 5.x compat. Verified backward compatible — full ``test_cache_strategies.py`` + ``test_image_runtime.py`` + ``test_video_runtime.py`` (190 tests) pass against 0.3.0. The `turboquant` (HuggingFace) and `turboquant-mlx` (arozanov fork) packages stay on their existing pins; only the active `turboquant-mlx-full` path advances. | +| ~~FU-002~~ | ~~Wire TriAttention MLX compressor into mlx_worker~~ | **Shipped 2026-05-03.** | Unblocked by triattention 0.2.0's MLX port (RavenX AI, 2026-04-09): `apply_triattention_mlx(model, kv_budget=N)` operates on the model directly, bypassing the `mlx_lm.stream_generate` callback gap. Spike at [scripts/spike_triattention_mlx.py](scripts/spike_triattention_mlx.py) confirmed 2.63× speedup with identical output on Qwen2.5-0.5B-Instruct-4bit (norm-only scoring works without calibration stats). Wired into `WorkerState._apply_cache_profile` ([backend_service/mlx_worker.py](backend_service/mlx_worker.py)) via a new `_apply_triattention_mlx_compressor` branch — when `cacheStrategy == "triattention"` the worker delegates to `cache_compression.registry.get("triattention").apply_mlx_compressor(model, kv_budget=self.kv_budget)`. `kvBudget` request param defaults to 2048; falls back to native cache on any failure (model None, registry missing, strategy unavailable, apply raises). | | FU-003 | LongLive integration for Wan 2.1 T2V 1.3B | CUDA platforms (Windows/Linux) only | Real-time causal long video gen ([triattention/longlive](https://github.com/WeianMao/triattention/tree/main/longlive)). We ship the target model already. Needs: new video backend branch in [backend_service/video_runtime.py](backend_service/video_runtime.py), LoRA weights download, torchrun orchestration, UI affordance for long-clip mode. Flash Attention dep. | | FU-004 | TriAttention SGLang backend | When/if we adopt SGLang as an inference backend | Added upstream 2026-04-22 as v0.2.0. No action unless SGLang lands in our runtime. | | ~~FU-005~~ | ~~arozanov v_only TurboQuant MLX mode~~ | **Dropped 2026-04-24** | Our current `turboquant-mlx-full` 0.1.3 path already runs without any mlx-lm fork — uses pip `TurboQuantKVCache` with `QuantizedKVCache` fallback ([turboquant_mlx/__init__.py:174-186](turboquant_mlx/__init__.py)). `VOnlyTurboQuantCache` is only in the arozanov fork (we track but don't consume). Value prop already satisfied; entry removed. | -| FU-006 | Re-verify dflash-mlx pin | Quarterly, or when Qwen/Llama drafts land | Currently `f825ffb` = v0.1.4.1 (latest). Upstream deleted tags April 2026 — pin by commit. | -| FU-007 | TeaCache diffusion cache strategy | **FLUX + HunyuanVideo + LTX-Video + CogVideoX + Mochi shipped 2026-04-26.** Wan2.1 still pending. | Five `teacache_forward` patches live under [cache_compression/_teacache_patches/](cache_compression/_teacache_patches/) — FLUX vendored from upstream, the four video DiTs authored as diffusers-shaped ports (upstream targets standalone repos with different forward signatures, so not directly vendorable). Per-model rescale coefficients pulled from upstream's calibration tables. **Wan2.1 still excluded** — ali-vilab `teacache_generate.py` targets Wan-Video/Wan2.1 (signature `(self, x, t, context, seq_len, clip_fea, y)`); diffusers `WanTransformer3DModel` block structure differs enough that a faithful port needs calibration access (deferred). Reference: [ali-vilab/TeaCache](https://github.com/ali-vilab/TeaCache) (Apache 2.0). Quality knob `rel_l1_thresh` default 0.4. | -| FU-008 | `stable-diffusion.cpp` engine (cross-platform diffusion) | **Scaffold shipped 2026-04-26.** Generate path (CLI subprocess + stdout progress parser) still pending. | Binary staging in [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) (mirrors `llama-server-turbo` pattern: `CHAOSENGINE_SDCPP_BIN_DIR` → `~/.chaosengine/bin/` → `../stable-diffusion.cpp/build/bin/`). Path resolution in [src-tauri/src/lib.rs](src-tauri/src/lib.rs) (`resolve_sd_cpp` + `CHAOSENGINE_SDCPP_BIN` env injection in both embedded and source-workspace branches). Engine class in [backend_service/sdcpp_video_runtime.py](backend_service/sdcpp_video_runtime.py) (`SdCppVideoEngine`) — `probe()` returns binary-presence status; `preload`/`unload` track loaded repo; `generate()` raises `NotImplementedError` until CLI arg builders + progress parser land. Manager exposes `sdcpp_video_capabilities()` so Setup/Studio can surface staging state. Models: SD 1.x/2.x/XL, FLUX.1/2, **Wan2.1/2.2 video**, Qwen Image, Z-Image — video subset wired only for Wan repos. Repo [leejet/stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) (MIT). | -| FU-009 | mlx-video (Blaizzy) Apple Silicon video engine | **LTX-2 shipped 2026-04-26.** Wan still scaffold. | [Blaizzy/mlx-video](https://github.com/Blaizzy/mlx-video) (MIT, 198⭐). LTX-2 paths (`prince-canuma/LTX-2-{distilled,dev,2.3-distilled,2.3-dev}`) routed through subprocess engine in [backend_service/mlx_video_runtime.py](backend_service/mlx_video_runtime.py); manager dispatch lives at [backend_service/video_runtime.py](backend_service/video_runtime.py) `VideoRuntimeManager.generate`. **Wan stays diffusers MPS** — mlx-video Wan2.1/2.2 require an explicit `mlx_video.models.wan_2.convert` step on raw HF weights (no pre-converted MLX repo today). Bundling that conversion into a one-shot install action will promote Wan to mlx-video; until then, Wan paths use diffusers MPS, which is fine for Wan2.1 1.3B / Wan2.2 5B on a 64 GB Mac. | -| FU-010 | vllm-swift Apple Silicon backend (**watch-only**) | Re-evaluate after 1–2 releases or mid-2026; skip if stars/commits stagnate | [TheTom/vllm-swift](https://github.com/TheTom/vllm-swift) — Swift/Metal vLLM forward pass, Python orchestration only. 2.4× over mlx_lm on Qwen3-0.6B single-request; matches vLLM at concurrency 64. Fills the macOS vLLM gap. Low-activity single fork (76 commits, 1 open issue) — treat as experimental. Action: monitor. No code this cycle. | +| ~~FU-006~~ | ~~Re-verify dflash-mlx pin~~ | **Bumped to `8d8545d` = v0.1.5.1 on 2026-05-05 after the ddtree.py rewrite landed.** | Pin advanced from `f825ffb` (v0.1.4.1) to `8d8545d` (v0.1.5.1). 0.1.5+ moved every primitive that [backend_service/ddtree.py](backend_service/ddtree.py) consumed off the runtime top-level onto a per-family `target_ops` adapter — `target_forward_with_hidden_states` → `target_ops.forward_with_hidden_capture`, `extract_context_feature_from_dict` → `target_ops.extract_context_feature`, `make_target_cache` → `target_ops.make_cache`, `_target_embed_tokens` → `target_ops.embed_tokens`, `_target_text_model` → `target_ops.text_model`, `_lm_head_logits` → `target_ops.logits_from_hidden`. `ContextOnlyDraftKVCache` moved to `dflash_mlx.model`; `create_attention_mask` re-imported from `mlx_lm.models.base`; `trim_cache_to` was removed entirely and now lives as a thin local `_trim_cache_to` shim that calls each entry's own `.rollback()` / `.trim()` / `.crop()`. Adapter resolved once at the top of `generate_ddtree_mlx` via `resolve_target_ops(target_model)`. Live smoke 2026-05-05 against `mlx-community/Qwen2.5-0.5B-Instruct-4bit` confirmed adapter resolves (`backend=qwen_gdn`, `family=pure_attention`), forward+capture / embed_tokens / text_model / logits_from_hidden / extract_context_feature / `_trim_cache_to` all working. Gains over 0.1.4.1: draft model quantization with Metal MMA kernels, branchless Metal kernels + fused draft KV projections, long-context runtime diagnostics. Re-check cadence resets to quarterly. | +| ~~FU-007~~ | ~~TeaCache for Wan2.1/2.2~~ | **Obsoleted 2026-05-03 by FU-015.** | TeaCache patches for FLUX + HunyuanVideo + LTX-Video + CogVideoX + Mochi remain under [cache_compression/_teacache_patches/](cache_compression/_teacache_patches/). The Wan-specific port that was deferred here is no longer needed: diffusers 0.36 ships a model-agnostic `apply_first_block_cache` hook (FU-015) that operates on `pipeline.transformer` regardless of model, so Wan caches via the same generic strategy without a vendored forward. Pick FBCache for Wan; TeaCache stays available as the alternative for FLUX-family pipelines. | +| ~~FU-008~~ | ~~`stable-diffusion.cpp` engine (cross-platform diffusion)~~ | **Shipped 2026-05-03 (video) + 2026-05-04 (image).** | Binary build via [scripts/build-sdcpp.sh](scripts/build-sdcpp.sh) + [scripts/update-sdcpp.sh](scripts/update-sdcpp.sh) (clones to `/tmp/stable-diffusion.cpp`, cmake `-DSD_METAL=ON` on Darwin or `-DSD_CUBLAS=ON` on Linux+CUDA, installs to `~/.chaosengine/bin/sd`). Build target is `sd-cli` (renamed from `sd` upstream around master-590); installer copies it back to the legacy `sd` filename so downstream resolvers in [sdcpp_video_runtime.py](backend_service/sdcpp_video_runtime.py), [sdcpp_image_runtime.py](backend_service/sdcpp_image_runtime.py), and [stage-runtime.mjs](scripts/stage-runtime.mjs) keep working. Path resolution in [src-tauri/src/lib.rs](src-tauri/src/lib.rs). **Video lane** (`SdCppVideoEngine.generate`): subprocess spawn → maps `VideoGenerationConfig` → sd.cpp flags (`--diffusion-model`, `-p`, `-W/-H`, `--steps`, `--cfg-scale`, `--seed`, `-o`, `--video-frames`, `--fps`, `--negative-prompt`); regex-parses `step N/M` (or `[N/M]`) into `VIDEO_PROGRESS`; reads `.webm` bytes back (sd.cpp's video output is `.webm`/`.avi`/animated `.webp` — no native `.mp4`). Catalog requires `ggufRepo` + `ggufFile` pin (e.g. `QuantStack/Wan2.2-TI2V-5B-GGUF`). **Image lane** (`SdCppImageEngine.generate`, [sdcpp_image_runtime.py](backend_service/sdcpp_image_runtime.py)): mirrors video shape but emits PNG, drops `--video-frames`/`--fps`, batches by looping seeds (sd.cpp renders one image per invocation). Manager dispatch in [image_runtime.py](backend_service/image_runtime.py) `ImageRuntimeManager.generate` routes when `config.runtime == "sdcpp"`, falls through to diffusers on probe failure or runtime error. Catalog variants: `FLUX.1-schnell-sdcpp-q4km` + `FLUX.1-dev-sdcpp-q4km` ([catalog/image_models.py](backend_service/catalog/image_models.py)). Supported image repos: FLUX.1/2 family, SD3.5, SDXL, SD2.1, Qwen-Image (+ 2512), Z-Image (+ Turbo). | +| ~~FU-009~~ | ~~mlx-video (Blaizzy) Apple Silicon video engine~~ | **Fully shipped 2026-05-04. Live smoke validated end-to-end.** | LTX-2 paths (`prince-canuma/LTX-2-{distilled,dev,2.3-distilled,2.3-dev}`) routed through subprocess engine in [backend_service/mlx_video_runtime.py](backend_service/mlx_video_runtime.py); Wan-AI paths route via Phase 8 of FU-025 (`_is_wan_repo` + `_build_wan_cmd` + `_REPO_ENTRY_POINTS["Wan-AI/"] = "mlx_video.models.wan_2.generate"`). Live smoke 2026-05-04 against `Wan-AI/Wan2.1-T2V-1.3B` (480×272, 5 frames, 4 steps, unipc): T5 encode 14.1s + transformer load 0.2s (4-bit q) + denoise 2.9s @ 1.4 it/s + VAE decode 1.3s = 19.6s total, 383 KB .mp4 output. The smoke also surfaced + fixed a `status_for` filename gap — mlx-video upstream emits root-level `model.safetensors` + `t5_encoder.safetensors`, not the legacy `transformer*.safetensors` / `text_encoder*.safetensors` patterns the helper originally checked for. Both now match. | +| FU-010 | vllm-swift Apple Silicon backend (**watch-closely**) | Re-evaluate end of June 2026 | [TheTom/vllm-swift](https://github.com/TheTom/vllm-swift) — Swift/Metal vLLM forward pass, Python orchestration only. 2.4× over mlx_lm on Qwen3-0.6B single-request; matches vLLM at concurrency 64. Fills the macOS vLLM gap. **Posture upgraded 2026-05-03** from watch-only after 76 → 238 stars and 1 → 15 forks in ~10 days; v0.3.0 (2026-04-28) shipped Metal Invalid Resource race fix + ~10% TQ MoE perf, v0.2.2 (2026-04-26) added hybrid model batched decode + paged-attention. Single contributor still. Trip-wires for adoption: ≥3 contributors with merged commits OR public benchmark beating mlx_lm at concurrency >1 on Llama-3.x-8B-class (current 2.4× claim is Qwen3-0.6B single-request only). | | FU-011 | LTX-Video 2.3 diffusers variant | Lightricks publishes diffusers-compatible weights (`Lightricks/LTX-2.3` gains `model_index.json`) | LTX-2.3 currently routes via mlx-video on Apple Silicon (`prince-canuma/LTX-2.3-{distilled,dev}` already in catalog). Lightricks' own model card states "diffusers support coming soon". When the diffusers-shaped weights land, add a `Lightricks/LTX-Video-2.3` entry to [backend_service/catalog/video_models.py](backend_service/catalog/video_models.py) under the `ltx-video` family so RTX 4090 / Linux users get a non-MLX path. Until then, no LTX-2.3 path exists for CUDA. | | FU-012 | LTX Spatial Temporal Guidance (STG) | diffusers ships LTXPipeline with `perturbed_blocks` kwarg, or vendor a forward patch | Upstream reference workflows enable STG by default — perturbs final transformer blocks during sampling to reduce object breakup / chroma drift. Our pinned diffusers' LTXPipeline does not accept `perturbed_blocks`. Phase D landed `frame_rate` + `decode_timestep` + `decode_noise_scale` + `guidance_rescale` for reference parity on the basic kwargs; STG is the remaining gap. Track upstream; if quality remains short of the reference, vendor a forward patch under [cache_compression/_teacache_patches/ltx_video.py](cache_compression/_teacache_patches/ltx_video.py)-style. | | FU-013 | Vendored STG-enabled LTX pipeline | Phase F or when a user reports that Phase D + E1 + E2 quality remains short of the upstream reference | Subclass `LTXPipeline` and override `__call__` to add a third forward pass per step with selected transformer block(s) perturbed (skip self-attention or replace with identity). Combine: `pred = uncond + cfg*(text - uncond) + stg_scale*(text - perturbed)`. Reference: Lightricks' upstream LTX-Video repo's `STGSamplingHook`. Estimated ~250 lines of vendored code + tests. Sequence dependency: do this AFTER FU-007 (Wan TeaCache) ships so the cache vs guidance interactions are tested in isolation. | -| FU-014 | LLM-based prompt enhancer | When Phase E1 template-only enhancer underperforms in real use | Phase E1 ships a deterministic per-model template suffix; FU-014 replaces it with a small instruction model (Llama-3.2-1B-Instruct via mlx-lm on Apple Silicon, or a 1B GGUF via llama-server elsewhere) that auto-rewrites short prompts into the structured 50-100 word format each video DiT was trained on. Reuses existing inference infrastructure — no new model bundling beyond a 1-2 GB checkpoint. | +| ~~FU-014~~ | ~~LLM-based prompt enhancer~~ | **Closed 2026-05-04 by FU-022.** | Replaced by FU-022's MLX-native enhancer (see below). | +| FU-015 | First Block Cache (diffusers 0.36 generic hook) | **Shipped 2026-05-03.** | Cross-platform diffusion cache strategy backed by `diffusers.hooks.apply_first_block_cache`. Lives at [cache_compression/firstblockcache.py](cache_compression/firstblockcache.py), registered as id `fbcache` in the strategy registry ([cache_compression/__init__.py](cache_compression/__init__.py)). Applies to image + video DiTs (FLUX, SD3.5, Wan2.1/2.2, HunyuanVideo, LTX-Video, CogVideoX, Mochi). Default threshold 0.12 (≈1.8× speedup on FLUX.1-dev with imperceptible quality drift). Same `apply_diffusion_cache_strategy` hook as TeaCache; UNet pipelines (SD1.5/SDXL) raise NotImplementedError into a runtimeNote. Closes FU-007. | +| FU-016 | SageAttention CUDA backend wiring | **Shipped 2026-05-03 (CUDA-gated).** | Helper at [backend_service/helpers/attention_backend.py](backend_service/helpers/attention_backend.py) (`maybe_apply_sage_attention`). Called from both [image_runtime.py](backend_service/image_runtime.py) and [video_runtime.py](backend_service/video_runtime.py) `_ensure_pipeline` after pipeline build. CUDA + sageattention pip wheel + diffusers ≥0.36 + DiT pipeline. No-op on macOS / CPU / UNet / non-DiT pipelines. Stacks multiplicatively with FBCache (community Wan2.1 720P cumulative 54%). Setup-page install action (`pip install sageattention`) follows. | +| FU-017 | SDXL VAE fp16 fix on MPS / CUDA | **Shipped 2026-05-03.** | Probes `madebyollin/sdxl-vae-fp16-fix` snapshot via `local_files_only=True` (no surprise download) at pipeline load. When cached, swaps `pipeline.vae` and lets `_preferred_torch_dtype` stay on fp16 for SDXL on MPS — drops the previous fp32 fallback that doubled wall-time on Apple Silicon. Helpers `_is_sdxl_repo` + `_locate_sdxl_vae_fix_snapshot` in [image_runtime.py](backend_service/image_runtime.py). Falls back to stock VAE + fp32 on any failure. | +| ~~FU-018~~ | ~~TAEHV / TAESD preview decoder~~ | **Fully shipped 2026-05-04 (parts 1 + 2).** | Tiny VAE for cheap preview decode each step. **Part 1 — full-decode VAE swap** ([backend_service/helpers/preview_vae.py](backend_service/helpers/preview_vae.py)) maps repo → preview VAE id (FLUX.1/2 → taef1/taef2, SD3 → taesd3, SDXL incl. sdxl-turbo + SDXL-Lightning → taesdxl, SD1.x/2.x incl. sd-turbo → taesd, Wan2.x → taew2_2, LTX-Video / LTX-2 → taeltx2_3_wide, HunyuanVideo → taehv1_5, CogVideoX → taecogvideox, Mochi → taemochi, Qwen-Image → taeqwenimage). `maybe_apply_preview_vae(pipeline, repo, enabled)` swaps `pipeline.vae` for an `AutoencoderTiny`, mirrors the stock VAE's dtype + device (live-validated against SDXL-Turbo on MPS — without the device mirror the first decoder pass raises `MPSHalfType` vs `torch.HalfTensor`). **Part 2 — live per-step thumbnails** ([backend_service/helpers/preview_thumbnails.py](backend_service/helpers/preview_thumbnails.py)) decodes `callback_kwargs["latents"]` through the swapped tiny VAE inside `callback_on_step_end`, scales to ≤192 px, base64-encodes a PNG, publishes to `IMAGE_PROGRESS.set_thumbnail` / `VIDEO_PROGRESS.set_thumbnail`. Stride caps emit count at ~8 (image) / ~6 (video) per gen so the polled `/api/{images,video}/progress` endpoint stays cheap. Handles both standard 4D `(B, C, H, W)` latents (SD1.5 / SDXL / SD3) and FLUX's packed 3D `(B, seq_len, 64)` shape via `pipeline._unpack_latents` (live-validated against FLUX.1-schnell on MPS — 4 thumbnails captured per 4-step gen, all valid base64 PNGs at 192x192). Frontend reads `snapshot.thumbnail` from `useGenerationProgress`, renders inside `LiveProgress` between the bar and the phase list when present. Errors are best-effort: a decode crash never aborts the actual generation — caller catches and falls back to no-thumbnail. **LTX refiner private-kwarg fix:** the FU-018 part 2 wiring also caught + fixed a pre-existing leak where `_invoke_pipeline_with_ltx_refiner` was passing `__cfg_decay` directly into `LTXPipeline.__call__` (would have started leaking `__preview_vae` too). Both private kwargs now stripped in the refiner path. | +| FU-019 | Distill LoRA support (Hyper-SD, FLUX.1-Turbo, lightx2v Wan CausVid) | **Shipped 2026-05-03; extended Phase 3 with Wan2.2-Distill.** | LoRA load + fuse path in both [image_runtime.py](backend_service/image_runtime.py) and [video_runtime.py](backend_service/video_runtime.py) `_ensure_pipeline`. Catalog variants in [catalog/image_models.py](backend_service/catalog/image_models.py) (FLUX.1-dev × Hyper-SD-8step + Turbo-Alpha) and [catalog/video_models.py](backend_service/catalog/video_models.py) (Wan2.1 1.3B/14B × CausVid). **Phase 3 extension: Wan 2.2 A14B I2V × lightx2v 4-step distill.** lightx2v ships full distilled transformers (not LoRAs) for both Wan2.2 MoE experts. New `distillTransformer*` fields on `VideoGenerationConfig` carry repo + high/low-noise filenames + precision (`bf16` / `fp8_e4m3` / `int8`). `_swap_distill_transformers` helper downloads both safetensors via `huggingface_hub.hf_hub_download`, loads via `WanTransformer3DModel.from_single_file`, and reassigns `pipeline.transformer` + `pipeline.transformer_2`. Variant key includes the distill identity so switching variants triggers clean rebuilds. Distill takes precedence over LoRA when both are pinned. Catalog adds: `Wan-AI/Wan2.2-I2V-A14B-Diffusers-distill-bf16` + `-distill-fp8`. Schema-default substitution sets `defaultSteps=4` + `cfgOverride=1.0`. | +| FU-020 | AYS (Align Your Steps) schedule for SD/SDXL | **Shipped 2026-05-03.** | New samplers `ays_dpmpp_2m_sd15` / `ays_dpmpp_2m_sdxl` in `_SAMPLER_REGISTRY` ([image_runtime.py](backend_service/image_runtime.py)). Private `_ays_family` token stripped from `from_config` kwargs and stashed on `pipeline._chaosengine_ays_timesteps`; `_build_pipeline_kwargs` passes it via `timesteps=` and pops `num_inference_steps`. Hardcoded NVIDIA timestep arrays for SD1.5/SDXL/SVD. Flow-match models continue to be gated out by `_is_flow_matching_repo`. | +| FU-021 | Image-runtime CFG decay parity | **Shipped 2026-05-03.** | `cfgDecay` field on `ImageGenerationConfig` + `ImageGenerationRequest`. Linear ramp from initial guidance to 1.5 floor inside the existing `callback_on_step_end` in `generate()`. Gated to flow-match repos (`_is_flow_matching_repo`); SD1.5/SDXL ignore the flag. Default off — opt-in vs. video runtime's default-on. | +| ~~FU-022~~ | ~~LLM-based prompt enhancer~~ | **Shipped 2026-05-04 (Apple Silicon path).** | Replaces the deterministic per-family template-suffix enhancer in `_enhance_prompt`. Helper [backend_service/helpers/prompt_enhancer.py](backend_service/helpers/prompt_enhancer.py) wraps `mlx_lm.load` + `mlx_lm.generate` against a small instruct model (default `mlx-community/Qwen2.5-0.5B-Instruct-4bit`, ~700 MB on disk, ~3s cold load + sub-second per call) — cached in a process-level `_EnhancerSingleton` so the second call onward hits the warm model. Per-family system prompts (`wan` / `ltx` / `hunyuan` / `flux` / `sdxl` / `sd3` / `default`) anchor the rewrite to the DiT's training distribution. `family_for(repo)` matches longest-prefix-wins. Endpoint `POST /api/prompt/enhance` ([routes/prompts.py](backend_service/routes/prompts.py)) returns `{enhanced, note, modelUsed, family}`. Frontend exposes a "Enhance" pill button next to the Prompt label in both Studio tabs ([components/PromptEnhanceButton.tsx](src/components/PromptEnhanceButton.tsx)) — click triggers the rewrite + replaces the textarea on success or surfaces a tooltip note when the enhancer fell back. Failure modes (non-Apple platform, mlx_lm missing, model not cached, generation crash, shorter-than-input rewrite) all return the original prompt + a runtimeNote so the user sees why. Live smoke 2026-05-04: 6-word "a fluffy cat on a windowsill" → 16-word FLUX rewrite (3.2s cold), 13-word Wan rewrite (0.12s warm), 8-word LTX rewrite (0.11s warm). 16 unit tests covering family-mapping + happy path + load-failure + generation crash + shorter-rewrite reject + quote stripping. CUDA / Linux still get the legacy template suffix; the helper returns the original + a "requires Apple Silicon" runtimeNote on those platforms. | +| FU-023 | SVDQuant / Nunchaku CUDA engine | **Foundation shipped 2026-05-05; awaiting live Windows / Linux CUDA validation.** | Apple Silicon dev box can't exercise the CUDA path live — wiring is in place so a Windows/Linux CUDA pull validates end-to-end. Backend: `_try_load_nunchaku_transformer` helper in [image_runtime.py](backend_service/image_runtime.py) loads via `NunchakuFluxTransformer2dModel` / `NunchakuQwenImageTransformer2DModel` / `NunchakuSD3Transformer2DModel` / `NunchakuSanaTransformer2DModel` / `NunchakuPixArtSigmaTransformer2DModel` — class registry at `_nunchaku_transformer_class_for_repo`. Preferred over NF4/int8wo on CUDA when `nunchakuRepo` pinned + nunchaku importable; falls back cleanly on Apple Silicon / CPU / missing package. Variant key extends with `nunchaku=...` so toggling rebuilds the pipeline. ImageGenerationConfig + ImageGenerationRequest fields: `nunchakuRepo`, `nunchakuFile`. Catalog rows: FLUX.1 Dev × svdq-int4-flux.1-dev, FLUX.1 Schnell × svdq-int4-flux.1-schnell. Setup install: `nunchaku>=1.2.1` via `_INSTALLABLE_PIP_PACKAGES`. Wan / HunyuanVideo / LTX wrappers don't exist in upstream Nunchaku v1.2.1 — adding a future video variant is a catalog-row change. | +| FU-024 | FP8 layerwise casting for non-FLUX DiTs | **Foundation shipped 2026-05-05; awaiting live CUDA SM 8.9+ validation.** | Apple Silicon can't exercise — Windows/Linux CUDA pull validates. Backend: `_maybe_enable_fp8_layerwise` helper in [image_runtime.py](backend_service/image_runtime.py) calls `transformer.enable_layerwise_casting(storage_dtype=…, compute_dtype=torch.bfloat16)` post-load. Family-correct fp8 dtype: E5M2 for HunyuanVideo (per upstream model card recommendation), E4M3 elsewhere (FLUX / Wan / Qwen-Image / SD3 / LTX). Compute capability gate refuses pre-Ada GPUs (SM <8.9) since hardware fp8 isn't there + the cast slows wall-time vs bf16. Helper degrades gracefully when `pipeline.transformer.enable_layerwise_casting` is missing (UNet pipelines / old diffusers) — runtimeNote surfaced into the load notes. Wired through both ImageGenerationConfig + VideoGenerationConfig + Request models + frontend hooks (`imageFp8LayerwiseCasting` / `videoFp8LayerwiseCasting`) + types. Default off; opt-in. | +| ~~FU-025~~ | ~~mlx-video Wan one-shot convert action~~ | **Fully shipped 2026-05-04 (Phase 7 + Phase 8 + Phase 9).** | Closes FU-009 Wan branch. **Phase 7 (foundation):** `[mlx-video]` extra in [pyproject.toml](pyproject.toml) flipped to ``git+https://github.com/Blaizzy/mlx-video.git``. Helper [backend_service/mlx_video_wan_convert.py](backend_service/mlx_video_wan_convert.py) wraps the upstream `python -m mlx_video.models.wan_2.convert` subprocess: `slug_for(repo)` / `output_dir_for(repo)` / `status_for(repo)` / `list_converted()` / `run_convert(checkpoint_dir, repo, dtype, quantize, bits, group_size, timeout)`. Output under ``~/.chaosengine/mlx-video-wan//`` (override via ``CHAOSENGINE_MLX_VIDEO_WAN_DIR``). **Phase 8 (routing):** [mlx_video_runtime.py](backend_service/mlx_video_runtime.py) `supported_repos()` returns dynamic union of LTX-2 + converted-on-disk Wan repos. `_REPO_ENTRY_POINTS` adds `"Wan-AI/": "mlx_video.models.wan_2.generate"`. `_build_wan_cmd` produces the Wan-shaped CLI (`--model-dir`, `--guide-scale` string, `--scheduler`, optional `--seed`/`--steps`/`--negative-prompt`; no LTX-2 flags). `generate()` picks `_wan_runtime_note` (flags MoE experts) and skips LTX-2 effective-step / effective-guidance overrides. **Phase 9 (GUI):** Orchestrator [backend_service/mlx_video_wan_installer.py](backend_service/mlx_video_wan_installer.py) drives preflight → download-raw → convert → verify with structured progress events. Setup endpoints in [routes/setup.py](backend_service/routes/setup.py): `POST /api/setup/install-mlx-video-wan` (background-job pattern mirroring `/api/setup/install-longlive`), `GET /api/setup/install-mlx-video-wan/status`, `GET /api/setup/mlx-video-wan/inventory`. Frontend client in [src/api.ts](src/api.ts) (`startWanInstall`, `getWanInstallStatus`, `getWanInventory`). UI panel [src/components/WanInstallPanel.tsx](src/components/WanInstallPanel.tsx) lists every supported Wan repo with raw-size hint + converted badge / install button + live `InstallLogPanel` underneath; rendered in [VideoDiscoverTab.tsx](src/features/video/VideoDiscoverTab.tsx) above the variant grid. Supported raw repos: `Wan-AI/Wan2.{1-T2V-1.3B,1-T2V-14B,2-TI2V-5B,2-T2V-A14B,2-I2V-A14B}`. End-to-end UX: user clicks Install → backend downloads + converts in background → runtime auto-detects + routes Wan generate calls through mlx-video. Tests: 21 in [test_mlx_video_wan_convert.py](tests/test_mlx_video_wan_convert.py), 9 Wan-routing in [test_mlx_video.py](tests/test_mlx_video.py), 15 in [test_mlx_video_wan_installer.py](tests/test_mlx_video_wan_installer.py). | +| ~~FU-026~~ | ~~TaylorSeer + DBCache aggressive cache preset~~ | **Obsoleted 2026-05-03 by diffusers 0.38 core.** | Diffusers 0.38.0 (2026-05-01) ships ``TaylorSeerCacheConfig``, ``MagCacheConfig``, ``PyramidAttentionBroadcastConfig``, ``FasterCacheConfig`` natively — no ``cache-dit`` dependency required. Wired as registry strategies (ids ``taylorseer``, ``magcache``, ``pab``, ``fastercache``) in [cache_compression/__init__.py](cache_compression/__init__.py). Each adapter calls ``pipeline.transformer.enable_cache()``. UNet pipelines (SD1.5/SDXL) raise ``NotImplementedError`` into a runtimeNote, matching the FBCache contract. MagCache is FLUX-only without calibration UX (uses ``FLUX_MAG_RATIOS`` from ``diffusers.hooks.mag_cache``); other DiTs raise a "calibration required" message until that UX lands. | +| FU-027 | NVIDIA/kvpress KV cache toolkit (CUDA-side) | **Setup install action pre-staged 2026-05-05; integration code pending.** | [NVIDIA/kvpress](https://github.com/NVIDIA/kvpress) — Apache 2.0, 1.1k stars, `kvpress>=0.5.3` registered in `_INSTALLABLE_PIP_PACKAGES` so the Setup tab can pre-stage the wheel. Integration hooks land separately under `cache_compression/kvpress.py` once the helper picks an adapter shape (the upstream library exposes `presses` per technique — e.g. SnapKV / TOVA / KIVI / pyramid — and a `Pipeline` wrapper that takes a HF transformers model). Apple Silicon stays on TurboQuant-MLX; this is the CUDA-side complement. | --- diff --git a/backend_service/agent.py b/backend_service/agent.py index 9b9431a..7600f5a 100644 --- a/backend_service/agent.py +++ b/backend_service/agent.py @@ -32,6 +32,13 @@ class ToolCallResult: arguments: dict[str, Any] result: str elapsed_seconds: float + # Phase 2.8: optional structured output the frontend can render + # natively (table / code / markdown / image / chart). When None, + # the legacy collapsible-JSON renderer fires. The `result` text + # field is always populated so the language model sees something + # readable on the next turn regardless of UI rendering. + render_as: str | None = None + data: dict[str, Any] | None = None @dataclass @@ -108,8 +115,19 @@ def _execute_tool_call( ) start = time.perf_counter() + render_as: str | None = None + structured_data: dict[str, Any] | None = None try: - result_text = tool.execute(**arguments) + # Phase 2.8: try the structured entry first. Tools that + # haven't migrated return None and we fall back to the + # plain-text path below. + structured = tool.execute_structured(**arguments) + if structured is not None: + result_text = structured.text + render_as = structured.render_as + structured_data = structured.data + else: + result_text = tool.execute(**arguments) except Exception as exc: result_text = f"Error executing {tool_name}: {exc}" elapsed = round(time.perf_counter() - start, 3) @@ -122,6 +140,8 @@ def _execute_tool_call( arguments=arguments, result=result_text, elapsed_seconds=elapsed, + render_as=render_as, + data=structured_data, ) @@ -384,6 +404,11 @@ def run_agent_loop_streaming( "name": tc_result.tool_name, "result": tc_result.result[:2000], # Cap for streaming "elapsed": tc_result.elapsed_seconds, + # Phase 2.8: stream the structured shape so the + # frontend can render it as the tool finishes + # rather than waiting for the final done payload. + "renderAs": tc_result.render_as, + "data": tc_result.data, }, } diff --git a/backend_service/app.py b/backend_service/app.py index 86977d7..0d4ea77 100644 --- a/backend_service/app.py +++ b/backend_service/app.py @@ -84,6 +84,8 @@ CHAT_SESSIONS_PATH = DATA_LOCATION.chat_sessions_path LIBRARY_CACHE_PATH = DATA_LOCATION.data_dir / "library_cache.json" DOCUMENTS_DIR = DATA_LOCATION.documents_dir +WORKSPACES_PATH = DATA_LOCATION.workspaces_path +WORKSPACES_DIR = DATA_LOCATION.workspaces_dir IMAGE_OUTPUTS_DIR = DATA_LOCATION.image_outputs_dir VIDEO_OUTPUTS_DIR = DATA_LOCATION.video_outputs_dir MAX_DOC_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file @@ -351,6 +353,20 @@ def _generate_image_artifacts( logger.info("Generating image: model=%s repo=%s size=%dx%d steps=%d draft=%s", variant.get("name"), variant.get("repo"), effective_width, effective_height, request.steps, request.draftMode) runtime_manager = runtime_manager or ImageRuntimeManager() + # FU-019: variant-declared defaults override schema defaults only + # when the user hasn't moved the slider. Schema defaults (24 steps, + # CFG 5.5) come from ImageGenerationRequest in models/__init__.py. + SCHEMA_DEFAULT_STEPS = 24 + SCHEMA_DEFAULT_GUIDANCE = 5.5 + effective_steps = request.steps + effective_guidance = request.guidance + variant_default_steps = variant.get("defaultSteps") + variant_cfg_override = variant.get("cfgOverride") + if variant_default_steps is not None and request.steps == SCHEMA_DEFAULT_STEPS: + effective_steps = int(variant_default_steps) + if variant_cfg_override is not None and abs(request.guidance - SCHEMA_DEFAULT_GUIDANCE) < 1e-3: + effective_guidance = float(variant_cfg_override) + rendered_images, runtime_status = runtime_manager.generate( ImageGenerationConfig( modelId=request.modelId, @@ -360,8 +376,8 @@ def _generate_image_artifacts( negativePrompt=request.negativePrompt or "", width=effective_width, height=effective_height, - steps=request.steps, - guidance=request.guidance, + steps=effective_steps, + guidance=effective_guidance, batchSize=request.batchSize, seed=request.seed, qualityPreset=request.qualityPreset, @@ -369,6 +385,30 @@ def _generate_image_artifacts( ggufRepo=(variant.get("ggufRepo") or None), ggufFile=(variant.get("ggufFile") or None), runtime=(variant.get("engine") or None), + cacheStrategy=request.cacheStrategy, + cacheRelL1Thresh=request.cacheRelL1Thresh, + cfgDecay=request.cfgDecay, + previewVae=request.previewVae, + # FU-019: variant-declared LoRA + step / guidance overrides. + # When the catalog variant pins a Hyper-SD / FLUX-Turbo / + # lightx2v LoRA, the engine fuses it into the pipeline at + # load time. ``defaultSteps`` / ``cfgOverride`` substitute + # only when the user kept the schema defaults — explicit + # slider tweaks survive untouched. + loraRepo=(variant.get("loraRepo") or None), + loraFile=(variant.get("loraFile") or None), + loraScale=(variant.get("loraScale") if variant.get("loraScale") is not None else None), + defaultSteps=(variant.get("defaultSteps") if variant.get("defaultSteps") is not None else None), + cfgOverride=(variant.get("cfgOverride") if variant.get("cfgOverride") is not None else None), + # FU-023: variant-pinned Nunchaku SVDQuant snapshot. Threads + # through to ``_ensure_pipeline`` which prefers it over + # NF4 / int8wo on CUDA when nunchaku is installed. + nunchakuRepo=(variant.get("nunchakuRepo") or None), + nunchakuFile=(variant.get("nunchakuFile") or None), + # FU-024: opt-in FP8 layerwise casting. Threaded from the + # request rather than the catalog so users can experiment + # without the catalog committing to fp8 readiness per repo. + fp8LayerwiseCasting=request.fp8LayerwiseCasting, ) ) created_at = datetime.utcnow().replace(microsecond=0).isoformat() + "Z" @@ -425,6 +465,21 @@ def _generate_video_artifact( request.steps, ) + # FU-019: variant-declared step / CFG defaults override schema + # defaults only when the user kept the schema defaults — explicit + # slider movement on the frontend is preserved untouched. The + # video schema default is steps=50 (see VideoGenerationRequest). + SCHEMA_DEFAULT_STEPS = 50 + SCHEMA_DEFAULT_GUIDANCE = 3.0 + effective_steps = request.steps + effective_guidance = request.guidance + variant_default_steps = variant.get("defaultSteps") + variant_cfg_override = variant.get("cfgOverride") + if variant_default_steps is not None and request.steps == SCHEMA_DEFAULT_STEPS: + effective_steps = int(variant_default_steps) + if variant_cfg_override is not None and abs(request.guidance - SCHEMA_DEFAULT_GUIDANCE) < 1e-3: + effective_guidance = float(variant_cfg_override) + video, runtime_status = runtime_manager.generate( VideoGenerationConfig( modelId=request.modelId, @@ -436,8 +491,8 @@ def _generate_video_artifact( height=request.height, numFrames=request.numFrames, fps=request.fps, - steps=request.steps, - guidance=request.guidance, + steps=effective_steps, + guidance=effective_guidance, seed=request.seed, ggufRepo=(variant.get("ggufRepo") or None), ggufFile=(variant.get("ggufFile") or None), @@ -447,6 +502,27 @@ def _generate_video_artifact( enableLtxRefiner=request.enableLtxRefiner, enhancePrompt=request.enhancePrompt, cfgDecay=request.cfgDecay, + stgScale=request.stgScale, + previewVae=request.previewVae, + # FU-019: variant-declared LoRA + override metadata. + loraRepo=(variant.get("loraRepo") or None), + loraFile=(variant.get("loraFile") or None), + loraScale=(variant.get("loraScale") if variant.get("loraScale") is not None else None), + defaultSteps=(variant.get("defaultSteps") if variant.get("defaultSteps") is not None else None), + cfgOverride=(variant.get("cfgOverride") if variant.get("cfgOverride") is not None else None), + # Phase 3 / Wan2.2-Distill 4-step: catalog-pinned distilled + # transformers replace both Wan A14B experts at pipeline load. + distillTransformerRepo=(variant.get("distillTransformerRepo") or None), + distillTransformerHighNoiseFile=(variant.get("distillTransformerHighNoiseFile") or None), + distillTransformerLowNoiseFile=(variant.get("distillTransformerLowNoiseFile") or None), + distillTransformerPrecision=(variant.get("distillTransformerPrecision") or None), + # FU-023 / FU-024: catalog-pinned Nunchaku snapshot + opt-in + # FP8 layerwise casting (CUDA-only). Same shape as the image + # side so a future video-Nunchaku release lands without app + # plumbing churn. + nunchakuRepo=(variant.get("nunchakuRepo") or None), + nunchakuFile=(variant.get("nunchakuFile") or None), + fp8LayerwiseCasting=request.fp8LayerwiseCasting, ) ) diff --git a/backend_service/catalog/capabilities.py b/backend_service/catalog/capabilities.py new file mode 100644 index 0000000..420d7ea --- /dev/null +++ b/backend_service/catalog/capabilities.py @@ -0,0 +1,201 @@ +"""Model capability resolver — Phase 2.11. + +Maps a loaded model's ref/canonical-repo to a typed capability blob the +UI can use to gate composer features (image attach hidden for text-only +models, tools toggle hidden for non-tool models, etc.) and to render +capability badges next to the model picker. + +The resolver consults the curated text-model catalog first (each +variant carries a `capabilities: [...]` string list); when no catalog +entry matches it falls back to ref-name heuristics so freshly downloaded +HF models without a catalog entry still get sensible defaults. + +Capabilities are intentionally conservative — when in doubt the +resolver omits the flag rather than promising support that may not +materialise. The frontend treats unknown capabilities as "hide the UI +affordance" so incorrectly omitting a flag degrades gracefully. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + +from backend_service.catalog.text_models import MODEL_FAMILIES + + +@dataclass +class ModelCapabilities: + supportsVision: bool = False + supportsTools: bool = False + supportsReasoning: bool = False + supportsCoding: bool = False + supportsAgents: bool = False + supportsAudio: bool = False + supportsVideo: bool = False + supportsMultilingual: bool = False + # Free-form tags from the catalog (or heuristic fallback) preserved + # so the UI can render badges without re-deriving them. + tags: tuple[str, ...] = () + + def to_dict(self) -> dict[str, Any]: + out = asdict(self) + out["tags"] = list(self.tags) + return out + + +# Maps catalog capability strings to fields on ModelCapabilities. Strings +# the catalog uses freely ("multilingual", "thinking", etc.) get folded +# into the closest typed flag. +_CAPABILITY_TO_FLAG: dict[str, str] = { + "vision": "supportsVision", + "multimodal": "supportsVision", + "tool-use": "supportsTools", + "tools": "supportsTools", + "function-calling": "supportsTools", + "reasoning": "supportsReasoning", + "thinking": "supportsReasoning", + "coding": "supportsCoding", + "code": "supportsCoding", + "agents": "supportsAgents", + "agent": "supportsAgents", + "audio": "supportsAudio", + "video": "supportsVideo", + "multilingual": "supportsMultilingual", +} + + +def _normalise_ref(value: str | None) -> str: + return (value or "").strip().lower() + + +def _catalog_lookup(model_ref: str | None, canonical_repo: str | None) -> list[str] | None: + """Find the variant whose `id` or `repo` matches the loaded model. + + Falls back to family-level capabilities when no variant matches but + the family-level repo is a prefix of the loaded ref. This catches + community quantised forks (e.g. `mlx-community/Qwen3-Coder-Next-MLX-4bit`) + whose ref doesn't appear verbatim in the catalog. + """ + ref = _normalise_ref(model_ref) + canonical = _normalise_ref(canonical_repo) + if not ref and not canonical: + return None + + for family in MODEL_FAMILIES: + for variant in family.get("variants", []): + variant_id = _normalise_ref(variant.get("id")) + variant_repo = _normalise_ref(variant.get("repo")) + if ref and (ref == variant_id or ref == variant_repo): + caps = variant.get("capabilities") + if isinstance(caps, list): + return [str(c) for c in caps] + if canonical and (canonical == variant_id or canonical == variant_repo): + caps = variant.get("capabilities") + if isinstance(caps, list): + return [str(c) for c in caps] + + # Family-level fallback: match by ref or canonical containing the + # family id or any of its variant repos as a substring. + for family in MODEL_FAMILIES: + family_caps = family.get("capabilities") + if not isinstance(family_caps, list): + continue + family_id = _normalise_ref(family.get("id")) + if not family_id: + continue + for needle in (ref, canonical): + if not needle: + continue + if family_id in needle: + return [str(c) for c in family_caps] + for variant in family.get("variants", []): + variant_repo = _normalise_ref(variant.get("repo")) + if variant_repo and variant_repo in needle: + return [str(c) for c in family_caps] + return None + + +def _heuristic_capabilities(model_ref: str | None) -> list[str]: + """Fallback when the catalog has no entry for the loaded model. + + Pure substring sniff against common repo conventions: vision models + typically include "vl" / "vision" / "llava" in the ref; coder models + include "coder" / "code"; reasoning models often advertise "r1" / + "reasoning" / "think". Conservative — only emit flags backed by a + well-established naming convention. + """ + if not model_ref: + return [] + lower = model_ref.lower() + out: list[str] = [] + if any(needle in lower for needle in ("-vl-", " vl ", "/vl-", "vision", "llava", "qwen-vl", "moondream")): + out.append("vision") + if any(needle in lower for needle in ("coder", "/code-", "starcoder", "deepseek-coder", "code-llama")): + out.append("coding") + if any(needle in lower for needle in ("r1", "reasoning", "think", "qwen3", "deepseek-r")): + out.append("reasoning") + if "tool" in lower or "function" in lower: + out.append("tool-use") + if "instruct" in lower or "-it" in lower or "chat" in lower: + # Instruction-tuned models almost always support chat-style tool + # prompts even when the catalog hasn't been updated. + if "tool-use" not in out: + out.append("tool-use") + return out + + +def resolve_capabilities( + model_ref: str | None, + canonical_repo: str | None = None, + engine: str | None = None, + vision_enabled: bool = False, +) -> ModelCapabilities: + """Public entry point — returns a typed capability blob for a model. + + Catalog match wins; heuristic fallback applies only when nothing in + the catalog matched. Always returns a valid `ModelCapabilities` (no + None) so callers don't need to null-check. + + `engine` (optional) gates capability flags by what the loaded + runtime can actually serve. The MLX worker subprocess never wired + vision input through — so even though Gemma-4 / Qwen-VL etc. + advertise vision in the catalog, the user gets silent base64-drop + if the route is MLX. Demote vision to False for those engines. + + `vision_enabled` is the runtime-side ground truth: True only when + the loaded model actually has an mmproj projector wired up. Until + that wiring lands the flag stays False on every load, so even the + llama.cpp path (which accepts image_url parts natively if mmproj + is configured) demotes vision until proven otherwise. Catalog + tags keep "vision" so the UI can still surface "supported once + mmproj loads" once the path is live. + """ + raw = _catalog_lookup(model_ref, canonical_repo) + if raw is None: + raw = _heuristic_capabilities(model_ref) + + caps = ModelCapabilities() + seen: set[str] = set() + for tag in raw: + normalised = tag.strip().lower() + if not normalised: + continue + seen.add(normalised) + flag = _CAPABILITY_TO_FLAG.get(normalised) + if flag is not None: + setattr(caps, flag, True) + caps.tags = tuple(sorted(seen)) + + # Engine-side reality check + runtime-side proof: strip vision + # unless the runtime explicitly says mmproj is loaded. Today no + # path sets this True so the typed flag is always False — exactly + # the right behaviour to prevent silent image drop. The MLX-engine + # demotion is kept as belt-and-braces for any caller that forgets + # to thread `vision_enabled` through. + engine_normalised = (engine or "").strip().lower() + if engine_normalised in {"mlx", "mlx_worker", "turboquant"}: + caps.supportsVision = False + if not vision_enabled: + caps.supportsVision = False + return caps diff --git a/backend_service/catalog/image_models.py b/backend_service/catalog/image_models.py index fce458b..7c56573 100644 --- a/backend_service/catalog/image_models.py +++ b/backend_service/catalog/image_models.py @@ -83,6 +83,34 @@ "estimatedGenerationSeconds": 2.4, "releaseDate": "2024-10", }, + { + # FU-008 image subset: sd.cpp engine routes via the + # ``sd`` binary built by ``./scripts/build-sdcpp.sh``. + # Cross-platform — Metal on Apple Silicon, CUDA on + # Linux/Windows. Pairs the city96 GGUF transformer with + # the binary's text-encoder + VAE handling so the user + # avoids the diffusers Python overhead entirely. + "id": "black-forest-labs/FLUX.1-schnell-sdcpp-q4km", + "familyId": "flux-fast", + "name": "FLUX.1 Schnell · sd.cpp Q4_K_M", + "provider": "Black Forest Labs · sd.cpp", + "repo": "black-forest-labs/FLUX.1-schnell", + "engine": "sdcpp", + "ggufRepo": "city96/FLUX.1-schnell-gguf", + "ggufFile": "flux1-schnell-Q4_K_M.gguf", + "link": "https://github.com/leejet/stable-diffusion.cpp", + "runtime": "stable-diffusion.cpp (subprocess)", + "styleTags": ["photoreal", "general", "fast", "gguf", "cross-platform"], + "taskSupport": ["txt2img"], + "sizeGb": 6.8, + "recommendedResolution": "1024x1024", + "note": ( + "Cross-platform GGUF runtime via sd.cpp subprocess. " + "Build the binary with ./scripts/build-sdcpp.sh first." + ), + "estimatedGenerationSeconds": 4.5, + "releaseDate": "2026-05", + }, ], }, { @@ -165,6 +193,28 @@ "estimatedGenerationSeconds": 7.8, "releaseDate": "2024-09", }, + { + "id": "black-forest-labs/FLUX.1-dev-sdcpp-q4km", + "familyId": "flux-dev", + "name": "FLUX.1 Dev · sd.cpp Q4_K_M", + "provider": "Black Forest Labs · sd.cpp", + "repo": "black-forest-labs/FLUX.1-dev", + "engine": "sdcpp", + "ggufRepo": "city96/FLUX.1-dev-gguf", + "ggufFile": "flux1-dev-Q4_K_M.gguf", + "link": "https://github.com/leejet/stable-diffusion.cpp", + "runtime": "stable-diffusion.cpp (subprocess)", + "styleTags": ["general", "detailed", "gguf", "cross-platform"], + "taskSupport": ["txt2img"], + "sizeGb": 7.2, + "recommendedResolution": "1024x1024", + "note": ( + "Cross-platform GGUF runtime via sd.cpp subprocess. " + "Build the binary with ./scripts/build-sdcpp.sh first." + ), + "estimatedGenerationSeconds": 6.0, + "releaseDate": "2026-05", + }, { "id": "black-forest-labs/FLUX.1-dev-mflux", "familyId": "flux-dev", @@ -182,6 +232,112 @@ "estimatedGenerationSeconds": 4.5, "releaseDate": "2024-10", }, + # FU-019 distill LoRAs. Drop FLUX.1-dev from 25-step base + # quality to 8-step quality. Stacks cleanly with NF4 + # (CUDA) / int8wo (MPS) / GGUF — the LoRA is loaded onto + # the already-quantized transformer at fuse time. CFG and + # step counts come from the LoRA author's recommended + # workflow. + { + "id": "black-forest-labs/FLUX.1-dev-hyper-sd-8step", + "familyId": "flux-dev", + "name": "FLUX.1 Dev · Hyper-SD 8-step", + "provider": "Black Forest Labs · ByteDance", + "repo": "black-forest-labs/FLUX.1-dev", + "loraRepo": "ByteDance/Hyper-SD", + "loraFile": "Hyper-FLUX.1-dev-8steps-lora.safetensors", + "loraScale": 0.125, + "defaultSteps": 8, + "cfgOverride": 3.5, + "link": "https://huggingface.co/ByteDance/Hyper-SD", + "runtime": "diffusers + Hyper-SD LoRA", + "styleTags": ["general", "detailed", "fast", "lora"], + "taskSupport": ["txt2img"], + "sizeGb": 23.8, + "recommendedResolution": "1024x1024", + "note": ( + "8-step Hyper-SD distillation LoRA fused into FLUX.1 Dev. " + "Matches base FLUX.1 Dev 25-step quality at ≈3× speed. " + "Stacks with NF4/int8wo/GGUF." + ), + "estimatedGenerationSeconds": 2.4, + "releaseDate": "2024-10", + }, + { + "id": "black-forest-labs/FLUX.1-dev-turbo-alpha", + "familyId": "flux-dev", + "name": "FLUX.1 Dev · Turbo Alpha", + "provider": "Black Forest Labs · alimama-creative", + "repo": "black-forest-labs/FLUX.1-dev", + "loraRepo": "alimama-creative/FLUX.1-Turbo-Alpha", + "loraFile": "diffusion_pytorch_model.safetensors", + "loraScale": 1.0, + "defaultSteps": 8, + "cfgOverride": 3.5, + "link": "https://huggingface.co/alimama-creative/FLUX.1-Turbo-Alpha", + "runtime": "diffusers + FLUX.1-Turbo-Alpha LoRA", + "styleTags": ["general", "detailed", "fast", "lora"], + "taskSupport": ["txt2img"], + "sizeGb": 23.8, + "recommendedResolution": "1024x1024", + "note": ( + "alimama's 8-step Turbo Alpha LoRA fused into FLUX.1 Dev. " + "Same wall-time win as Hyper-SD with slightly different " + "stylistic bias — try both and pick by output." + ), + "estimatedGenerationSeconds": 2.4, + "releaseDate": "2025-02", + }, + # FU-023 Nunchaku SVDQuant — 4-bit precompiled INT4 weights. + # CUDA only (Ada/Hopper/Blackwell). ~3× over NF4 on FLUX.1-dev, + # quality near bf16. Variant pins the upstream MIT-Han-Lab + # snapshot; runtime falls back to the standard FLUX.1 Dev + # path when nunchaku is unavailable so MPS / CPU users see + # the same final image (just slower). + { + "id": "black-forest-labs/FLUX.1-dev-nunchaku-int4", + "familyId": "flux-dev", + "name": "FLUX.1 Dev · Nunchaku INT4 (CUDA)", + "provider": "Black Forest Labs · MIT-Han-Lab", + "repo": "black-forest-labs/FLUX.1-dev", + "nunchakuRepo": "mit-han-lab/svdq-int4-flux.1-dev", + "link": "https://huggingface.co/mit-han-lab/svdq-int4-flux.1-dev", + "runtime": "diffusers + nunchaku SVDQuant (CUDA)", + "styleTags": ["general", "detailed", "fast", "cuda", "int4"], + "taskSupport": ["txt2img"], + "sizeGb": 6.7, + "recommendedResolution": "1024x1024", + "note": ( + "Nunchaku SVDQuant INT4 — ~3× over NF4 on FLUX.1-dev, " + "quality near bf16. CUDA only (RTX 4070+ / 4090 / " + "Hopper / Blackwell). Falls back to bf16 / NF4 / int8wo " + "automatically on Apple Silicon and CPU." + ), + "estimatedGenerationSeconds": 1.4, + "releaseDate": "2026-01", + }, + { + "id": "black-forest-labs/FLUX.1-schnell-nunchaku-int4", + "familyId": "flux-schnell", + "name": "FLUX.1 Schnell · Nunchaku INT4 (CUDA)", + "provider": "Black Forest Labs · MIT-Han-Lab", + "repo": "black-forest-labs/FLUX.1-schnell", + "nunchakuRepo": "mit-han-lab/svdq-int4-flux.1-schnell", + "defaultSteps": 4, + "cfgOverride": 0.0, + "link": "https://huggingface.co/mit-han-lab/svdq-int4-flux.1-schnell", + "runtime": "diffusers + nunchaku SVDQuant (CUDA)", + "styleTags": ["general", "fast", "cuda", "int4"], + "taskSupport": ["txt2img"], + "sizeGb": 6.7, + "recommendedResolution": "1024x1024", + "note": ( + "Nunchaku SVDQuant INT4 — sub-second 4-step gen on a " + "4090 with quality near the bf16 baseline. CUDA only." + ), + "estimatedGenerationSeconds": 0.7, + "releaseDate": "2026-01", + }, ], }, { @@ -364,6 +520,57 @@ "updatedLabel": "Tracked latest", "releaseDate": "2026-02", }, + { + # Apache 2.0 4B FLUX.2 — fixed 4-step inference, ~13 GB VRAM. + # Smallest FLUX.2 lane; first one suitable for catalog ship without + # gating. Pipeline class is ``Flux2KleinPipeline`` (new in diffusers + # 0.38+); existing PIPELINE_REGISTRY routing for FLUX.2 family + # covers the dispatch. + "repo": "black-forest-labs/FLUX.2-klein-4B", + "name": "FLUX.2 Klein 4B", + "provider": "Black Forest Labs", + "styleTags": ["general", "flux", "fast", "small"], + "taskSupport": ["txt2img", "img2img"], + "sizeGb": 14.5, + "runtimeFootprintGb": 13.0, + "runtimeFootprintMpsGb": 16.0, + "runtimeFootprintCpuGb": 22.0, + "coreWeightsGb": 14.5, + "repoSizeGb": 14.6, + "recommendedResolution": "1024x1024", + "note": ( + "Apache 2.0 4B FLUX.2 — fixed 4-step inference, sub-second " + "images on RTX 3090/4070+. Smaller and shippable cousin of " + "the 9B Klein variant." + ), + "gated": False, + "pipelineTag": "text-to-image", + "updatedLabel": "Tracked latest", + "releaseDate": "2026-01", + }, + { + "repo": "fal/FLUX.2-dev-Turbo", + "name": "FLUX.2 Dev · Turbo", + "provider": "Black Forest Labs · fal", + "styleTags": ["general", "fast", "flux"], + "taskSupport": ["txt2img", "img2img"], + "sizeGb": 49.5, + "runtimeFootprintGb": 50.0, + "runtimeFootprintMpsGb": 60.0, + "runtimeFootprintCpuGb": 70.0, + "coreWeightsGb": 49.5, + "repoSizeGb": 49.6, + "recommendedResolution": "1024x1024", + "note": ( + "fal's Turbo distillation of FLUX.2 Dev — 8-step Turbo Alpha " + "matches the base 25-step quality. Tracked for catalog refresh " + "(FU-019 catalog round)." + ), + "gated": False, + "pipelineTag": "text-to-image", + "updatedLabel": "Tracked latest", + "releaseDate": "2025-12", + }, { "repo": "Tongyi-MAI/Z-Image-Turbo", "name": "Z-Image-Turbo", @@ -436,6 +643,33 @@ "updatedLabel": "Tracked latest", "releaseDate": "2025-08", }, + { + # Dec 2025 refresh of Qwen-Image. Same QwenImagePipeline architecture + # (9-shard transformer, Qwen2.5-VL text encoder) and Apache 2.0 + # license as the base Qwen-Image entry above; weights tuned for + # stronger prompt adherence on multi-element scenes and CJK glyph + # rendering. Uses Qwen's YYMM dated-release convention (cf. + # Qwen-Image-Edit-2511 / -2509). + "repo": "Qwen/Qwen-Image-2512", + "name": "Qwen-Image (Dec 2025)", + "provider": "Qwen", + "styleTags": ["general", "detailed", "qwenimage", "refreshed"], + "taskSupport": ["txt2img"], + "sizeGb": 57.7, + "runtimeFootprintGb": 58.0, + "runtimeFootprintMpsGb": 72.0, + "runtimeFootprintCpuGb": 72.0, + "recommendedResolution": "1024x1024", + "note": ( + "December 2025 Qwen-Image refresh with stronger prompt " + "adherence and improved CJK rendering. Apache 2.0; same " + "QwenImagePipeline as base Qwen-Image." + ), + "gated": False, + "pipelineTag": "text-to-image", + "updatedLabel": "Tracked latest", + "releaseDate": "2025-12", + }, { "repo": "Qwen/Qwen-Image-Edit", "name": "Qwen-Image-Edit", diff --git a/backend_service/catalog/video_models.py b/backend_service/catalog/video_models.py index 9fd6773..a4c510b 100644 --- a/backend_service/catalog/video_models.py +++ b/backend_service/catalog/video_models.py @@ -137,7 +137,10 @@ "recommendedResolution": "768x512", "defaultDurationSeconds": 4.0, "note": "Distilled LTX-2 — fastest MLX path for previews. Use the dev variant for final fidelity.", - "estimatedGenerationSeconds": 60.0, + # Distilled is 8 + 3 fixed sampler passes with CFG off; STG is + # ignored. Real-world wall time on M4 Max at 768×512 / 4 s + # lands around 90 s including model load. + "estimatedGenerationSeconds": 90.0, "availableLocally": False, "releaseDate": "2026-01", }, @@ -156,7 +159,14 @@ "recommendedResolution": "768x512", "defaultDurationSeconds": 4.0, "note": "Full LTX-2 dev weights — higher fidelity, longer sampling than distilled.", - "estimatedGenerationSeconds": 180.0, + # Dev runs single-stage CFG sampling; with STG=1.0 (default) + # that's 3 forward passes per step. ~600 s for a 4-s clip at + # 30 steps on M4 Max. Drops to ~400 s with STG=0.0. + "estimatedGenerationSeconds": 600.0, + # Fast-preview swap target — Studio toggle renders the + # distilled sibling instead so the user gets a quick draft + # of the same prompt + seed in ~1/6 of the time. + "fastPreviewSiblingId": "prince-canuma/LTX-2-distilled", "availableLocally": False, "releaseDate": "2026-01", }, @@ -176,7 +186,10 @@ "recommendedResolution": "768x512", "defaultDurationSeconds": 4.0, "note": "LTX-2.3 distilled — refreshed fast preview path with sharper texture detail vs LTX-2. Use the dev variant for final fidelity.", - "estimatedGenerationSeconds": 60.0, + # Same fixed 8 + 3 sampler shape as LTX-2 distilled with the + # 2.3 weight refresh; wall time tracks the LTX-2 distilled + # entry within measurement noise. + "estimatedGenerationSeconds": 100.0, "availableLocally": False, "releaseDate": "2026-03", }, @@ -196,7 +209,12 @@ "recommendedResolution": "768x512", "defaultDurationSeconds": 4.0, "note": "LTX-2.3 dev — quality tier; full sampler steps for best output. Apple Silicon native via MLX. Install mlx-video from Setup → GPU runtime bundle to enable.", - "estimatedGenerationSeconds": 180.0, + # Dev pipeline + CFG + STG=1.0 = 3 forward passes per step; + # observed wall time on M4 Max for a 4-s / 30-step / 768×512 + # render is ~600 s. Drops to ~400 s with STG=0.0. Old 180 s + # estimate predated STG and the dev pipeline-mode change. + "estimatedGenerationSeconds": 600.0, + "fastPreviewSiblingId": "prince-canuma/LTX-2.3-distilled", "availableLocally": False, "releaseDate": "2026-03", }, @@ -398,6 +416,68 @@ "availableLocally": False, "releaseDate": "2025-03", }, + # FU-019 distill LoRAs. lightx2v's CausVid LoRAs collapse + # the 30-step base schedule to 4 steps, CFG-free. Wall-time + # win is ~7-8× before any caching strategy stacks on top. + # Keep the full-fat Wan 2.1 1.3B / 14B variants above for + # users who want the un-distilled quality ceiling. + { + "id": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers-causvid", + "familyId": "wan-2-1", + "name": "Wan 2.1 T2V 1.3B · CausVid (4-step)", + "provider": "Alibaba · lightx2v", + "repo": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", + "loraRepo": "lightx2v/Wan2.1-T2V-1.3B-CausVid-LoRA", + "loraFile": "wan21_t2v_1.3b_causvid_lora.safetensors", + "loraScale": 1.0, + "defaultSteps": 4, + "cfgOverride": 1.0, + "link": "https://huggingface.co/lightx2v/Wan2.1-T2V-1.3B-CausVid-LoRA", + "runtime": "diffusers WanPipeline + CausVid LoRA", + "styleTags": ["general", "fast", "small", "lora"], + "taskSupport": ["txt2video"], + "sizeGb": 16.4, + "runtimeFootprintGb": 14.0, + "runtimeFootprintMpsGb": 23.0, + "recommendedResolution": "832x480", + "defaultDurationSeconds": 4.0, + "note": ( + "lightx2v CausVid distillation LoRA fused into Wan 2.1 1.3B. " + "Runs at 4 steps, CFG-free — roughly 7-8× faster than the " + "base 30-step schedule on the same hardware." + ), + "estimatedGenerationSeconds": 9.0, + "availableLocally": False, + "releaseDate": "2025-04", + }, + { + "id": "Wan-AI/Wan2.1-T2V-14B-Diffusers-causvid", + "familyId": "wan-2-1", + "name": "Wan 2.1 T2V 14B · CausVid (4-step)", + "provider": "Alibaba · lightx2v", + "repo": "Wan-AI/Wan2.1-T2V-14B-Diffusers", + "loraRepo": "lightx2v/Wan2.1-T2V-14B-CausVid-LoRA", + "loraFile": "wan21_t2v_14b_causvid_lora.safetensors", + "loraScale": 1.0, + "defaultSteps": 4, + "cfgOverride": 1.0, + "link": "https://huggingface.co/lightx2v/Wan2.1-T2V-14B-CausVid-LoRA", + "runtime": "diffusers WanPipeline + CausVid LoRA", + "styleTags": ["general", "quality", "motion", "lora"], + "taskSupport": ["txt2video"], + "sizeGb": 45.0, + "runtimeFootprintGb": 39.0, + "recommendedResolution": "832x480", + "defaultDurationSeconds": 5.0, + "note": ( + "lightx2v CausVid distillation LoRA fused into Wan 2.1 14B. " + "Runs at 4 steps, CFG-free — quality holds close to the base " + "30-step Wan 2.1 14B at a fraction of the wall time." + ), + "estimatedGenerationSeconds": 24.0, + "availableLocally": False, + "releaseDate": "2025-04", + }, ], }, { @@ -557,6 +637,83 @@ "availableLocally": False, "releaseDate": "2025-07", }, + # Phase 3 / Wan2.2-Distill 4-step (lightx2v): drops the A14B + # I2V schedule from ~30 to 4 steps with CFG-free sampling. The + # base repo is ``Wan-AI/Wan2.2-I2V-A14B-Diffusers`` (text + # encoder + VAE come from there); the runtime swaps both + # transformer experts (``transformer`` high-noise + + # ``transformer_2`` low-noise) for the lightx2v distilled + # safetensors. ``defaultSteps=4`` + ``cfgOverride=1.0`` + # substitute the schema defaults so users running the + # default sliders pick up the distill schedule automatically. + { + "id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers-distill-bf16", + "familyId": "wan-2-2", + "name": "Wan 2.2 I2V A14B · Distill 4-step (BF16)", + "provider": "Alibaba · lightx2v", + "repo": "Wan-AI/Wan2.2-I2V-A14B-Diffusers", + "distillTransformerRepo": "lightx2v/Wan2.2-Distill-Models", + "distillTransformerHighNoiseFile": "wan2.2_i2v_A14b_high_noise_lightx2v_4step.safetensors", + "distillTransformerLowNoiseFile": "wan2.2_i2v_A14b_low_noise_lightx2v_4step.safetensors", + "distillTransformerPrecision": "bf16", + "defaultSteps": 4, + "cfgOverride": 1.0, + "link": "https://huggingface.co/lightx2v/Wan2.2-Distill-Models", + "runtime": "diffusers WanPipeline + lightx2v distill (bf16)", + "styleTags": ["i2v", "general", "fast", "motion", "distill"], + "taskSupport": ["img2video"], + "sizeGb": 56.0, + # Both BF16 distilled experts (~28 GB each) plus UMT5-XXL + # text encoder + VAE from base repo. MoE offload required + # on hosts under ~60 GB unified memory. + "runtimeFootprintGb": 30.0, + "runtimeFootprintMpsGb": 36.0, + "recommendedResolution": "832x480", + "defaultDurationSeconds": 5.0, + "note": ( + "lightx2v 4-step distillation of Wan 2.2 A14B I2V " + "(BF16). Replaces both MoE transformer experts; runs " + "at 4 steps, CFG-free. Quality holds close to the " + "30-step base at ~7-8x faster wall-time." + ), + "estimatedGenerationSeconds": 40.0, + "availableLocally": False, + "releaseDate": "2026-04", + }, + { + "id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers-distill-fp8", + "familyId": "wan-2-2", + "name": "Wan 2.2 I2V A14B · Distill 4-step (FP8)", + "provider": "Alibaba · lightx2v", + "repo": "Wan-AI/Wan2.2-I2V-A14B-Diffusers", + "distillTransformerRepo": "lightx2v/Wan2.2-Distill-Models", + "distillTransformerHighNoiseFile": "wan2.2_i2v_A14b_high_noise_scaled_fp8_e4m3_lightx2v_4step.safetensors", + "distillTransformerLowNoiseFile": "wan2.2_i2v_A14b_low_noise_scaled_fp8_e4m3_lightx2v_4step.safetensors", + "distillTransformerPrecision": "fp8_e4m3", + "defaultSteps": 4, + "cfgOverride": 1.0, + "link": "https://huggingface.co/lightx2v/Wan2.2-Distill-Models", + "runtime": "diffusers WanPipeline + lightx2v distill (FP8 E4M3)", + "styleTags": ["i2v", "general", "fast", "motion", "distill", "fp8"], + "taskSupport": ["img2video"], + "sizeGb": 28.0, + # FP8 distilled experts (~14 GB each) plus UMT5-XXL. + # CUDA SM 8.9+ (Hopper / Ada) loads natively; older + # CUDA + MPS dequant to bf16 at load (~28 GB resident). + "runtimeFootprintGb": 18.0, + "runtimeFootprintMpsGb": 30.0, + "recommendedResolution": "832x480", + "defaultDurationSeconds": 5.0, + "note": ( + "lightx2v 4-step Wan 2.2 A14B I2V distill in FP8 E4M3. " + "Best on CUDA SM 8.9+ (RTX 4090 / Hopper) for native " + "FP8 ops; older hardware dequants to bf16 at load and " + "loses the memory saving but keeps the 4-step speedup." + ), + "estimatedGenerationSeconds": 32.0, + "availableLocally": False, + "releaseDate": "2026-04", + }, ], }, { @@ -652,15 +809,23 @@ "runtime": "diffusers CogVideoXPipeline", "styleTags": ["general", "fast", "small"], "taskSupport": ["txt2video"], - # 2B transformer in fp16 (~4 GB) + T5 text encoder (~5 GB) + - # VAE. Fits comfortably on a 12 GB card; 8 GB works with - # CPU-offload tricks. Smaller than Wan 2.1 1.3B because there's - # no UMT5-XXL — just the standard T5. + # 2B transformer in bf16 (~4 GB) + T5-XXL text encoder + # (~5 GB bf16) + VAE (~250 MB). Real-world bf16 + standard + # placement: ~13 GB resident peak on CUDA, ~15 GB on MPS + # because of allocator overhead. The runtime auto-engages + # enable_sequential_cpu_offload() if .to(device) OOMs, so + # 8-12 GB cards still work via the offload path -- the + # peak just shifts to ~5-7 GB at the cost of slower steps. + # Earlier 19 GB number was the worst-case fp32 figure and + # was tripping "would crash" on 24 GB 4090s, blocking a + # config that runs comfortably. "sizeGb": 9.0, - "runtimeFootprintGb": 19.0, + "runtimeFootprintGb": 13.0, + "runtimeFootprintCudaGb": 13.0, + "runtimeFootprintMpsGb": 15.0, "recommendedResolution": "720x480", "defaultDurationSeconds": 6.0, - "note": "Smallest CogVideoX. Apache 2.0 weights, ~9 GB on disk; runtime peak is closer to 19 GB without the most aggressive offload/tiling.", + "note": "Smallest CogVideoX. Apache 2.0 weights, ~9 GB on disk; bf16 peak is ~13 GB on CUDA / ~15 GB on MPS. Runtime auto-engages sequential CPU offload on smaller GPUs (~5-7 GB peak, slower).", "estimatedGenerationSeconds": 90.0, "availableLocally": False, "releaseDate": "2024-08", @@ -675,18 +840,58 @@ "runtime": "diffusers CogVideoXPipeline", "styleTags": ["general", "quality", "balanced"], "taskSupport": ["txt2video"], - # 5B transformer (~10 GB) + T5 (~5 GB) + VAE. Lands in the - # same envelope as Wan 2.2 — needs 24 GB VRAM or 32 GB+ - # unified memory. + # 5B transformer bf16 (~10 GB) + T5-XXL bf16 (~5 GB) + + # VAE (~250 MB). Real-world bf16 + standard placement on + # CUDA: ~18 GB resident peak; on MPS allocator overhead + # pushes it closer to ~22 GB. Earlier 33 GB number was the + # fp32 + duplicate-text-encoder worst case and was blocking + # 24 GB CUDA cards from a config that fits. "sizeGb": 18.0, - "runtimeFootprintGb": 33.0, + "runtimeFootprintGb": 18.0, + "runtimeFootprintCudaGb": 18.0, + "runtimeFootprintMpsGb": 22.0, "recommendedResolution": "720x480", "defaultDurationSeconds": 6.0, - "note": "Quality tier. ~18 GB on disk; budget for a 32 GB-class runtime envelope unless aggressive offload is enabled.", + "note": "Quality tier. ~18 GB on disk; bf16 peak is ~18 GB on CUDA / ~22 GB on MPS. Sequential CPU offload kicks in automatically on smaller GPUs.", "estimatedGenerationSeconds": 200.0, "availableLocally": False, "releaseDate": "2024-08", }, + # FU-019 catalog refresh: CogVideoX 1.5 5B. Same architecture + # as 5B, refreshed weights with stronger prompt adherence and + # higher-resolution training (1360×768). Routed via the same + # CogVideoXPipeline class, so PIPELINE_REGISTRY only needs the + # repo id added. + { + "id": "THUDM/CogVideoX-1.5-5b", + "familyId": "cogvideox", + "name": "CogVideoX 1.5 · 5B", + "provider": "THUDM", + "repo": "THUDM/CogVideoX-1.5-5b", + "link": "https://huggingface.co/THUDM/CogVideoX-1.5-5b", + "runtime": "diffusers CogVideoXPipeline", + "styleTags": ["general", "quality", "balanced", "refreshed"], + "taskSupport": ["txt2video"], + # Same architecture as CogVideoX-5b at higher training + # resolution. bf16 peak ~19 GB on CUDA / ~23 GB on MPS; + # the extra GB over 5B is the larger latent at 1360x768. + # Earlier 34 GB number was the worst case and tripped a + # spurious "would crash" on 24 GB CUDA cards. + "sizeGb": 18.5, + "runtimeFootprintGb": 19.0, + "runtimeFootprintCudaGb": 19.0, + "runtimeFootprintMpsGb": 23.0, + "recommendedResolution": "1360x768", + "defaultDurationSeconds": 5.0, + "note": ( + "Refreshed CogVideoX 1.5 5B with stronger prompt " + "adherence at 1360×768. bf16 peak ~19 GB on CUDA / " + "~23 GB on MPS; same CogVideoXPipeline as 5B." + ), + "estimatedGenerationSeconds": 220.0, + "availableLocally": False, + "releaseDate": "2024-11", + }, ], }, { diff --git a/backend_service/ddtree.py b/backend_service/ddtree.py index 9e0507a..bfbcae5 100644 --- a/backend_service/ddtree.py +++ b/backend_service/ddtree.py @@ -273,22 +273,53 @@ def generate_ddtree_mlx( Falls back to linear DFlash when tree_budget <= 0. """ import mlx.core as mx + # dflash-mlx 0.1.5+ moved every primitive consumed below off the + # ``runtime`` module top-level onto a per-family ``target_ops`` + # adapter (Qwen3.5/3.6 / Llama-4 / Phi-4 / DeepSeek-V3). One adapter + # instance carries every per-architecture entry point we need — + # forward+capture, embed, text_model, lm_head, make_cache, + # extract_context_feature. ``ContextOnlyDraftKVCache`` moved to + # ``dflash_mlx.model``; ``create_attention_mask`` is upstream + # mlx-lm. ``trim_cache_to`` was removed entirely — the replacement + # is a thin local helper that calls each entry's own ``.trim()`` / + # ``.rollback()`` / ``.crop()`` based on what the cache type + # exposes. from dflash_mlx.runtime import ( - target_forward_with_hidden_states, - extract_context_feature_from_dict, - make_target_cache, - ContextOnlyDraftKVCache, greedy_tokens_with_mask, build_suppress_token_mask, - trim_cache_to, + resolve_target_ops, ) + from dflash_mlx.model import ContextOnlyDraftKVCache + from mlx_lm.models.base import create_attention_mask - # Private helpers from dflash_mlx - from dflash_mlx.runtime import ( - _target_embed_tokens, - _lm_head_logits, - _target_text_model, - ) + target_ops = resolve_target_ops(target_model) + + def _trim_cache_to(cache_entries: list[Any], target_len: int) -> None: + """Local replacement for the dropped ``dflash_mlx.runtime.trim_cache_to``. + + Mirrors the trim half of ``target_ops.restore_after_acceptance`` + — for every entry that exposes ``trim`` / ``crop`` / ``offset``, + roll the entry's effective length back to ``target_len``. + """ + for entry in cache_entries: + if entry is None: + continue + if hasattr(entry, "rollback"): + offset = int(getattr(entry, "offset", 0) or 0) + if offset > target_len: + entry.rollback(offset - target_len) + elif hasattr(entry, "trim"): + offset = int(getattr(entry, "offset", 0) or 0) + if offset > target_len: + entry.trim(offset - target_len) + elif hasattr(entry, "offset"): + offset = int(getattr(entry, "offset", 0) or 0) + if offset > target_len: + entry.offset = target_len + elif hasattr(entry, "crop"): + entry.crop(target_len) + + trim_cache_to = _trim_cache_to prompt_len = len(prompt_tokens) prompt_array = mx.array(prompt_tokens, dtype=mx.uint32)[None] @@ -300,7 +331,7 @@ def generate_ddtree_mlx( effective_budget = max(0, min(tree_budget, 64)) # Caches - target_cache = make_target_cache(target_model, enable_speculative_linear_cache=False) + target_cache = target_ops.make_cache(target_model, enable_speculative_linear_cache=False) draft_cache = [ ContextOnlyDraftKVCache(sink_size=0, window_size=0) for _ in range(len(draft_model.layers)) @@ -314,7 +345,7 @@ def generate_ddtree_mlx( # ── Prefill ────────────────────────────────────────────── t_start = time.perf_counter() - prefill_logits, prefill_hidden = target_forward_with_hidden_states( + prefill_logits, prefill_hidden = target_ops.forward_with_hidden_capture( target_model, input_ids=prompt_array, cache=target_cache, capture_layer_ids=capture_layer_ids, ) @@ -325,19 +356,24 @@ def generate_ddtree_mlx( mx.eval(*prefill_hidden) first_token = greedy_tokens_with_mask(prefill_logits[:, -1, :], suppress_mask).reshape(-1) - target_hidden = extract_context_feature_from_dict( + target_hidden = target_ops.extract_context_feature( prefill_hidden, list(draft_model.target_layer_ids), ) mx.eval(first_token, target_hidden) generated_tokens: list[int] = [int(first_token.item())] + # Phase 3.1 follow-up: track per-token accepted-from-draft bools so + # the AcceptedTokenOverlay can tint draft-accepted spans for the + # DDTree path the same way it does for linear DFLASH. The first + # token is the prefill posterior (verifier-decoded), so it's False. + per_token_accepted: list[bool] = [False] start = prompt_len cycles = 0 accepted_from_draft = 0 acceptance_history: list[int] = [] - embed_fn = _target_embed_tokens(target_model) - inner = _target_text_model(target_model) + embed_fn = target_ops.embed_tokens(target_model) + inner = target_ops.text_model(target_model) # ── Decode loop ────────────────────────────────────────── while len(generated_tokens) < max_new_tokens: @@ -357,7 +393,7 @@ def generate_ddtree_mlx( target_hidden=target_hidden, cache=draft_cache, ) - draft_logits = _lm_head_logits(target_model, draft_hidden[:, 1:, :]) + draft_logits = target_ops.logits_from_hidden(target_model, draft_hidden[:, 1:, :]) mx.eval(draft_logits) else: draft_logits = None @@ -372,7 +408,7 @@ def generate_ddtree_mlx( block_ids_np[1:block_len] = np.array(drafted.tolist(), dtype=np.int32)[:block_len - 1] block_ids = mx.array(block_ids_np, dtype=mx.uint32)[None] - verify_logits, verify_hidden = target_forward_with_hidden_states( + verify_logits, verify_hidden = target_ops.forward_with_hidden_capture( target_model, input_ids=block_ids[:, :block_len], cache=target_cache, capture_layer_ids=capture_layer_ids, ) @@ -395,11 +431,19 @@ def generate_ddtree_mlx( committed.append(next_tok) generated_tokens.extend(committed) + # Per-token accepted bools: first `acceptance_len` are + # draft-accepted; final one is the verifier's posterior + # decode for the position the draft got wrong (or the + # natural next token when the whole draft block was + # accepted). + for _ in range(acceptance_len): + per_token_accepted.append(True) + per_token_accepted.append(False) accepted_from_draft += acceptance_len acceptance_history.append(acceptance_len) start += commit_count - committed_hidden = extract_context_feature_from_dict( + committed_hidden = target_ops.extract_context_feature( verify_hidden, list(draft_model.target_layer_ids), )[:, :commit_count, :] mx.eval(committed_hidden) @@ -439,8 +483,9 @@ def generate_ddtree_mlx( if 0 in capture_layer_ids: captured_hidden[0] = h - # Get the cache's current prefix length for mask construction - from dflash_mlx.runtime import create_attention_mask + # Get the cache's current prefix length for mask construction. + # ``create_attention_mask`` lives in mlx_lm upstream (dflash-mlx + # 0.1.5 dropped the runtime re-export). causal_mask = create_attention_mask(h, target_cache[0] if target_cache else None) # Replace the tree portion of the causal mask with our tree mask @@ -490,6 +535,12 @@ def generate_ddtree_mlx( committed = [tree_ids_list[idx] for idx in accepted_indices[1:]] # skip root committed.append(next_tok) generated_tokens.extend(committed) + # Per-token accepted bools — same shape as the linear path: + # `acceptance_len` tokens came from the draft tree (True), + # the final next_tok is verifier-decoded (False). + for _ in range(acceptance_len): + per_token_accepted.append(True) + per_token_accepted.append(False) start += len(accepted_indices) # Compact cache: keep only accepted nodes @@ -497,7 +548,7 @@ def generate_ddtree_mlx( # Extract hidden states for accepted nodes accepted_mx = mx.array(accepted_indices, dtype=mx.int32) - committed_hidden = extract_context_feature_from_dict( + committed_hidden = target_ops.extract_context_feature( captured_hidden, list(draft_model.target_layer_ids), ) committed_hidden = mx.take(committed_hidden, accepted_mx, axis=1) @@ -514,6 +565,10 @@ def generate_ddtree_mlx( for si, st in enumerate(generated_tokens): if st in stop_set: generated_tokens = generated_tokens[:si + 1] + # Phase 3.1 follow-up: keep per_token_accepted + # length aligned with generated_tokens after + # stop-token truncation. + per_token_accepted = per_token_accepted[:si + 1] break break @@ -524,6 +579,51 @@ def generate_ddtree_mlx( output_tokens = len(generated_tokens) avg_acceptance = float(np.mean(acceptance_history)) if acceptance_history else 0.0 + # Phase 3.1 follow-up: per-token text decode + run-length encode + # the accepted bools into character spans so the frontend overlay + # can tint draft-accepted ranges. Defensive try/except — token + # decoders sometimes fail on rare ids; we fall through to no + # overlay rather than crashing the turn. + accepted_spans: list[dict[str, Any]] = [] + accepted_token_text: str | None = None + try: + if generated_tokens and per_token_accepted: + # Defensive align — slice both to the same length in case + # truncation paths drift. + limit = min(len(generated_tokens), len(per_token_accepted)) + tokens = generated_tokens[:limit] + accepted = per_token_accepted[:limit] + per_token_text: list[str] = [] + for tok_id in tokens: + try: + per_token_text.append(tokenizer.decode([int(tok_id)])) + except Exception: + per_token_text.append("") + accepted_token_text = "".join(per_token_text) + offset = 0 + run_start = 0 + run_kind = accepted[0] if accepted else False + for idx, is_accepted in enumerate(accepted): + tok_text = per_token_text[idx] + if is_accepted != run_kind: + accepted_spans.append({ + "start": run_start, + "length": offset - run_start, + "accepted": run_kind, + }) + run_start = offset + run_kind = is_accepted + offset += len(tok_text) + if accepted: + accepted_spans.append({ + "start": run_start, + "length": offset - run_start, + "accepted": run_kind, + }) + except Exception: + accepted_spans = [] + accepted_token_text = None + return { "generated_tokens": generated_tokens, "output_tokens": output_tokens, @@ -532,4 +632,6 @@ def generate_ddtree_mlx( "accepted_from_draft": accepted_from_draft, "avg_acceptance_length": avg_acceptance, "tree_budget": effective_budget, + "accepted_spans": accepted_spans, + "accepted_token_text": accepted_token_text, } diff --git a/backend_service/helpers/attention_backend.py b/backend_service/helpers/attention_backend.py new file mode 100644 index 0000000..0059ded --- /dev/null +++ b/backend_service/helpers/attention_backend.py @@ -0,0 +1,75 @@ +"""Attention-backend selection for diffusers DiT pipelines. + +FU-016. Diffusers 0.36+ exposes ``transformer.set_attention_backend(...)`` +for picking between PyTorch SDPA, FlashAttention 2/3, xformers and +SageAttention. SageAttention 2/2++ (thu-ml) is an INT8 (Ampere+) / +FP8 (Hopper) attention kernel that drops attention wall time 2-3× and +end-to-end DiT latency 1.3-1.6× on FLUX/Wan/Hunyuan/CogVideoX with no +documented quality regression. + +Platform gate: +- CUDA only (no MPS / Metal port as of May 2026). +- Requires the ``sageattention`` pip wheel (``pip install sageattention``) + AND a diffusers ≥0.36 build that exposes ``set_attention_backend``. +- Skipped silently on macOS / CPU / unsupported pipelines so the call + site can stay platform-neutral. + +Stacks multiplicatively with First Block Cache (FU-015) — community +benchmarks (Wan2.1 720P I2V) report cumulative ~54% wall-time reduction +when SageAttention + FBCache are combined. + +Reference: https://github.com/thu-ml/SageAttention +""" + +from __future__ import annotations + +import importlib.util +from typing import Any + + +def maybe_apply_sage_attention(pipeline: Any) -> str | None: + """Switch ``pipeline.transformer`` to the SageAttention backend if available. + + Returns a short note for the per-image / per-video runtimeNote slot + (e.g. ``"Attention: SageAttention"``) when the swap succeeded, or + ``None`` when the backend isn't available, the device isn't CUDA, + or the pipeline shape doesn't expose ``set_attention_backend``. + + Failure modes (import error, kernel mismatch on a non-SM80+ GPU, + incompatible diffusers version) all return ``None`` so the caller + can keep the stock SDPA path. The only thing that propagates is a + bug in this helper itself. + """ + # 1. CUDA gate. SageAttention has no MPS / Metal port; calling + # ``set_attention_backend("sage")`` on a non-CUDA pipeline raises. + try: + import torch # type: ignore + except Exception: + return None + try: + cuda_available = bool(torch.cuda.is_available()) + except Exception: + cuda_available = False + if not cuda_available: + return None + + # 2. SageAttention package gate. Importable means the pip wheel + # matched the user's CUDA + Python combo at install time. + if importlib.util.find_spec("sageattention") is None: + return None + + # 3. Pipeline shape gate. Must be a DiT pipeline with a transformer + # that exposes the diffusers ≥0.36 attention-backend selector. + transformer = getattr(pipeline, "transformer", None) + if transformer is None: + return None + set_backend = getattr(transformer, "set_attention_backend", None) + if not callable(set_backend): + return None + + try: + set_backend("sage") + except Exception as exc: # noqa: BLE001 — keep stock SDPA on any failure + return f"SageAttention unavailable ({type(exc).__name__})" + + return "Attention: SageAttention" diff --git a/backend_service/helpers/chat_template.py b/backend_service/helpers/chat_template.py new file mode 100644 index 0000000..75fc462 --- /dev/null +++ b/backend_service/helpers/chat_template.py @@ -0,0 +1,199 @@ +"""Phase 3.8: chat-template inspection + auto-fix detection. + +Reasoning models and their tokenisers ship a `chat_template` Jinja +fragment that the runtime calls via `apply_chat_template` to format +multi-turn history. The template encodes: + +- Where role markers go (`<|im_start|>`, ``, etc.) +- Whether system messages are supported +- Whether the tokeniser accepts `add_generation_prompt` so the + rendered prompt ends with an assistant-side prefix the model + treats as "your turn now" + +Gemma-family models (Gemma-1 through Gemma-4) reject system role +entirely; ChatML-derived templates sometimes ship without +`add_generation_prompt` handling and produce truncated last-user +turns; a handful of GGUF community quants pin a stale chat template +that doesn't match the model's actual training format. + +This helper inspects a tokeniser at load time, returns a structured +report of detected issues and fixes the runtime can apply, and gives +the rest of the codebase a single place to encode "we know about +this template quirk". +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class ChatTemplateReport: + """Outcome of inspecting a tokeniser's chat-template support. + + `issues` lists detected problems; `fixes_applied` lists the + workarounds the runtime can transparently apply (no user action + needed). When both are empty, the template is healthy. + """ + issues: list[str] = field(default_factory=list) + fixes_applied: list[str] = field(default_factory=list) + template_present: bool = True + accepts_system_role: bool = True + accepts_generation_prompt: bool = True + + @property + def needs_attention(self) -> bool: + return bool(self.issues) or bool(self.fixes_applied) + + def to_runtime_note(self) -> str | None: + """Render a single-line note suitable for `runtime_note` on + a generation result. Returns None when the template is healthy. + """ + if not self.needs_attention: + return None + parts: list[str] = [] + if self.fixes_applied: + parts.append("auto-fixed: " + ", ".join(self.fixes_applied)) + if self.issues: + parts.append("issues: " + ", ".join(self.issues)) + return "Chat template " + "; ".join(parts) + + +# --------------------------------------------------------------------------- +# Heuristics +# --------------------------------------------------------------------------- + +# Gemma family lowercased markers — used to identify models whose chat +# template rejects the system role. +_GEMMA_PREFIXES: tuple[str, ...] = ( + "google/gemma-", + "gemma-", + "mlx-community/gemma-", + "lmstudio-community/gemma-", +) + +# Multimodal (vision-capable) repo prefixes. Lowercased prefix match. +# Models in this set get loaded via ``mlx_vlm.load`` instead of +# ``mlx_lm.load`` and route through the multimodal generate path +# (which decodes the chat ``images`` field into per-image paths and +# passes them to ``mlx_vlm.generate`` / ``stream_generate``). +# +# Add new prefixes here when adopting a vision-capable family. Text-only +# Gemma variants (e.g. older Gemma 1/2 text-only quants on mlx-community +# would go here NEGATIVELY — but Gemma 4 is multimodal across the entire +# family per Google's release, so all gemma-4 variants qualify). +_MULTIMODAL_PREFIXES: tuple[str, ...] = ( + # Gemma 4 family: every variant is multimodal. + "google/gemma-4", + "mlx-community/gemma-4", + "lmstudio-community/gemma-4", + # Qwen2.5-VL family: vision-language model, every variant is multimodal. + "qwen/qwen2.5-vl", + "mlx-community/qwen2.5-vl", + # Qwen3-VL family: future-proofing — same naming convention. + "qwen/qwen3-vl", + "mlx-community/qwen3-vl", + # LLaVA-style models running through mlx-vlm. + "mlx-community/llava-", + "llava-hf/llava-", +) + +# ChatML / Qwen2/3 templates ship `<|im_start|>` markers. When a quant +# ships without `add_generation_prompt` support, the rendered prompt +# stops mid-turn and the model continues the user turn instead of +# replying. Detection: template string contains `<|im_start|>` but +# does NOT reference `add_generation_prompt`. +_CHATML_OPEN = "<|im_start|>" +_GENERATION_PROMPT_MARKER = "add_generation_prompt" + + +def _model_ref_lower(model_ref: str | None) -> str: + return (model_ref or "").lower() + + +def is_gemma_family(model_ref: str | None) -> bool: + lowered = _model_ref_lower(model_ref) + return any(lowered.startswith(prefix) for prefix in _GEMMA_PREFIXES) + + +def is_multimodal_family(model_ref: str | None) -> bool: + """Return ``True`` when the repo id matches a vision-capable family + that should be loaded via ``mlx_vlm`` rather than ``mlx_lm``. + + Match is a lowercased prefix scan against ``_MULTIMODAL_PREFIXES``. + Returns ``False`` for text-only models, including Gemma 1/2 quants + that share the ``gemma-`` prefix but are not multimodal. + """ + lowered = _model_ref_lower(model_ref) + return any(lowered.startswith(prefix) for prefix in _MULTIMODAL_PREFIXES) + + +def fold_system_into_first_user(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Gemma fix — fold the system message (if any) into the first user + message so the chat template's system-role rejection doesn't kick in. + + Idempotent on inputs without a system message; preserves order + otherwise. + """ + out: list[dict[str, Any]] = [] + pending_system: str | None = None + for message in messages: + role = message.get("role") + content = message.get("content") or message.get("text") or "" + if role == "system" and not out and not pending_system: + pending_system = str(content) + continue + if role == "user" and pending_system is not None: + merged = f"{pending_system}\n\n{content}" if content else pending_system + out.append({**message, "role": "user", "content": merged}) + pending_system = None + continue + out.append({**message}) + if pending_system is not None and not out: + # System with no following user — preserve as-is rather than dropping. + out.append({"role": "user", "content": pending_system}) + return out + + +def inspect_chat_template( + template: str | None, + model_ref: str | None = None, +) -> ChatTemplateReport: + """Inspect a tokeniser's `chat_template` source and the model ref. + + Returns a structured report. Callers (mlx_worker, inference.py) + apply the fix the report recommends and then surface the + `runtime_note` so the UI can show a banner. + """ + report = ChatTemplateReport() + + if template is None or not template.strip(): + report.template_present = False + report.issues.append("no chat_template found on tokeniser") + return report + + # Gemma family always rejects system role — surface this as an + # auto-fix ("we'll fold system into first user") rather than an + # issue the user has to act on. + if is_gemma_family(model_ref): + report.accepts_system_role = False + report.fixes_applied.append("Gemma family — fold system into first user message") + + # ChatML without add_generation_prompt handling. + if _CHATML_OPEN in template and _GENERATION_PROMPT_MARKER not in template: + report.accepts_generation_prompt = False + report.issues.append( + "ChatML template missing add_generation_prompt handling — " + "responses may truncate mid-turn" + ) + + # Detect templates that hard-code an assistant prefix in the system + # branch, which double-prefixes when the runtime adds its own. + if template.count("<|im_start|>assistant") > 1 and "add_generation_prompt" in template: + report.issues.append( + "Template hard-codes assistant prefix even when " + "add_generation_prompt is True — may emit a doubled marker" + ) + + return report diff --git a/backend_service/helpers/documents.py b/backend_service/helpers/documents.py index f629bf3..c61e982 100644 --- a/backend_service/helpers/documents.py +++ b/backend_service/helpers/documents.py @@ -327,6 +327,13 @@ def __init__(self, persist_path: Path | None = None) -> None: self._bm25 = BM25Scorer() self._fitted = False self._persist_path = persist_path + # Phase 2.6: optional dense-embedding store. Lazily created when + # `add_document` is called with an `embedding_client`. Stays + # None when no semantic path is wired so the legacy TF-IDF + + # BM25 hybrid runs unchanged. + from backend_service.rag import VectorStore # local import: avoid cycle + + self._embeddings: VectorStore | None = None if persist_path and persist_path.exists(): self._load(persist_path) @@ -340,8 +347,16 @@ def add_document( text: str, doc_id: str | None = None, doc_name: str = "document", + embedding_client: Any = None, ) -> int: - """Add a document to the index. Returns number of chunks created.""" + """Add a document to the index. Returns number of chunks created. + + Phase 2.6: when `embedding_client` is provided, also computes + per-chunk embeddings and appends them to the dense store. Embed + failures fall through silently — the lexical (TF-IDF + BM25) + path always succeeds, so document retrieval never breaks + because the embedding subprocess is misconfigured. + """ if not text.strip(): return 0 @@ -362,6 +377,23 @@ def add_document( self._bm25.fit(self._chunks) self._fitted = True + # Phase 2.6: dense embeddings (best-effort). + if embedding_client is not None and chunks: + from backend_service.rag import VectorStore + + if self._embeddings is None: + self._embeddings = VectorStore() + try: + vectors = embedding_client.embed_batch(chunks) + if len(vectors) == len(chunks): + self._embeddings.add_batch(vectors) + else: + # Embedding output mismatch — drop the partial state + # so the search fallback path runs cleanly. + self._embeddings = None + except Exception: + self._embeddings = None + if self._persist_path: self._save() @@ -378,6 +410,12 @@ def remove_document(self, doc_id: str) -> int: self._chunks = [c for i, c in enumerate(self._chunks) if i not in indices_to_remove] self._citations = [c for i, c in enumerate(self._citations) if i not in indices_to_remove] + # Phase 2.6: keep the dense store in lockstep with chunks/citations. + if self._embeddings is not None: + self._embeddings.remove_indices(indices_to_remove) + if self._embeddings.size == 0: + self._embeddings = None + if self._chunks: self._vectoriser.fit(self._chunks) self._bm25.fit(self._chunks) @@ -398,40 +436,82 @@ def search( top_k: int = 5, vector_weight: float = 0.6, bm25_weight: float = 0.4, + embedding_client: Any = None, ) -> list[dict[str, Any]]: """Hybrid search combining vector similarity and BM25 keyword matching. + Phase 2.6: when an `embedding_client` is provided AND the index + has a populated `_embeddings` store with the same chunk count + as `_chunks`, the search rotates to a semantic primary + + keyword/BM25 secondary blend (semantic 70%, BM25 30%). When the + embedding client is missing or returns empty, the function + falls back to the legacy TF-IDF + BM25 blend so no document + retrieval ever fails because semantic was unavailable. + Returns list of ``{"text": str, "citation": dict, "score": float}`` dicts. """ if not self._fitted or not self._chunks: return [] - # Get scores from both methods - vec_results = self._vectoriser.query(query, top_k=top_k * 2) bm25_results = self._bm25.query(query, top_k=top_k * 2) - # Normalise scores to [0, 1] - vec_scores: dict[int, float] = {} - if vec_results: - max_vec = max(s for _, s in vec_results) or 1 - vec_scores = {idx: s / max_vec for idx, s in vec_results} + # Try the semantic path first when an embedding client + a fully + # populated vector store are both present. Any error during query + # embedding falls through to the legacy TF-IDF blend below so a + # transient subprocess hang doesn't break document retrieval. + semantic_scores: dict[int, float] = {} + if ( + embedding_client is not None + and getattr(self, "_embeddings", None) is not None + and self._embeddings.size == len(self._chunks) + ): + try: + query_vector = embedding_client.embed(query) + except Exception: + query_vector = None + if query_vector: + semantic_results = self._embeddings.search(query_vector, top_k=top_k * 2) + if semantic_results: + max_sem = max(s for _, s in semantic_results) or 1 + semantic_scores = {idx: s / max_sem for idx, s in semantic_results} bm25_scores: dict[int, float] = {} if bm25_results: max_bm25 = max(s for _, s in bm25_results) or 1 bm25_scores = {idx: s / max_bm25 for idx, s in bm25_results} - # Merge with weighted combination - all_indices = set(vec_scores.keys()) | set(bm25_scores.keys()) - combined: list[tuple[int, float]] = [] - for idx in all_indices: - score = ( - vector_weight * vec_scores.get(idx, 0) - + bm25_weight * bm25_scores.get(idx, 0) - ) - combined.append((idx, score)) - - combined.sort(key=lambda x: x[1], reverse=True) + if semantic_scores: + # Semantic primary + BM25 secondary. Heavier semantic weight + # because the embedding model captures synonyms / paraphrase + # which BM25 cannot. + sem_weight = 0.7 + bm_weight = 0.3 + all_indices = set(semantic_scores.keys()) | set(bm25_scores.keys()) + combined: list[tuple[int, float]] = [] + for idx in all_indices: + score = ( + sem_weight * semantic_scores.get(idx, 0) + + bm_weight * bm25_scores.get(idx, 0) + ) + combined.append((idx, score)) + combined.sort(key=lambda x: x[1], reverse=True) + else: + # Legacy TF-IDF + BM25 fallback. + vec_results = self._vectoriser.query(query, top_k=top_k * 2) + vec_scores: dict[int, float] = {} + if vec_results: + max_vec = max(s for _, s in vec_results) or 1 + vec_scores = {idx: s / max_vec for idx, s in vec_results} + + all_indices = set(vec_scores.keys()) | set(bm25_scores.keys()) + combined = [] + for idx in all_indices: + score = ( + vector_weight * vec_scores.get(idx, 0) + + bm25_weight * bm25_scores.get(idx, 0) + ) + combined.append((idx, score)) + combined.sort(key=lambda x: x[1], reverse=True) results: list[dict[str, Any]] = [] for idx, score in combined[:top_k]: diff --git a/backend_service/helpers/gpu.py b/backend_service/helpers/gpu.py index 2c4e84a..9f3b33a 100644 --- a/backend_service/helpers/gpu.py +++ b/backend_service/helpers/gpu.py @@ -106,6 +106,14 @@ def _snapshot_macos(self) -> dict[str, Any]: # ------------------------------------------------------------------ def _snapshot_nvidia(self) -> dict[str, Any]: + # Try torch.cuda first — when the GPU bundle is installed it reads + # the right total VRAM via the CUDA driver without shelling out, + # and works even if ``nvidia-smi`` isn't on PATH (common on Windows + # when the user installs the driver but not the CUDA toolkit). + torch_snapshot = self._snapshot_torch_cuda() + if torch_snapshot is not None: + return torch_snapshot + try: out = subprocess.check_output( [ @@ -130,8 +138,60 @@ def _snapshot_nvidia(self) -> dict[str, Any]: except (FileNotFoundError, subprocess.SubprocessError, ValueError): pass - # Fallback: system RAM via psutil - return self._fallback_psutil() + # No GPU detected — return a None-VRAM dict rather than reporting + # system RAM as if it were VRAM. The image / video safety + # estimators downstream treat ``vram_total_gb is None`` as + # "unknown" and skip the crash warning, which is the correct + # behaviour when we genuinely don't know the card's capacity. + return self._no_gpu_detected() + + def _snapshot_torch_cuda(self) -> dict[str, Any] | None: + """Read total + used VRAM from torch.cuda when available. + + Returns ``None`` if torch isn't importable, has no CUDA build, or + no CUDA device is currently visible (driver missing, GPU + passthrough disabled, etc.). The caller then falls through to + ``nvidia-smi``. + + Importing torch is heavy (~200ms first time) but the result is + cached one level up by ``get_device_vram_total_gb``, so the cost + is paid at most once per backend session. + """ + try: + import torch # type: ignore + except Exception: + return None + try: + if not torch.cuda.is_available(): + return None + device = torch.cuda.current_device() + props = torch.cuda.get_device_properties(device) + total_bytes = int(props.total_memory) + try: + free_bytes, _ = torch.cuda.mem_get_info(device) + used_bytes = max(0, total_bytes - int(free_bytes)) + except Exception: + used_bytes = 0 + return { + "gpu_name": props.name, + "vram_total_gb": round(total_bytes / (1024 ** 3), 2), + "vram_used_gb": round(used_bytes / (1024 ** 3), 2), + "utilization_pct": None, + "temperature_c": None, + "power_w": None, + } + except Exception: + return None + + def _no_gpu_detected(self) -> dict[str, Any]: + return { + "gpu_name": "No GPU detected", + "vram_total_gb": None, + "vram_used_gb": None, + "utilization_pct": None, + "temperature_c": None, + "power_w": None, + } # ------------------------------------------------------------------ # Fallback @@ -221,6 +281,97 @@ def nvidia_gpu_present() -> bool: return shutil.which("nvidia-smi") is not None +def torch_install_warning() -> str | None: + """Detect a torch wheel/host mismatch WITHOUT importing torch. + + Three failure modes that all silently sandbag generation onto CPU: + + 1. NVIDIA GPU present but torch isn't installed at all -- the GPU + bundle never ran, so even the "Real engine ready" badge would + be misleading. + 2. NVIDIA GPU present but the installed torch wheel is the +cpu + build -- the bundle ran but pip resolved the CPU wheel instead + of a CUDA one. This is the case the user keeps hitting on a + 4090: Studio shows "Device: cuda (expected)" because nvidia-smi + is on PATH, but generation runs on CPU because torch is + literally CPU-only. + 3. Apple Silicon host but no torch installed -- mirrors case 1. + + Returns a one-line warning string when a mismatch is detected, + ``None`` when everything looks fine. Importing torch would lock + torch DLLs in the backend process and break the GPU-bundle install + flow on Windows, so we read the wheel's dist-info METADATA from + sys.path / extras instead. + """ + import importlib.util + import sys + from pathlib import Path + + spec = importlib.util.find_spec("torch") + torch_installed = spec is not None + torch_local_version: str | None = None # "+cpu", "+cu124", "+cu128", ... + torch_version_str: str | None = None # "2.6.0+cpu" etc. + + # Read torch/version.py directly. That file is what Python executes at + # ``import torch`` time, so it's the only ground truth for the actual + # local-version tag. Don't trust dist-info names: pip can leave a stale + # ``torch-X.Y.Z+cu124.dist-info`` dir next to the +cpu wheel that was + # installed afterwards (each install of a different local-version + # creates its own dist-info but only ONE set of package files survives). + # The user we're chasing has exactly that state -- both dist-info dirs + # present, but ``torch/version.py`` reports ``2.6.0+cpu``. + if spec is not None and spec.origin: + try: + version_path = Path(spec.origin).with_name("version.py") + if version_path.is_file(): + text = version_path.read_text(errors="ignore") + for line in text.splitlines(): + stripped = line.strip() + if stripped.startswith("__version__"): + # Lines look like: __version__ = '2.6.0+cpu' + for quote in ("'", '"'): + if quote in stripped: + _, _, rest = stripped.partition(quote) + value, _, _ = rest.partition(quote) + if value: + torch_version_str = value + break + break + if torch_version_str and "+" in torch_version_str: + torch_local_version = "+" + torch_version_str.split("+", 1)[1] + except OSError: + pass + + nvidia_present = nvidia_gpu_present() + on_apple_silicon = ( + platform.system() == "Darwin" + and platform.machine() in ("arm64", "aarch64") + ) + + # Case 2 first: bundle ran, picked the wrong wheel. Most actionable. + if nvidia_present and torch_installed and torch_local_version: + if torch_local_version.lower().startswith("+cpu"): + return ( + f"torch is installed as a CPU-only wheel ({torch_version_str}) " + "even though an NVIDIA GPU is present. Generation will run " + "on CPU at a fraction of GPU speed. Open Settings > Setup " + "and click Install CUDA torch, then Restart Backend." + ) + # Case 1: NVIDIA host but no torch at all. + if nvidia_present and not torch_installed: + return ( + "torch is not installed but an NVIDIA GPU is present. Open " + "Settings > Setup and click Install GPU runtime." + ) + # Case 3: Apple Silicon but no torch. + if on_apple_silicon and not torch_installed: + return ( + "torch is not installed. Open Settings > Setup and click " + "Install GPU runtime to enable Apple Silicon (MPS) generation." + ) + return None + + _CUDA_WHEEL_HINT = ( "Click \"Install CUDA torch\" in this banner, or run: " "pip install --upgrade --force-reinstall torch " diff --git a/backend_service/helpers/images.py b/backend_service/helpers/images.py index 51fcd7d..290fe33 100644 --- a/backend_service/helpers/images.py +++ b/backend_service/helpers/images.py @@ -26,6 +26,10 @@ _parse_iso_datetime, ) from backend_service.helpers.discovery import _candidate_model_dirs, _path_size_bytes +from backend_service.helpers.platform_filter import ( + filter_mlx_only_families, + is_apple_silicon, +) from backend_service.image_runtime import validate_local_diffusers_snapshot @@ -196,7 +200,7 @@ def _image_model_payloads(library: list[dict[str, Any]]) -> list[dict[str, Any]] "variants": variants, } ) - return families + return filter_mlx_only_families(families, on_apple_silicon=is_apple_silicon()) def _find_image_variant(model_id: str) -> dict[str, Any] | None: diff --git a/backend_service/helpers/memory_gate.py b/backend_service/helpers/memory_gate.py new file mode 100644 index 0000000..44b4612 --- /dev/null +++ b/backend_service/helpers/memory_gate.py @@ -0,0 +1,202 @@ +"""Pre-flight memory gates for chat / image / video generation. + +Phase 2.0.5-B: refuses generation requests when free system memory is below +a safety floor, before the runtime gets a chance to OOM and wedge the host. +The gate is intentionally conservative — it doesn't try to predict exact +working-set size (the model is already loaded, KV pressure varies with +context length) — it just bails when the system is already memory-starved. + +Decision factors: + * `available_gb` — `psutil.virtual_memory().available`, the kernel's own + estimate of memory that can be allocated without forcing major GC or + swap, which is the right measure on every supported OS. + * `pressure_percent` — same formula the system snapshot exposes + (used + compressed + swap), which captures real pressure on macOS where + `available` underreports compressed pages. + +If both signals trip the floor, refuse with a structured message the UI can +render verbatim. Callers receive `None` on success or a dict with `code` +and `message`. +""" + +from __future__ import annotations + +from typing import Any + + +# Minimum free memory required to start a chat generation. Smaller than the +# image/video gates because chat KV growth per turn is typically <1 GB; the +# model itself is already resident. +CHAT_MIN_AVAILABLE_GB = 1.0 +# Combined-pressure ceiling. macOS unified memory routinely sits at 90-97% +# pressure during normal use because the kernel aggressively compresses +# pages — the original 92% threshold turned out to be too strict and +# refused generations that would have completed comfortably. We now treat +# `available_gb` as the primary signal and only fall back to the pressure +# ceiling at near-OOM levels (98%+). Raise this only if the available-GB +# floor proves insufficient. +CHAT_MAX_PRESSURE_PERCENT = 98.0 + +# Phase 2.0.5-H: image generation typically needs 4-12 GB working set on +# top of the already-resident pipeline (latents, attention buffers, VAE +# decode). The gate is a backstop — refuses when the host is already +# strained enough that an OOM during inference would wedge the laptop. +IMAGE_MIN_AVAILABLE_GB = 4.0 +IMAGE_MAX_PRESSURE_PERCENT = 95.0 + +# Video gen working set scales with frame count + resolution. Strictest +# of the three gates — a hung video gen on Apple Silicon will typically +# swap-thrash for minutes before recovering. +VIDEO_MIN_AVAILABLE_GB = 6.0 +VIDEO_MAX_PRESSURE_PERCENT = 92.0 + + +def gate_chat_generation( + available_gb: float, + pressure_percent: float, + *, + min_available_gb: float = CHAT_MIN_AVAILABLE_GB, + max_pressure_percent: float = CHAT_MAX_PRESSURE_PERCENT, +) -> dict[str, Any] | None: + """Decide whether a chat generation may proceed. + + Returns `None` when the system has enough headroom. Returns a refusal + dict with `code` and `message` when memory is too tight. The message is + user-facing — the UI surfaces it directly via the standard chat error + path. + """ + if available_gb < min_available_gb: + return { + "code": "memory_gate_low_available", + "message": ( + f"Only {available_gb:.1f} GB of RAM available — at least " + f"{min_available_gb:.1f} GB free is required to start a " + "generation safely. Try unloading any warm models or " + "closing other applications, then retry." + ), + } + if pressure_percent > max_pressure_percent: + return { + "code": "memory_gate_high_pressure", + "message": ( + f"System memory pressure is {pressure_percent:.0f}% — generation " + "would risk swap thrashing or an OOM kill. Free some memory " + "(unload warm models, close apps) and retry." + ), + } + return None + + +def gate_image_generation( + available_gb: float, + pressure_percent: float, + *, + min_available_gb: float = IMAGE_MIN_AVAILABLE_GB, + max_pressure_percent: float = IMAGE_MAX_PRESSURE_PERCENT, +) -> dict[str, Any] | None: + """Pre-flight check for image generation. Returns refusal or None. + + Image inference can OOM swap-thrash for minutes before recovering, so + we require materially more headroom than chat. Same shape as + `gate_chat_generation` so call sites can render the message uniformly. + """ + if available_gb < min_available_gb: + return { + "code": "memory_gate_image_low_available", + "message": ( + f"Only {available_gb:.1f} GB of RAM available — image " + f"generation needs at least {min_available_gb:.1f} GB free " + "to run safely. Unload warm models or close other apps " + "before retrying." + ), + } + if pressure_percent > max_pressure_percent: + return { + "code": "memory_gate_image_high_pressure", + "message": ( + f"Memory pressure is {pressure_percent:.0f}% — image " + "generation would risk swap thrashing. Free some memory " + "before retrying." + ), + } + return None + + +def gate_video_generation( + available_gb: float, + pressure_percent: float, + *, + min_available_gb: float = VIDEO_MIN_AVAILABLE_GB, + max_pressure_percent: float = VIDEO_MAX_PRESSURE_PERCENT, +) -> dict[str, Any] | None: + """Pre-flight check for video generation. Returns refusal or None. + + Video working sets scale with frame count + resolution, so the floor + is the strictest of the three gates. A hung diffusion loop on a memory + -starved Apple Silicon machine has historically taken the whole host + down — this gate is the cheapest possible defence. + """ + if available_gb < min_available_gb: + return { + "code": "memory_gate_video_low_available", + "message": ( + f"Only {available_gb:.1f} GB of RAM available — video " + f"generation needs at least {min_available_gb:.1f} GB free " + "to avoid swap thrashing. Unload warm models or close " + "other apps before retrying." + ), + } + if pressure_percent > max_pressure_percent: + return { + "code": "memory_gate_video_high_pressure", + "message": ( + f"Memory pressure is {pressure_percent:.0f}% — video " + "generation would likely OOM. Free some memory before " + "retrying." + ), + } + return None + + +def snapshot_memory_signals() -> tuple[float, float]: + """Read current available-RAM + pressure-percent signals. + + Mirrors the formulas in `helpers/system.system_snapshot` but is cheaper + to call repeatedly — no model catalog refresh, no GPU probing. Suitable + for the per-request gate. + """ + import psutil + + memory = psutil.virtual_memory() + try: + swap = psutil.swap_memory() + swap_used = swap.used + except OSError: + swap_used = 0 + total = memory.total + used = memory.used + available = memory.available + available_gb = available / (1024 ** 3) + + # Compressed pages are macOS-specific and not always available; fall + # back to plain used+swap when the read fails so non-Apple platforms + # still get a sensible pressure number. + compressed_used = 0 + try: + from backend_service.helpers.system import _get_compressed_memory_gb + + compressed_used = _get_compressed_memory_gb() * (1024 ** 3) + except Exception: + compressed_used = 0 + + swap_used_gb = swap_used / (1024 ** 3) + used_gb = used / (1024 ** 3) + compressed_used_gb = compressed_used / (1024 ** 3) + pressure_numerator = used_gb + compressed_used_gb + swap_used_gb + total_gb = total / (1024 ** 3) + pressure_percent = ( + min(100.0, (pressure_numerator / total_gb) * 100) + if total_gb > 0 + else 0.0 + ) + return round(available_gb, 1), round(pressure_percent, 1) diff --git a/backend_service/helpers/perf.py b/backend_service/helpers/perf.py new file mode 100644 index 0000000..3a4db09 --- /dev/null +++ b/backend_service/helpers/perf.py @@ -0,0 +1,91 @@ +"""Phase 3.5: cross-platform per-turn perf telemetry snapshot. + +Captures a small bundle of system-side metrics (CPU %, GPU %, +thermal state, available memory) at chat-turn finalisation time so +the frontend can render a compact perf strip below each assistant +response without making a separate round-trip. + +Backed by: +- macOS: psutil + pmset thermal probe (already used by the watchdog + stack — Phase 2.0.5-I) +- Linux: psutil + best-effort GPU sampler. Thermal stays None + because there's no portable read; future iteration could surface + /sys/class/thermal_zone* readings. +- Windows: psutil + best-effort NVML / pdh.dll counter (deferred — + returns None for now). + +Best-effort everywhere: any sampler error falls through to None +fields so the UI degrades gracefully. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + + +@dataclass +class PerfTelemetry: + cpuPercent: float | None = None + gpuPercent: float | None = None + thermalState: str | None = None + availableMemoryGb: float | None = None + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @property + def is_empty(self) -> bool: + return all( + v is None for v in ( + self.cpuPercent, + self.gpuPercent, + self.thermalState, + self.availableMemoryGb, + ) + ) + + +def snapshot_perf_telemetry() -> PerfTelemetry: + """Sample current host telemetry. Always returns a PerfTelemetry — + fields default to None when the underlying probe fails. Cheap to + call: no subprocess fork unless thermal is read on Darwin (which + re-uses the watchdog's pmset call). + """ + telemetry = PerfTelemetry() + + # CPU + memory via psutil — universally available. + try: + import psutil # noqa: WPS433 — local import keeps boot lean + + # interval=None = non-blocking sample using the rolling baseline + # psutil maintains since import. First call returns 0; subsequent + # calls reflect the delta since the last sample. The chat path + # has been running long enough that the baseline is warm. + telemetry.cpuPercent = round(psutil.cpu_percent(interval=None), 1) + vm = psutil.virtual_memory() + telemetry.availableMemoryGb = round(vm.available / (1024 ** 3), 2) + except Exception: + # Any psutil failure → leave as None. Telemetry strip will + # render only the fields that are present. + pass + + # Thermal — Darwin only today, re-uses Phase 2.0.5-I sampler. + try: + from backend_service.helpers.thermal import read_thermal_state + + telemetry.thermalState = read_thermal_state() + except Exception: + pass + + # GPU utilisation — best-effort, falls back to None on platforms + # without a known sampler. The dashboard's _detect_gpu_utilization + # already covers macOS Metal + NVML, so re-use it. + try: + from backend_service.helpers.system import _detect_gpu_utilization + + telemetry.gpuPercent = _detect_gpu_utilization() + except Exception: + pass + + return telemetry diff --git a/backend_service/helpers/platform_filter.py b/backend_service/helpers/platform_filter.py new file mode 100644 index 0000000..8c2f2ec --- /dev/null +++ b/backend_service/helpers/platform_filter.py @@ -0,0 +1,84 @@ +"""Platform-aware filtering for the image + video model catalogs. + +Some catalog variants only run on Apple Silicon: ``mflux`` (image) routes +through ``mflux``/``mlx-lm`` and ``prince-canuma/LTX-2-*`` (video) routes +through ``mlx-video``. Both of those Python packages depend on ``mlx``, +which has no Linux or Windows wheels. Surfacing those variants in the +Image Studio / Video Studio dropdowns on the wrong OS lets users pick +something that cannot run, so this module strips them server-side +before the payload reaches the frontend. + +The detection is conservative: a variant is treated as MLX-only iff it +declares so explicitly via ``mlxOnly`` or it carries one of the runtime +labels we know is Apple-only. New runtime labels need to be added here +when they ship — falsely keeping an entry visible is a regression we'd +catch at smoke test, falsely hiding one isn't. +""" + +from __future__ import annotations + +import platform +from typing import Any + + +_MLX_ONLY_RUNTIME_MARKERS: tuple[str, ...] = ( + "mflux (MLX native)", + "mlx-video (MLX native)", +) + +_MLX_ONLY_ENGINES: frozenset[str] = frozenset({"mflux", "mlx-video"}) + + +def is_apple_silicon(system: str | None = None, machine: str | None = None) -> bool: + """True iff the host is Darwin running on arm64. + + Both arguments are exposed for tests so the platform check can be + pinned without monkeypatching ``platform`` itself. They default to + the live host values. + """ + sys_name = system if system is not None else platform.system() + arch = machine if machine is not None else platform.machine() + return sys_name == "Darwin" and arch == "arm64" + + +def is_mlx_only_variant(variant: dict[str, Any]) -> bool: + """True iff the variant cannot run outside Apple Silicon.""" + if variant.get("mlxOnly") is True: + return True + engine = str(variant.get("engine") or "").strip().lower() + if engine in _MLX_ONLY_ENGINES: + return True + runtime = str(variant.get("runtime") or "") + return any(marker in runtime for marker in _MLX_ONLY_RUNTIME_MARKERS) + + +def filter_mlx_only_families( + families: list[dict[str, Any]], + *, + on_apple_silicon: bool, +) -> list[dict[str, Any]]: + """Strip MLX-only variants from a catalog payload on non-Apple hosts. + + On Apple Silicon every variant is preserved untouched. On every other + OS the MLX-only variants are dropped from each family's ``variants`` + list, and any family whose entire variant set is MLX-only is dropped + from the result so the UI doesn't render an empty card. + + Returns a new list — the input is not mutated. + """ + if on_apple_silicon: + return families + + filtered: list[dict[str, Any]] = [] + for family in families: + variants = [ + variant + for variant in family.get("variants", []) + if not is_mlx_only_variant(variant) + ] + if not variants: + continue + new_family = dict(family) + new_family["variants"] = variants + filtered.append(new_family) + return filtered diff --git a/backend_service/helpers/preview_thumbnails.py b/backend_service/helpers/preview_thumbnails.py new file mode 100644 index 0000000..d51b15b --- /dev/null +++ b/backend_service/helpers/preview_thumbnails.py @@ -0,0 +1,236 @@ +"""Live denoise thumbnail emit (FU-018 part 2). + +Decodes the current ``callback_kwargs["latents"]`` tensor through the +TAESD / TAEHV preview VAE that ``maybe_apply_preview_vae`` swapped onto +``pipeline.vae``, scales the result down, base64-encodes a PNG, and +returns the string for ``ProgressTracker.set_thumbnail`` to publish. + +Two helpers — one for image pipelines (latents shape ``(B, C, H, W)``) +and one for video pipelines (latents shape ``(B, C, F, H, W)`` — +TAEHV/TAEW reduce on the frame axis already, but for thumbnails we +just pick the middle frame). Both clamp to a max output size (default +192 px on the long edge) to keep base64 payloads cheap on the polled +``/api/{images,video}/progress`` endpoint. + +Errors are intentionally swallowed and turned into a ``None`` return — +a thumbnail decode crash should never abort the actual generation. The +caller (``callback_on_step_end``) just clears the slot and the UI +shows the previous frame until the next successful decode. +""" + +from __future__ import annotations + +import base64 +import io +from typing import Any + +# Cap thumbnail size so a 1024px gen doesn't push 1.5 MB of PNG through +# the polling endpoint each step. 192 px on the long edge keeps PNGs +# under ~30 KB after compression on typical content. +_MAX_THUMB_SIDE = 192 + + +def _to_pil_from_tensor(image_tensor: Any): + """Map a torch / mlx tensor (single image, 3xHxW or HxWx3, [-1,1] or + [0,1]) to a ``PIL.Image``. Returns ``None`` on shape mismatch.""" + try: + from PIL import Image + import numpy as np + except ImportError: + return None + + if image_tensor is None: + return None + + # Accept torch.Tensor or numpy.ndarray. Detach + cpu + numpy. + array = image_tensor + if hasattr(array, "detach"): + array = array.detach() + if hasattr(array, "to"): + try: + array = array.to("cpu") + except Exception: + pass + if hasattr(array, "float"): + try: + array = array.float() + except Exception: + pass + if hasattr(array, "numpy"): + try: + array = array.numpy() + except Exception: + return None + if not hasattr(array, "shape"): + return None + + # Squeeze to a single image. Common shapes: + # (1, 3, H, W) -> (3, H, W) + # (3, H, W) + # (H, W, 3) + if array.ndim == 4 and array.shape[0] == 1: + array = array[0] + if array.ndim != 3: + return None + if array.shape[0] in (1, 3) and array.shape[-1] not in (1, 3): + # CHW -> HWC + array = np.transpose(array, (1, 2, 0)) + if array.shape[-1] == 1: + array = np.repeat(array, 3, axis=-1) + if array.shape[-1] != 3: + return None + + # Normalise into [0, 255] uint8. Detect [-1, 1] vs [0, 1] from the + # observed range — taking the min lets us cover both VAE-output + # conventions without an explicit flag. + arr_min = float(array.min()) + if arr_min < -0.05: + array = (array + 1.0) * 0.5 + array = np.clip(array, 0.0, 1.0) + array = (array * 255.0).round().astype("uint8") + + return Image.fromarray(array, mode="RGB") + + +def _scale_to_max_side(image, max_side: int): + if image is None: + return None + w, h = image.size + long_side = max(w, h) + if long_side <= max_side: + return image + ratio = max_side / float(long_side) + target_w = max(1, int(round(w * ratio))) + target_h = max(1, int(round(h * ratio))) + return image.resize((target_w, target_h)) + + +def _pil_to_b64_png(image) -> str | None: + if image is None: + return None + try: + buf = io.BytesIO() + image.save(buf, format="PNG", optimize=True) + return base64.b64encode(buf.getvalue()).decode("ascii") + except Exception: + return None + + +def _unpack_flux_latents(pipeline: Any, latents: Any) -> Any: + """Convert FLUX's packed 3D latent ``(B, seq_len, 64)`` back to the + 4D ``(B, 16, H/8, W/8)`` shape ``vae.decode`` expects. + + FLUX packs 2x2 patches of 16-channel latents into a single sequence + token, so ``seq_len = (H/16) * (W/16)``. We assume square latents + when reading dimensions — that covers every FLUX preset we ship and + keeps the helper from poking at private pipeline state for size info. + """ + try: + import math + except Exception: + return None + if latents is None or not hasattr(latents, "shape") or len(latents.shape) != 3: + return None + seq_len = latents.shape[1] + side = int(round(math.sqrt(seq_len))) + if side * side != seq_len: + return None + # Pixel dimensions: each token covers a 16x16 pixel patch (FLUX + # patch_size=2 over a 8x VAE downsample → 16 pixel stride). + pixel_side = side * 16 + unpack = getattr(pipeline, "_unpack_latents", None) + if not callable(unpack): + return None + try: + # Most FLUX pipelines expose ``vae_scale_factor`` directly; fall + # back to 8 (the published default for AutoencoderKL on FLUX). + vae_scale = int(getattr(pipeline, "vae_scale_factor", 8) or 8) + return unpack(latents, pixel_side, pixel_side, vae_scale) + except Exception: + return None + + +def decode_image_latent_to_b64( + pipeline: Any, + latents: Any, + *, + max_side: int = _MAX_THUMB_SIDE, +) -> str | None: + """Decode an image latent via ``pipeline.vae``, scale down, return + base64 PNG. Handles both standard 4D ``(B, C, H, W)`` latents + (SD1.5 / SDXL / SD3) and FLUX's packed 3D ``(B, seq_len, 64)`` + latents — we unpack via ``pipeline._unpack_latents`` before decode. + Returns ``None`` on any failure.""" + vae = getattr(pipeline, "vae", None) + if vae is None or latents is None: + return None + try: + import torch + except ImportError: + return None + + try: + # FLUX packed latents need an unpack pass before VAE decode. + if hasattr(latents, "shape") and len(latents.shape) == 3: + unpacked = _unpack_flux_latents(pipeline, latents) + if unpacked is None: + return None + latents = unpacked + + with torch.no_grad(): + vae_config = getattr(vae, "config", None) + scaling = float(getattr(vae_config, "scaling_factor", 1.0) or 1.0) + shift = float(getattr(vae_config, "shift_factor", 0.0) or 0.0) + latents_in = latents + # Most diffusers image pipelines store ``latents * scaling_factor + shift`` + # in the noise space — invert that before VAE decode. + if scaling != 1.0 or shift != 0.0: + latents_in = (latents / scaling) + shift if shift else latents / scaling + decoded = vae.decode(latents_in.to(vae.dtype)).sample + # Pick first batch element only — single-image preview is enough. + first = decoded[0:1] if decoded.ndim == 4 else decoded + image = _to_pil_from_tensor(first) + image = _scale_to_max_side(image, max_side) + return _pil_to_b64_png(image) + except Exception: + return None + + +def decode_video_latent_to_b64( + pipeline: Any, + latents: Any, + *, + max_side: int = _MAX_THUMB_SIDE, +) -> str | None: + """Decode a 5D video latent ``(B, C, F, H, W)`` via ``pipeline.vae``, + pick the middle frame, scale down, return base64 PNG. Returns ``None`` + on any failure.""" + vae = getattr(pipeline, "vae", None) + if vae is None or latents is None: + return None + try: + import torch + except ImportError: + return None + + try: + with torch.no_grad(): + scaling = float(getattr(getattr(vae, "config", None), "scaling_factor", 1.0) or 1.0) + latents_in = latents + if scaling != 1.0: + latents_in = latents / scaling + decoded = vae.decode(latents_in.to(vae.dtype)).sample + # Video VAE returns ``(B, C, F, H, W)``. Pick the middle frame. + if decoded.ndim == 5: + frame_count = decoded.shape[2] + mid = frame_count // 2 + frame = decoded[0, :, mid, :, :] + elif decoded.ndim == 4: + frame = decoded[0] + else: + return None + image = _to_pil_from_tensor(frame) + image = _scale_to_max_side(image, max_side) + return _pil_to_b64_png(image) + except Exception: + return None diff --git a/backend_service/helpers/preview_vae.py b/backend_service/helpers/preview_vae.py new file mode 100644 index 0000000..aee90e2 --- /dev/null +++ b/backend_service/helpers/preview_vae.py @@ -0,0 +1,143 @@ +"""TAESD / TAEHV preview-decode VAE swap (FU-018). + +Tiny VAE for cheap decode each step. Preview-only by default — caller +toggles via the ``previewVae`` knob on the generation request. The full +generate path uses the swapped-in VAE so the user trades final fidelity +for wall-time. Real-time UI thumbnails would use this same swap with the +per-step callback hook (planned). + +Per-family mapping (longest prefix wins): + +- FLUX.1 family → ``madebyollin/taef1`` +- FLUX.2 family → ``madebyollin/taef2`` +- SD3 / SD3.5 → ``madebyollin/taesd3`` +- SDXL → ``madebyollin/taesdxl`` +- SD 1.x / 2.x → ``madebyollin/taesd`` +- Wan2.1 / Wan2.2 (any) → ``madebyollin/taew2_2`` +- LTX-Video / LTX-2 family → ``madebyollin/taeltx2_3_wide`` +- HunyuanVideo → ``madebyollin/taehv1_5`` +- Qwen-Image family → ``madebyollin/taeqwenimage`` +- CogVideoX → ``madebyollin/taecogvideox`` +- Mochi → ``madebyollin/taemochi`` + +The helper tries ``AutoencoderTiny.from_pretrained(..., local_files_only=True)`` +first, then falls back to a remote fetch. Anything that isn't cached and +isn't reachable is treated as a no-op with a runtimeNote so the caller +can show the user why the swap didn't apply. +""" + +from __future__ import annotations + +import importlib.util +from typing import Any + + +# Repo-prefix → preview VAE HF id. Order matters: longer / more-specific +# prefixes first so FLUX.2 doesn't trigger the FLUX.1 default. +_PREVIEW_VAE_MAP: list[tuple[str, str]] = [ + ("black-forest-labs/FLUX.2", "madebyollin/taef2"), + ("black-forest-labs/FLUX.1", "madebyollin/taef1"), + ("fal/FLUX.2", "madebyollin/taef2"), + ("stabilityai/stable-diffusion-3", "madebyollin/taesd3"), + ("stabilityai/stable-diffusion-xl", "madebyollin/taesdxl"), + # Turbo / Lightning variants ship under shorter repo ids + # (no ``stable-diffusion-xl`` prefix) so they need explicit entries. + ("stabilityai/sdxl-turbo", "madebyollin/taesdxl"), + ("stabilityai/sd-turbo", "madebyollin/taesd"), + ("ByteDance/SDXL-Lightning", "madebyollin/taesdxl"), + ("stabilityai/stable-diffusion-2", "madebyollin/taesd"), + ("stabilityai/stable-diffusion-v1", "madebyollin/taesd"), + ("runwayml/stable-diffusion-v1", "madebyollin/taesd"), + ("Wan-AI/Wan2", "madebyollin/taew2_2"), + ("QuantStack/Wan2", "madebyollin/taew2_2"), + ("Lightricks/LTX-Video", "madebyollin/taeltx2_3_wide"), + ("prince-canuma/LTX-2", "madebyollin/taeltx2_3_wide"), + ("hunyuanvideo-community/HunyuanVideo", "madebyollin/taehv1_5"), + ("tencent/HunyuanVideo", "madebyollin/taehv1_5"), + ("THUDM/CogVideoX", "madebyollin/taecogvideox"), + ("genmo/mochi", "madebyollin/taemochi"), + ("Qwen/Qwen-Image", "madebyollin/taeqwenimage"), +] + + +def resolve_preview_vae_id(repo: str) -> str | None: + """Map a base repo id to a preview VAE HF id, or ``None`` if unmapped.""" + for prefix, vae_id in _PREVIEW_VAE_MAP: + if repo.startswith(prefix): + return vae_id + return None + + +def maybe_apply_preview_vae( + pipeline: Any, + *, + repo: str, + enabled: bool, +) -> str | None: + """Swap ``pipeline.vae`` for the matching TAESD / TAEHV preview decoder. + + Returns a runtimeNote string when the swap applied (or attempted-but-failed + visibly), or ``None`` when the toggle is off, no preview VAE is mapped + for the repo, or diffusers itself is missing. Failures are non-fatal — + caller continues with the stock VAE. + """ + if not enabled: + return None + if importlib.util.find_spec("diffusers") is None: + return None + + preview_id = resolve_preview_vae_id(repo) + if preview_id is None: + return None + + target_vae = getattr(pipeline, "vae", None) + if target_vae is None: + return "Preview VAE skipped: pipeline has no .vae attribute." + + target_dtype = getattr(target_vae, "dtype", None) + target_device = getattr(target_vae, "device", None) + + try: + from diffusers import AutoencoderTiny + except ImportError as exc: + return f"Preview VAE skipped: AutoencoderTiny unavailable ({exc})." + + kwargs: dict[str, Any] = {} + if target_dtype is not None: + kwargs["torch_dtype"] = target_dtype + + # Try the local cache first so offline use keeps working when the + # preview VAE hasn't been downloaded yet. If it's not cached, fall + # through to a remote attempt — preview VAEs are small (~5-30 MB) + # so the download cost is negligible. + preview_vae = None + try: + preview_vae = AutoencoderTiny.from_pretrained( + preview_id, local_files_only=True, **kwargs + ) + except Exception: + try: + preview_vae = AutoencoderTiny.from_pretrained(preview_id, **kwargs) + except Exception as exc: + return ( + f"Preview VAE {preview_id} not cached and download failed " + f"({type(exc).__name__}: {exc}). Using stock VAE." + ) + + # ``from_pretrained`` defaults to CPU. Match the stock VAE's device + # so the swap doesn't trigger a device-type mismatch on the first + # decoder call (e.g. SDXL on MPS would otherwise raise + # ``Input type (MPSHalfType) and weight type (torch.HalfTensor) + # should be the same``). + if target_device is not None: + try: + preview_vae = preview_vae.to(target_device) + except Exception as exc: + return ( + f"Preview VAE {preview_id} loaded but device move to " + f"{target_device} failed ({type(exc).__name__}: {exc}). " + "Using stock VAE." + ) + + pipeline.vae = preview_vae + return f"Preview VAE: {preview_id} (fast decode)." diff --git a/backend_service/helpers/prompt_enhancer.py b/backend_service/helpers/prompt_enhancer.py new file mode 100644 index 0000000..e95264c --- /dev/null +++ b/backend_service/helpers/prompt_enhancer.py @@ -0,0 +1,378 @@ +"""LLM-based prompt enhancer (FU-022). + +Replaces the deterministic per-family suffix template that ``_enhance_prompt`` +appends in ``video_runtime.py`` with a small instruction model that +auto-rewrites short prompts into the structured 50-100 word format each +video DiT was trained on. Apple Silicon path uses ``mlx_lm`` directly; +CUDA / Linux fall back to the legacy template suffix until a llama.cpp +GGUF path lands. + +Default model: ``mlx-community/Qwen2.5-0.5B-Instruct-4bit`` (~700 MB on +disk, ~2-3s cold load on M-series, sub-second per generation). Picked +over the 1B Llama variant the original FU-022 plan named because: + * smaller memory footprint when the enhancer shares the FastAPI + sidecar's process (vs spawning a dedicated worker) + * already cached on most dev boxes (FU-002 spike used it) + * 0.5B Qwen2.5-Instruct still produces the structured 50-100 word + rewrites we need; the enhancer task is constrained enough that the + extra reasoning headroom of 1B isn't load-bearing. + +The helper caches the loaded model in a process-level singleton — +first call pays the load cost, subsequent calls reuse it. Failure +modes (model not cached, mlx_lm missing, generation crash) all return +the deterministic template fallback + a runtimeNote when enabled, so +non-Apple hosts still get useful short-prompt enhancement. +""" + +from __future__ import annotations + +import logging +import platform +import threading +from dataclasses import dataclass + +LOG = logging.getLogger(__name__) + + +# Per-family system prompt that anchors the model to the DiT's training +# distribution. Keeps the rewrite short (under 100 words) so we don't +# produce verbose paragraphs that overflow the text encoder context +# window. Each suffix mirrors the upstream model card's recommended +# prompt structure. +_FAMILY_SYSTEM_PROMPTS: dict[str, str] = { + "wan": ( + "You rewrite short user prompts into Wan-AI video model format. " + "Stay under 80 words. Always include: subject + action + setting + " + "camera angle + lighting + mood. Do not add cinematic jargon the " + "user did not ask for. Output only the rewritten prompt — no " + "preamble, no quotation marks." + ), + "ltx": ( + "You rewrite short user prompts into LTX-Video format. Stay under " + "70 words. Always include: subject + action + setting + camera " + "movement (e.g. 'tracking shot', 'static wide angle') + lighting " + "(e.g. 'golden hour', 'overcast'). Output only the rewritten " + "prompt — no preamble, no quotation marks." + ), + "hunyuan": ( + "You rewrite short user prompts into HunyuanVideo format. Stay " + "under 75 words. Always include: subject + action + setting + " + "camera shot (close-up / medium / wide) + atmosphere. Avoid " + "redundant adjectives. Output only the rewritten prompt — no " + "preamble, no quotation marks." + ), + "flux": ( + "You rewrite short user prompts into FLUX image format. Stay " + "under 60 words. Always include: subject + composition + " + "lighting + style (e.g. 'photorealistic', 'oil painting', " + "'cinematic'). Output only the rewritten prompt — no preamble, " + "no quotation marks." + ), + "sdxl": ( + "You rewrite short user prompts into SDXL image format. Stay " + "under 50 words. Always include: subject + composition + " + "lighting + comma-separated style tags. Output only the " + "rewritten prompt — no preamble, no quotation marks." + ), + "sd3": ( + "You rewrite short user prompts into Stable Diffusion 3 format. " + "Stay under 60 words. Always include: subject + setting + " + "composition + lighting + medium / style. Output only the " + "rewritten prompt — no preamble, no quotation marks." + ), + "default": ( + "You rewrite short user prompts into a richer 50-80 word " + "description while preserving the user's intent. Always include: " + "subject + action + setting + lighting + style. Output only the " + "rewritten prompt — no preamble, no quotation marks." + ), +} + + +# Repo-prefix → family id (longest match wins). ``family_for`` walks +# this in declared order, so put more-specific prefixes first. +_FAMILY_MAP: list[tuple[str, str]] = [ + ("Wan-AI/", "wan"), + ("QuantStack/Wan", "wan"), + ("Lightricks/LTX", "ltx"), + ("prince-canuma/LTX", "ltx"), + ("hunyuanvideo-community/", "hunyuan"), + ("tencent/HunyuanVideo", "hunyuan"), + ("THUDM/CogVideoX", "cogvideox"), + ("genmo/mochi", "mochi"), + ("black-forest-labs/FLUX", "flux"), + ("fal/FLUX", "flux"), + ("stabilityai/stable-diffusion-3", "sd3"), + ("stabilityai/stable-diffusion-xl", "sdxl"), + ("stabilityai/sdxl-turbo", "sdxl"), + ("ByteDance/SDXL-Lightning", "sdxl"), +] + + +# Default enhancer model. Override via ``CHAOSENGINE_ENHANCER_MODEL`` +# env var when a different small instruct model is preferred. +_DEFAULT_ENHANCER_MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit" +_PROMPT_ENHANCE_MIN_WORDS = 25 + +_IMAGE_TEMPLATE_SUFFIXES: dict[str, str] = { + "flux": ( + ", detailed composition, balanced lighting, crisp subject focus, " + "high-quality visual detail." + ), + "sdxl": ( + ", detailed composition, balanced lighting, sharp focus, high quality." + ), + "sd3": ( + ", detailed scene description, balanced lighting, strong composition, " + "high-quality visual detail." + ), + "default": ( + ", detailed setting, balanced lighting, clear composition, high-quality " + "visual detail." + ), +} + + +def family_for(repo: str) -> str: + """Map a base repo id to a family id used by the system prompt + table. Falls back to ``"default"`` for unknown repos.""" + for prefix, family in _FAMILY_MAP: + if repo.startswith(prefix): + return family + return "default" + + +@dataclass(frozen=True) +class EnhancementResult: + """Output of ``enhance_prompt``. ``enhanced == prompt`` when the + enhancer was unavailable / errored — the caller still gets a + runtimeNote so the user sees why.""" + + enhanced: str + note: str | None + modelUsed: str | None + family: str + + +class _EnhancerSingleton: + """Process-level cache for the loaded mlx_lm model + tokenizer. + First call into ``ensure_loaded`` pays the ~2-3s load cost; + subsequent calls reuse the in-memory state under a lock so two + concurrent enhancement requests don't both try to load.""" + + def __init__(self) -> None: + self._lock = threading.RLock() + self._model = None + self._tokenizer = None + self._model_id: str | None = None + self._unavailable_reason: str | None = None + + def reset(self) -> None: + """Drop the cached model — caller invokes this when a memory + pressure event tells us to free up RAM, or in test setUp.""" + with self._lock: + self._model = None + self._tokenizer = None + self._model_id = None + self._unavailable_reason = None + + def ensure_loaded(self, model_id: str) -> tuple[bool, str | None]: + """Idempotent load. Returns ``(loaded, error_reason)``.""" + with self._lock: + if self._model is not None and self._model_id == model_id: + return True, None + # Different model requested — drop the old one before loading + # the new. Prevents two ~700 MB models stacking in memory. + self._model = None + self._tokenizer = None + self._model_id = None + + if platform.system() != "Darwin": + self._unavailable_reason = ( + "Prompt enhancer requires Apple Silicon (mlx_lm). " + "Falling back to the deterministic template suffix." + ) + return False, self._unavailable_reason + + try: + from mlx_lm import load as mlx_lm_load + except ImportError as exc: + self._unavailable_reason = ( + f"Prompt enhancer requires mlx_lm ({exc}). " + "Falling back to the deterministic template suffix." + ) + return False, self._unavailable_reason + + try: + model, tokenizer = mlx_lm_load(model_id) + except Exception as exc: + self._unavailable_reason = ( + f"Prompt enhancer failed to load {model_id} " + f"({type(exc).__name__}: {exc}). Falling back to the " + "deterministic template suffix." + ) + return False, self._unavailable_reason + + self._model = model + self._tokenizer = tokenizer + self._model_id = model_id + self._unavailable_reason = None + return True, None + + def generate(self, system_prompt: str, user_prompt: str, max_tokens: int = 256) -> str: + """Render the chat-template messages + run a single generation. + Caller has already confirmed ``ensure_loaded`` succeeded.""" + with self._lock: + if self._model is None or self._tokenizer is None: + raise RuntimeError("Prompt enhancer model not loaded.") + from mlx_lm import generate as mlx_lm_generate + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + try: + rendered = self._tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False, + ) + except Exception: + # Tokenizers without a chat template — concatenate manually. + rendered = ( + f"<|system|>\n{system_prompt}\n<|user|>\n{user_prompt}\n<|assistant|>\n" + ) + + return mlx_lm_generate( + self._model, + self._tokenizer, + prompt=rendered, + max_tokens=max_tokens, + verbose=False, + ) + + +def _template_fallback(prompt: str, *, repo: str, family: str, reason: str | None) -> EnhancementResult: + cleaned = prompt.strip() + if not cleaned: + return EnhancementResult( + enhanced=cleaned, note=None, modelUsed=None, family=family, + ) + + enhanced = cleaned + applied = False + try: + from backend_service.video_runtime import _enhance_prompt as _enhance_video_prompt + + enhanced, video_note = _enhance_video_prompt(repo, cleaned) + applied = bool(video_note and enhanced != cleaned) + except Exception: + enhanced = cleaned + applied = False + + if not applied: + suffix = _IMAGE_TEMPLATE_SUFFIXES.get(family) + if suffix and len(cleaned.split()) < _PROMPT_ENHANCE_MIN_WORDS and suffix.strip() not in cleaned: + enhanced = cleaned.rstrip(",.!? ") + suffix + applied = True + + if applied: + reason_text = reason or "local LLM enhancer unavailable" + return EnhancementResult( + enhanced=enhanced, + note=f"Applied template prompt enhancement because {reason_text}", + modelUsed=None, + family=family, + ) + + return EnhancementResult( + enhanced=cleaned, + note=reason or "Prompt enhancer unavailable.", + modelUsed=None, + family=family, + ) + + +_SINGLETON = _EnhancerSingleton() + + +def reset_singleton_for_test() -> None: + """Test-only hook: forces the next ``enhance_prompt`` call to + re-load. Production code never calls this.""" + _SINGLETON.reset() + + +def enhance_prompt( + prompt: str, + *, + repo: str, + enabled: bool = True, + model_id: str = _DEFAULT_ENHANCER_MODEL, + max_tokens: int = 256, + template_fallback: bool = True, +) -> EnhancementResult: + """Synchronous entry point for the FastAPI route + the runtime + callbacks. + + Returns a template-enhanced prompt + a note when the LLM path can't + run (non-Apple, mlx_lm missing, model not cached, generation + crashes). ``template_fallback=False`` preserves the older no-op + fallback for tests and callers that need exact input retention. + """ + cleaned = (prompt or "").strip() + family = family_for(repo) + + if not enabled or not cleaned: + return EnhancementResult( + enhanced=cleaned, note=None, modelUsed=None, family=family, + ) + + loaded, reason = _SINGLETON.ensure_loaded(model_id) + if not loaded: + if template_fallback: + return _template_fallback(cleaned, repo=repo, family=family, reason=reason) + return EnhancementResult( + enhanced=cleaned, + note=reason or "Prompt enhancer unavailable.", + modelUsed=None, + family=family, + ) + + system_prompt = _FAMILY_SYSTEM_PROMPTS.get(family, _FAMILY_SYSTEM_PROMPTS["default"]) + try: + raw = _SINGLETON.generate(system_prompt, cleaned, max_tokens=max_tokens) + except Exception as exc: + LOG.exception("Prompt enhancer generation failed") + if template_fallback: + return _template_fallback( + cleaned, + repo=repo, + family=family, + reason=f"local LLM enhancer crashed ({type(exc).__name__}: {exc})", + ) + return EnhancementResult( + enhanced=cleaned, + note=( + f"Prompt enhancer crashed ({type(exc).__name__}: {exc}). " + "Using your original prompt verbatim." + ), + modelUsed=model_id, + family=family, + ) + + enhanced = raw.strip().strip('"').strip("'") + if not enhanced or len(enhanced.split()) < len(cleaned.split()): + # Model produced something shorter than input — likely a refusal + # or empty completion. Fall back to the original. + return EnhancementResult( + enhanced=cleaned, + note="Prompt enhancer returned an empty / shorter rewrite — using the original.", + modelUsed=model_id, + family=family, + ) + + note = ( + f"Prompt enhanced via {model_id} (family={family}, " + f"{len(cleaned.split())} → {len(enhanced.split())} words)." + ) + return EnhancementResult( + enhanced=enhanced, note=note, modelUsed=model_id, family=family, + ) diff --git a/backend_service/helpers/prompts.py b/backend_service/helpers/prompts.py index 0e5e265..023ec95 100644 --- a/backend_service/helpers/prompts.py +++ b/backend_service/helpers/prompts.py @@ -2,6 +2,7 @@ from __future__ import annotations import json +import re import time import uuid from pathlib import Path @@ -139,6 +140,11 @@ def create(self, data: dict[str, Any]) -> dict[str, Any]: "tags": data.get("tags", []), "category": data.get("category", "General"), "fewShotExamples": data.get("fewShotExamples", []), + # Phase 2.7: variable declarations + preset samplers + preset model + # default to empty / None so existing templates keep their shape. + "variables": _normalise_variables(data.get("variables", [])), + "presetSamplers": data.get("presetSamplers"), + "presetModelRef": data.get("presetModelRef"), "createdAt": now, "updatedAt": now, } @@ -155,6 +161,13 @@ def update(self, template_id: str, data: dict[str, Any]) -> dict[str, Any] | Non for key in ("name", "systemPrompt", "tags", "category", "fewShotExamples"): if key in data: existing[key] = data[key] + # Phase 2.7: optional fields — set when present, leave alone otherwise. + if "variables" in data: + existing["variables"] = _normalise_variables(data["variables"]) + if "presetSamplers" in data: + existing["presetSamplers"] = data["presetSamplers"] + if "presetModelRef" in data: + existing["presetModelRef"] = data["presetModelRef"] existing["updatedAt"] = time.time() self.save() return existing @@ -198,3 +211,91 @@ def search( ] return results + + +# --------------------------------------------------------------------------- +# Phase 2.7: variable substitution helpers +# --------------------------------------------------------------------------- + +# Match `{{name}}` placeholders. Names are alphanumeric + underscore + dash; +# whitespace inside the braces is tolerated so users can write `{{ topic }}` +# in templates and still have it match the declared variable name `topic`. +_PLACEHOLDER_PATTERN = re.compile(r"\{\{\s*([A-Za-z0-9_\-]+)\s*\}\}") + +_VALID_VARIABLE_TYPES: tuple[str, ...] = ("string", "number", "boolean") + + +def _normalise_variables(raw: Any) -> list[dict[str, Any]]: + """Coerce a user-supplied variable list into the canonical schema. + + Each entry is `{name: str, type: "string"|"number"|"boolean", default: Any}`. + Invalid entries are dropped silently rather than raising — the UI + does the validation work; this layer just keeps storage clean. + """ + if not isinstance(raw, list): + return [] + cleaned: list[dict[str, Any]] = [] + seen_names: set[str] = set() + for entry in raw: + if not isinstance(entry, dict): + continue + name = entry.get("name") + if not isinstance(name, str) or not name.strip(): + continue + name = name.strip() + if name in seen_names: + continue + seen_names.add(name) + var_type = entry.get("type", "string") + if var_type not in _VALID_VARIABLE_TYPES: + var_type = "string" + cleaned.append({ + "name": name, + "type": var_type, + "default": entry.get("default"), + "description": str(entry.get("description") or "")[:200], + }) + return cleaned + + +def extract_placeholders(text: str) -> list[str]: + """Return the unique placeholder names present in `text`. + + Order is the order of first appearance — the form renderer uses this + to match declared-variable order with text-occurrence order so + declarations not present in the text fall to the bottom. + """ + if not text: + return [] + seen: list[str] = [] + seen_set: set[str] = set() + for match in _PLACEHOLDER_PATTERN.finditer(text): + name = match.group(1) + if name not in seen_set: + seen_set.add(name) + seen.append(name) + return seen + + +def apply_variables(text: str, values: dict[str, Any]) -> str: + """Replace `{{name}}` placeholders with stringified values. + + Missing names stay as the literal placeholder so the user notices + the gap in the assembled prompt rather than getting a silently + truncated message. Boolean / numeric values are coerced via str(). + """ + if not text: + return text + + def _sub(match: re.Match[str]) -> str: + name = match.group(1) + if name not in values: + return match.group(0) + value = values[name] + if value is None: + return "" + if isinstance(value, bool): + return "true" if value else "false" + return str(value) + + return _PLACEHOLDER_PATTERN.sub(_sub, text) diff --git a/backend_service/helpers/settings.py b/backend_service/helpers/settings.py index 226ab66..9d46751 100644 --- a/backend_service/helpers/settings.py +++ b/backend_service/helpers/settings.py @@ -169,6 +169,20 @@ def benchmarks_path(self) -> Path: def chat_sessions_path(self) -> Path: return self.data_dir / "chat-sessions.json" + @property + def workspaces_path(self) -> Path: + """Phase 3.7: workspace registry. JSON list of workspaces with + title + descriptions; documents live under workspaces_dir.""" + return self.data_dir / "workspaces.json" + + @property + def workspaces_dir(self) -> Path: + """Phase 3.7: per-workspace document directory. Each workspace + gets a subdirectory containing its uploaded files; the RAG + retriever reads from both this dir and the active session's + own documents dir.""" + return self.data_dir / "workspaces" + @property def documents_dir(self) -> Path: return self.data_dir / "documents" @@ -223,6 +237,8 @@ def _default_settings(default_port: int, data_dir: Path) -> dict[str, Any]: # drive. Moving existing models between locations is handled by # the ``/api/settings/storage/move`` endpoint. "hfCachePath": "", + # Phase 3.3: advanced-mode logprobs flag. Off by default. + "advancedLogprobs": False, } @@ -330,6 +346,8 @@ def _load_settings(path: Path, default_port: int, data_dir: Path) -> dict[str, A # preserve the secure default rather than silently opening the API. settings["requireApiAuth"] = bool(payload.get("requireApiAuth", True)) settings["autoStartServer"] = bool(payload.get("autoStartServer", False)) + # Phase 3.3: advanced-mode logprobs toggle. + settings["advancedLogprobs"] = bool(payload.get("advancedLogprobs", False)) settings["launchPreferences"] = _normalize_launch_preferences(payload.get("launchPreferences")) diff --git a/backend_service/helpers/system.py b/backend_service/helpers/system.py index fad84ce..7f33463 100644 --- a/backend_service/helpers/system.py +++ b/backend_service/helpers/system.py @@ -413,6 +413,32 @@ def _build_system_snapshot( compressed_memory_gb = _get_compressed_memory_gb() battery = _get_battery_info() + + # Discrete GPU VRAM (CUDA cards on Windows/Linux). Apple Silicon shares + # unified memory with the CPU so this stays None there -- the chat / + # video safety estimators already treat unified memory as a single pool. + # The chat-side cache-fit warning needs this number because llama.cpp + # places the KV cache on the GPU when ngl=999, so a 60 GB cache on a + # 24 GB 4090 fails far worse than the system-RAM check would suggest. + try: + from backend_service.helpers.gpu import get_device_vram_total_gb + gpu_vram_total_gb_raw = get_device_vram_total_gb() + except Exception: + gpu_vram_total_gb_raw = None + if ( + platform.system() == "Darwin" + and platform.machine() in ("arm64", "aarch64") + ): + # On Apple Silicon get_device_vram_total_gb returns the unified + # memory total (== totalMemoryGb). Reporting it as a separate + # "GPU VRAM" field would double-count and confuse the cache-fit + # message ("60 GB > 24 GB VRAM" on a 64 GB Mac). Leave it None + # so the consumer falls back to the unified totalMemoryGb. + gpu_vram_total_gb: float | None = None + else: + gpu_vram_total_gb = gpu_vram_total_gb_raw + + # Memory pressure: used + compressed + swap as a fraction of total pressure_numerator = used_memory_gb + compressed_memory_gb + swap_used_gb memory_pressure_percent = ( @@ -467,6 +493,7 @@ def _get_dflash_info(): "llamaCliPath": native["llamaCliPath"], "nativeRuntimeMessage": native["mlxMessage"], "totalMemoryGb": total_memory_gb, + "gpuVramTotalGb": gpu_vram_total_gb, "availableMemoryGb": available_memory_gb, "usedMemoryGb": used_memory_gb, "swapUsedGb": swap_used_gb, diff --git a/backend_service/helpers/thermal.py b/backend_service/helpers/thermal.py new file mode 100644 index 0000000..4e9acef --- /dev/null +++ b/backend_service/helpers/thermal.py @@ -0,0 +1,96 @@ +"""Thermal-pressure read helpers for the runaway-watchdog stack. + +Phase 2.0.5-I: surface OS-level thermal warnings so the chat stream loop +can pause / warn when the host is throttling. On macOS we shell out to +`pmset -g therm` (works without sudo, returns a thermal warning level +string when one is recorded). Linux and Windows return None today — +both expose thermal data via vendor-specific paths that can be wired in +later when there's a per-OS UX story (NVML on NVIDIA, ACPI on Intel / +AMD, etc.). + +The function is best-effort. Any subprocess error or unparseable output +returns None so the caller can decide how to handle missing data +(usually: continue uninterrupted). +""" + +from __future__ import annotations + +import platform +import subprocess +from typing import Literal + + +ThermalState = Literal["nominal", "moderate", "critical"] + + +def read_thermal_state() -> ThermalState | None: + """Return the current thermal state, or None when unknown. + + macOS: parses `pmset -g therm`. The command emits one or more lines + in the form ` = `; specifically `CPU_Scheduler_Limit` + and `CPU_Available_CPUs` reflect throttling. We classify based on + the warning levels reported in the same output: + - "Thermal warning level set to 0" → nominal + - 1-2 → moderate + - 3+ → critical + + Other platforms: returns None (cross-platform thermal probes are + intentionally out of scope for Phase 2.0.5-I; revisit when we wire + the substrate-telemetry strip in Phase 3.5). + """ + if platform.system() != "Darwin": + return None + try: + result = subprocess.run( + ["pmset", "-g", "therm"], + capture_output=True, + text=True, + timeout=2.0, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return None + if result.returncode != 0: + return None + return _classify_pmset_output(result.stdout) + + +def _classify_pmset_output(output: str) -> ThermalState | None: + """Pure helper for tests — classifies a pmset stdout string. + + `pmset -g therm` reports the highest-severity thermal warning the + kernel has recorded since boot, plus CPU scheduler / available-CPU + limits when active throttling is in effect. We map the reported + warning level to our three-state space. + """ + if not output: + return None + lower = output.lower() + # Explicit "no thermal warning level" — the host is fine. + if "no thermal warning level has been recorded" in lower: + return "nominal" + # "Thermal warning level set to N" lines. + for line in lower.splitlines(): + if "thermal warning level set to" in line: + tail = line.rsplit("set to", 1)[-1].strip().rstrip(".") + try: + level = int(tail.split()[0]) + except (ValueError, IndexError): + continue + if level <= 0: + return "nominal" + if level <= 2: + return "moderate" + return "critical" + # CPU_Scheduler_Limit lower than 100 means active throttling — call + # that "moderate" so the watchdog at least surfaces a hint. + for line in lower.splitlines(): + if "cpu_scheduler_limit" in line: + tail = line.split("=", 1)[-1].strip().rstrip(".") + try: + limit = int(tail.split()[0]) + except (ValueError, IndexError): + continue + if limit < 100: + return "moderate" + return "nominal" + return None diff --git a/backend_service/helpers/video.py b/backend_service/helpers/video.py index d5b6684..63fea88 100644 --- a/backend_service/helpers/video.py +++ b/backend_service/helpers/video.py @@ -18,6 +18,10 @@ from backend_service.helpers.formatting import _bytes_to_gb from backend_service.helpers.huggingface import _format_release_label, _hf_repo_snapshot_dir from backend_service.helpers.images import _image_repo_live_metadata, _snapshot_on_disk_bytes +from backend_service.helpers.platform_filter import ( + filter_mlx_only_families, + is_apple_silicon, +) from backend_service.image_runtime import validate_local_diffusers_snapshot @@ -113,7 +117,7 @@ def _video_model_payloads(library: list[dict[str, Any]]) -> list[dict[str, Any]] payload = dict(family) payload["variants"] = variants families.append(payload) - return families + return filter_mlx_only_families(families, on_apple_silicon=is_apple_silicon()) def _find_video_variant(model_id: str) -> dict[str, Any] | None: diff --git a/backend_service/helpers/video_runtime_diagnostics.py b/backend_service/helpers/video_runtime_diagnostics.py new file mode 100644 index 0000000..7c8d503 --- /dev/null +++ b/backend_service/helpers/video_runtime_diagnostics.py @@ -0,0 +1,205 @@ +"""Translate opaque diffusers / transformers lazy-import errors into actionable +guidance for the Video Studio UI. + +Diffusers raises ``RuntimeError("Failed to import diffusers.pipelines.X.Y +because of the following error (look up to see its traceback): Could not +import module 'Z'. Are this object's requirements defined correctly?")`` +whenever any pipeline submodule import chain fails. The wrapped message +hides the real cause -- the user just sees a vague "module 'T5EncoderModel'" +hint with no path forward. + +This helper: + * recognises the wrapper text so we know to dig + * runs targeted in-process probes on the actual chain (transformers, + torchao, torch, sentencepiece, protobuf) to surface the underlying + error message + * formats a one-paragraph reason the UI can show in the row banner + +All probes are wrapped in try/except so we never raise from the diagnostics +helper itself -- if probing also fails we fall back to the original wrapped +text rather than masking it. +""" +from __future__ import annotations + +import importlib +import importlib.util +import re +from typing import Any + + +_DIFFUSERS_LAZY_IMPORT_PATTERN = re.compile( + r"Failed to import (?Pdiffusers[\w\.]+) because of the following error", + re.IGNORECASE, +) + + +def _probe_module_import_error(module_name: str) -> str | None: + """Return the underlying ImportError message when *module_name* won't load. + + Returns ``None`` when the module imports cleanly. Catches every exception + type because import-time errors aren't always ImportError -- a partial + install can raise AttributeError, RuntimeError, OSError, etc. + """ + try: + importlib.import_module(module_name) + except Exception as exc: + return f"{type(exc).__name__}: {exc}" + return None + + +def _probe_torch_device() -> dict[str, Any]: + """Inspect the installed torch wheel: version + CUDA availability. + + Returns ``{"installed": False}`` when torch isn't on the path. Otherwise + returns version + cuda_available + cuda_built_with so the caller can + flag the "CPU torch on a CUDA host" case explicitly. + """ + if importlib.util.find_spec("torch") is None: + return {"installed": False} + try: + import torch # type: ignore + return { + "installed": True, + "version": str(getattr(torch, "__version__", "unknown")), + "cuda_available": bool(getattr(torch, "cuda", None) and torch.cuda.is_available()), + "cuda_built_with": str(getattr(torch.version, "cuda", None) or ""), + } + except Exception as exc: + return {"installed": True, "import_error": f"{type(exc).__name__}: {exc}"} + + +def _format_torchao_torch_mismatch(torch_info: dict[str, Any]) -> str | None: + """Return a hint when torchao requires a newer torch than what's installed. + + The specific failure that triggered this helper: + ``AttributeError: module 'torch.utils._pytree' has no attribute + 'register_constant'`` + Newer torchao (>=0.10) uses ``register_constant`` which only exists from + torch 2.11. Older torch + newer torchao breaks the entire transformers + quantizer import chain, which then breaks T5EncoderModel. + """ + if not torch_info.get("installed"): + return None + if importlib.util.find_spec("torchao") is None: + return None + torchao_error = _probe_module_import_error("torchao.utils") + if torchao_error and "register_constant" in torchao_error: + torch_version = torch_info.get("version", "unknown") + return ( + "torchao is incompatible with the installed torch wheel " + f"({torch_version}). torchao >= 0.10 needs torch >= 2.11 -- " + "the missing torch.utils._pytree.register_constant attribute " + "breaks the transformers quantizer import chain, which is what " + "stops the T5 text encoder from loading. Open Settings > Setup " + "and re-run Install GPU runtime (torch will upgrade) or " + "uninstall torchao until torch is updated." + ) + return None + + +def _format_cpu_torch_on_cuda_host_warning(torch_info: dict[str, Any]) -> str | None: + """Detect the "you have a 4090 but the GPU bundle installed CPU torch" case. + + The +cpu local-version tag is the canonical marker. If the user has a + CUDA-capable host (we delegate that probe to nvidia_gpu_present) but + their torch is CPU-only, video models can technically load but they'll + run on CPU only -- effectively useless for any modern DiT. + """ + if not torch_info.get("installed"): + return None + version = str(torch_info.get("version") or "") + if "+cpu" not in version.lower(): + return None + try: + from backend_service.helpers.gpu import nvidia_gpu_present + nvidia_present = nvidia_gpu_present() + except Exception: + nvidia_present = False + if not nvidia_present: + return None + return ( + f"The installed torch wheel is CPU-only ({version}) even though an " + "NVIDIA GPU is present. Video generation will run on CPU, which is " + "unusable for modern video DiTs. Open Settings > Setup and click " + "Install CUDA torch (or re-run Install GPU runtime) so the CUDA " + "wheel replaces the CPU one. After it lands, click Restart Backend." + ) + + +def diagnose_diffusers_lazy_import_error(error_text: str) -> str | None: + """Translate a diffusers lazy-import RuntimeError into a friendlier reason. + + Returns ``None`` when the error doesn't match the lazy-import wrapper + pattern (caller should fall back to the raw text). Otherwise returns a + paragraph that names the real broken dep and points the user at the + Setup page action that fixes it. + """ + if not error_text: + return None + + # ``module 'torch' has no attribute 'cuda'`` shows up when the install + # left torch importable but partially gutted -- typically a CPU wheel + # whose torch.cuda submodule failed to lazy-import because the C + # extension never finished loading. Or the user clicked Install CUDA + # torch, the request reached the backend, _purge_stale_torch_from_extras + # ran, the pip swap then failed, and torch on disk is now half a wheel. + # Either way the recovery is the same: re-run Install CUDA torch and + # restart the backend so the cached torch module is replaced. + lowered = error_text.lower() + if "module 'torch' has no attribute" in lowered or "torch has no attribute 'cuda'" in lowered: + return ( + "The backend Python's torch is partially broken -- torch imports " + "but its CUDA submodule is missing or failed to load (often a " + "half-installed wheel left over from an interrupted Install CUDA " + "torch run). Re-run Install CUDA torch from this banner, then " + "click Restart Backend so the cached broken torch is replaced." + ) + + if not _DIFFUSERS_LAZY_IMPORT_PATTERN.search(error_text): + return None + + torch_info = _probe_torch_device() + + # Highest-priority signals first: a fundamentally broken torch install + # invalidates every downstream "missing X" theory, so report it before + # checking sentencepiece / protobuf. + cpu_torch_hint = _format_cpu_torch_on_cuda_host_warning(torch_info) + if cpu_torch_hint: + return cpu_torch_hint + + torchao_hint = _format_torchao_torch_mismatch(torch_info) + if torchao_hint: + return torchao_hint + + # Walk the typical T5EncoderModel dependency chain in import order and + # report the first concrete failure. We check transformers itself last + # because its error often comes from a deeper module (quantizers, etc). + chain = [ + ("torch", "torch"), + ("sentencepiece", "sentencepiece"), + ("google.protobuf", "protobuf"), + ("transformers.quantizers", "transformers (quantizers submodule)"), + ("transformers", "transformers"), + ] + for module_name, friendly_name in chain: + if importlib.util.find_spec(module_name.split(".")[0]) is None: + return ( + f"The backend Python is missing {friendly_name}, which " + "diffusers needs to load the T5 text encoder. Open Settings " + f"> Setup and click Install {friendly_name.split(' ')[0]} " + "(or re-run Install GPU runtime to repair the whole stack), " + "then click Restart Backend." + ) + probe_error = _probe_module_import_error(module_name) + if probe_error: + return ( + f"The backend Python could not import {friendly_name}: " + f"{probe_error}. This is what's blocking the T5 text encoder " + "(and therefore CogVideoX, Wan, LTX, and HunyuanVideo). " + "Open Settings > Setup and re-run Install GPU runtime to " + "rebuild the dependency chain, then click Restart Backend." + ) + + # Probes all passed but diffusers still failed -- surface the original + # wrapped error rather than pretending we know what's wrong. + return None diff --git a/backend_service/helpers/workspaces.py b/backend_service/helpers/workspaces.py new file mode 100644 index 0000000..5c27744 --- /dev/null +++ b/backend_service/helpers/workspaces.py @@ -0,0 +1,150 @@ +"""Phase 3.7: workspace knowledge stack registry. + +A workspace is a named bundle of documents that multiple chat +sessions can share. Each session can be assigned to at most one +workspace via `ChatSession.workspaceId`; when the RAG retriever +runs it sees both the session's own docs and the workspace's docs +under one merged corpus. + +Persistence: a JSON list at `/workspaces.json`, plus a +per-workspace subdirectory at `/workspaces//` for +uploaded files. + +This is a slim CRUD surface — Workspace metadata only (id, title, +description, doc list, timestamps). Document content stays in the +filesystem under the workspace's directory; the index entries on +the workspace point at filenames. +""" + +from __future__ import annotations + +import json +import time +import uuid +from pathlib import Path +from threading import RLock +from typing import Any + + +class WorkspaceRegistry: + """JSON-backed CRUD manager for workspace metadata.""" + + def __init__(self, registry_path: Path, workspaces_dir: Path) -> None: + self._lock = RLock() + self._path = Path(registry_path) + self._dir = Path(workspaces_dir) + self._workspaces: dict[str, dict[str, Any]] = {} + self.load() + + # -- Persistence -------------------------------------------------- + + def load(self) -> None: + with self._lock: + if not self._path.is_file(): + self._workspaces = {} + return + try: + raw = json.loads(self._path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + self._workspaces = {} + return + if isinstance(raw, list): + self._workspaces = { + str(entry.get("id")): entry + for entry in raw + if isinstance(entry, dict) and entry.get("id") + } + elif isinstance(raw, dict): + self._workspaces = { + str(k): v for k, v in raw.items() + if isinstance(v, dict) + } + else: + self._workspaces = {} + + def save(self) -> None: + with self._lock: + self._path.parent.mkdir(parents=True, exist_ok=True) + payload = list(self._workspaces.values()) + self._path.write_text( + json.dumps(payload, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + + # -- CRUD --------------------------------------------------------- + + def list_all(self) -> list[dict[str, Any]]: + with self._lock: + return [dict(entry) for entry in self._workspaces.values()] + + def get(self, workspace_id: str) -> dict[str, Any] | None: + with self._lock: + entry = self._workspaces.get(workspace_id) + return dict(entry) if entry else None + + def create(self, title: str, description: str = "") -> dict[str, Any]: + now = self._now_label() + workspace_id = uuid.uuid4().hex + entry: dict[str, Any] = { + "id": workspace_id, + "title": title or "Untitled workspace", + "description": description or "", + "documents": [], + "createdAt": now, + "updatedAt": now, + } + with self._lock: + self._workspaces[workspace_id] = entry + self.save() + (self._dir / workspace_id).mkdir(parents=True, exist_ok=True) + return dict(entry) + + def update( + self, + workspace_id: str, + *, + title: str | None = None, + description: str | None = None, + ) -> dict[str, Any] | None: + with self._lock: + existing = self._workspaces.get(workspace_id) + if existing is None: + return None + if title is not None: + existing["title"] = title + if description is not None: + existing["description"] = description + existing["updatedAt"] = self._now_label() + self.save() + return dict(existing) + + def delete(self, workspace_id: str) -> bool: + with self._lock: + if workspace_id not in self._workspaces: + return False + del self._workspaces[workspace_id] + self.save() + workspace_dir = self._dir / workspace_id + if workspace_dir.is_dir(): + # Remove the workspace's document directory + contents. + # We do this last so a save() failure above doesn't lose + # files from an undeleted workspace. + for child in workspace_dir.glob("**/*"): + if child.is_file(): + try: + child.unlink() + except OSError: + pass + try: + workspace_dir.rmdir() + except OSError: + # Non-empty (residual subdirs) — leave alone. + pass + return True + + def workspace_dir(self, workspace_id: str) -> Path: + return self._dir / workspace_id + + @staticmethod + def _now_label() -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) diff --git a/backend_service/image_runtime.py b/backend_service/image_runtime.py index 5fd46ea..917a74c 100644 --- a/backend_service/image_runtime.py +++ b/backend_service/image_runtime.py @@ -10,7 +10,10 @@ import gc import secrets -from backend_service.helpers.gpu import nvidia_gpu_present as _nvidia_gpu_present +from backend_service.helpers.gpu import ( + nvidia_gpu_present as _nvidia_gpu_present, + torch_install_warning as _torch_install_warning, +) from colorsys import hsv_to_rgb from dataclasses import asdict, dataclass, field from pathlib import Path @@ -207,6 +210,90 @@ def _guess_expected_device() -> str | None: return "cpu" +def _windows_cuda_unavailable_message(torch: Any) -> str | None: + if platform.system() != "Windows" or not _nvidia_gpu_present(): + return None + cuda_module = getattr(torch, "cuda", None) + if cuda_module is None: + return ( + "CUDA torch is unavailable on this Windows NVIDIA host: torch imports " + "but has no torch.cuda module. Open Settings > Setup and click " + "Install CUDA torch, then Restart Backend." + ) + try: + cuda_available = bool(getattr(cuda_module, "is_available", lambda: False)()) + except Exception as exc: + return ( + "CUDA torch is unavailable on this Windows NVIDIA host: " + f"torch.cuda.is_available failed ({type(exc).__name__}: {exc}). " + "Open Settings > Setup and click Install CUDA torch, then Restart Backend." + ) + if not cuda_available: + return ( + "CUDA torch is unavailable on this Windows NVIDIA host. Open Settings > " + "Setup and click Install CUDA torch, then Restart Backend." + ) + return None + + +def _is_cuda_torch_unavailable_error(exc: Exception) -> bool: + return "CUDA torch is unavailable on this Windows NVIDIA host" in str(exc) + + +# FU-017: madebyollin's SDXL VAE fp16 fix. The stock SDXL VAE silently +# decodes to NaN at fp16 on MPS and on consumer CUDA fp16 paths — the +# image_runtime currently sidesteps the bug by forcing fp32 on MPS for +# SDXL repos, which doubles wall time. The fp16-fix VAE is a drop-in +# replacement (same architecture, weights re-quantised to avoid NaN +# overflow on fp16 sigmoid) so swapping it in lets MPS / CUDA stay on +# fp16 without producing black images. +# +# We only attempt the swap when the snapshot is already in the user's +# HF cache (``local_files_only=True``) — the runtime never triggers a +# surprise download. Users who haven't fetched the fix repo see the +# original fp32 fallback path. +_SDXL_VAE_FIX_REPO = "madebyollin/sdxl-vae-fp16-fix" + + +def _is_sdxl_repo(repo: str) -> bool: + """Match SDXL family repos (Stability XL base, refiner, community fine-tunes). + + Matches loosely on substring — a false positive would attempt the + VAE swap on a non-SDXL repo, but the fp16-fix VAE only loads + successfully against an SDXL pipeline because the encoder/decoder + shape has to match. ``AutoencoderKL.from_pretrained`` raises on + mismatch and the swap silently no-ops, so an over-broad match is + self-correcting. + """ + lower = repo.lower() + return "stable-diffusion-xl" in lower or "sdxl" in lower or "sd_xl" in lower + + +def _locate_sdxl_vae_fix_snapshot() -> str | None: + """Return the local path to ``madebyollin/sdxl-vae-fp16-fix`` if cached. + + Uses ``snapshot_download(local_files_only=True)`` so a missing snapshot + returns ``None`` rather than triggering a download mid-generate. Users + who want the fp16-fix path opt in by downloading the repo from the + Setup page (or via ``huggingface-cli download``); until then the + runtime stays on the existing fp32-on-MPS fallback for SDXL. + """ + if importlib.util.find_spec("huggingface_hub") is None: + return None + try: + from huggingface_hub import snapshot_download # type: ignore + except Exception: + return None + try: + return snapshot_download( + repo_id=_SDXL_VAE_FIX_REPO, + local_files_only=True, + resume_download=True, + ) + except Exception: + return None + + def _is_flux_repo(repo: str) -> bool: """Does this HF repo look like a FLUX.1 family model? @@ -259,11 +346,68 @@ def _gguf_transformer_class_for_repo(repo: str) -> str | None: return None +def _nunchaku_transformer_class_for_repo(repo: str) -> str | None: + """FU-023: map a base repo to the Nunchaku transformer subclass. + + Nunchaku exports per-architecture wrappers for SVDQuant 4-bit weights: + FLUX family -> NunchakuFluxTransformer2dModel + Qwen-Image family -> NunchakuQwenImageTransformer2DModel + SD3 / SD3.5 -> NunchakuSD3Transformer2DModel + SANA -> NunchakuSanaTransformer2DModel + PixArt-Σ -> NunchakuPixArtSigmaTransformer2DModel + + Returns ``None`` for families Nunchaku hasn't shipped yet (Wan, + HunyuanVideo, LTX, Z-Image, ERNIE-Image) so the caller falls back + cleanly. v1.2.1 (2026-01-25) is the pin we ship; new families land + here when nunchaku adds matching subclasses. + """ + lowered = repo.lower() + if _is_flux_repo(repo): + return "NunchakuFluxTransformer2dModel" + if "qwen-image" in lowered or "qwen/qwen-image" in lowered: + return "NunchakuQwenImageTransformer2DModel" + if "stable-diffusion-3" in lowered or "sd3" in lowered: + return "NunchakuSD3Transformer2DModel" + if "sana" in lowered: + return "NunchakuSanaTransformer2DModel" + if "pixart-sigma" in lowered: + return "NunchakuPixArtSigmaTransformer2DModel" + return None + + +# FU-020: Align Your Steps (AYS) — NVIDIA's hand-optimised 10-step +# timestep schedules for SD1.5, SDXL and SVD. At 7-10 steps the AYS +# arrays preserve substantially more detail than DPM++ 2M Karras — +# the user study cited in the paper shows a 2× preference at low step +# counts. Numbers are the *timesteps* (not sigmas) the scheduler +# should sample at, not the count itself; passing them via +# ``pipeline(timesteps=...)`` overrides the standard +# ``num_inference_steps`` path. +# +# Reference: NVIDIA AYS project page, +# https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/ +_AYS_TIMESTEPS: dict[str, list[int]] = { + "sd15": [999, 850, 736, 645, 545, 455, 343, 233, 124, 24], + "sdxl": [999, 845, 730, 587, 443, 310, 193, 116, 53, 13], + # SVD reserved for the video runtime; not exposed in the image + # sampler dropdown today but registered here so the same + # ``_ays_family`` token works if/when we surface it on a video + # path. + "svd": [999, 963, 911, 833, 720, 562, 387, 219, 90, 8], +} + + # Maps a stable UI-facing sampler id to (diffusers scheduler class name, # optional from_config kwargs). The class is imported lazily from # ``diffusers`` so the runtime doesn't pay the import cost unless a user # actually picks a non-default sampler. Kwargs let us configure the # Karras/SDE variants without adding separate classes. +# +# The ``_ays_family`` key is a private marker consumed by +# ``_apply_scheduler`` — when present it pops out of the kwargs (so it +# never reaches diffusers' ``from_config``) and stashes the matching +# AYS timestep array on the pipeline for ``_build_pipeline_kwargs`` to +# pass via the ``timesteps=`` arg. _SAMPLER_REGISTRY: dict[str, tuple[str, dict[str, Any]]] = { "dpmpp_2m": ("DPMSolverMultistepScheduler", {}), "dpmpp_2m_karras": ("DPMSolverMultistepScheduler", {"use_karras_sigmas": True}), @@ -272,6 +416,8 @@ def _gguf_transformer_class_for_repo(repo: str) -> str | None: "euler_a": ("EulerAncestralDiscreteScheduler", {}), "ddim": ("DDIMScheduler", {}), "unipc": ("UniPCMultistepScheduler", {}), + "ays_dpmpp_2m_sd15": ("DPMSolverMultistepScheduler", {"_ays_family": "sd15"}), + "ays_dpmpp_2m_sdxl": ("DPMSolverMultistepScheduler", {"_ays_family": "sdxl"}), } @@ -282,6 +428,12 @@ def _apply_scheduler(pipeline: Any, sampler_id: str | None) -> str | None: nothing was), to surface in ``GeneratedImage.runtimeNote``. Silent failure modes (missing scheduler class on old diffusers, pipeline with no ``scheduler`` attribute) fall back to the model default. + + FU-020: when the registry entry includes the ``_ays_family`` private + marker, the matching AYS timestep array is stashed on + ``pipeline._chaosengine_ays_timesteps`` so + ``_build_pipeline_kwargs`` can pass it via the ``timesteps=`` arg + instead of the usual ``num_inference_steps``. """ if not sampler_id: return None @@ -290,7 +442,7 @@ def _apply_scheduler(pipeline: Any, sampler_id: str | None) -> str | None: return f"Unknown sampler '{sampler_id}' — using model default." if not hasattr(pipeline, "scheduler") or pipeline.scheduler is None: return None - class_name, extra_kwargs = entry + class_name, registry_kwargs = entry try: import diffusers # type: ignore except Exception: @@ -298,12 +450,35 @@ def _apply_scheduler(pipeline: Any, sampler_id: str | None) -> str | None: scheduler_cls = getattr(diffusers, class_name, None) if scheduler_cls is None: return f"Sampler '{sampler_id}' not available in installed diffusers." + # Pop private markers (e.g. ``_ays_family``) before passing to + # ``from_config`` — diffusers rejects unknown kwargs. + extra_kwargs = dict(registry_kwargs) + ays_family = extra_kwargs.pop("_ays_family", None) try: pipeline.scheduler = scheduler_cls.from_config( pipeline.scheduler.config, **extra_kwargs, ) except Exception as exc: return f"Sampler swap to '{sampler_id}' failed: {type(exc).__name__}. Using model default." + if ays_family: + timesteps = _AYS_TIMESTEPS.get(ays_family) + if timesteps: + try: + pipeline._chaosengine_ays_timesteps = list(timesteps) # type: ignore[attr-defined] + except Exception: + # Pipeline objects are usually attribute-friendly, but + # if a future diffusers version locks slots we swallow + # and keep the swap-only behaviour rather than failing + # the run. + pass + return f"Sampler: {sampler_id} ({len(timesteps or [])}-step AYS)" + # Clear any stale stash from a previous AYS-using generate so a + # later non-AYS run doesn't reuse the timestep array. + if hasattr(pipeline, "_chaosengine_ays_timesteps"): + try: + delattr(pipeline, "_chaosengine_ays_timesteps") + except Exception: + pass return f"Sampler: {sampler_id}" @@ -354,6 +529,11 @@ class ImageRuntimeStatus: # base M2. ``None`` means detection failed; the frontend falls back # to MPS-strict defaults. deviceMemoryGb: float | None = None + # ``torchInstallWarning`` -- mirrors VideoRuntimeStatus. Surfaces + # the "torch is +cpu but you have a CUDA card" / "torch missing" + # mismatch that otherwise hides behind a misleadingly green + # "Real engine ready" + "Device: cuda (expected)" badge pair. + torchInstallWarning: str | None = None def to_dict(self) -> dict[str, Any]: return asdict(self) @@ -396,6 +576,57 @@ class ImageGenerationConfig: # strategy's default (0.4 for TeaCache → ~1.8× speedup). See # ``TeaCacheStrategy.recommended_thresholds()`` for presets. cacheRelL1Thresh: float | None = None + # FU-021: CFG decay schedule, mirroring the video runtime knob. When + # True and the model is flow-match (FLUX/SD3/Qwen-Image/Sana/HiDream), + # the engine ramps ``guidance_scale`` linearly from the user's + # setting at step 0 toward 1.5 (the floor that keeps + # ``do_classifier_free_guidance`` True end-to-end). Default off: + # image users typically want consistent CFG; turning on the knob is + # opt-in. Non-flow-match repos (SD1.5/SDXL) ignore the flag because + # CFG decay on UNet-based ε-prediction pipelines doesn't carry the + # same oversaturation benefit. + cfgDecay: bool = False + # FU-018: TAESD / TAEHV preview-decode VAE swap. Preview-only quality + # knob — when True the engine swaps ``pipeline.vae`` for the matching + # tiny VAE before the first denoise so each step decodes in a fraction + # of the wall-time. Final output goes through the same fast VAE; users + # trade fidelity for iteration speed. Default off. + previewVae: bool = False + # FU-019 distill LoRAs: when the catalog variant pins a LoRA + # (Hyper-SD FLUX, alimama FLUX.1-Turbo-Alpha, lightx2v Wan + # CausVid), the engine fuses it into the pipeline at load time so + # subsequent generates run at the LoRA's lower step count without + # re-loading. ``loraRepo`` is the HF repo id, ``loraFile`` is the + # specific weight name within that repo (LoRAs commonly ship + # multiple step variants), ``loraScale`` is the fuse strength + # (Hyper-SD recommends 0.125, alimama Turbo wants 1.0, lightx2v + # CausVid wants 1.0). + loraRepo: str | None = None + loraFile: str | None = None + loraScale: float | None = None + # Variant-declared step / CFG defaults. Used by + # ``_generate_image_artifacts`` in app.py to substitute the schema + # defaults when the user hasn't moved the sliders — distill LoRAs + # have very different optimal points (4-8 steps, CFG 1.0-3.5) + # than the schema defaults (24 steps, CFG 5.5). + defaultSteps: int | None = None + cfgOverride: float | None = None + # FU-023 Nunchaku / SVDQuant: 4-bit weight quantization for FLUX, + # Qwen-Image, SD3.5, SANA, PixArt-Σ on CUDA. ~3× over NF4 on FLUX.1-dev. + # ``nunchakuRepo`` pins the precompiled SVDQuant snapshot (e.g. + # ``mit-han-lab/svdq-int4-flux.1-dev``); ``nunchakuFile`` is optional + # for repos that ship multiple precision tiers. CUDA only — the helper + # falls back to the standard transformer when the import fails or the + # device isn't ``cuda``. + nunchakuRepo: str | None = None + nunchakuFile: str | None = None + # FU-024 FP8 layerwise casting (CUDA SM 8.9+, e.g. RTX 4090 / H100). + # When True the engine calls ``transformer.enable_layerwise_casting`` + # post-load with the family-correct fp8 dtype (E4M3 for FLUX / Wan, + # E5M2 for HunyuanVideo). No-op on Apple Silicon, CPU, and pre-Ada + # GPUs — the helper guards before invoking. Defaults off so users + # opt-in once their hardware is confirmed. + fp8LayerwiseCasting: bool = False @dataclass(frozen=True) @@ -528,6 +759,12 @@ def __init__(self) -> None: self._loaded_path: str | None = None self._loaded_variant_key: str | None = None self._device: str | None = None + # FU-017 / FU-019 / FU-016: notes accumulated during pipeline load + # (VAE swap, LoRA fuse, attention backend). Surfaced as part of + # ``runtimeNote`` on every GeneratedImage produced by the loaded + # pipeline so the user sees what was applied without polling + # capabilities mid-batch. Reset on each pipeline load. + self._load_notes: list[str] = [] def probe(self) -> ImageRuntimeStatus: # Deliberately does NOT ``import torch`` — that would load @@ -537,6 +774,17 @@ def probe(self) -> ImageRuntimeStatus: # find_spec answers "is it installable?" without triggering the # import side effects. Device detection (cuda vs cpu) is deferred # to preload/generate where we're about to import torch anyway. + # + # ``invalidate_caches`` matters when the GPU bundle install has + # finished mid-process: pip writes the new packages into the + # extras dir (already on ``sys.path`` from process start), but + # ``importlib`` keeps a per-finder cache of negative lookups, so + # the find_spec calls below would still report None even though + # the .dist-info folders are sitting on disk. Calling + # ``invalidate_caches`` first re-walks the path entries so the + # newly installed packages are picked up without a process + # restart. + importlib.invalidate_caches() missing = [ package for package, module_name in ( @@ -560,6 +808,7 @@ def probe(self) -> ImageRuntimeStatus: pythonExecutable=_resolve_image_python(), message=message, loadedModelRepo=self._loaded_repo, + torchInstallWarning=_torch_install_warning(), ) message = ( @@ -585,6 +834,7 @@ def probe(self) -> ImageRuntimeStatus: message=message, loadedModelRepo=self._loaded_repo, deviceMemoryGb=device_memory_gb, + torchInstallWarning=_torch_install_warning(), ) def generate(self, config: ImageGenerationConfig) -> list[GeneratedImage]: @@ -603,6 +853,13 @@ def generate(self, config: ImageGenerationConfig) -> list[GeneratedImage]: config.repo, gguf_repo=config.ggufRepo, gguf_file=config.ggufFile, + lora_repo=config.loraRepo, + lora_file=config.loraFile, + lora_scale=config.loraScale, + preview_vae=config.previewVae, + nunchaku_repo=config.nunchakuRepo, + nunchaku_file=config.nunchakuFile, + fp8_layerwise_casting=config.fp8LayerwiseCasting, ) # Early-cancel check: the load phase is blocking (from_pretrained # is a C-extension call we can't interrupt), so if the user hit @@ -643,7 +900,14 @@ def generate(self, config: ImageGenerationConfig) -> list[GeneratedImage]: # most models. ``callback_on_step_end`` is the non-deprecated name # in modern diffusers (>=0.27); some pipelines also accept the # legacy ``callback`` arg, but we prefer the new one. - total_steps = int(kwargs.get("num_inference_steps", config.steps) or config.steps) + # AYS path passes ``timesteps=[...]`` instead of + # ``num_inference_steps`` — derive the step count from the + # array length so the progress bar / decay schedule still + # report the right total. + if isinstance(kwargs.get("timesteps"), list): + total_steps = len(kwargs["timesteps"]) + else: + total_steps = int(kwargs.get("num_inference_steps", config.steps) or config.steps) IMAGE_PROGRESS.set_phase( PHASE_DIFFUSING, message=self._diffuse_message(config), @@ -674,6 +938,33 @@ def generate(self, config: ImageGenerationConfig) -> list[GeneratedImage]: # to every image's metadata would flood the gallery UI. pass + # FU-021: CFG decay schedule for flow-match image pipelines. + # Same shape as the video-runtime ramp — linear from initial + # guidance to a 1.5 floor that keeps + # ``do_classifier_free_guidance`` True for the entire schedule + # (dropping below 1.0 mid-loop swaps the pipeline from + # 2-batch to 1-batch shape and produces shape-mismatch + # crashes; 1.5 is the documented floor we use on video). + # Gated to flow-match so SD1.5 / SDXL stay on constant CFG. + decay_floor = 1.5 + initial_guidance = float(kwargs.get("guidance_scale", config.guidance) or config.guidance) + decay_active = ( + config.cfgDecay + and _is_flow_matching_repo(config.repo) + and total_steps > 1 + and initial_guidance > decay_floor + ) + + # FU-018 part 2: live denoise thumbnails. Emit a base64 PNG + # of the current latent every Nth step when previewVae is on + # (the swap to TAESD makes per-step decode cheap enough to do + # without dragging total wall time). Stride keeps the polled + # endpoint payload manageable on long schedules — 50 steps at + # one decode each would push 1.5 MB of base64 through the + # poller per gen. Always emit on the final step. + thumb_active = bool(config.previewVae) + thumb_stride = max(1, total_steps // 8) if thumb_active else 1 + def _on_step_end(_pipeline: Any, step: int, _timestep: Any, callback_kwargs: dict[str, Any]): # Diffusers calls this *after* step ``step`` finishes, so step # 0 means "one step done". Convert to the 1-indexed value the @@ -692,6 +983,33 @@ def _on_step_end(_pipeline: Any, step: int, _timestep: Any, callback_kwargs: dic except Exception: pass raise GenerationCancelled("Image generation cancelled by user") + if decay_active: + next_step = step + 1 + progress = min(1.0, next_step / max(1, total_steps - 1)) + next_scale = ( + initial_guidance * (1.0 - progress) + + decay_floor * progress + ) + try: + _pipeline.guidance_scale = float(next_scale) + except Exception: + pass + if thumb_active: + is_final = (step + 1) >= total_steps + if is_final or (step % thumb_stride == 0): + latents = callback_kwargs.get("latents") if callback_kwargs else None + try: + from backend_service.helpers.preview_thumbnails import ( + decode_image_latent_to_b64, + ) + b64 = decode_image_latent_to_b64(_pipeline, latents) + if b64 is not None: + IMAGE_PROGRESS.set_thumbnail(b64) + except Exception: + # Thumbnail decode is best-effort — never fail + # the actual generation because of a preview + # decode error. + pass return callback_kwargs kwargs.setdefault("callback_on_step_end", _on_step_end) @@ -729,6 +1047,15 @@ def _on_step_end(_pipeline: Any, step: int, _timestep: Any, callback_kwargs: dic ) buffer = io.BytesIO() image.save(buffer, format="PNG", optimize=True) + # Combine all per-load notes (VAE swap, LoRA fuse, + # attention backend) with the per-generate sampler note. + # Joined with " · " so the UI can show a single line. + note_parts: list[str] = list(self._load_notes) + if sampler_note: + note_parts.append(sampler_note) + if cache_note: + note_parts.append(cache_note) + runtime_note = " · ".join(note_parts) if note_parts else None artifacts.append( GeneratedImage( seed=base_seed + index, @@ -737,7 +1064,7 @@ def _on_step_end(_pipeline: Any, step: int, _timestep: Any, callback_kwargs: dic mimeType="image/png", durationSeconds=round(elapsed / max(1, config.batchSize), 1), runtimeLabel=f"{self.runtime_label} ({self._device or 'cpu'})", - runtimeNote=sampler_note, + runtimeNote=runtime_note, ) ) if not artifacts: @@ -771,9 +1098,34 @@ def _ensure_pipeline( repo: str, gguf_repo: str | None = None, gguf_file: str | None = None, + lora_repo: str | None = None, + lora_file: str | None = None, + lora_scale: float | None = None, + preview_vae: bool = False, + nunchaku_repo: str | None = None, + nunchaku_file: str | None = None, + fp8_layerwise_casting: bool = False, ) -> Any: with self._lock: - variant_key = f"{repo}::{gguf_file}" if gguf_file else repo + # Variant key folds LoRA identity in too — switching LoRAs + # on the same base repo must rebuild the pipeline because + # ``fuse_lora`` mutates the transformer weights in place. + # ``preview_vae`` joins the same key set so toggling the + # FU-018 preview-decode knob triggers a clean rebuild. + variant_parts = [repo] + if gguf_file: + variant_parts.append(f"gguf={gguf_file}") + if lora_repo and lora_file: + variant_parts.append(f"lora={lora_repo}/{lora_file}@{lora_scale or 1.0}") + if preview_vae: + variant_parts.append("preview_vae") + if nunchaku_repo: + variant_parts.append( + f"nunchaku={nunchaku_repo}{'/' + nunchaku_file if nunchaku_file else ''}" + ) + if fp8_layerwise_casting: + variant_parts.append("fp8_layerwise") + variant_key = "::".join(variant_parts) if self._pipeline is not None and self._loaded_variant_key == variant_key: return self._pipeline @@ -800,8 +1152,21 @@ def _ensure_pipeline( raise RuntimeError(validation_error) detected_device = self._detect_device(torch) device = self._preferred_execution_device(repo, detected_device) - dtype = self._preferred_torch_dtype(torch, repo, device) + # FU-017: probe the SDXL fp16-fix VAE before deciding dtype so + # SDXL on MPS can stay on fp16 when the fix snapshot is cached. + # Probe only fires for SDXL repos on devices that actually + # benefit (MPS / CUDA) — CPU stays on fp32 regardless. + sdxl_vae_fix_path: str | None = None + if _is_sdxl_repo(repo) and device in ("mps", "cuda"): + sdxl_vae_fix_path = _locate_sdxl_vae_fix_snapshot() + dtype = self._preferred_torch_dtype( + torch, repo, device, + sdxl_vae_fix_available=sdxl_vae_fix_path is not None, + ) use_cpu_offload = self._should_use_model_cpu_offload(repo, device) + # Clear load notes on each pipeline (re)load so stale entries + # from a previously-loaded model don't bleed into new outputs. + self._load_notes = [] # Three transformer-loading strategies, in preference order: # 1. GGUF (cross-platform, any quant level the user picked) @@ -812,6 +1177,7 @@ def _ensure_pipeline( # on CUDA when no GGUF file was specified. pipeline_kwargs: dict[str, Any] = {} gguf_note: str | None = None + nunchaku_note: str | None = None if gguf_file: IMAGE_PROGRESS.set_phase( PHASE_LOADING, @@ -827,6 +1193,30 @@ def _ensure_pipeline( pipeline_kwargs["transformer"] = quantized_transformer if gguf_note: IMAGE_PROGRESS.set_phase(PHASE_LOADING, message=gguf_note) + # FU-023 Nunchaku / SVDQuant — preferred path on CUDA when the + # variant pins a Nunchaku snapshot. Wins over NF4 / int8wo by + # roughly 3× on FLUX.1-dev. CUDA only; the helper falls back to + # the standard transformer when nunchaku isn't installed or the + # device is mps/cpu so the rest of the runtime keeps working. + if ( + "transformer" not in pipeline_kwargs + and nunchaku_repo + and device == "cuda" + ): + IMAGE_PROGRESS.set_phase( + PHASE_LOADING, + message=f"Loading Nunchaku SVDQuant transformer {nunchaku_repo}", + ) + quantized_transformer, nunchaku_note = self._try_load_nunchaku_transformer( + repo=repo, + nunchaku_repo=nunchaku_repo, + nunchaku_file=nunchaku_file, + torch=torch, + ) + if quantized_transformer is not None: + pipeline_kwargs["transformer"] = quantized_transformer + if nunchaku_note: + IMAGE_PROGRESS.set_phase(PHASE_LOADING, message=nunchaku_note) if ( "transformer" not in pipeline_kwargs and device == "mps" @@ -875,6 +1265,118 @@ def _ensure_pipeline( pipeline.requires_safety_checker = False if hasattr(pipeline, "set_progress_bar_config"): pipeline.set_progress_bar_config(disable=True) + + # FU-017: swap in madebyollin's SDXL VAE fp16-fix when the + # snapshot is cached. The pipeline already loaded with fp16 + # weights (decided above) so the VAE swap is the load-bearing + # piece — without it the stock SDXL VAE silently NaN-overflows + # on the fp16 sigmoid and outputs black images on MPS / consumer + # CUDA. Failure modes (corrupt snapshot, dtype mismatch) fall + # back to the original VAE so the user still gets *some* image. + if sdxl_vae_fix_path and getattr(pipeline, "vae", None) is not None: + try: + from diffusers import AutoencoderKL # type: ignore + fix_vae = AutoencoderKL.from_pretrained( + sdxl_vae_fix_path, + torch_dtype=torch.float16, + local_files_only=True, + ) + pipeline.vae = fix_vae + self._load_notes.append("VAE: SDXL fp16-fix") + except Exception as exc: # noqa: BLE001 — fall back to stock VAE + self._load_notes.append( + f"SDXL VAE fp16-fix swap failed ({type(exc).__name__}); using stock VAE." + ) + + # FU-016: SageAttention CUDA backend. No-op on MPS / CPU and + # when the pipeline lacks ``transformer.set_attention_backend``. + # Stacks multiplicatively with FBCache. Must run *before* + # placement so the kernel selection is locked in before the + # first forward pass. + try: + from backend_service.helpers.attention_backend import ( + maybe_apply_sage_attention, + ) + sage_note = maybe_apply_sage_attention(pipeline) + if sage_note: + self._load_notes.append(sage_note) + except Exception: + # Helper is wrapped in its own try/except; any leakage + # here is a bug in the helper, not a runtime concern. + pass + + # FU-018: TAESD preview-decode VAE swap. No-op when toggle + # is off or no preview VAE is mapped for this repo. Runs + # before LoRA fuse so the LoRA's adapter modules don't trip + # the VAE swap (they target the transformer, not the VAE, + # but ordering keeps the swap close to other VAE-touching + # code like the SDXL fp16-fix above). + try: + from backend_service.helpers.preview_vae import ( + maybe_apply_preview_vae, + ) + preview_note = maybe_apply_preview_vae( + pipeline, repo=repo, enabled=preview_vae + ) + if preview_note: + self._load_notes.append(preview_note) + except Exception: + pass + + # FU-024 FP8 layerwise casting (CUDA SM 8.9+ / Ada+ / Hopper+). + # Halves transformer VRAM by storing weights in fp8 and + # promoting to bf16 only inside the matmul. Diffusers exposes + # ``enable_layerwise_casting`` on every flow-match DiT we ship. + # Family-correct fp8 dtype: E4M3 for FLUX / Wan / Qwen-Image, + # E5M2 for HunyuanVideo (hunyuan team's recommendation in + # their model card). No-op outside CUDA. + if fp8_layerwise_casting and device == "cuda": + try: + fp8_note = self._maybe_enable_fp8_layerwise( + pipeline, repo=repo, torch=torch, + ) + if fp8_note: + self._load_notes.append(fp8_note) + except Exception as exc: # noqa: BLE001 — any failure → bf16 + self._load_notes.append( + f"FP8 layerwise casting failed ({type(exc).__name__}: " + f"{exc}) — running bf16." + ) + + # FU-019: distill LoRAs (Hyper-SD FLUX, alimama FLUX.1-Turbo, + # lightx2v Wan CausVid). Load + fuse at pipeline build time + # so subsequent ``pipeline(...)`` calls run with the LoRA + # baked into the transformer — no per-generate fuse cost. + # ``unload_lora_weights`` after fuse drops the un-fused + # state dict from RAM (the fused weights live in the + # transformer itself). + if lora_repo and lora_file: + try: + pipeline.load_lora_weights( + lora_repo, + weight_name=lora_file, + local_files_only=True, + ) + effective_scale = ( + float(lora_scale) if lora_scale is not None else 1.0 + ) + pipeline.fuse_lora(lora_scale=effective_scale) + try: + pipeline.unload_lora_weights() + except Exception: + # Best-effort cleanup — older diffusers don't + # always succeed at unloading after fuse, and + # the fused transformer is correct either way. + pass + self._load_notes.append( + f"LoRA: {lora_repo}/{lora_file} @ scale {effective_scale:.3f}" + ) + except Exception as exc: # noqa: BLE001 — non-fatal + self._load_notes.append( + f"LoRA load failed ({type(exc).__name__}: {exc}). " + "Pipeline continuing without LoRA." + ) + if use_cpu_offload: # Diffusers' stock recipe for FLUX on <32 GB VRAM: keep only # the active component (T5, then transformer, then VAE) on @@ -937,7 +1439,13 @@ def _release_pipeline(self) -> None: except Exception: pass - def _preferred_torch_dtype(self, torch: Any, repo: str, device: str) -> Any: + def _preferred_torch_dtype( + self, + torch: Any, + repo: str, + device: str, + sdxl_vae_fix_available: bool = False, + ) -> Any: if device == "cuda": # FLUX was trained and validated in bfloat16. Loading it as # float16 produces slightly off saturations and occasional @@ -950,8 +1458,14 @@ def _preferred_torch_dtype(self, torch: Any, repo: str, device: str) -> Any: if device == "mps": lowered_repo = repo.lower() # SDXL / Stable Diffusion on MPS can silently decode to black - # images in fp16. Favor correctness over speed for those repos. + # images in fp16 due to the stock SDXL VAE overflowing the + # fp16 sigmoid. FU-017: when madebyollin/sdxl-vae-fp16-fix is + # cached locally we swap that VAE in and stay on fp16 (≈2× + # faster than fp32). Without the fix snapshot we keep the + # safe fp32 fallback so users still get correct images. if any(token in lowered_repo for token in ("stable-diffusion", "sdxl", "sd_xl")): + if sdxl_vae_fix_available and _is_sdxl_repo(repo): + return torch.float16 return torch.float32 return torch.float16 return torch.float32 @@ -1126,12 +1640,23 @@ def _try_load_gguf_transformer( filename=gguf_file, local_files_only=True, ) + # Pin the architecture config to the base repo's + # ``transformer/config.json`` — without this hint + # ``from_single_file`` falls back to the transformer class's + # default layout, which is fine for the largest variant in a + # family but breaks smaller variants (different cross-attn + # dim, hidden size, layer count). Mirrors the video-side + # loader. See ``backend_service/video_runtime.py``'s + # ``_try_load_gguf_transformer`` for the Wan 2.2 5B repro + # that motivated the fix. transformer = transformer_cls.from_single_file( gguf_local_path, quantization_config=GGUFQuantizationConfig( compute_dtype=torch.bfloat16, ), torch_dtype=torch.bfloat16, + config=repo, + subfolder="transformer", ) return transformer, ( f"Transformer loaded from GGUF ({gguf_file})" @@ -1171,6 +1696,18 @@ def _build_pipeline_kwargs(self, config: ImageGenerationConfig, generator: Any) "num_images_per_prompt": config.batchSize, "generator": generator, } + # FU-020: when the user picked an AYS sampler, + # ``_apply_scheduler`` stashed the precomputed timestep array on + # the pipeline. Diffusers accepts ``timesteps=`` as an explicit + # override; when present it takes precedence over + # ``num_inference_steps`` so we drop the latter to avoid the + # "got both" warning. + pipeline = self._pipeline + if pipeline is not None: + ays_timesteps = getattr(pipeline, "_chaosengine_ays_timesteps", None) + if ays_timesteps: + kwargs["timesteps"] = list(ays_timesteps) + kwargs.pop("num_inference_steps", None) lowered_repo = config.repo.lower() if "qwen-image" in lowered_repo: kwargs.pop("guidance_scale", None) @@ -1183,13 +1720,144 @@ def _build_pipeline_kwargs(self, config: ImageGenerationConfig, generator: Any) return kwargs def _detect_device(self, torch: Any) -> str: - if getattr(torch.cuda, "is_available", lambda: False)(): - return "cuda" + cuda_module = getattr(torch, "cuda", None) + if cuda_module is not None: + try: + if getattr(cuda_module, "is_available", lambda: False)(): + return "cuda" + except Exception: + pass + cuda_error = _windows_cuda_unavailable_message(torch) + if cuda_error: + raise RuntimeError(cuda_error) mps_backend = getattr(getattr(torch, "backends", None), "mps", None) if mps_backend is not None and getattr(mps_backend, "is_available", lambda: False)(): return "mps" return "cpu" + def _try_load_nunchaku_transformer( + self, + repo: str, + nunchaku_repo: str, + nunchaku_file: str | None, + torch: Any, + ) -> tuple[Any, str | None]: + """FU-023: load a Nunchaku SVDQuant transformer for FLUX / Qwen-Image + / SD3.5 / SANA / PixArt-Σ. CUDA only. + + Nunchaku ships dedicated transformer subclasses + (``NunchakuFluxTransformer2dModel``, ``NunchakuQwenImageTransformer2DModel``, + etc.) that load precompiled INT4 SVDQuant weights and expose the + same forward signature as the stock diffusers transformer, so the + rest of ``_ensure_pipeline`` keeps working without further + plumbing. ~3× perf over NF4 on FLUX.1-dev. + + Returns ``(transformer, note)`` matching the NF4 / GGUF helper + contract — ``None`` transformer means the caller should fall back. + """ + if importlib.util.find_spec("nunchaku") is None: + return None, ( + "Nunchaku package not installed — install it from the Setup " + "page to enable SVDQuant 4-bit on CUDA. Falling back to " + "the standard transformer." + ) + cls_name = _nunchaku_transformer_class_for_repo(repo) + if cls_name is None: + return None, ( + f"No Nunchaku transformer class registered for {repo}. " + "Add a mapping in image_runtime._nunchaku_transformer_class_for_repo." + ) + try: + import nunchaku # type: ignore + except ImportError as exc: + return None, ( + f"Nunchaku import failed ({exc}). Install nunchaku>=1.2.1 " + "from the Setup page." + ) + cls = getattr(nunchaku, cls_name, None) + if cls is None: + return None, ( + f"{cls_name} not in installed nunchaku — upgrade via the " + "Setup page to use this Nunchaku variant." + ) + + try: + from huggingface_hub import snapshot_download # type: ignore + local_dir = snapshot_download( + repo_id=nunchaku_repo, + local_files_only=True, + ) + kwargs: dict[str, Any] = {"torch_dtype": torch.bfloat16} + if nunchaku_file: + # Some Nunchaku snapshots ship multiple precision tiers + # under one repo (e.g. svdq-int4 vs svdq-fp4). When the + # variant pins a specific filename, pass it through. + kwargs["filename"] = nunchaku_file + transformer = cls.from_pretrained(local_dir, **kwargs) + note = ( + f"Nunchaku SVDQuant transformer loaded from {nunchaku_repo}" + + (f"/{nunchaku_file}" if nunchaku_file else "") + + " (CUDA INT4 — ~3× over NF4)." + ) + return transformer, note + except Exception as exc: # noqa: BLE001 — fall through to NF4 + return None, ( + f"Nunchaku load failed ({type(exc).__name__}: {exc}) — " + "falling back to NF4 / int8wo / bf16." + ) + + def _maybe_enable_fp8_layerwise( + self, + pipeline: Any, + repo: str, + torch: Any, + ) -> str | None: + """FU-024: call ``transformer.enable_layerwise_casting`` with the + family-correct fp8 dtype. Caller has already gated to CUDA. Pre-Ada + GPUs lack hardware fp8 support — the cast still runs but generation + is slower than bf16, so we additionally check the compute capability + (SM 8.9 = Ada Lovelace, SM 9.0 = Hopper, SM 10.0 = Blackwell). + Returns a runtimeNote string, or ``None`` when the path no-ops + cleanly. + """ + try: + major, minor = torch.cuda.get_device_capability() + except Exception: + return "FP8 layerwise skipped: torch.cuda.get_device_capability failed." + if (major, minor) < (8, 9): + return ( + f"FP8 layerwise skipped: SM {major}.{minor} pre-dates Ada — " + "hardware fp8 unavailable. Use bf16 / NF4 / Nunchaku instead." + ) + transformer = getattr(pipeline, "transformer", None) + if transformer is None or not hasattr(transformer, "enable_layerwise_casting"): + return ( + "FP8 layerwise skipped: pipeline.transformer.enable_layerwise_casting " + "missing — pipeline is UNet-based or the diffusers version is old." + ) + # E5M2 has wider exponent range (good for activations + outliers), + # E4M3 has more mantissa bits (better for weights). HunyuanVideo's + # team published their FP8 weights as E5M2; FLUX / Wan / Qwen-Image + # / SD3 use E4M3. + repo_lower = repo.lower() + if "hunyuan" in repo_lower: + storage_dtype = torch.float8_e5m2 + storage_label = "E5M2" + else: + storage_dtype = torch.float8_e4m3fn + storage_label = "E4M3" + try: + transformer.enable_layerwise_casting( + storage_dtype=storage_dtype, + compute_dtype=torch.bfloat16, + ) + except Exception as exc: + return ( + f"FP8 layerwise enable failed ({type(exc).__name__}: {exc}) — " + "running bf16." + ) + return f"FP8 layerwise casting enabled ({storage_label}, compute=bf16)." + class MfluxImageEngine: """Native Apple Silicon FLUX runtime via the ``mflux`` package. @@ -1294,6 +1962,12 @@ def __init__(self) -> None: self._placeholder = PlaceholderImageEngine() self._diffusers = DiffusersTextToImageEngine() self._mflux = MfluxImageEngine() + # FU-008 image subset: sd.cpp engine. Wired lazily so the import + # cost (small) is paid only when the manager is actually + # constructed. Engine probe is cheap; full binary check happens + # at generate time. + from backend_service.sdcpp_image_runtime import SdCppImageEngine + self._sdcpp = SdCppImageEngine() def capabilities(self) -> dict[str, Any]: return self._diffusers.probe().to_dict() @@ -1339,6 +2013,41 @@ def generate(self, config: ImageGenerationConfig) -> tuple[list[GeneratedImage], else: _mflux_fallback_note = None + # FU-008 image subset: sd.cpp path. Routed when the catalog + # variant declares ``engine="sdcpp"`` (which app.py threads onto + # ``config.runtime``). Failure modes (missing binary, unsupported + # repo, missing GGUF, subprocess error) fall through to the + # diffusers path below and surface a runtimeNote so the user + # still gets an image rendered. + if (config.runtime or "").lower() == "sdcpp": + probe = self._sdcpp.probe() + if probe.get("available"): + try: + images = self._sdcpp.generate(config) + status = self._diffusers.probe().to_dict() + status["activeEngine"] = "sd.cpp" + status["message"] = "Generated via stable-diffusion.cpp subprocess." + return images, status + except Exception as exc: + _sdcpp_fallback_note = ( + f"sd.cpp failed ({type(exc).__name__}: {exc}) — " + "falling back to diffusers." + ) + else: + _sdcpp_fallback_note = None + else: + _sdcpp_fallback_note = probe.get("reason") or "sd.cpp unavailable" + # Combine mflux + sdcpp fallback notes if both fired (rare but + # possible if a variant lists ``engine="sdcpp"`` AND the user + # has overridden the runtime selector to ``"mflux"`` somehow). + if _sdcpp_fallback_note: + if _mflux_fallback_note: + _mflux_fallback_note = ( + f"{_mflux_fallback_note} {_sdcpp_fallback_note}" + ) + else: + _mflux_fallback_note = _sdcpp_fallback_note + status = self._diffusers.probe() if status.realGenerationAvailable: try: @@ -1350,6 +2059,8 @@ def generate(self, config: ImageGenerationConfig) -> tuple[list[GeneratedImage], ) return images, result_status except Exception as exc: + if _is_cuda_torch_unavailable_error(exc): + raise fallback_note = ( "The diffusers runtime failed, so ChaosEngineAI fell back to the placeholder engine for this run. " f"Details: {exc}" @@ -1362,6 +2073,16 @@ def generate(self, config: ImageGenerationConfig) -> tuple[list[GeneratedImage], missingDependencies=[], loadedModelRepo=status.loadedModelRepo, message=fallback_note, + # Preserve the +cpu / missing-torch warning across + # the demotion. Without this the Studio's "GPU + # acceleration not active" banner disappears the + # moment generation fails, leaving only "Install + # GPU runtime" -- which is the wrong remedy when + # torch IS installed (just CPU-only). Recompute + # rather than copying ``status.torchInstallWarning`` + # so the message reflects current disk state, not + # what the probe saw at preload time. + torchInstallWarning=_torch_install_warning(), ) return self._placeholder.generate(config, runtime_note=fallback_note), fallback_status.to_dict() diff --git a/backend_service/inference.py b/backend_service/inference.py index ef8c321..f3b3070 100644 --- a/backend_service/inference.py +++ b/backend_service/inference.py @@ -32,6 +32,106 @@ MLX_LOAD_TIMEOUT_SECONDS = 1800.0 DEFAULT_LLAMA_TIMEOUT_SECONDS = 120.0 CAPABILITY_CACHE_TTL_SECONDS = 10.0 + + +# Phase 2.2: keys forwarded as-is from `samplers` into the llama-server +# /v1/chat/completions payload. Anything not in this set is silently +# ignored so the frontend can blindly send the union of supported knobs +# without breaking older llama-server builds that don't recognise some. +_LLAMA_SAMPLER_KEYS: tuple[str, ...] = ( + "top_p", + "top_k", + "min_p", + "repeat_penalty", + "seed", + "mirostat", + "mirostat_tau", + "mirostat_eta", + # Phase 2.13: OpenAI-spec penalty fields. llama-server accepts these + # natively under the same names. mlx-lm doesn't pass them through + # but `_apply_sampler_kwargs` only adds them to the llama path + # payload, so the worker subprocess is unaffected. + "frequency_penalty", + "presence_penalty", + "stop", + # Phase 3.3: per-token confidence info. llama-server returns + # top-k alternatives with their logprobs in each delta when + # `logprobs: true` + `top_logprobs: N` are set. + "logprobs", + "top_logprobs", +) + + +def _apply_llama_chat_template_fixes( + messages: list[dict[str, Any]], + loaded_model: Any, +) -> tuple[list[dict[str, Any]], str | None]: + """Phase 3.8 follow-up: apply known chat-template auto-fixes before + sending the message list to llama-server. + + The llama.cpp server applies the chat template internally based on + GGUF metadata, so we can't observe template Jinja directly. But we + know certain families (Gemma) reject the system role entirely; + folding the system message into the first user message client-side + avoids the template error. + + Returns ``(new_messages, runtime_note)``. The note is None when no + fix was applied; when set it's a single line suitable for the + GenerationResult.runtimeNote channel so the substrate badge can + show "auto-fixed: Gemma family — fold system into first user". + """ + if not loaded_model or not messages: + return messages, None + + from backend_service.helpers.chat_template import ( + fold_system_into_first_user, + is_gemma_family, + ) + + model_ref = getattr(loaded_model, "ref", None) + canonical = getattr(loaded_model, "canonicalRepo", None) + target = canonical or model_ref + + if is_gemma_family(target): + new_messages = fold_system_into_first_user(messages) + if len(new_messages) != len(messages): + return new_messages, "Chat template auto-fixed: Gemma family — fold system into first user message" + return new_messages, None + + return messages, None + + +def _apply_sampler_kwargs( + payload: dict[str, Any], + *, + samplers: dict[str, Any] | None, + reasoning_effort: str | None, + json_schema: dict[str, Any] | None, +) -> None: + """Merge Phase 2.2 sampler overrides into a chat-completions payload. + + Mutates `payload` in place. Skips keys whose value is None so an + explicit "use the default" from a UI that always sends every field + doesn't override server-side defaults. Json-schema is wrapped in + the OpenAI structured-outputs `response_format` envelope. + """ + if samplers: + for key in _LLAMA_SAMPLER_KEYS: + value = samplers.get(key) + if value is None: + continue + payload[key] = value + if reasoning_effort: + payload["reasoning_effort"] = reasoning_effort + if json_schema: + payload["response_format"] = { + "type": "json_schema", + "json_schema": { + "name": "response", + "schema": json_schema, + "strict": True, + }, + } _LLAMA_HELP_CACHE: dict[str, str] = {} _LLAMA_HELP_LOCK = RLock() @@ -369,6 +469,75 @@ def _resolve_gguf_path(path: str | None, runtime_target: str | None) -> str | No return None +def _resolve_mmproj_path(model_gguf_path: str | None) -> str | None: + """Locate the mmproj projector sibling for a vision-capable GGUF. + + Vision support in llama.cpp is gated by the `--mmproj` flag; the + projector lives as a separate `*mmproj*.gguf` file alongside the + main weights. HF repos for vision-capable models usually ship both + in the same snapshot (e.g. `gemma-3-27b-it-qat-4bit/` contains + `model.gguf` and `mmproj.gguf`). This helper scans the same + directory tree the main GGUF was found in and returns the largest + matching projector file, or None when no projector is present (the + model is text-only, or the user only downloaded the main weights). + """ + if not model_gguf_path: + return None + main_path = Path(model_gguf_path) + if not main_path.exists(): + return None + + # Search the parent directory + its immediate sibling directories + # (covers the HF snapshot layout where projectors might live in a + # `projectors/` peer to the `weights/` folder). We deliberately do + # NOT recurse via `rglob` past one level — on macOS test rigs the + # parent's parent is sometimes a system-cache root that raises + # `OSError: Result too large` mid-scandir. Bounded depth keeps the + # resolver predictable across hosts. + candidates: list[Path] = [] + parent = main_path.parent + if parent.is_dir(): + for entry in parent.iterdir(): + if entry.is_file() and entry.suffix.lower() == ".gguf" and "mmproj" in entry.name.lower(): + candidates.append(entry) + elif entry.is_dir(): + try: + for child in entry.iterdir(): + if ( + child.is_file() + and child.suffix.lower() == ".gguf" + and "mmproj" in child.name.lower() + ): + candidates.append(child) + except OSError: + continue + grandparent = parent.parent + if grandparent.is_dir() and grandparent != parent: + try: + for entry in grandparent.iterdir(): + if not entry.is_dir() or entry == parent: + continue + try: + for child in entry.iterdir(): + if ( + child.is_file() + and child.suffix.lower() == ".gguf" + and "mmproj" in child.name.lower() + and child not in candidates + ): + candidates.append(child) + except OSError: + continue + except OSError: + pass + + valid = [p for p in candidates if p.is_file() and p != main_path] + if not valid: + return None + valid.sort(key=lambda f: f.stat().st_size, reverse=True) + return str(valid[0]) + + def _is_local_target(candidate: str | None) -> bool: if not candidate: return False @@ -724,8 +893,30 @@ class LoadedModelInfo: speculativeDecoding: bool = False dflashDraftModel: str | None = None treeBudget: int = 0 + # Hotfix (2026-05-01 v2): the runtime currently has no mmproj path + # wired for either backend — `_resolve_gguf_path` strips mmproj + # files, and the MLX worker has never carried images. Until those + # paths land (Phase 2.6+ work), `visionEnabled` stays False on every + # load and the capability resolver demotes the typed `supportsVision` + # flag accordingly. The catalog `tags` keep "vision" so the UI can + # still surface "this model supports vision once mmproj loads". + visionEnabled: bool = False def to_dict(self) -> dict[str, Any]: + # Phase 2.11: include resolved capabilities so the frontend can + # gate composer affordances (vision, tools, reasoning, etc.) + # without a separate fetch. Resolved lazily — adding a field on + # the dataclass would force a migration in every load path. + # The active engine is passed so capability flags get demoted + # for runtime gaps (e.g. MLX worker doesn't carry images). + from backend_service.catalog.capabilities import resolve_capabilities + + capabilities = resolve_capabilities( + self.ref, + self.canonicalRepo, + engine=self.engine, + vision_enabled=self.visionEnabled, + ).to_dict() return { "ref": self.ref, "name": self.name, @@ -746,6 +937,8 @@ def to_dict(self) -> dict[str, Any]: "speculativeDecoding": self.speculativeDecoding, "dflashDraftModel": self.dflashDraftModel, "treeBudget": self.treeBudget, + "visionEnabled": self.visionEnabled, + "capabilities": capabilities, } @@ -799,6 +992,16 @@ class StreamChunk: speculative_decoding: bool | None = None tree_budget: int | None = None done: bool = False + # Phase 3.3: per-token logprobs. When set, contains the chosen + # token's logprob plus the top-k alternatives. Only populated + # when the request had `logprobs: N` set. + token_logprobs: list[dict[str, Any]] | None = None + # Phase 3.1: DDTree accepted-span overlay data. `accepted_spans` + # is a run-length-encoded list of {start, length, accepted} over + # the per-token rendered text in `accepted_token_text`. Only + # populated when DFLASH speculative decoding ran. + accepted_spans: list[dict[str, Any]] | None = None + accepted_token_text: str | None = None class BaseInferenceEngine: @@ -854,6 +1057,9 @@ def generate( temperature: float, images: list[str] | None = None, tools: list[dict[str, Any]] | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> GenerationResult: raise NotImplementedError @@ -889,6 +1095,9 @@ def stream_generate( images: list[str] | None = None, tools: list[dict[str, Any]] | None = None, thinking_mode: str | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> Iterator[StreamChunk]: result = self.generate( prompt=prompt, @@ -896,6 +1105,11 @@ def stream_generate( system_prompt=system_prompt, max_tokens=max_tokens, temperature=temperature, + images=images, + tools=tools, + samplers=samplers, + reasoning_effort=reasoning_effort, + json_schema=json_schema, ) yield StreamChunk(text=result.text) yield StreamChunk( @@ -992,7 +1206,8 @@ def _request(self, *, prompt, history, system_prompt, max_tokens, temperature, s return urllib.request.urlopen(req, timeout=120.0) def generate(self, *, prompt, history, system_prompt, max_tokens, temperature, - images=None, tools=None) -> GenerationResult: + images=None, tools=None, + samplers=None, reasoning_effort=None, json_schema=None) -> GenerationResult: if self.loaded_model is None: raise RuntimeError("Remote model not configured.") started = time.perf_counter() @@ -1475,6 +1690,9 @@ def generate( temperature: float, images: list[str] | None = None, tools: list[dict[str, Any]] | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> GenerationResult: if self.loaded_model is None: raise RuntimeError("No model is loaded.") @@ -1499,6 +1717,15 @@ def generate( payload["images"] = images if tools: payload["tools"] = tools + # Phase 2.2: forward whatever sampler subset mlx-lm supports. + # Worker side reads these out of the payload and ignores keys it + # doesn't recognise, so this is forward-compatible. + if samplers: + payload["samplers"] = samplers + if reasoning_effort: + payload["reasoningEffort"] = reasoning_effort + if json_schema: + payload["jsonSchema"] = json_schema result = self.worker.request(payload) elapsed = max(time.perf_counter() - started_at, 1e-6) return GenerationResult( @@ -1533,6 +1760,9 @@ def stream_generate( images: list[str] | None = None, tools: list[dict[str, Any]] | None = None, thinking_mode: str | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> Iterator[StreamChunk]: if self.loaded_model is None: raise RuntimeError("No model is loaded.") @@ -1557,6 +1787,17 @@ def stream_generate( payload["images"] = images if tools: payload["tools"] = tools + # Phase 2.2: forward sampler / reasoning / schema overrides. The + # MLX worker reads these from the payload and applies what it + # supports (top_p, top_k, min_p, repeat_penalty, seed via + # mlx-lm); reasoning_effort + json_schema are accepted for + # forward-compat with future mlx-lm releases. + if samplers: + payload["samplers"] = samplers + if reasoning_effort: + payload["reasoningEffort"] = reasoning_effort + if json_schema: + payload["jsonSchema"] = json_schema try: request_iter = self.worker.stream_request(payload) except RuntimeError as exc: @@ -1576,7 +1817,17 @@ def stream_generate( if chunk.get("reasoningDone"): yield StreamChunk(reasoning_done=True) if chunk.get("text"): - yield StreamChunk(text=chunk["text"]) + token_logprobs = chunk.get("tokenLogprobs") + yield StreamChunk( + text=chunk["text"], + token_logprobs=token_logprobs if token_logprobs else None, + ) + elif chunk.get("tokenLogprobs"): + # Phase 3.3 follow-up: forward logprobs even when + # the chunk has no text (e.g. emitted alongside + # reasoning) so the frontend overlay still gets + # a complete trace. + yield StreamChunk(token_logprobs=chunk["tokenLogprobs"]) if response.get("done"): result = response.get("result") or {} yield StreamChunk( @@ -1597,6 +1848,10 @@ def stream_generate( else None ), tree_budget=int(result.get("treeBudget")) if result.get("treeBudget") is not None else None, + # Phase 3.1: forward accepted-span data when DDTree + # populated it. Llama path leaves these as None. + accepted_spans=result.get("acceptedSpans"), + accepted_token_text=result.get("acceptedTokenText"), ) except RuntimeError as exc: if "No MLX model is loaded" in str(exc): @@ -1819,7 +2074,20 @@ def _build_command( else: raise RuntimeError("GGUF loading requires a local model path or a Hugging Face GGUF repository.") - return command, runtime_note, fell_back_to_native + # Vision wiring: if a sibling mmproj file is present, pass it + # via `--mmproj` so llama-server enables image input. Capture + # the path so the caller can flip `LoadedModelInfo.visionEnabled` + # to True; the capability resolver reads that flag to enable + # the composer's image-attach button. Older llama-server builds + # without `--mmproj` skip the flag silently — verify support + # via the help-text gate to avoid startup failure on those. + mmproj_path: str | None = None + if resolved_gguf and _llama_server_supports(binary, "--mmproj"): + mmproj_path = _resolve_mmproj_path(resolved_gguf) + if mmproj_path: + command.extend(["--mmproj", mmproj_path]) + + return command, runtime_note, fell_back_to_native, mmproj_path def _wait_for_server(self) -> None: deadline = time.time() + DEFAULT_LLAMA_TIMEOUT_SECONDS @@ -1900,9 +2168,10 @@ def load_model( attempts.append(("native", False, True)) last_error: str | None = None + attempt_mmproj_path: str | None = None for strategy_id, fit_enabled, is_fallback in attempts: strategy = _strategy_registry.get(strategy_id) or _strategy_registry.default() - command, attempt_note, prevalidation_fallback = self._build_command( + command, attempt_note, prevalidation_fallback, attempt_mmproj_path = self._build_command( path=path, runtime_target=runtime_target, cache_strategy=strategy_id, @@ -1982,6 +2251,7 @@ def load_model( path=path, runtimeTarget=runtime_target or path, runtimeNote=runtime_note, + visionEnabled=attempt_mmproj_path is not None, ) return self.loaded_model @@ -1999,6 +2269,9 @@ def generate( temperature: float, images: list[str] | None = None, tools: list[dict[str, Any]] | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> GenerationResult: if self.loaded_model is None: raise RuntimeError("No model is loaded.") @@ -2023,6 +2296,11 @@ def generate( else: messages.append({"role": "user", "content": prompt}) + # Phase 3.8 follow-up: apply known chat-template auto-fixes + # before the messages reach llama-server (e.g. Gemma family + # rejects the system role outright). + messages, template_fix_note = _apply_llama_chat_template_fixes(messages, self.loaded_model) + started_at = time.perf_counter() payload: dict[str, Any] = { "model": self.loaded_model.ref, @@ -2033,6 +2311,12 @@ def generate( } if tools: payload["tools"] = tools + _apply_sampler_kwargs( + payload, + samplers=samplers, + reasoning_effort=reasoning_effort, + json_schema=json_schema, + ) try: response = _http_json( self._server_url("/v1/chat/completions"), @@ -2062,7 +2346,11 @@ def generate( totalTokens=total_tokens, tokS=round(completion_tokens / elapsed, 1) if completion_tokens else 0.0, responseSeconds=round(elapsed, 2), - runtimeNote=self.loaded_model.runtimeNote, + runtimeNote=( + _append_runtime_note(self.loaded_model.runtimeNote, template_fix_note) + if template_fix_note + else self.loaded_model.runtimeNote + ), ) def stream_generate( @@ -2076,6 +2364,9 @@ def stream_generate( images: list[str] | None = None, tools: list[dict[str, Any]] | None = None, thinking_mode: str | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> Iterator[StreamChunk]: if self.loaded_model is None: raise RuntimeError("No model is loaded.") @@ -2099,6 +2390,11 @@ def stream_generate( else: messages.append({"role": "user", "content": prompt}) + # Phase 3.8 follow-up: chat-template auto-fix on the streaming + # path matches the non-stream behaviour. The note is forwarded + # via the final StreamChunk's runtime_note. + messages, template_fix_note = _apply_llama_chat_template_fixes(messages, self.loaded_model) + payload: dict[str, Any] = { "model": self.loaded_model.ref, "messages": messages, @@ -2108,6 +2404,12 @@ def stream_generate( } if tools: payload["tools"] = tools + _apply_sampler_kwargs( + payload, + samplers=samplers, + reasoning_effort=reasoning_effort, + json_schema=json_schema, + ) url = self._server_url("/v1/chat/completions") data = json.dumps(payload).encode("utf-8") headers = {"Content-Type": "application/json", "Accept": "text/event-stream"} @@ -2126,6 +2428,8 @@ def stream_generate( stream_start = time.perf_counter() first_token_time: float | None = None runtime_note = self.loaded_model.runtimeNote + if template_fix_note: + runtime_note = _append_runtime_note(runtime_note, template_fix_note) think_filter = ThinkingTokenFilter(detect_raw_reasoning=(thinking_mode or "off") != "off") runaway_guard = RepeatedLineGuard() try: @@ -2143,6 +2447,28 @@ def stream_generate( choice = (chunk.get("choices") or [{}])[0] delta = choice.get("delta") or {} content = delta.get("content") + # Phase 3.3: extract per-token logprobs when llama-server + # returns them. The `logprobs.content` field is a list of + # token entries with top_logprobs alternatives. + logprob_entries: list[dict[str, Any]] | None = None + logprobs_payload = choice.get("logprobs") or {} + if isinstance(logprobs_payload, dict): + raw_entries = logprobs_payload.get("content") + if isinstance(raw_entries, list) and raw_entries: + logprob_entries = [] + for entry in raw_entries: + if not isinstance(entry, dict): + continue + top = entry.get("top_logprobs") or [] + logprob_entries.append({ + "token": entry.get("token"), + "logprob": entry.get("logprob"), + "alternatives": [ + {"token": alt.get("token"), "logprob": alt.get("logprob")} + for alt in top + if isinstance(alt, dict) + ], + }) if content: split = think_filter.feed(str(content)) if split.reasoning: @@ -2154,7 +2480,7 @@ def stream_generate( if first_token_time is None: first_token_time = time.perf_counter() completion_tokens += 1 - yield StreamChunk(text=split.text) + yield StreamChunk(text=split.text, token_logprobs=logprob_entries) fr = choice.get("finish_reason") if fr: finish_reason = fr @@ -2910,6 +3236,9 @@ def generate( images: list[str] | None = None, tools: list[dict[str, Any]] | None = None, engine: BaseInferenceEngine | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> GenerationResult: if self.loaded_model is None: raise RuntimeError("Load a model before sending prompts.") @@ -2923,6 +3252,9 @@ def generate( temperature=temperature, images=images, tools=tools, + samplers=samplers, + reasoning_effort=reasoning_effort, + json_schema=json_schema, ) if result.runtimeNote is None: result.runtimeNote = self.runtime_note @@ -2940,6 +3272,9 @@ def stream_generate( tools: list[dict[str, Any]] | None = None, engine: BaseInferenceEngine | None = None, thinking_mode: str | None = None, + samplers: dict[str, Any] | None = None, + reasoning_effort: str | None = None, + json_schema: dict[str, Any] | None = None, ) -> Iterator[StreamChunk]: if self.loaded_model is None: raise RuntimeError("Load a model before sending prompts.") @@ -2954,6 +3289,9 @@ def stream_generate( images=images, tools=tools, thinking_mode=thinking_mode, + samplers=samplers, + reasoning_effort=reasoning_effort, + json_schema=json_schema, ) def extract_gguf_metadata(self, path: str) -> dict[str, Any]: diff --git a/backend_service/mcp/__init__.py b/backend_service/mcp/__init__.py new file mode 100644 index 0000000..ca7423d --- /dev/null +++ b/backend_service/mcp/__init__.py @@ -0,0 +1,40 @@ +"""MCP (Model Context Protocol) client — Phase 2.10. + +ChaosEngineAI's chat agent loop dispatches built-in tools (web search, +calculator, file reader, code executor) through `backend_service.tools`. +This package extends that surface with externally-provided MCP tools: +the user configures one or more MCP servers in settings, and at startup +each server's exported tools are discovered and registered alongside +the built-ins. From the agent loop's perspective the new tools look +identical — same `BaseTool` interface, same OpenAI-shaped function +schema, same `execute(...)` calling convention. + +Transport +--------- +First ship supports stdio only. The user gives us a command line; we +spawn the process, talk JSON-RPC 2.0 over its stdin/stdout, and tear +the subprocess down at app shutdown. SSE / WebSocket transports are +future work. + +Provenance +---------- +Every adapted MCP tool tags its `provenance` so the API surface and +the eventual UI can show which server a tool came from. Built-in +tools tag as `"builtin"`; MCP tools tag as `"mcp:"`. +""" + +from backend_service.mcp.client import ( + McpClient, + McpClientError, + McpServerConfig, + McpToolDescriptor, +) +from backend_service.mcp.tool_adapter import McpTool + +__all__ = [ + "McpClient", + "McpClientError", + "McpServerConfig", + "McpToolDescriptor", + "McpTool", +] diff --git a/backend_service/mcp/client.py b/backend_service/mcp/client.py new file mode 100644 index 0000000..9fc1228 --- /dev/null +++ b/backend_service/mcp/client.py @@ -0,0 +1,394 @@ +"""Minimal stdio MCP client — JSON-RPC 2.0 over a subprocess pipe. + +The client speaks the bare-minimum slice of the Model Context Protocol +needed for tool discovery + invocation: + + - `initialize` / `initialized` handshake (protocolVersion + capabilities) + - `tools/list` to enumerate available tools + - `tools/call` to run a tool + +Everything else (resources, prompts, sampling, roots) is ignored. +Servers that depend on these features will still load — we just don't +surface them. Adding support is a forward-compatible extension. + +Errors are wrapped in `McpClientError`. Servers that crash, hang, or +return malformed JSON are isolated: the client raises, the registry +falls back to whatever it had before, and the chat agent loop still +runs with the built-in tools intact. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import threading +from dataclasses import dataclass, field +from queue import Empty, Queue +from typing import Any + + +# Conservative defaults. Stdio MCP servers are local subprocesses, so a +# multi-second ceiling is plenty — anything slower is a hung server we +# want to abort rather than wait on. +DEFAULT_REQUEST_TIMEOUT_S = 30.0 +DEFAULT_INITIALIZE_TIMEOUT_S = 15.0 + + +class McpClientError(RuntimeError): + """Raised on any client-side failure — protocol, timeout, or process.""" + + +@dataclass(frozen=True) +class McpServerConfig: + """User-supplied configuration for one MCP server. + + `id` is a short opaque key (e.g. "filesystem", "search-perplexity") + used in tool provenance and the settings UI. `command` + `args` is + the subprocess to spawn; `env` overlays the parent environment. + """ + + id: str + command: str + args: tuple[str, ...] = () + env: dict[str, str] = field(default_factory=dict) + enabled: bool = True + + def to_dict(self) -> dict[str, Any]: + return { + "id": self.id, + "command": self.command, + "args": list(self.args), + "env": dict(self.env), + "enabled": self.enabled, + } + + @classmethod + def from_dict(cls, payload: dict[str, Any]) -> "McpServerConfig": + if not isinstance(payload, dict): + raise McpClientError(f"MCP server config must be a dict, got {type(payload).__name__}") + server_id = str(payload.get("id") or "").strip() + command = str(payload.get("command") or "").strip() + if not server_id or not command: + raise McpClientError("MCP server config requires non-empty `id` and `command`") + raw_args = payload.get("args") or [] + if not isinstance(raw_args, list): + raise McpClientError("MCP server config `args` must be a list") + env_payload = payload.get("env") or {} + if not isinstance(env_payload, dict): + raise McpClientError("MCP server config `env` must be an object") + return cls( + id=server_id, + command=command, + args=tuple(str(a) for a in raw_args), + env={str(k): str(v) for k, v in env_payload.items()}, + enabled=bool(payload.get("enabled", True)), + ) + + +@dataclass(frozen=True) +class McpToolDescriptor: + """Metadata for one tool exported by an MCP server.""" + + server_id: str + name: str + description: str + input_schema: dict[str, Any] + + +class McpClient: + """One open client per MCP server. Thread-safe for sequential RPCs. + + Construct via `McpClient(config)` then call `initialize()` exactly + once before `list_tools()` / `call_tool()`. Always close via + `close()` (or use as a context manager) so the subprocess pipes are + drained — leaking pipes wedges the parent app on exit. + """ + + def __init__(self, config: McpServerConfig, *, request_timeout: float = DEFAULT_REQUEST_TIMEOUT_S) -> None: + self.config = config + self._timeout = request_timeout + self._proc: subprocess.Popen | None = None + self._stdout_queue: Queue[str | None] = Queue() + self._stdout_thread: threading.Thread | None = None + self._lock = threading.Lock() + self._next_id = 1 + self._initialized = False + + def __enter__(self) -> "McpClient": + return self + + def __exit__(self, *_exc: Any) -> None: + self.close() + + def start(self) -> None: + """Spawn the subprocess. Idempotent.""" + if self._proc is not None and self._proc.poll() is None: + return + env = os.environ.copy() + env.update(self.config.env) + try: + self._proc = subprocess.Popen( + [self.config.command, *self.config.args], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + text=True, + bufsize=1, # line-buffered + ) + except FileNotFoundError as exc: + raise McpClientError( + f"MCP server '{self.config.id}' command not found: {self.config.command}" + ) from exc + + # Drain stdout in a worker thread so reads don't block on the + # main thread when the server is busy producing output. + def _drain() -> None: + assert self._proc is not None and self._proc.stdout is not None + for line in self._proc.stdout: + self._stdout_queue.put(line.rstrip("\n")) + self._stdout_queue.put(None) + + self._stdout_thread = threading.Thread(target=_drain, daemon=True) + self._stdout_thread.start() + + def initialize(self, *, timeout: float = DEFAULT_INITIALIZE_TIMEOUT_S) -> dict[str, Any]: + """Run the initialize handshake. Must complete before any RPCs.""" + self.start() + result = self._request( + "initialize", + { + "protocolVersion": "2025-03-26", + "capabilities": {}, + "clientInfo": { + "name": "ChaosEngineAI", + "version": "0.7.x", + }, + }, + timeout=timeout, + ) + # Per spec, send the `initialized` notification after the + # response. Notifications have no `id` and expect no response. + self._notify("notifications/initialized", {}) + self._initialized = True + return result + + def list_tools(self, *, timeout: float | None = None) -> list[McpToolDescriptor]: + """Enumerate the server's tools. Requires `initialize()` first.""" + if not self._initialized: + raise McpClientError( + f"MCP server '{self.config.id}' not initialised — call initialize() first" + ) + result = self._request("tools/list", {}, timeout=timeout) + raw_tools = result.get("tools") if isinstance(result, dict) else None + if not isinstance(raw_tools, list): + return [] + descriptors: list[McpToolDescriptor] = [] + for entry in raw_tools: + if not isinstance(entry, dict): + continue + name = str(entry.get("name") or "").strip() + if not name: + continue + schema = entry.get("inputSchema") or {"type": "object", "properties": {}} + if not isinstance(schema, dict): + schema = {"type": "object", "properties": {}} + descriptors.append(McpToolDescriptor( + server_id=self.config.id, + name=name, + description=str(entry.get("description") or ""), + input_schema=schema, + )) + return descriptors + + def call_tool( + self, + name: str, + arguments: dict[str, Any], + *, + timeout: float | None = None, + ) -> str: + """Invoke a tool. Returns the text representation of the result. + + MCP tool results are a structured list of content parts (text, + image, embedded resources, etc.). For chat-agent integration we + flatten the parts into a single string by concatenating text + parts and stringifying anything else, matching the contract + every existing built-in tool already follows. + """ + return _flatten_tool_result(self.call_tool_raw(name, arguments, timeout=timeout)) + + def call_tool_raw( + self, + name: str, + arguments: dict[str, Any], + *, + timeout: float | None = None, + ) -> Any: + """Phase 2.8: invoke and return the raw `tools/call` result. + + Adapter callers that want to render MCP content parts natively + (images, embedded resources) read the raw envelope so they can + inspect each part's `type` / `mimeType` / `data` / `text` + before falling back to flattened text. + """ + if not self._initialized: + raise McpClientError( + f"MCP server '{self.config.id}' not initialised — call initialize() first" + ) + return self._request( + "tools/call", + {"name": name, "arguments": arguments}, + timeout=timeout, + ) + + def close(self) -> None: + if self._proc is None: + return + proc = self._proc + self._proc = None + try: + if proc.stdin and not proc.stdin.closed: + proc.stdin.close() + except OSError: + pass + try: + proc.terminate() + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=5) + except OSError: + pass + + # ------------------------------------------------------------------ + # JSON-RPC plumbing + # ------------------------------------------------------------------ + + def _request( + self, + method: str, + params: dict[str, Any], + *, + timeout: float | None = None, + ) -> Any: + with self._lock: + assert self._proc is not None and self._proc.stdin is not None, "client not started" + request_id = self._next_id + self._next_id += 1 + payload = { + "jsonrpc": "2.0", + "id": request_id, + "method": method, + "params": params, + } + try: + self._proc.stdin.write(json.dumps(payload) + "\n") + self._proc.stdin.flush() + except OSError as exc: + raise McpClientError( + f"MCP server '{self.config.id}' stdin failed: {exc}" + ) from exc + + deadline_seconds = timeout if timeout is not None else self._timeout + while True: + try: + line = self._stdout_queue.get(timeout=deadline_seconds) + except Empty as exc: + raise McpClientError( + f"MCP server '{self.config.id}' timed out waiting for {method}" + ) from exc + if line is None: + stderr_tail = self._read_stderr_tail() + raise McpClientError( + f"MCP server '{self.config.id}' exited mid-request: {stderr_tail}" + ) + parsed = _parse_json_rpc_line(line) + if parsed is None: + continue # progress / log line — keep reading + # Skip notifications + responses for other request ids + if parsed.get("id") != request_id: + continue + if "error" in parsed and parsed["error"]: + err = parsed["error"] + msg = err.get("message") if isinstance(err, dict) else str(err) + raise McpClientError( + f"MCP server '{self.config.id}' returned error for {method}: {msg}" + ) + return parsed.get("result") + + def _notify(self, method: str, params: dict[str, Any]) -> None: + with self._lock: + if self._proc is None or self._proc.stdin is None: + return + payload = {"jsonrpc": "2.0", "method": method, "params": params} + try: + self._proc.stdin.write(json.dumps(payload) + "\n") + self._proc.stdin.flush() + except OSError: + pass + + def _read_stderr_tail(self) -> str: + if self._proc is None or self._proc.stderr is None: + return "" + try: + return self._proc.stderr.read()[-500:] + except OSError: + return "" + + +# ---------------------------------------------------------------------- +# Pure helpers (testable without a subprocess) +# ---------------------------------------------------------------------- + + +def _parse_json_rpc_line(line: str) -> dict[str, Any] | None: + """Parse a single line of JSON-RPC. Returns None for unparseable / empty. + + Some servers print log lines to stdout alongside JSON-RPC frames; + the client tolerates them by returning None and continuing the + read loop. A frame must be a JSON object with `jsonrpc: "2.0"`. + """ + stripped = line.strip() + if not stripped: + return None + if not stripped.startswith("{"): + return None + try: + payload = json.loads(stripped) + except json.JSONDecodeError: + return None + if not isinstance(payload, dict): + return None + if payload.get("jsonrpc") != "2.0": + return None + return payload + + +def _flatten_tool_result(result: Any) -> str: + """Convert an MCP `tools/call` result into a single string. + + The MCP spec returns ``{"content": [{"type": "text", "text": "..."}, ...]}`` + plus optional `isError`. We concatenate text parts; anything else + is JSON-stringified so the caller still sees the data. + """ + if not isinstance(result, dict): + return str(result) if result is not None else "" + if result.get("isError"): + prefix = "[MCP error] " + else: + prefix = "" + content = result.get("content") + if not isinstance(content, list): + return prefix + (str(result) if result else "") + parts: list[str] = [] + for entry in content: + if not isinstance(entry, dict): + parts.append(str(entry)) + continue + if entry.get("type") == "text": + parts.append(str(entry.get("text") or "")) + else: + parts.append(json.dumps(entry, sort_keys=True)) + return prefix + "\n".join(parts).strip() diff --git a/backend_service/mcp/loader.py b/backend_service/mcp/loader.py new file mode 100644 index 0000000..8fe86be --- /dev/null +++ b/backend_service/mcp/loader.py @@ -0,0 +1,96 @@ +"""High-level MCP loader — spawn servers, discover tools, build adapters. + +The single entry point `load_mcp_tools` is what the app should call +at startup (and after the user updates `mcpServers` in settings). It +takes a list of server configs and returns: + + * a flat list of `McpTool` adapters ready to feed into + `ToolRegistry.replace_mcp_tools`; + * a list of live `McpClient` instances the caller must close on + shutdown (or when reloading). + +A misbehaving server (bad command, init timeout, malformed +`tools/list` response) is isolated: its client is closed and skipped, +the loader logs via the supplied callback, and other servers proceed +normally. The chat path always sees the union of healthy servers' +tools — never an all-or-nothing failure. +""" + +from __future__ import annotations + +from typing import Callable, Iterable + +from backend_service.mcp.client import ( + McpClient, + McpClientError, + McpServerConfig, +) +from backend_service.mcp.tool_adapter import McpTool + + +LogFn = Callable[[str, str], None] + + +def load_mcp_tools( + configs: Iterable[McpServerConfig], + *, + log: LogFn | None = None, +) -> tuple[list[McpTool], list[McpClient]]: + """Spawn each enabled server and collect its tools. + + `log(level, message)` is the optional logging callback. When + omitted, failures are silent (callers like tests can pass + ``log=None``); production callers should plumb in `state.add_log` + so users see a settings → log entry per misbehaving server. + """ + tools: list[McpTool] = [] + clients: list[McpClient] = [] + + for config in configs: + if not config.enabled: + continue + client = McpClient(config) + try: + client.initialize() + descriptors = client.list_tools() + except McpClientError as exc: + if log is not None: + log("warning", f"MCP server '{config.id}' failed to start: {exc}") + client.close() + continue + except Exception as exc: # noqa: BLE001 — protect chat path from any subprocess weirdness + if log is not None: + log("warning", f"MCP server '{config.id}' raised unexpected error: {exc}") + client.close() + continue + + if not descriptors: + if log is not None: + log("info", f"MCP server '{config.id}' is up but exports zero tools.") + # Keep the client around — the server may export tools + # later, and the user might still rely on resources/prompts + # in a future release. + clients.append(client) + continue + + clients.append(client) + for descriptor in descriptors: + tools.append(McpTool(client, descriptor)) + if log is not None: + log("info", f"MCP server '{config.id}' loaded ({len(descriptors)} tool(s)).") + + return tools, clients + + +def close_all(clients: Iterable[McpClient]) -> None: + """Tear down every client — call on app shutdown / reload. + + Errors during close are swallowed: a hung subprocess shouldn't + block the parent app from exiting. Each client's `close()` method + sends terminate + falls back to kill after 5 s. + """ + for client in clients: + try: + client.close() + except Exception: + continue diff --git a/backend_service/mcp/tool_adapter.py b/backend_service/mcp/tool_adapter.py new file mode 100644 index 0000000..343c2aa --- /dev/null +++ b/backend_service/mcp/tool_adapter.py @@ -0,0 +1,145 @@ +"""Adapter that exposes an MCP server tool as a `BaseTool`. + +Phase 2.10: lets the existing agent loop dispatch MCP tools using the +same interface it already uses for built-ins. The adapter holds a +reference to the live `McpClient` and routes each `execute(...)` call +through `client.call_tool`. Errors from the remote tool are converted +to a string return so the agent loop's existing tool-call result path +handles them — no exception surface change. + +Provenance +---------- +Each adapter exposes a `provenance` property tagged +``"mcp:"``. The /api/tools route reads this so the UI can +render a source badge next to each tool ("Built-in" vs "MCP: filesystem"). +""" + +from __future__ import annotations + +import re +from typing import Any + +from backend_service.mcp.client import McpClient, McpClientError, McpToolDescriptor +from backend_service.tools import BaseTool, StructuredToolOutput + + +# MCP tool names can include slashes / colons that aren't legal in +# OpenAI function-calling identifiers. Sanitise to a safe identifier +# while keeping a deterministic mapping back to the original. +_NAME_SAFE_RE = re.compile(r"[^A-Za-z0-9_-]+") + + +def _safe_name(server_id: str, tool_name: str) -> str: + """Build a registry-safe name. Format: `mcp____`.""" + safe_server = _NAME_SAFE_RE.sub("_", server_id).strip("_") or "server" + safe_tool = _NAME_SAFE_RE.sub("_", tool_name).strip("_") or "tool" + return f"mcp__{safe_server}__{safe_tool}" + + +class McpTool(BaseTool): + """One MCP tool wrapped as a backend-native `BaseTool`.""" + + def __init__(self, client: McpClient, descriptor: McpToolDescriptor) -> None: + self._client = client + self._descriptor = descriptor + self._safe_name = _safe_name(descriptor.server_id, descriptor.name) + + @property + def name(self) -> str: + return self._safe_name + + @property + def description(self) -> str: + # Prefix the description with the server id so the UI can + # surface provenance even when the schema list is rendered + # without per-tool styling. + base = self._descriptor.description.strip() + suffix = f" (via MCP: {self._descriptor.server_id})" + if base: + return base + suffix + return f"Tool from MCP server '{self._descriptor.server_id}'" + + @property + def provenance(self) -> str: + """Phase 2.10: tag for the API surface + UI badging.""" + return f"mcp:{self._descriptor.server_id}" + + @property + def remote_name(self) -> str: + """The tool name on the remote server (before _safe_name munging).""" + return self._descriptor.name + + def parameters_schema(self) -> dict[str, Any]: + # MCP exposes JSON Schema directly under `inputSchema`. Pass + # through verbatim so the model sees the upstream-published + # shape. Default to a permissive object schema if the server + # left it empty. + return self._descriptor.input_schema or {"type": "object", "properties": {}} + + def execute(self, **kwargs: Any) -> str: + try: + return self._client.call_tool(self._descriptor.name, kwargs) + except McpClientError as exc: + # Surface the failure as text so the agent loop still has + # something to feed back to the model. Raising would + # require a more invasive change to the loop's error path. + return f"[MCP server '{self._descriptor.server_id}' error] {exc}" + + def execute_structured(self, **kwargs: Any) -> StructuredToolOutput | None: + """Phase 2.8: surface MCP content parts as structured output. + + MCP servers return a list of content parts under + ``result.content`` (text, image, embedded resources). When the + first part is an image we render it inline; when there's a + single text part we leave it for the legacy fallback so the UI + can still pick markdown / table renderers added later by tool + introspection. Multiple-part results render as markdown with + each part stringified. + """ + try: + raw = self._client.call_tool_raw(self._descriptor.name, kwargs) + except AttributeError: + # Older clients without the raw helper — just fall through + # to the plain text path. + return None + except McpClientError as exc: + return StructuredToolOutput( + text=f"[MCP server '{self._descriptor.server_id}' error] {exc}", + render_as="markdown", + ) + if not isinstance(raw, dict): + return None + content = raw.get("content") + if not isinstance(content, list) or not content: + return None + + # Single image part: render inline. + if len(content) == 1 and isinstance(content[0], dict) and content[0].get("type") == "image": + img = content[0] + data_uri = _image_part_to_data_uri(img) + if data_uri: + return StructuredToolOutput( + text=f"[image: {img.get('mimeType', 'image/png')}]", + render_as="image", + data={"src": data_uri, "alt": img.get("alt", "")}, + ) + + # Multiple parts or non-image: stringify into markdown so the + # UI shows each part with its own framing. + from backend_service.mcp.client import _flatten_tool_result + + text = _flatten_tool_result(raw) + return StructuredToolOutput( + text=text, + render_as="markdown", + data={"markdown": text}, + ) + + +def _image_part_to_data_uri(part: dict[str, Any]) -> str | None: + """Convert an MCP image content part to a `data:` URI for inline render.""" + data = part.get("data") + if not isinstance(data, str) or not data: + return None + mime = part.get("mimeType") or "image/png" + return f"data:{mime};base64,{data}" diff --git a/backend_service/mlx_video_runtime.py b/backend_service/mlx_video_runtime.py index 346d170..5891ee1 100644 --- a/backend_service/mlx_video_runtime.py +++ b/backend_service/mlx_video_runtime.py @@ -49,20 +49,24 @@ ) -# Repos that route to mlx-video on Apple Silicon. Kept as a frozenset so -# the Setup page and tests can introspect the supported surface without -# importing the engine class. -# -# Only LTX-2 ships pre-converted MLX weights today — Wan paths go through -# diffusers MPS until we automate the ``mlx_video.models.wan_2.convert`` -# step. See module docstring for the staged plan. -_SUPPORTED_REPOS: frozenset[str] = frozenset({ +# Statically-supported repos. LTX-2 ships pre-converted on +# prince-canuma/LTX-2-* and routes through this set unconditionally. +# Wan-AI raw checkpoints become routable only when their converted MLX +# artifacts exist on disk (FU-025) — see ``supported_repos()`` for the +# dynamic union. +_LTX2_SUPPORTED_REPOS: frozenset[str] = frozenset({ "prince-canuma/LTX-2-distilled", "prince-canuma/LTX-2-dev", "prince-canuma/LTX-2.3-distilled", "prince-canuma/LTX-2.3-dev", }) +# Backwards-compatible alias. Tests + the Setup page used to import +# ``_SUPPORTED_REPOS`` directly; keep it pointing at the LTX-2 set so +# their assertions don't break. Callers that want the full dynamic +# (LTX-2 + converted-Wan) view should use ``supported_repos()``. +_SUPPORTED_REPOS: frozenset[str] = _LTX2_SUPPORTED_REPOS + # Maps repo prefix → mlx-video MODULE path (NOT the console-script alias). # Blaizzy/mlx-video declares ``mlx_video.ltx_2.generate`` and @@ -75,6 +79,11 @@ # this dict points at the real module path. _REPO_ENTRY_POINTS: dict[str, str] = { "prince-canuma/LTX-2": "mlx_video.models.ltx_2.generate", + # FU-025: Wan2.1/2.2 routes through the converted MLX dir. + # The CLI takes ``--model-dir `` rather than + # ``--model-repo ``; ``_build_wan_cmd`` resolves the + # converted dir from ``mlx_video_wan_convert.output_dir_for(repo)``. + "Wan-AI/": "mlx_video.models.wan_2.generate", } @@ -97,26 +106,59 @@ _LTX2_DISTILLED_STAGE_2_STEPS = 3 +def _converted_wan_repos() -> frozenset[str]: + """FU-025: Wan-AI repos whose converted MLX artifacts exist on disk. + + Defers the import of ``mlx_video_wan_convert`` so a missing helper + module (very unlikely; same package) doesn't bomb the whole + runtime. Each call rescans ``CONVERT_ROOT`` so newly-converted + weights show up without a process restart — the lookup is cheap + (one ``Path.iterdir`` plus per-entry stat checks). + """ + try: + from backend_service import mlx_video_wan_convert + except Exception: # noqa: BLE001 — defensive + return frozenset() + try: + return frozenset(s.repo for s in mlx_video_wan_convert.list_converted()) + except Exception: # noqa: BLE001 + return frozenset() + + def supported_repos() -> frozenset[str]: - """Repo ids the MLX video engine accepts. + """Repo ids the MLX video engine accepts (dynamic). + + Returns the union of: + - LTX-2 pre-converted repos (always available when mlx-video is + installed) + - Wan-AI raw checkpoints whose ``mlx_video_wan_convert`` artifacts + exist on disk (FU-025). Exposed so the Setup page and tests can enumerate the supported set without importing the engine class (which would pull in the heavy ``video_runtime`` module and its torch-warmup side effects). """ - return _SUPPORTED_REPOS + return _LTX2_SUPPORTED_REPOS | _converted_wan_repos() def _is_mlx_video_repo(repo: str | None) -> bool: """Routing helper for the video manager. - Returns ``True`` only for repos mlx-video supports natively. The - manager still consults ``MlxVideoEngine.probe()`` before dispatching - — a supported repo on an Intel Mac must fall through to diffusers. + Returns ``True`` only for repos mlx-video supports natively at this + moment. The manager still consults ``MlxVideoEngine.probe()`` before + dispatching — a supported repo on an Intel Mac must fall through to + diffusers. """ if not repo: return False - return repo in _SUPPORTED_REPOS + return repo in supported_repos() + + +def _is_wan_repo(repo: str) -> bool: + """FU-025 dispatch helper. ``True`` for any Wan-AI repo whose + converted artifact exists on disk; the engine then routes through + ``_build_wan_cmd`` instead of the LTX-2 builder.""" + return repo.startswith("Wan-AI/") and repo in _converted_wan_repos() def _resolve_entry_point(repo: str) -> str: @@ -455,6 +497,20 @@ def generate( f"{output_path}. Check the subprocess log above." ) data = output_path.read_bytes() + is_wan = _is_wan_repo(config.repo) + runtime_note = ( + self._wan_runtime_note(config.repo) + if is_wan + else _ltx2_runtime_note(config.repo) + ) + effective_steps = ( + config.steps if is_wan + else _ltx2_effective_steps(config.repo, config.steps) + ) + effective_guidance = ( + config.guidance if is_wan + else _ltx2_effective_guidance(config.repo, config.guidance) + ) return GeneratedVideo( seed=resolved_seed, bytes=data, @@ -466,9 +522,9 @@ def generate( width=config.width, height=config.height, runtimeLabel=self.runtime_label, - runtimeNote=_ltx2_runtime_note(config.repo), - effectiveSteps=_ltx2_effective_steps(config.repo, config.steps), - effectiveGuidance=_ltx2_effective_guidance(config.repo, config.guidance), + runtimeNote=runtime_note, + effectiveSteps=effective_steps, + effectiveGuidance=effective_guidance, ) finally: shutil.rmtree(workspace, ignore_errors=True) @@ -485,12 +541,13 @@ def _build_cmd( """Compose the ``python -m mlx_video. --...`` invocation. Split out so tests can assert the CLI shape without spawning a - real subprocess. Flags mirror Blaizzy/mlx-video's - ``mlx_video.models.ltx_2.generate`` argparse surface — note the - names differ from diffusers conventions: ``--model-repo`` (not - ``--model``), ``--cfg-scale`` (not ``--guidance``), - ``--output-path`` (not ``--output``). + real subprocess. Wan-AI repos route to ``_build_wan_cmd`` + because the Wan generate CLI takes ``--model-dir `` and a different flag set than LTX-2's + ``--model-repo``/``--pipeline``/``--cfg-scale``. """ + if _is_wan_repo(config.repo): + return self._build_wan_cmd(config, output_path) entry = _resolve_entry_point(config.repo) python = _resolve_video_python() pipeline_flag = _resolve_pipeline_flag(config.repo) @@ -535,13 +592,68 @@ def _build_cmd( cmd.extend(["--spatial-upscaler", str(spatial_upscaler)]) # STG (Spatial-Temporal Guidance) is mlx-video's built-in quality # lever — perturbs final transformer blocks during sampling to - # reduce object breakup / chroma drift. Default 1.0 mirrors the - # upstream README's quality recommendation. This closes the FU-013 - # gap for the mlx-video path (still pending for the diffusers - # LTX path on CUDA / non-Apple-Silicon hosts). - cmd.extend(["--stg-scale", "1.0"]) + # reduce object breakup / chroma drift. Value comes from + # ``VideoGenerationConfig.stgScale``: 1.0 matches Blaizzy's + # upstream README recommendation, 0.0 disables the perturbed + # forward pass and frees ~33 % wall time per step. Distilled + # pipelines ignore the flag (fixed sampler). + cmd.extend(["--stg-scale", str(config.stgScale)]) + return cmd + + def _build_wan_cmd( + self, + config: VideoGenerationConfig, + output_path: Path, + ) -> list[str]: + """FU-025: Wan2.1/2.2 generate CLI is shaped differently than + LTX-2 (``--model-dir`` instead of ``--model-repo``, no + ``--pipeline``, no ``--cfg-scale`` / ``--fps``, single + ``--guide-scale`` string that can carry a low,high pair). + + The converted MLX dir comes from + ``mlx_video_wan_convert.output_dir_for(repo)`` — runtime + resolution is centralised so a future change to the convert + layout doesn't fragment across builders. + """ + from backend_service import mlx_video_wan_convert + + entry = _resolve_entry_point(config.repo) + python = _resolve_video_python() + model_dir = mlx_video_wan_convert.output_dir_for(config.repo) + cmd = [ + python, + "-m", entry, + "--model-dir", str(model_dir), + "--prompt", config.prompt, + "--num-frames", str(config.numFrames), + "--height", str(config.height), + "--width", str(config.width), + "--output-path", str(output_path), + # Wan generate accepts a string ``low,high`` pair; pass the + # configured guidance as a single float and let upstream + # default to balanced when it's the canonical 5.0/3.0 pair. + "--guide-scale", f"{config.guidance:g}", + ] + if config.steps and config.steps > 0: + cmd.extend(["--steps", str(config.steps)]) + if config.negativePrompt: + cmd.extend(["--negative-prompt", config.negativePrompt]) + if config.seed is not None: + cmd.extend(["--seed", str(config.seed)]) + if config.scheduler and config.scheduler in {"unipc", "euler", "dpm++"}: + cmd.extend(["--scheduler", config.scheduler]) return cmd + def _wan_runtime_note(self, repo: str) -> str: + from backend_service.mlx_video_wan_convert import output_dir_for, status_for + + status = status_for(repo) + suffix = " (MoE high+low noise experts)" if status.hasMoeExperts else "" + return ( + f"mlx-video subprocess (MLX native, Wan2.x{suffix}, " + f"converted at {output_dir_for(repo).name})" + ) + def _launch( self, cmd: list[str], diff --git a/backend_service/mlx_video_wan_convert.py b/backend_service/mlx_video_wan_convert.py new file mode 100644 index 0000000..dfacd73 --- /dev/null +++ b/backend_service/mlx_video_wan_convert.py @@ -0,0 +1,307 @@ +"""mlx-video Wan2.1/2.2 weight conversion (FU-025). + +Wraps ``mlx_video.models.wan_2.convert.convert_wan_checkpoint`` (and its +``python -m`` CLI entrypoint) so ChaosEngineAI can promote raw HF Wan +repos to mlx-video's native MLX format. Closes FU-009 Wan branch. + +UPSTREAM +-------- +Blaizzy/mlx-video ships ``mlx_video/models/wan_2/convert.py`` with both +a ``convert_wan_checkpoint(checkpoint_dir, output_dir, ...)`` function +and a CLI module entry. This wrapper invokes the CLI as a subprocess so +the long-running conversion (5-30 min depending on model size) doesn't +block the FastAPI worker thread. The CLI flags we forward: + +* ``--checkpoint-dir`` — raw HF Wan repo path +* ``--output-dir`` — converted MLX dir +* ``--dtype {float16, bfloat16, float32}`` +* ``--model-version {2.1, 2.2, auto}`` +* ``--quantize --bits {4,8} --group-size {32,64,128}`` (optional) + +LAYOUT +------ +Converted weights land under +``~/.chaosengine/mlx-video-wan//`` where ```` is +the HF repo id with ``/`` replaced by ``__`` so the directory is a +single path component. Each output directory contains: + +* ``models_t5_umt5-xxl-enc-bf16.safetensors`` (text encoder) +* ``Wan2.1_VAE.safetensors`` (VAE) +* ``transformer*.safetensors`` (Wan2.1 single transformer) OR + ``high_noise_model/`` + ``low_noise_model/`` subdirs (Wan2.2 MoE) +* ``config.json`` (model metadata) + +SCOPE +----- +This module ships the CONVERSION foundation: install detection, +supported-repo set, output-path convention, status inspection, and the +subprocess invocation. Runtime routing (so generate calls dispatch to +mlx-video for converted Wan repos) is deferred to a follow-up. +""" + +from __future__ import annotations + +import importlib.util +import logging +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + +LOG = logging.getLogger("chaosengine.mlx-video-wan") + + +def _resolve_convert_root() -> Path: + override = os.environ.get("CHAOSENGINE_MLX_VIDEO_WAN_DIR") + if override: + return Path(override).expanduser() + return Path.home() / ".chaosengine" / "mlx-video-wan" + + +# Public so callers (tests, setup endpoints) can introspect the path +# without importing private state. +CONVERT_ROOT: Path = _resolve_convert_root() + + +# Raw Wan-AI checkpoints the upstream convert script supports. These +# are NOT the ``-Diffusers`` mirrors used by the diffusers MPS path — +# the convert script expects raw Wan format +# (``models_t5_umt5-xxl-enc-bf16.pth`` + ``Wan2.1_VAE.pth`` + transformer +# safetensors at the directory root). Mirror repos go through the +# diffusers code path regardless of conversion state. +SUPPORTED_RAW_REPOS: frozenset[str] = frozenset({ + "Wan-AI/Wan2.1-T2V-1.3B", + "Wan-AI/Wan2.1-T2V-14B", + "Wan-AI/Wan2.2-TI2V-5B", + "Wan-AI/Wan2.2-T2V-A14B", + "Wan-AI/Wan2.2-I2V-A14B", +}) + + +@dataclass(frozen=True) +class WanConvertStatus: + """Snapshot of a converted Wan checkpoint on disk.""" + repo: str + converted: bool + outputDir: str + hasTransformer: bool + hasMoeExperts: bool + hasVae: bool + hasTextEncoder: bool + note: str | None = None + + def to_dict(self) -> dict[str, object]: + return { + "repo": self.repo, + "converted": self.converted, + "outputDir": self.outputDir, + "hasTransformer": self.hasTransformer, + "hasMoeExperts": self.hasMoeExperts, + "hasVae": self.hasVae, + "hasTextEncoder": self.hasTextEncoder, + "note": self.note, + } + + +def slug_for(repo: str) -> str: + """Filesystem-safe slug from an HF repo id (``/`` → ``__``).""" + return repo.replace("/", "__") + + +def output_dir_for(repo: str) -> Path: + """Convention path where the converted MLX weights for ``repo`` land.""" + return CONVERT_ROOT / slug_for(repo) + + +def is_supported_raw_repo(repo: str | None) -> bool: + """Return ``True`` when the upstream convert script can handle ``repo``.""" + if not repo: + return False + return repo in SUPPORTED_RAW_REPOS + + +def is_mlx_video_available() -> bool: + """Cheap check for the upstream package without importing it.""" + return importlib.util.find_spec("mlx_video") is not None + + +def status_for(repo: str) -> WanConvertStatus: + """Inspect ``output_dir_for(repo)`` and report what's on disk. + + A repo is considered ``converted`` when the output dir exists AND + the VAE is present AND either: + - a single transformer file/dir exists (Wan2.1), or + - both MoE expert subdirs exist (Wan2.2 high_noise + low_noise). + Text encoder presence is reported separately because some users + convert transformer-only and reuse a shared text encoder. + """ + out = output_dir_for(repo) + if not out.exists(): + return WanConvertStatus( + repo=repo, + converted=False, + outputDir=str(out), + hasTransformer=False, + hasMoeExperts=False, + hasVae=False, + hasTextEncoder=False, + note="Output directory does not exist; conversion not run yet.", + ) + + # mlx-video upstream layout (verified 2026-05-04 against Wan2.1-T2V-1.3B): + # - Single-DiT (Wan2.1, Wan2.2 5B): model.safetensors at the root + # - MoE (Wan2.2 A14B): high_noise_model/ + low_noise_model/ subdirs + # - Text encoder: t5_encoder.safetensors at the root + # - VAE: vae.safetensors at the root + # The legacy `transformer*.safetensors` / `text_encoder*.safetensors` + # patterns stay as fallbacks in case upstream renames in a future cut. + has_single_transformer = ( + (out / "model.safetensors").exists() + or any(out.glob("transformer*.safetensors")) + or (out / "transformer").is_dir() + ) + has_high = (out / "high_noise_model").is_dir() + has_low = (out / "low_noise_model").is_dir() + has_moe = has_high and has_low + + has_vae = ( + (out / "vae.safetensors").exists() + or (out / "Wan2.1_VAE.safetensors").exists() + or any(out.glob("vae*.safetensors")) + ) + has_text_encoder = ( + (out / "t5_encoder.safetensors").exists() + or any(out.glob("text_encoder*.safetensors")) + or any(out.glob("models_t5*.safetensors")) + or any(out.glob("umt5*.safetensors")) + ) + + converted = (has_single_transformer or has_moe) and has_vae + + note = None + if not converted: + missing = [] + if not (has_single_transformer or has_moe): + missing.append("transformer (single .safetensors or high_noise/low_noise dirs)") + if not has_vae: + missing.append("VAE") + note = f"Output dir exists but conversion incomplete; missing: {', '.join(missing)}." + + return WanConvertStatus( + repo=repo, + converted=converted, + outputDir=str(out), + hasTransformer=has_single_transformer or has_moe, + hasMoeExperts=has_moe, + hasVae=has_vae, + hasTextEncoder=has_text_encoder, + note=note, + ) + + +def list_converted() -> list[WanConvertStatus]: + """Return ``WanConvertStatus`` for every converted dir under + ``CONVERT_ROOT`` that maps back to a known supported repo. Useful + for the Setup page's "Available Wan MLX runtimes" listing.""" + if not CONVERT_ROOT.exists(): + return [] + out: list[WanConvertStatus] = [] + for entry in sorted(CONVERT_ROOT.iterdir()): + if not entry.is_dir(): + continue + repo = entry.name.replace("__", "/", 1) + if not is_supported_raw_repo(repo): + continue + status = status_for(repo) + if status.converted: + out.append(status) + return out + + +def run_convert( + checkpoint_dir: Path | str, + repo: str, + *, + dtype: str = "bfloat16", + model_version: str = "auto", + quantize: bool = False, + bits: int = 4, + group_size: int = 64, + timeout_seconds: int = 3600, + python_executable: str | None = None, +) -> WanConvertStatus: + """Run ``python -m mlx_video.models.wan_2.convert`` on a checkpoint. + + Output lands at ``output_dir_for(repo)`` (under ``CONVERT_ROOT``). + Returns the post-convert ``WanConvertStatus`` so the caller can + decide whether to surface a runtimeNote about partial conversion. + + Subprocess timeout defaults to 1 hour — large models (Wan2.2 A14B + at ~67 GB raw) can take 20-30 minutes to convert on M-series Macs; + 1 hour gives plenty of headroom without leaving the worker hung + indefinitely if the script wedges. + """ + if not is_supported_raw_repo(repo): + raise ValueError( + f"Unsupported Wan repo {repo!r}. " + f"Supported: {sorted(SUPPORTED_RAW_REPOS)}" + ) + + if not is_mlx_video_available(): + raise RuntimeError( + "mlx-video is not installed. Run " + "``pip install -e \".[mlx-video]\"`` (installs from git) first." + ) + + checkpoint_path = Path(checkpoint_dir).expanduser() + if not checkpoint_path.is_dir(): + raise FileNotFoundError( + f"Checkpoint dir not found: {checkpoint_path}. " + "Download the raw Wan repo first via " + "``huggingface-cli download ``." + ) + + out = output_dir_for(repo) + out.parent.mkdir(parents=True, exist_ok=True) + + python_bin = python_executable or sys.executable + args = [ + python_bin, + "-m", "mlx_video.models.wan_2.convert", + "--checkpoint-dir", str(checkpoint_path), + "--output-dir", str(out), + "--dtype", dtype, + "--model-version", model_version, + ] + if quantize: + args.extend([ + "--quantize", + "--bits", str(bits), + "--group-size", str(group_size), + ]) + + LOG.info("Starting Wan convert: repo=%s args=%s", repo, " ".join(args)) + try: + result = subprocess.run( + args, + capture_output=True, + text=True, + timeout=timeout_seconds, + check=False, + ) + except subprocess.TimeoutExpired as exc: + tail = (exc.stderr or exc.stdout or "") + raise RuntimeError( + f"Wan convert timed out after {timeout_seconds}s for {repo}. " + f"Last output: {str(tail)[-500:]}" + ) from exc + + if result.returncode != 0: + tail = (result.stderr or result.stdout or "")[-800:] + raise RuntimeError( + f"Wan convert exited with code {result.returncode} for {repo}. " + f"Last output:\n{tail}" + ) + + return status_for(repo) diff --git a/backend_service/mlx_video_wan_installer.py b/backend_service/mlx_video_wan_installer.py new file mode 100644 index 0000000..920224d --- /dev/null +++ b/backend_service/mlx_video_wan_installer.py @@ -0,0 +1,351 @@ +"""mlx-video Wan installer (FU-025). + +End-to-end orchestration that downloads a raw Wan-AI checkpoint from +Hugging Face and runs ``mlx_video.models.wan_2.convert`` so the +``mlx_video_runtime`` engine can route the repo through the native MLX +subprocess. This is the bridge between the helper module +(``mlx_video_wan_convert``) and the Setup-page UX — same pattern as +``longlive_installer`` but Apple-Silicon-only and considerably smaller +in scope. + +Invocable two ways: + * In-process: ``from backend_service.mlx_video_wan_installer import install`` + * As a module: ``python -m backend_service.mlx_video_wan_installer + --repo Wan-AI/Wan2.1-T2V-1.3B`` (used by the FastAPI install + endpoint so the long-running convert stays out of the sidecar). +""" + +from __future__ import annotations + +import argparse +import os +import platform +import shutil +import subprocess +import sys +from pathlib import Path +from typing import Callable + +from backend_service.mlx_video_wan_convert import ( + SUPPORTED_RAW_REPOS, + is_mlx_video_available, + is_supported_raw_repo, + output_dir_for, + slug_for, + status_for, +) + + +# Where raw HF Wan checkpoints land before conversion. Kept under +# ``~/.chaosengine/mlx-video-wan-raw/`` so the converted artifacts and +# their source weights live under the same parent (easier for users to +# audit / clean up). Override with ``CHAOSENGINE_MLX_VIDEO_WAN_RAW_DIR``. +def _resolve_raw_root() -> Path: + override = os.environ.get("CHAOSENGINE_MLX_VIDEO_WAN_RAW_DIR") + if override: + return Path(override).expanduser() + return Path.home() / ".chaosengine" / "mlx-video-wan-raw" + + +RAW_ROOT: Path = _resolve_raw_root() + + +# Ordered phases. The async job worker walks this list to drive a +# percent counter; the in-process / CLI path uses it for log labels. +INSTALL_PHASES: tuple[str, ...] = ( + "preflight", # check Apple Silicon + mlx-video installed + repo supported + "download-raw", # snapshot raw Wan repo from HF (largest phase) + "convert", # python -m mlx_video.models.wan_2.convert + "verify", # status_for() must report converted=True +) + + +# Per-repo approximate size in GB (raw weights + headroom). Used by the +# preflight to surface a "free disk needed" hint, not enforced. +_APPROX_RAW_SIZE_GB: dict[str, float] = { + "Wan-AI/Wan2.1-T2V-1.3B": 3.5, + "Wan-AI/Wan2.1-T2V-14B": 28.0, + "Wan-AI/Wan2.2-TI2V-5B": 24.0, + "Wan-AI/Wan2.2-T2V-A14B": 67.0, + "Wan-AI/Wan2.2-I2V-A14B": 67.0, +} + + +class WanInstallError(RuntimeError): + """Raised when the installer cannot proceed (wrong platform, missing + package, unknown repo, download/convert failure).""" + + +def raw_dir_for(repo: str) -> Path: + """Local path where raw HF weights are downloaded for ``repo``.""" + return RAW_ROOT / slug_for(repo) + + +def approx_raw_size_gb(repo: str) -> float | None: + return _APPROX_RAW_SIZE_GB.get(repo) + + +def _noop_progress(_event: dict[str, object]) -> None: + """Default progress sink. The async job worker overrides with one + that updates ``_WAN_INSTALL_JOB`` shared state.""" + + +def _emit( + progress: Callable[[dict[str, object]], None], + *, + phase: str, + message: str, + ok: bool = True, + output: str | None = None, +) -> None: + payload: dict[str, object] = {"phase": phase, "ok": ok, "message": message} + if output is not None: + payload["output"] = output + progress(payload) + + +def _preflight(repo: str) -> None: + """Validate platform + package + repo before starting the heavy + download. Raises ``WanInstallError`` with an actionable message + otherwise.""" + system = platform.system() + if system != "Darwin": + raise WanInstallError( + "mlx-video Wan runtime is Apple Silicon only. " + f"Detected platform: {system}." + ) + if platform.machine() not in {"arm64", "aarch64"}: + raise WanInstallError( + "mlx-video Wan runtime requires an arm64 / aarch64 Mac. " + f"Detected machine: {platform.machine()}." + ) + if not is_mlx_video_available(): + raise WanInstallError( + "mlx-video is not installed. From the project root, run " + '``pip install -e ".[mlx-video]"`` and retry.' + ) + if not is_supported_raw_repo(repo): + raise WanInstallError( + f"Unsupported Wan repo {repo!r}. " + f"Supported: {sorted(SUPPORTED_RAW_REPOS)}" + ) + + +def _download_raw( + repo: str, + raw_dir: Path, + logger: Callable[[str], None], +) -> None: + """Snapshot the raw Wan repo to ``raw_dir`` via huggingface_hub.""" + raw_dir.parent.mkdir(parents=True, exist_ok=True) + logger(f"Downloading {repo} → {raw_dir}") + try: + from huggingface_hub import snapshot_download # type: ignore[import-untyped] + except ImportError as exc: + raise WanInstallError( + f"huggingface_hub is required to download raw Wan weights: {exc}. " + "Install it via ``pip install huggingface-hub``." + ) from exc + try: + snapshot_download( + repo_id=repo, + local_dir=str(raw_dir), + local_dir_use_symlinks=False, + ) + except Exception as exc: # noqa: BLE001 — surface any HF error as install error + raise WanInstallError( + f"Failed to download {repo}: {type(exc).__name__}: {exc}" + ) from exc + + +def _run_convert( + raw_dir: Path, + repo: str, + *, + dtype: str, + quantize: bool, + bits: int, + group_size: int, + timeout_seconds: int, + python_executable: str, + logger: Callable[[str], None], +) -> None: + """Spawn ``python -m mlx_video.models.wan_2.convert`` and stream its + stdout into ``logger``. Bypasses ``mlx_video_wan_convert.run_convert`` + so we can stream output line-by-line for the progress UI rather than + capturing the whole thing at the end of the run.""" + out = output_dir_for(repo) + out.parent.mkdir(parents=True, exist_ok=True) + + args = [ + python_executable, + "-m", "mlx_video.models.wan_2.convert", + "--checkpoint-dir", str(raw_dir), + "--output-dir", str(out), + "--dtype", dtype, + "--model-version", "auto", + ] + if quantize: + args.extend([ + "--quantize", + "--bits", str(bits), + "--group-size", str(group_size), + ]) + + logger(f"$ {' '.join(args)}") + try: + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + except FileNotFoundError as exc: + raise WanInstallError( + f"Failed to spawn convert subprocess: {exc}. " + "Verify the Python interpreter path is correct." + ) from exc + + assert process.stdout is not None + for line in process.stdout: + stripped = line.rstrip() + if stripped: + logger(stripped) + + rc = process.wait(timeout=timeout_seconds) + if rc != 0: + raise WanInstallError( + f"Convert subprocess exited with code {rc}. " + "Last lines of output appear in the install log above." + ) + + +def install( + repo: str, + *, + dtype: str = "bfloat16", + quantize: bool = False, + bits: int = 4, + group_size: int = 64, + timeout_seconds: int = 3600, + keep_raw: bool = True, + logger: Callable[[str], None] = print, + progress: Callable[[dict[str, object]], None] = _noop_progress, + python_executable: str | None = None, +) -> None: + """Run the full Wan install: preflight → download raw → convert → verify. + + Raises ``WanInstallError`` on any failure. ``progress`` receives a + structured event per phase so the FastAPI job worker can surface + progress to the UI; the CLI path uses the no-op sink. + + ``keep_raw=False`` deletes the raw HF download after successful + conversion to free disk space (Wan2.2 A14B raw is ~67 GB; after + convert the raw weights aren't referenced again until a future + re-conversion). + """ + py = python_executable or sys.executable + + _emit(progress, phase="preflight", message=f"Checking platform + package for {repo}") + _preflight(repo) + + raw_dir = raw_dir_for(repo) + _emit( + progress, + phase="download-raw", + message=( + f"Downloading raw {repo} (~{approx_raw_size_gb(repo) or '?'} GB) → {raw_dir}" + ), + ) + _download_raw(repo, raw_dir, logger) + + _emit( + progress, + phase="convert", + message=f"Converting to MLX format → {output_dir_for(repo)}", + ) + _run_convert( + raw_dir, + repo, + dtype=dtype, + quantize=quantize, + bits=bits, + group_size=group_size, + timeout_seconds=timeout_seconds, + python_executable=py, + logger=logger, + ) + + _emit(progress, phase="verify", message="Verifying converted output") + status = status_for(repo) + if not status.converted: + raise WanInstallError( + f"Convert finished but output dir is incomplete: " + f"{status.note or 'unknown reason'}" + ) + + if not keep_raw: + logger(f"Cleaning raw download at {raw_dir}") + shutil.rmtree(raw_dir, ignore_errors=True) + + logger( + f"Wan install complete: {repo} converted at {status.outputDir}" + ) + + +# ---------------------------------------------------------------------- +# CLI entrypoint — used by the FastAPI install endpoint to spawn this +# module as a subprocess so a long-running convert stays out of the +# sidecar process. Mirror longlive_installer's pattern. +# ---------------------------------------------------------------------- + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=( + "Install an mlx-video Wan model: download raw HF weights " + "and convert to MLX format." + ) + ) + parser.add_argument( + "--repo", + required=True, + help=f"Raw Wan-AI repo id. Supported: {sorted(SUPPORTED_RAW_REPOS)}", + ) + parser.add_argument("--dtype", default="bfloat16", choices=["float16", "float32", "bfloat16"]) + parser.add_argument("--quantize", action="store_true", help="Quantize transformer weights") + parser.add_argument("--bits", type=int, default=4, choices=[4, 8]) + parser.add_argument("--group-size", type=int, default=64, choices=[32, 64, 128]) + parser.add_argument( + "--timeout-seconds", type=int, default=3600, + help="Max wall-clock for the convert subprocess (default 1 hour).", + ) + parser.add_argument( + "--cleanup-raw", action="store_true", + help="Delete raw HF download after successful convert.", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = _build_arg_parser() + args = parser.parse_args(argv) + try: + install( + args.repo, + dtype=args.dtype, + quantize=args.quantize, + bits=args.bits, + group_size=args.group_size, + timeout_seconds=args.timeout_seconds, + keep_raw=not args.cleanup_raw, + ) + except WanInstallError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/backend_service/mlx_worker.py b/backend_service/mlx_worker.py index e57e3a7..e9f45a1 100644 --- a/backend_service/mlx_worker.py +++ b/backend_service/mlx_worker.py @@ -1,11 +1,14 @@ from __future__ import annotations +import base64 +import binascii import importlib.util import io import json import os import re import sys +import tempfile import time import traceback from pathlib import Path @@ -15,6 +18,8 @@ RAW_REASONING_HEADING_RE, ThinkingTokenFilter, ThinkingStreamResult, + reasoning_delimiters_for, + strip_harmony_boilerplate, strip_thinking_tokens as _strip_thinking_tokens, ) @@ -81,104 +86,105 @@ def _sanitize_messages(messages: list[dict[str, str]]) -> list[dict[str, str]]: return sanitized _TRANSCRIPT_ROLE_LINE_RE = re.compile(r"^\s*(SYSTEM|USER|ASSISTANT):\s*(.*)$", re.IGNORECASE) -_RAW_THINKING_HEADING_RE = RAW_REASONING_HEADING_RE +# Phase 2.0.5-F: RunawayGuard now lives in `backend_service.runaway_guard` +# so the llama.cpp stream loop in `state.py` can use the same detector. Re- +# export the symbol here so existing callers / tests keep working without +# import-path churn. +from backend_service.runaway_guard import RunawayGuard # noqa: E402,F401 -_REASONING_LINE_RE = re.compile( - r"^\s*(?:" - r"wait,|okay[,.]|actually[,.]|let me|i (?:need to|should|will|must|can)" - r"|so (?:i |the )|hmm|looking|check(?:ing)?|(?:re)?evaluat" - r"|draft(?:ing)?|refin(?:ing|e)|final (?:check|answer|decision|polish)" - r")", - re.IGNORECASE, -) - - -class RunawayGuard: - """Detect and abort runaway generation loops in streamed output. - - Catches three failure modes: - 1. Repeated identical lines (e.g. "Wait, I will write 'Qwen3.5'." x100) - 2. Near-duplicate reasoning loops (lines starting with "Wait," / "Okay," etc.) - 3. Raw thinking-heading dumps (e.g. "Thinking Process:" at generation start) - Raises ``RuntimeError`` when a runaway is detected. +def _extract_top_logprobs( + response: Any, + tokenizer: Any, + top_k: int, +) -> list[dict[str, Any]] | None: + """Phase 3.3 follow-up: extract top-k logprob entries from an + mlx-lm GenerationResponse for the just-emitted token. + + Returns a list with a single entry shaped like the OpenAI + `logprobs.content[]` payload — token + logprob + alternatives — + so the frontend overlay treats MLX and llama-server output + identically. Returns None on any failure (missing logprobs, + unsupported tensor shape, etc.) — logprobs are diagnostic, not + correctness-critical. """ + if top_k <= 0: + return None + logprobs = getattr(response, "logprobs", None) + chosen_token_id = getattr(response, "token", None) + if logprobs is None or chosen_token_id is None: + return None + try: + import numpy as np # noqa: WPS433 — keep import lazy - def __init__( - self, - *, - min_line_length: int = 30, - max_repeats: int = 4, - max_reasoning_lines: int = 20, - ) -> None: - self._min_line_length = min_line_length - self._max_repeats = max_repeats - self._max_reasoning_lines = max_reasoning_lines - self._buffer = "" - self._last_line: str | None = None - self._repeat_count = 0 - self._reasoning_streak = 0 - self._total_chars = 0 - self._thinking_heading_seen = False - - def feed(self, text: str) -> None: - """Feed a chunk of streamed text. Raises on detected runaway.""" - self._total_chars += len(text) - self._buffer += text - - # Check for raw thinking heading at the start of generation - if not self._thinking_heading_seen and self._total_chars < 200: - if _RAW_THINKING_HEADING_RE.search(self._buffer): - self._thinking_heading_seen = True - - # Check for repeated / reasoning lines - while "\n" in self._buffer: - line, self._buffer = self._buffer.split("\n", 1) - self._check_line(line) + arr = np.array(logprobs, dtype=np.float32) + if arr.ndim != 1 or arr.size == 0: + return None + # argpartition gets top-k unsorted; sort just the slice. + k = min(int(top_k), int(arr.size)) + if k >= int(arr.size): + top_idx = np.argsort(-arr) + else: + partial = np.argpartition(-arr, k - 1)[:k] + top_idx = partial[np.argsort(-arr[partial])] + alternatives: list[dict[str, Any]] = [] + for token_id in top_idx[:k].tolist(): + try: + token_text = tokenizer.decode([int(token_id)]) + except Exception: + token_text = "" + alternatives.append({ + "token": token_text, + "logprob": float(arr[token_id]), + }) + try: + chosen_text = tokenizer.decode([int(chosen_token_id)]) + except Exception: + chosen_text = "" + chosen_logprob: float | None + try: + chosen_logprob = float(arr[int(chosen_token_id)]) + except Exception: + chosen_logprob = None + return [{ + "token": chosen_text, + "logprob": chosen_logprob, + "alternatives": alternatives, + }] + except Exception: + return None - def flush(self) -> None: - if self._buffer: - self._check_line(self._buffer) - self._buffer = "" - @property - def saw_thinking_heading(self) -> bool: - return self._thinking_heading_seen - - def _check_line(self, line: str) -> None: - normalized = " ".join(line.strip().lower().split()) - if len(normalized) < self._min_line_length: - # Short lines still decay the reasoning streak so alternating - # "Wait, ..." / "31536000 seconds." patterns get caught. - self._reasoning_streak = max(0, self._reasoning_streak - 1) - return +def _build_mlx_sampler(request: dict[str, Any]) -> Any: + """Phase 2.2: build an mlx-lm sampler with whichever Phase 2.2 sampler + overrides the installed `make_sampler` actually supports. - # Exact-match repetition - if normalized == self._last_line: - self._repeat_count += 1 - else: - self._last_line = normalized - self._repeat_count = 1 + `mlx_lm.sample_utils.make_sampler` has gained kwargs across versions + (top_p, top_k, min_p, ...). Call sites used to pass `temp` only — we + now collect the request's `samplers` block and forward whatever + survives a signature filter, so newer mlx-lm builds get the full + sampler chain while older builds fall back gracefully. + """ + import inspect - if self._repeat_count >= self._max_repeats: - raise RuntimeError( - "Stopped runaway generation: model is repeating itself." - ) + from mlx_lm.sample_utils import make_sampler - # Near-duplicate reasoning loop detection - # Lines like "Wait, I should...", "Okay, I'll...", "Actually, looking..." - # Non-reasoning lines decay the streak by 1 instead of resetting, - # so alternating "Wait, ..." / "31536000 seconds." still trips the guard. - if _REASONING_LINE_RE.match(normalized): - self._reasoning_streak += 2 - else: - self._reasoning_streak = max(0, self._reasoning_streak - 1) + kwargs: dict[str, Any] = {"temp": float(request.get("temperature") or 0.0)} + samplers = request.get("samplers") or {} + if isinstance(samplers, dict): + for src in ("top_p", "top_k", "min_p"): + value = samplers.get(src) + if value is not None: + kwargs[src] = value - if self._reasoning_streak >= self._max_reasoning_lines: - raise RuntimeError( - "Stopped runaway generation: model is stuck in a reasoning loop." - ) + try: + sig = inspect.signature(make_sampler) + allowed = set(sig.parameters.keys()) + filtered = {k: v for k, v in kwargs.items() if k in allowed} + except (TypeError, ValueError): + filtered = {"temp": kwargs["temp"]} + return make_sampler(**filtered) def _format_tools_for_prompt(tools: list[dict[str, Any]] | None) -> str | None: @@ -317,7 +323,19 @@ def _build_prompt_text( history: list[dict[str, Any]], prompt: str, system_prompt: str | None, + model_ref: str | None = None, ) -> tuple[str, str | None]: + # Phase 3.8: detect chat-template quirks at render time and apply + # the matching auto-fix. Today: Gemma family rejects the system role + # entirely, so we fold the system prompt into the first user message + # before handing off to apply_chat_template. The report's + # `to_runtime_note()` surfaces the fix to the UI's substrate badge. + from backend_service.helpers.chat_template import ( + fold_system_into_first_user, + inspect_chat_template, + is_gemma_family, + ) + messages: list[dict[str, str]] = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) @@ -329,19 +347,25 @@ def _build_prompt_text( messages.append({"role": "user", "content": prompt}) messages = _sanitize_messages(messages) + template_note: str | None = None + if is_gemma_family(model_ref): + messages = fold_system_into_first_user(messages) + report = inspect_chat_template(getattr(tokenizer, "chat_template", None), model_ref) + template_note = report.to_runtime_note() + apply_template = getattr(tokenizer, "apply_chat_template", None) if callable(apply_template): try: rendered = apply_template(messages, tokenize=False, add_generation_prompt=True) if isinstance(rendered, str): - return rendered, None + return rendered, template_note except TypeError: try: rendered = apply_template(messages, add_generation_prompt=True) if isinstance(rendered, str): - return rendered, None + return rendered, template_note if isinstance(rendered, list): - return tokenizer.decode(rendered), None + return tokenizer.decode(rendered), template_note except Exception as exc: # pragma: no cover - exercised via fallback path below reason = str(exc).strip() or exc.__class__.__name__ return ( @@ -496,6 +520,15 @@ class WorkerState: def __init__(self) -> None: self.model = None self.tokenizer = None + # Multimodal (vision-language) state. ``processor`` is the HF + # AutoProcessor returned by mlx_vlm.load (image preprocessor + + # tokenizer). ``is_multimodal`` flips the generate path to + # ``_generate_multimodal`` / ``_stream_generate_multimodal`` + # which decode the chat ``images`` field into temp files and + # call ``mlx_vlm.generate`` / ``stream_generate``. Stays + # ``None`` / ``False`` for plain text-only mlx-lm models. + self.processor = None + self.is_multimodal = False self.config: dict[str, Any] | None = None self.cache_strategy = "native" self.cache_bits = 0 @@ -508,6 +541,17 @@ def __init__(self) -> None: self.tree_budget = 0 self._ddtree_draft = None # DFlashDraftModel for DDTree self._ddtree_target = None # target model loaded via dflash_mlx for DDTree + # FU-002: TriAttention MLX kv_budget. Number of KV positions kept + # per layer; older positions get scored + evicted by the + # apply_triattention_mlx compressor. ~2048 is the upstream default + # and matches the spike result on Qwen2.5-0.5B (2.6x speedup, + # identical output). + self.kv_budget = 2048 + # Bug 2 / Gemma 4 channel-token leak: track the currently loaded + # model ref so the reasoning split layer can pick model-specific + # delimiters via ``reasoning_delimiters_for``. Default + # (``...``) still applies when ``None``. + self._loaded_model_ref: str | None = None def handle(self, request: dict[str, Any]) -> dict[str, Any] | None: op = request.get("op") @@ -536,6 +580,10 @@ def load_model(self, request: dict[str, Any]) -> dict[str, Any]: requested_cache_bits = int(request.get("cacheBits", 0)) requested_fp16_layers = int(request.get("fp16Layers", 0)) requested_fused_attention = bool(request.get("fusedAttention", False)) + # FU-002: kv_budget for the TriAttention MLX compressor. Ignored + # when cache_strategy != "triattention". Falls back to 2048 (the + # upstream default validated by scripts/spike_triattention_mlx.py). + self.kv_budget = max(64, int(request.get("kvBudget", 2048))) self.context_tokens = int(request.get("contextTokens", 8192)) self.speculative_decoding = bool(request.get("speculativeDecoding", False)) dflash_draft_model = request.get("dflashDraftModel") @@ -656,10 +704,51 @@ def _heartbeat() -> None: heartbeat_thread = threading.Thread(target=_heartbeat, daemon=True) heartbeat_thread.start() + + # Multimodal branch: vision-capable repos (Gemma 4, Qwen2.5-VL, + # LLaVA family) load via mlx_vlm.load → ``(model, processor)``. + # The processor wraps the HF tokenizer so downstream code that + # reads ``self.tokenizer`` keeps working. When the multimodal + # extra isn't installed, fall back to mlx_lm.load with a + # runtimeNote so the user gets a clear "install mlx-vlm" hint. + from backend_service.helpers.chat_template import is_multimodal_family + multimodal_note: str | None = None + use_multimodal = is_multimodal_family(target) try: # Reject quantisation formats that MLX cannot dequantize. _reject_unsupported_quant(local_path) - self.model, self.tokenizer, self.config = load(local_path, return_config=True) + if use_multimodal: + try: + from mlx_vlm import load as mlx_vlm_load # type: ignore[import-untyped] + except ImportError as exc: + multimodal_note = ( + f"Vision model {target!r} requires mlx-vlm but the " + f"package isn't installed ({exc}). Falling back to " + "mlx_lm text-only load — image inputs will be ignored." + ) + use_multimodal = False + + if use_multimodal: + self.model, self.processor = mlx_vlm_load(local_path) + self.tokenizer = getattr(self.processor, "tokenizer", None) + # mlx_vlm.load doesn't return a config dict — read it from + # the snapshot directly so prompt-formatter + chat-template + # paths can still introspect (e.g. ``num_attention_heads`` + # for cache estimation). + config_path = Path(local_path) / "config.json" + if config_path.exists(): + try: + self.config = json.loads(config_path.read_text()) + except Exception: + self.config = {} + else: + self.config = {} + self.is_multimodal = True + else: + self.model, self.tokenizer, self.config = load(local_path, return_config=True) + self.processor = None + self.is_multimodal = False + self._loaded_model_ref = target finally: load_done.set() heartbeat_thread.join(timeout=0.5) @@ -731,6 +820,9 @@ def _heartbeat() -> None: def unload_model(self) -> dict[str, Any]: self.model = None self.tokenizer = None + self.processor = None + self.is_multimodal = False + self._loaded_model_ref = None self._dflash_generator = None self._dflash_target = None self._ddtree_draft = None @@ -782,6 +874,14 @@ def _apply_cache_profile( self.fp16_layers = 0 return None + # FU-002: TriAttention MLX path. Doesn't make a prompt_cache + # object — instead applies the compressor in-place to the loaded + # model so subsequent ``mlx_lm.generate`` calls run against the + # wrapped attention. Falls back to native on any failure (model + # missing, triattention unavailable, apply raises). + if self.cache_strategy == "triattention": + return self._apply_triattention_mlx_compressor() + preview_cache, note = self._make_cache() if preview_cache is not None: preview_cache = None @@ -795,6 +895,43 @@ def _apply_cache_profile( return note + def _apply_triattention_mlx_compressor(self) -> str | None: + """Apply ``apply_triattention_mlx`` to the loaded model in-place. + + Returns a runtimeNote describing what happened. On any failure + the worker falls back to the native cache so generation keeps + working without TriAttention. + """ + if self.model is None: + self.cache_strategy = "native" + self.cache_bits = 0 + self.fp16_layers = 0 + return "TriAttention requested but no model is loaded; using native cache." + try: + from cache_compression import registry + except Exception as exc: + self.cache_strategy = "native" + return f"TriAttention failed to import strategy registry ({exc}); using native cache." + strategy = registry.get("triattention") + if strategy is None or not strategy.is_available(): + self.cache_strategy = "native" + return ( + "TriAttention is not available in this runtime " + "(install ``triattention`` + ``mlx_lm``); using native cache." + ) + try: + apply_compressor = getattr(strategy, "apply_mlx_compressor", None) + if apply_compressor is None: + raise AttributeError("strategy.apply_mlx_compressor missing") + apply_compressor(self.model, kv_budget=self.kv_budget) + except Exception as exc: + self.cache_strategy = "native" + return ( + f"TriAttention apply_mlx_compressor raised " + f"({type(exc).__name__}: {exc}); using native cache." + ) + return f"TriAttention MLX compressor applied (kv_budget={self.kv_budget})." + def _runtime_fields( self, *, @@ -866,6 +1003,15 @@ def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: # followed by a final ``{"event": "summary", ...}`` payload whose shape # matches what the old ``generate_dflash_once`` helper returned. summary: dict[str, Any] = {} + # Phase 3.1: per-token accepted-from-draft tracking. Tokens that + # share `cycles_completed` with the previous token are commits + # from the same DDTree cycle — the first is verifier-decoded, + # the rest are draft-accepted. Build a parallel list of + # (token_text, accepted: bool) so the UI can tint accepted runs. + per_token_accepted: list[bool] = [] + per_token_text: list[str] = [] + prev_cycle: int = -1 + prev_gen_count: int = 0 for event in stream_dflash_generate( target_model=self._dflash_target or self.model, tokenizer=self.tokenizer, @@ -878,6 +1024,29 @@ def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: ): if event.get("event") == "summary": summary = dict(event) + continue + if event.get("event") != "token": + continue + cycle = int(event.get("cycles_completed") or 0) + gen_count = int(event.get("generated_tokens") or 0) + token_id = event.get("token_id") + if token_id is None: + continue + # First token of a new cycle (cycle increments) is + # verifier-decoded; subsequent tokens within the same + # cycle are draft-accepted. Cycle 0 (the initial seed + # token) is also verifier-decoded. + if gen_count <= prev_gen_count: + # Defensive — skip duplicates / out-of-order events. + continue + accepted = cycle == prev_cycle and prev_cycle > 0 + per_token_accepted.append(accepted) + try: + per_token_text.append(self.tokenizer.decode([int(token_id)])) + except Exception: + per_token_text.append("") + prev_cycle = cycle + prev_gen_count = gen_count gen_tokens = [int(token_id) for token_id in summary.get("generated_token_ids", [])] text = self.tokenizer.decode(gen_tokens).strip() if gen_tokens else "" @@ -885,10 +1054,15 @@ def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: # is enabled. XML tags are always processed regardless. thinking_mode = request.get("thinkingMode") or "off" if text: - think_filter = ThinkingTokenFilter(detect_raw_reasoning=(thinking_mode != "off")) + _open_tag, _close_tag = reasoning_delimiters_for(self._loaded_model_ref) + think_filter = ThinkingTokenFilter( + detect_raw_reasoning=(thinking_mode != "off"), + open_tag=_open_tag, + close_tag=_close_tag, + ) result = think_filter.feed(text) flushed = think_filter.flush() - text = f"{result.text}{flushed.text}".strip() + text = strip_harmony_boilerplate(f"{result.text}{flushed.text}".strip()) if not text: text = "Generation completed without decoded text." @@ -916,6 +1090,31 @@ def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: ), ) + # Phase 3.1: build run-length-encoded accepted spans from the + # per-token accepted bools. Each span has start (char offset + # into the rendered text), length (chars), and accepted (bool). + accepted_spans: list[dict[str, Any]] = [] + if per_token_accepted and per_token_text: + offset = 0 + run_start = 0 + run_kind = per_token_accepted[0] + for idx, accepted in enumerate(per_token_accepted): + tok_text = per_token_text[idx] if idx < len(per_token_text) else "" + if accepted != run_kind: + accepted_spans.append({ + "start": run_start, + "length": offset - run_start, + "accepted": run_kind, + }) + run_start = offset + run_kind = accepted + offset += len(tok_text) + accepted_spans.append({ + "start": run_start, + "length": offset - run_start, + "accepted": run_kind, + }) + return { "text": text, "finishReason": "stop", @@ -927,6 +1126,8 @@ def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: "peakMemoryGb": round(float(summary.get("peak_memory_gb") or 0.0), 3), "runtimeNote": runtime_note, "dflashAcceptanceRate": round(float(acceptance_rate), 2) if acceptance_rate is not None else None, + "acceptedSpans": accepted_spans, + "acceptedTokenText": "".join(per_token_text) if per_token_text else None, **self._runtime_fields(prompt_cache=None, speculative_decoding=True, tree_budget=0), } @@ -968,10 +1169,15 @@ def _generate_ddtree(self, request: dict[str, Any]) -> dict[str, Any]: # is enabled. XML tags are always processed regardless. thinking_mode = request.get("thinkingMode") or "off" if text: - think_filter = ThinkingTokenFilter(detect_raw_reasoning=(thinking_mode != "off")) + _open_tag, _close_tag = reasoning_delimiters_for(self._loaded_model_ref) + think_filter = ThinkingTokenFilter( + detect_raw_reasoning=(thinking_mode != "off"), + open_tag=_open_tag, + close_tag=_close_tag, + ) filter_result = think_filter.feed(text) flushed = think_filter.flush() - text = f"{filter_result.text}{flushed.text}".strip() + text = strip_harmony_boilerplate(f"{filter_result.text}{flushed.text}".strip()) if not text: text = "Generation completed without decoded text." @@ -997,6 +1203,12 @@ def _generate_ddtree(self, request: dict[str, Any]) -> dict[str, Any]: "peakMemoryGb": 0.0, "runtimeNote": runtime_note, "dflashAcceptanceRate": round(float(acceptance_rate), 2) if acceptance_rate else None, + # Phase 3.1 follow-up: DDTree path now ships accepted-span + # data alongside the linear DFLASH path so the frontend + # AcceptedTokenOverlay tints draft-accepted ranges for + # both speculative-decode strategies. + "acceptedSpans": result.get("accepted_spans") or [], + "acceptedTokenText": result.get("accepted_token_text"), **self._runtime_fields( prompt_cache=None, speculative_decoding=True, @@ -1008,6 +1220,15 @@ def generate(self, request: dict[str, Any]) -> dict[str, Any]: if self.model is None or self.tokenizer is None: raise RuntimeError("No MLX model is loaded.") + # Multimodal short-circuit: vision-capable models loaded via + # mlx_vlm always route through the multimodal generate path, + # whether or not the request carries an ``images`` field + # (mlx_vlm.generate accepts ``image=None`` for text-only turns). + # DFlash speculative decoding doesn't apply on the VLM branch + # because the draft-model registry doesn't ship multimodal drafts. + if self.is_multimodal: + return self._generate_multimodal(request) + # Use DDTree if tree budget is set and components are loaded if self.speculative_decoding and self.tree_budget > 0 and self._ddtree_draft is not None: try: @@ -1045,7 +1266,7 @@ def _generate_standard(self, request: dict[str, Any]) -> dict[str, Any]: prompt=str(request.get("prompt") or ""), system_prompt=system_prompt, ) - sampler = make_sampler(temp=float(request.get("temperature") or 0.0)) + sampler = _build_mlx_sampler(request) prompt_cache, runtime_note = self._make_cache() runtime_note = _merge_runtime_notes(runtime_note, prompt_note) runtime_fields = self._runtime_fields(prompt_cache=prompt_cache) @@ -1117,10 +1338,15 @@ def _generate_standard(self, request: dict[str, Any]) -> dict[str, Any]: raw_text = "".join(text_parts).strip() # Respect thinkingMode: only strip raw reasoning when thinking is on. thinking_mode = request.get("thinkingMode") or "off" - think_filter = ThinkingTokenFilter(detect_raw_reasoning=(thinking_mode != "off")) + _open_tag, _close_tag = reasoning_delimiters_for(self._loaded_model_ref) + think_filter = ThinkingTokenFilter( + detect_raw_reasoning=(thinking_mode != "off"), + open_tag=_open_tag, + close_tag=_close_tag, + ) filter_result = think_filter.feed(raw_text) flushed = think_filter.flush() - text = f"{filter_result.text}{flushed.text}".strip() + text = strip_harmony_boilerplate(f"{filter_result.text}{flushed.text}".strip()) if transcript_fallback: text, transcript_trimmed = _trim_transcript_continuation(text) if transcript_trimmed: @@ -1144,11 +1370,284 @@ def _generate_standard(self, request: dict[str, Any]) -> dict[str, Any]: **runtime_fields, } + # ------------------------------------------------------------------ + # Multimodal (vision-language) generation via mlx-vlm + # ------------------------------------------------------------------ + + @staticmethod + def _decode_images_to_paths( + images_b64: list[str], temp_dir: str + ) -> list[str]: + """Decode base64-encoded images into ``temp_dir`` and return paths. + + The chat payload sends each image as a raw base64 string (no + data-URL prefix — that's stripped client-side in + ``ChatComposer.tsx``). mlx-vlm's ``image=`` kwarg accepts a list + of file paths, so we materialise each blob to a temp file with + a deterministic suffix. + """ + paths: list[str] = [] + for index, blob in enumerate(images_b64 or []): + if not blob: + continue + try: + raw = base64.b64decode(blob, validate=False) + except (binascii.Error, ValueError): + # Skip malformed entries rather than aborting the whole + # generation — the model will still answer using text. + continue + path = Path(temp_dir) / f"img_{index:03d}.png" + path.write_bytes(raw) + paths.append(str(path)) + return paths + + def _format_multimodal_prompt( + self, + request: dict[str, Any], + num_images: int, + ) -> str: + """Render the chat history into a single prompt string the + VLM tokenizer expects, accounting for ``num_images`` image + placeholders. Falls back to the plain-text prompt builder when + the processor doesn't expose ``apply_chat_template`` or the + helper raises (some VLMs ship templates that reject our + history shape). + """ + history = list(request.get("history") or []) + prompt = str(request.get("prompt") or "") + system_prompt = request.get("systemPrompt") + messages: list[dict[str, str]] = [] + if system_prompt: + messages.append({"role": "system", "content": str(system_prompt)}) + for message in history: + role = message.get("role") + if role not in {"system", "user", "assistant"}: + continue + messages.append( + {"role": role, "content": _normalize_message_content(message.get("text", ""))} + ) + messages.append({"role": "user", "content": prompt}) + messages = _sanitize_messages(messages) + + try: + from mlx_vlm.prompt_utils import apply_chat_template # type: ignore[import-untyped] + except ImportError: + return _fallback_chat_prompt(messages) + + try: + rendered = apply_chat_template( + self.processor, + self.config or {}, + messages, + add_generation_prompt=True, + num_images=num_images, + ) + except Exception: + return _fallback_chat_prompt(messages) + + if isinstance(rendered, str): + return rendered + if isinstance(rendered, list): + tokenizer = self.tokenizer + decoder = getattr(tokenizer, "decode", None) if tokenizer is not None else None + if callable(decoder): + try: + return decoder(rendered) + except Exception: + pass + return _fallback_chat_prompt(messages) + + def _vlm_generate_kwargs(self, request: dict[str, Any]) -> dict[str, Any]: + """Sampling kwargs accepted by ``mlx_vlm.generate`` / + ``stream_generate``. The VLM API takes ``temperature`` and + ``top_p`` directly (no separate sampler factory like mlx-lm), + so we forward only the knobs that map cleanly. Missing fields + fall back to the underlying mlx-vlm defaults. + """ + kwargs: dict[str, Any] = { + "max_tokens": int(request.get("maxTokens") or 256), + } + temperature = request.get("temperature") + if temperature is not None: + try: + kwargs["temperature"] = float(temperature) + except (TypeError, ValueError): + pass + top_p = request.get("topP") + if top_p is not None: + try: + kwargs["top_p"] = float(top_p) + except (TypeError, ValueError): + pass + return kwargs + + def _generate_multimodal(self, request: dict[str, Any]) -> dict[str, Any]: + """Synchronous mlx-vlm generation. Decodes any attached images, + runs ``mlx_vlm.generate``, applies the thinking-token filter, + and returns the same response shape as ``_generate_standard``. + """ + try: + from mlx_vlm import generate as vlm_generate # type: ignore[import-untyped] + except ImportError as exc: + raise RuntimeError( + f"mlx-vlm is not installed but a multimodal model is loaded: {exc}. " + "Install via ``pip install mlx-vlm``." + ) from exc + + images_b64 = list(request.get("images") or []) + kwargs = self._vlm_generate_kwargs(request) + + with tempfile.TemporaryDirectory(prefix="chaosengine-mm-") as tmpdir: + image_paths = self._decode_images_to_paths(images_b64, tmpdir) + prompt_text = self._format_multimodal_prompt(request, num_images=len(image_paths)) + if image_paths: + result = vlm_generate( + self.model, self.processor, prompt_text, + image=image_paths, **kwargs, + ) + else: + result = vlm_generate( + self.model, self.processor, prompt_text, **kwargs, + ) + + raw_text = getattr(result, "text", None) or str(result) + thinking_mode = request.get("thinkingMode") or "off" + _open_tag, _close_tag = reasoning_delimiters_for(self._loaded_model_ref) + think_filter = ThinkingTokenFilter( + detect_raw_reasoning=(thinking_mode != "off"), + open_tag=_open_tag, + close_tag=_close_tag, + ) + filter_result = think_filter.feed(raw_text) + flushed = think_filter.flush() + text = strip_harmony_boilerplate(f"{filter_result.text}{flushed.text}".strip()) + if not text: + text = "Generation completed without decoded text." + + runtime_note = ( + f"Multimodal generation via mlx-vlm " + f"({len(image_paths)} image{'s' if len(image_paths) != 1 else ''})." + ) + + return { + "text": text, + "finishReason": getattr(result, "finish_reason", None) or "stop", + "promptTokens": int(getattr(result, "prompt_tokens", 0) or 0), + "completionTokens": int(getattr(result, "generation_tokens", 0) or 0), + "totalTokens": int( + (getattr(result, "prompt_tokens", 0) or 0) + + (getattr(result, "generation_tokens", 0) or 0) + ), + "tokS": round(float(getattr(result, "generation_tps", 0.0) or 0.0), 1), + "promptTokS": round(float(getattr(result, "prompt_tps", 0.0) or 0.0), 1), + "peakMemoryGb": round(float(getattr(result, "peak_memory", 0.0) or 0.0), 3), + "runtimeNote": runtime_note, + "cacheStrategy": "native", + "cacheBits": 0, + "fp16Layers": 0, + "fusedAttention": False, + "speculativeDecoding": False, + } + + def _stream_generate_multimodal(self, request: dict[str, Any]) -> None: + """Streaming mlx-vlm generation. Emits chunks via the standard + ``_emit`` protocol used by the text-only path so the caller + sees the same shape regardless of which engine produced the run. + """ + try: + from mlx_vlm import stream_generate as vlm_stream # type: ignore[import-untyped] + except ImportError as exc: + _emit({"error": ( + f"mlx-vlm is not installed but a multimodal model is loaded: {exc}. " + "Install via ``pip install mlx-vlm``." + )}) + return + + images_b64 = list(request.get("images") or []) + kwargs = self._vlm_generate_kwargs(request) + thinking_mode = request.get("thinkingMode") or "off" + _open_tag, _close_tag = reasoning_delimiters_for(self._loaded_model_ref) + think_filter = ThinkingTokenFilter( + detect_raw_reasoning=(thinking_mode != "off"), + open_tag=_open_tag, + close_tag=_close_tag, + ) + + text_parts: list[str] = [] + completion_tokens = 0 + last_chunk: Any = None + + with tempfile.TemporaryDirectory(prefix="chaosengine-mm-") as tmpdir: + image_paths = self._decode_images_to_paths(images_b64, tmpdir) + prompt_text = self._format_multimodal_prompt(request, num_images=len(image_paths)) + if image_paths: + stream = vlm_stream( + self.model, self.processor, prompt_text, + image=image_paths, **kwargs, + ) + else: + stream = vlm_stream( + self.model, self.processor, prompt_text, **kwargs, + ) + + for chunk in stream: + last_chunk = chunk + chunk_text = chunk if isinstance(chunk, str) else ( + getattr(chunk, "text", None) or "" + ) + if not chunk_text: + continue + text_parts.append(chunk_text) + completion_tokens += 1 + filtered = think_filter.feed(chunk_text) + if filtered.text: + _emit({"ok": True, "chunk": {"text": filtered.text}}) + + flushed = think_filter.flush() + if flushed.text: + _emit({"ok": True, "chunk": {"text": flushed.text}}) + + runtime_note = ( + f"Multimodal stream via mlx-vlm " + f"({len(image_paths)} image{'s' if len(image_paths) != 1 else ''})." + ) + _emit({ + "ok": True, + "done": True, + "result": { + "finishReason": getattr(last_chunk, "finish_reason", None) or "stop", + "promptTokens": int(getattr(last_chunk, "prompt_tokens", 0) or 0), + "completionTokens": int( + getattr(last_chunk, "generation_tokens", 0) or completion_tokens + ), + "totalTokens": int( + (getattr(last_chunk, "prompt_tokens", 0) or 0) + + (getattr(last_chunk, "generation_tokens", 0) or completion_tokens) + ), + "tokS": round(float(getattr(last_chunk, "generation_tps", 0.0) or 0.0), 1), + "promptTokS": round(float(getattr(last_chunk, "prompt_tps", 0.0) or 0.0), 1), + "peakMemoryGb": round(float(getattr(last_chunk, "peak_memory", 0.0) or 0.0), 3), + "runtimeNote": runtime_note, + "cacheStrategy": "native", + "cacheBits": 0, + "fp16Layers": 0, + "fusedAttention": False, + "speculativeDecoding": False, + }, + }) + def stream_generate(self, request: dict[str, Any]) -> None: if self.model is None or self.tokenizer is None: raise RuntimeError("No MLX model is loaded.") + # Multimodal short-circuit (see ``generate`` for context). The + # streaming variant emits chunks via ``_emit`` so the caller + # protocol matches the text-only path exactly. + if self.is_multimodal: + self._stream_generate_multimodal(request) + return + speculative_stream_fallback_note = None # DFLASH/DDTree don't support token-level streaming natively, so # emit the full result as a single chunk in the streaming protocol. @@ -1233,7 +1732,7 @@ def stream_generate(self, request: dict[str, Any]) -> None: prompt=str(request.get("prompt") or ""), system_prompt=system_prompt, ) - sampler = make_sampler(temp=float(request.get("temperature") or 0.0)) + sampler = _build_mlx_sampler(request) prompt_cache, runtime_note = self._make_cache() runtime_note = _merge_runtime_notes(runtime_note, prompt_note) runtime_note = _merge_runtime_notes(runtime_note, speculative_stream_fallback_note) @@ -1241,11 +1740,20 @@ def stream_generate(self, request: dict[str, Any]) -> None: transcript_fallback = _plain_chat_fallback_active(prompt_note) thinking_mode = request.get("thinkingMode") or "off" - think_filter = ThinkingTokenFilter(detect_raw_reasoning=(thinking_mode != "off")) + _open_tag, _close_tag = reasoning_delimiters_for(self._loaded_model_ref) + think_filter = ThinkingTokenFilter( + detect_raw_reasoning=(thinking_mode != "off"), + open_tag=_open_tag, + close_tag=_close_tag, + ) transcript_filter = TranscriptLoopFilter() if transcript_fallback else None transcript_trimmed = False runaway_guard = RunawayGuard() runaway_stopped = False + # Phase 3.3 follow-up: when the request opted into logprobs, + # extract top-k per token via the helper and forward inline + # with each text chunk. + logprobs_top_k = int(request.get("logprobs") or 0) try: last_response = None @@ -1276,7 +1784,12 @@ def stream_generate(self, request: dict[str, Any]) -> None: if transcript_filter.stopped: transcript_trimmed = True if visible_text: - _emit({"ok": True, "chunk": {"text": visible_text}}) + chunk_payload: dict[str, Any] = {"text": visible_text} + if logprobs_top_k > 0: + entries = _extract_top_logprobs(response, self.tokenizer, logprobs_top_k) + if entries: + chunk_payload["tokenLogprobs"] = entries + _emit({"ok": True, "chunk": chunk_payload}) if transcript_filter is not None and transcript_filter.stopped: last_response = response break @@ -1306,7 +1819,12 @@ def stream_generate(self, request: dict[str, Any]) -> None: ) ) runtime_fields = self._runtime_fields(prompt_cache=None) - think_filter = ThinkingTokenFilter(detect_raw_reasoning=(thinking_mode != "off")) + _open_tag, _close_tag = reasoning_delimiters_for(self._loaded_model_ref) + think_filter = ThinkingTokenFilter( + detect_raw_reasoning=(thinking_mode != "off"), + open_tag=_open_tag, + close_tag=_close_tag, + ) transcript_filter = TranscriptLoopFilter() if transcript_fallback else None transcript_trimmed = False runaway_guard = RunawayGuard() diff --git a/backend_service/models/__init__.py b/backend_service/models/__init__.py index a47fe80..deb19df 100644 --- a/backend_service/models/__init__.py +++ b/backend_service/models/__init__.py @@ -48,6 +48,42 @@ class CreateSessionRequest(BaseModel): title: str | None = None +class AddVariantRequest(BaseModel): + """Phase 2.5: generate a sibling variant of an assistant message. + + The frontend calls this after the user picks an alternate model + from the assistant-message hover action. The chosen model must + already be the loaded runtime (call /api/models/load first if + needed). Backend runs a non-streaming generation using messages + truncated to the prior user prompt, then attaches the result as + a new entry on `messages[messageIndex].variants`. + """ + + messageIndex: int = Field(ge=0) + modelRef: str = Field(min_length=1) + modelName: str = Field(min_length=1) + canonicalRepo: str | None = None + source: str = "catalog" + path: str | None = None + backend: str = "auto" + maxTokens: int = Field(default=2048, ge=1, le=32768) + temperature: float = Field(default=0.7, ge=0.0, le=2.0) + + +class ForkSessionRequest(BaseModel): + """Phase 2.4: fork a thread at a specific assistant message. + + `forkAtMessageIndex` is the 0-based index of the last message to + include in the fork — typically the assistant turn the user + wants to branch from. The fork keeps every message up to and + including this index, then becomes a fresh thread for divergent + continuation. + """ + + forkAtMessageIndex: int = Field(ge=0) + title: str | None = Field(default=None, max_length=200) + + class UpdateSessionRequest(BaseModel): title: str | None = None model: str | None = None @@ -57,6 +93,7 @@ class UpdateSessionRequest(BaseModel): modelPath: str | None = None modelBackend: str | None = None thinkingMode: Literal["off", "auto"] | None = None + reasoningEffort: Literal["low", "medium", "high"] | None = None pinned: bool | None = None cacheStrategy: str | None = None cacheBits: int | None = None @@ -68,6 +105,9 @@ class UpdateSessionRequest(BaseModel): treeBudget: int | None = None dflashDraftModel: str | None = None messages: list[dict[str, Any]] | None = None + # Phase 3.7: assign / unassign a session to a workspace. + # Pass empty string to clear; None leaves the value untouched. + workspaceId: str | None = None class GenerateRequest(BaseModel): @@ -82,9 +122,45 @@ class GenerateRequest(BaseModel): path: str | None = None backend: str = "auto" thinkingMode: Literal["off", "auto"] | None = None + # Phase 1.12: reasoning effort hint forwarded to OpenAI-compat + # `reasoning_effort` chat-completion parameter on backends that respect it + # (recent llama-server builds + several reasoning models). Backends that + # ignore it remain unaffected. Null means no override. + reasoningEffort: Literal["low", "medium", "high"] | None = None systemPrompt: str | None = None temperature: float = Field(default=0.7, ge=0.0, le=2.0) maxTokens: int = Field(default=4096, ge=1, le=32768) + # Optional per-message sampler overrides. None means "let backend default + # apply" (llama.cpp / mlx-lm defaults). Phase 2.2 closes the Phase 1.10 + # deferral and exposes the full sampler chain end-to-end. Each backend + # forwards what it supports and silently ignores the rest: + # - llama-server: all of these (native /v1/chat/completions params) + # - mlx-lm: temperature, topP, topK, minP, repeatPenalty, seed + # DRY / XTC are intentionally deferred — DRY ships in llama-server but + # is sensitive to context-length growth; XTC is too new to expose + # broadly. Free-form GBNF grammars are skipped in favour of the safer + # JSON-schema response format which covers most practical use cases. + topP: float | None = Field(default=None, ge=0.0, le=1.0) + topK: int | None = Field(default=None, ge=0, le=200) + minP: float | None = Field(default=None, ge=0.0, le=1.0) + repeatPenalty: float | None = Field(default=None, ge=0.0, le=2.0) + # Mirostat: mode 0 = off, 1 = mirostat v1, 2 = mirostat v2. tau is the + # target entropy; eta the learning rate. Pass None to use llama-server + # defaults; pass mode=0 to explicitly disable on a model whose template + # leaves it on. + mirostatMode: Literal[0, 1, 2] | None = None + mirostatTau: float | None = Field(default=None, ge=0.0, le=10.0) + mirostatEta: float | None = Field(default=None, ge=0.0, le=1.0) + seed: int | None = Field(default=None, ge=0, le=2**31 - 1) + # Constrained decoding: when set, llama-server enforces a JSON schema + # via its `response_format: {type: "json_schema", json_schema: {...}}` + # parameter. The shape mirrors the OpenAI structured-outputs spec. + jsonSchema: dict[str, Any] | None = None + # Phase 3.3: when set, ask llama-server to return top-k logprobs per + # token. Gated behind an advanced-mode setting on the frontend so the + # bandwidth + render cost is only paid when explicitly requested. + # Pass None to omit (default — no logprobs returned). + logprobs: int | None = Field(default=None, ge=1, le=20) cacheStrategy: str | None = None cacheBits: int | None = Field(default=None, ge=0, le=8) fp16Layers: int | None = Field(default=None, ge=0, le=16) @@ -96,6 +172,16 @@ class GenerateRequest(BaseModel): # Agent tool-use enableTools: bool = False availableTools: list[str] | None = None # None = all registered tools + # Phase 2.12: when True, the modelRef / canonicalRepo / source / etc. + # in this request are treated as a one-turn override — the model + # loads (or stays) for this turn, but the session's stored + # `modelRef` / `model` / `canonicalRepo` / `modelSource` / + # `modelPath` / `modelBackend` fields are NOT updated. The session + # default sticks so the next plain message goes back to the + # original model. Default False preserves the existing behaviour + # where sending with a different model permanently switches the + # thread. + oneTurnOverride: bool = False class RemoteProviderRequest(BaseModel): @@ -107,6 +193,23 @@ class RemoteProviderRequest(BaseModel): providerType: str = "openai" +class McpServerConfigRequest(BaseModel): + """Phase 2.10: one MCP server entry for the settings payload. + + Maps onto `backend_service.mcp.McpServerConfig`. The shape mirrors + the standard mcp-clients config blob (`command`, `args`, `env`) so + config files copied from other MCP-aware tools work with minimal + edits. `id` is a short opaque key surfaced on tool provenance + badges. + """ + + id: str = Field(min_length=1, max_length=64) + command: str = Field(min_length=1, max_length=512) + args: list[str] | None = None + env: dict[str, str] | None = None + enabled: bool = True + + class UpdateSettingsRequest(BaseModel): modelDirectories: list[ModelDirectoryRequest] | None = None preferredServerPort: int | None = Field(default=None, ge=1024, le=65535) @@ -115,6 +218,11 @@ class UpdateSettingsRequest(BaseModel): autoStartServer: bool | None = None launchPreferences: LaunchPreferencesRequest | None = None remoteProviders: list[RemoteProviderRequest] | None = None + # Phase 2.10: list of MCP servers to spawn at startup. Each entry's + # `tools/list` output is merged into the agent tool registry with + # `provenance: mcp:` tags. None = leave existing list alone; + # empty list = remove all configured servers. + mcpServers: list[McpServerConfigRequest] | None = None huggingFaceToken: str | None = Field(default=None, max_length=512) dataDirectory: str | None = Field(default=None, max_length=4096) # Per-modality output overrides. Empty string clears the override and @@ -125,6 +233,10 @@ class UpdateSettingsRequest(BaseModel): # drive. Applied by the Tauri shell at backend spawn; requires restart # to take effect. Empty string clears the override. hfCachePath: str | None = Field(default=None, max_length=4096) + # Phase 3.3: when true, the chat composer adds `logprobs: 5` to + # every send so llama-server returns top-k per-token confidence + # info. Off by default. + advancedLogprobs: bool | None = None class OpenAIMessage(BaseModel): @@ -143,6 +255,30 @@ class OpenAIChatCompletionRequest(BaseModel): stream: bool = False tools: list[dict[str, Any]] | None = None tool_choice: Any = None + # Phase 2.13: standard OpenAI sampler parameters. llama-server + # supports them natively; mlx-lm consumes top_p / top_k / seed and + # silently ignores the rest. Pass None to use the runtime default. + top_p: float | None = Field(default=None, ge=0.0, le=1.0) + top_k: int | None = Field(default=None, ge=0, le=200) + frequency_penalty: float | None = Field(default=None, ge=-2.0, le=2.0) + presence_penalty: float | None = Field(default=None, ge=-2.0, le=2.0) + seed: int | None = Field(default=None, ge=0, le=2**31 - 1) + stop: list[str] | str | None = None + response_format: dict[str, Any] | None = None + + +class OpenAIEmbeddingsRequest(BaseModel): + """Phase 2.13: OpenAI-shaped embeddings input. + + `input` accepts a single string or a list of strings, mirroring + the OpenAI spec. The `model` field is informational — we use the + bundled embedding GGUF regardless. + """ + model: str | None = None + input: str | list[str] + encoding_format: Literal["float"] | None = "float" + dimensions: int | None = Field(default=None, ge=8, le=8192) + user: str | None = None class ConvertModelRequest(BaseModel): @@ -211,6 +347,35 @@ class ImageGenerationRequest(BaseModel): qualityPreset: str | None = Field(default=None, max_length=32) draftMode: bool = Field(default=False) sampler: str | None = Field(default=None, max_length=32) + # FU-015 / FBCache: optional diffusion cache strategy id + # ("fbcache" | "teacache" | "native"). Default ``None`` keeps the + # stock pipeline. See ``cache_compression`` registry for available + # ids; the runtime ignores ids that don't apply to image pipelines. + cacheStrategy: str | None = Field(default=None, max_length=32) + # Threshold for caching strategies. ``None`` uses the strategy + # default (FBCache: 0.12, TeaCache: 0.4). Lower = stricter (more + # blocks recomputed, less cached, less speedup, less quality drift). + cacheRelL1Thresh: float | None = Field(default=None, ge=0.0, le=1.0) + # FU-021: CFG decay schedule for flow-match image models. Mirrors + # the video runtime knob. Default off; opt-in. + cfgDecay: bool = Field(default=False) + # FU-018: TAESD preview-decode VAE swap. Preview-only quality knob — + # toggling on swaps ``pipeline.vae`` for the matching tiny VAE for + # the duration of the run. Final output goes through the fast VAE + # so the user trades fidelity for wall-time. Default off; opt-in. + previewVae: bool = Field(default=False) + # FU-023 Nunchaku / SVDQuant: 4-bit weight quantization on CUDA. + # Catalog variants pin ``nunchakuRepo`` (e.g. + # ``mit-han-lab/svdq-int4-flux.1-dev``) and optionally + # ``nunchakuFile``. CUDA only — runtime falls back to NF4 / int8wo / + # bf16 when nunchaku isn't installed or the device isn't CUDA. + nunchakuRepo: str | None = Field(default=None, min_length=1, max_length=200) + nunchakuFile: str | None = Field(default=None, min_length=1, max_length=200) + # FU-024 FP8 layerwise casting. Halves transformer VRAM by storing + # weights in fp8 + promoting to bf16 inside the matmul. CUDA SM 8.9+ + # only (Ada / Hopper / Blackwell). Family-correct fp8 dtype picked + # by the runtime: E5M2 for HunyuanVideo, E4M3 elsewhere. + fp8LayerwiseCasting: bool = Field(default=False) class ImageRuntimePreloadRequest(BaseModel): @@ -278,3 +443,25 @@ class VideoGenerationRequest(BaseModel): # ``guidance_scale`` linearly from the user's setting at step 0 # to 1.0 at the final step. Default-on for flow-match pipelines. cfgDecay: bool = Field(default=True) + # Spatial-Temporal Guidance scale for the mlx-video LTX-2 path. + # mlx-video implements STG by running an extra "perturbed" forward + # pass per sampler step alongside the cond/uncond CFG passes — the + # perturbed branch skips final transformer blocks to reduce object + # breakup and chroma drift on long motion. ``1.0`` matches Blaizzy's + # upstream README quality recommendation; ``0.0`` disables STG and + # frees ~33 % wall time per step at a mild quality cost. Distilled + # pipelines ignore the value (they run a fixed sampler), and other + # video runtimes (diffusers MPS, LongLive) do not consume it. + stgScale: float = Field(default=1.0, ge=0.0, le=3.0) + # FU-018: TAESD / TAEHV preview-decode VAE swap. Preview-only quality + # knob — when True the engine swaps ``pipeline.vae`` for the matching + # tiny VAE for the duration of the run. Default off — video users + # typically want full fidelity. + previewVae: bool = Field(default=False) + # FU-023 Nunchaku / SVDQuant — same shape as the image-side knob. + # When the catalog variant pins a Nunchaku snapshot, the runtime + # loads via the matching Nunchaku transformer subclass on CUDA. + nunchakuRepo: str | None = Field(default=None, min_length=1, max_length=200) + nunchakuFile: str | None = Field(default=None, min_length=1, max_length=200) + # FU-024 FP8 layerwise casting (CUDA SM 8.9+ Ada/Hopper/Blackwell). + fp8LayerwiseCasting: bool = Field(default=False) diff --git a/backend_service/progress.py b/backend_service/progress.py index 2d30573..b968953 100644 --- a/backend_service/progress.py +++ b/backend_service/progress.py @@ -71,6 +71,13 @@ def __init__(self, *, kind: str) -> None: # Optional run-shape metadata so the UI can render labels like # "Diffusing 3 images" without a separate request. self._run_label: str | None = None + # FU-018 part 2: live denoise thumbnail. Base64-encoded PNG bytes + # the runtime publishes from inside ``callback_on_step_end`` after + # decoding the current latents via TAESD/TAEHV. ``None`` when + # previewVae is off or the swap didn't apply. Cleared at + # ``begin()`` / ``finish()`` so a stale thumbnail from the previous + # run never leaks into the next one's first poll. + self._thumbnail: str | None = None # Cooperative cancel signal — the UI's Cancel button sets this via # /api/{images,video}/cancel; the pipeline's step-end callback reads # it and raises to abort the run. ``Event`` (not a plain bool) @@ -97,6 +104,7 @@ def begin( self._started_at = now self._updated_at = now self._run_label = run_label + self._thumbnail = None # Clear any cancel flag from a previous run — otherwise a user # who cancelled yesterday's gen would have today's first click # abort before it started. @@ -131,6 +139,18 @@ def set_step(self, step: int, total: int | None = None) -> None: self._total_steps = max(0, int(total)) self._updated_at = time.time() + def set_thumbnail(self, thumbnail_b64: str | None) -> None: + """Publish a base64-encoded PNG of the current denoised state for + the UI to render. Called from ``callback_on_step_end`` after the + runtime decodes ``callback_kwargs["latents"]`` via the swapped-in + TAESD/TAEHV preview VAE. Pass ``None`` to clear the slot mid-run + (e.g. after a decode failure).""" + with self._lock: + if not self._active: + return + self._thumbnail = thumbnail_b64 + self._updated_at = time.time() + def finish(self, *, message: str = "") -> None: with self._lock: self._active = False @@ -140,6 +160,7 @@ def finish(self, *, message: str = "") -> None: self._total_steps = 0 self._updated_at = time.time() self._run_label = None + self._thumbnail = None # Leave ``_cancel_event`` alone — the route handler needs to be # able to check whether the just-finished run was cancelled so # it can return the right status. ``begin()`` clears it for the @@ -182,6 +203,7 @@ def snapshot(self) -> dict[str, Any]: "elapsedSeconds": round(elapsed, 3), "runLabel": self._run_label, "cancelRequested": self._cancel_event.is_set(), + "thumbnail": self._thumbnail, } diff --git a/backend_service/rag/__init__.py b/backend_service/rag/__init__.py new file mode 100644 index 0000000..7a3c373 --- /dev/null +++ b/backend_service/rag/__init__.py @@ -0,0 +1,36 @@ +"""Cross-platform RAG primitives — Phase 2.6. + +Two collaborators replace (or augment) the existing TF-IDF + BM25 +retrieval that lives in `helpers/documents.py`: + + * `embedding_client` — subprocess wrapper around the llama.cpp + `llama-embedding` CLI. Returns dense vectors for arbitrary text. + Cross-platform because llama.cpp ships binaries for macOS, Linux, + and Windows; same wire format on every host. + + * `vector_store` — numpy cosine-similarity index. No new dep + (numpy is already part of the chat runtime). Persistable as a + JSON blob alongside session documents. + +The integration in `helpers/documents.DocumentIndex` is opt-in: when +the embedding client reports availability (model + binary present), +search ranks chunks by cosine similarity over embeddings, falls +back to the existing TF-IDF + BM25 hybrid when the embedding path +errors out at runtime. Either way the public `search()` shape stays +identical so call sites (state.py `_retrieve_session_context`) +don't change. +""" + +from backend_service.rag.embedding_client import ( + EmbeddingClient, + EmbeddingClientUnavailable, + resolve_embedding_client, +) +from backend_service.rag.vector_store import VectorStore + +__all__ = [ + "EmbeddingClient", + "EmbeddingClientUnavailable", + "VectorStore", + "resolve_embedding_client", +] diff --git a/backend_service/rag/embedding_client.py b/backend_service/rag/embedding_client.py new file mode 100644 index 0000000..6cbd310 --- /dev/null +++ b/backend_service/rag/embedding_client.py @@ -0,0 +1,215 @@ +"""Subprocess wrapper around `llama-embedding` for cross-platform RAG. + +Phase 2.6: takes a string, returns a normalised dense vector. Detects +the binary via env var override or PATH. Detects the model via env var +or a per-data-dir convention (`/embeddings/*.gguf`). When +either is missing, every method raises `EmbeddingClientUnavailable` +and the caller falls back to the existing TF-IDF + BM25 path — +behaviour preserves a graceful degradation rather than refusing +generations when no embedding model is shipped. + +The CLI is invoked with `--embd-output-format json` so we don't have +to parse the human-readable text dump. JSON output looks like: + + {"object": "list", "data": [{"index": 0, "embedding": [...]}], ...} + +Embeddings are L2-normalised (`--embd-normalize 2`) so cosine +similarity is the same as dot product downstream. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import time +from dataclasses import dataclass +from pathlib import Path + + +CHAOSENGINE_LLAMA_EMBEDDING_BIN = "CHAOSENGINE_LLAMA_EMBEDDING" +CHAOSENGINE_EMBEDDING_MODEL = "CHAOSENGINE_EMBEDDING_MODEL" + +# Default subprocess deadline. Embedding a single chunk on CPU should +# return within a couple of seconds; the ceiling exists to prevent a +# wedged binary from hanging the chat send path. +DEFAULT_TIMEOUT_S = 30.0 + + +class EmbeddingClientUnavailable(RuntimeError): + """Raised when the binary or model is missing. + + Callers treat this as "use the keyword fallback" — it must not + surface as a chat error. + """ + + +@dataclass(frozen=True) +class EmbeddingClient: + """Concrete client. Constructed via `resolve_embedding_client`.""" + + binary: str + model_path: str + timeout: float = DEFAULT_TIMEOUT_S + + def is_available(self) -> bool: + return Path(self.binary).is_file() and Path(self.model_path).is_file() + + def embed(self, text: str) -> list[float]: + """Embed a single string. Returns a normalised float vector.""" + vectors = self.embed_batch([text]) + return vectors[0] + + def embed_batch(self, texts: list[str]) -> list[list[float]]: + """Embed multiple strings via repeated CLI calls. + + The llama-embedding CLI accepts a single `--prompt` per + invocation (`--prompt-file` for batch is also supported but the + format is awkward to thread through). For chunk counts the + chat path actually sees (typically <50 per session), the + per-call overhead is acceptable. Switch to `--prompt-file` + if profiling shows this is hot. + """ + if not texts: + return [] + if not self.is_available(): + raise EmbeddingClientUnavailable( + f"Embedding binary or model missing (binary={self.binary}, model={self.model_path})" + ) + vectors: list[list[float]] = [] + for text in texts: + vectors.append(self._embed_one(text)) + return vectors + + def _embed_one(self, text: str) -> list[float]: + # `llama-embedding` only accepts text via stdin or file; passing + # via `--prompt` works for short strings but trips on shell + # quoting + newlines. Use stdin. + cmd = [ + self.binary, + "-m", self.model_path, + "--embd-output-format", "json", + "--embd-normalize", "2", + "-f", "/dev/stdin", + "--no-warmup", + "--log-disable", + ] + try: + result = subprocess.run( + cmd, + input=text, + capture_output=True, + text=True, + timeout=self.timeout, + ) + except subprocess.TimeoutExpired as exc: + raise EmbeddingClientUnavailable( + f"llama-embedding timed out after {self.timeout:.0f}s" + ) from exc + except FileNotFoundError as exc: + raise EmbeddingClientUnavailable( + f"llama-embedding binary not found: {self.binary}" + ) from exc + + if result.returncode != 0: + stderr_tail = (result.stderr or "").strip()[-500:] + raise EmbeddingClientUnavailable( + f"llama-embedding failed (rc={result.returncode}): {stderr_tail}" + ) + + return parse_embedding_output(result.stdout) + + +def parse_embedding_output(stdout: str) -> list[float]: + """Pure helper for tests — extracts the first vector from the JSON. + + The JSON envelope has shape ``{"data": [{"embedding": [...]}, ...]}`` + when ``--embd-output-format json`` is used. We always submit a + single prompt so we always want the first entry's vector. + """ + if not stdout.strip(): + raise EmbeddingClientUnavailable("llama-embedding returned empty stdout") + # Some llama.cpp builds prefix the JSON with metadata lines on + # stderr-merged stdout; find the first '{' and parse from there. + start = stdout.find("{") + if start < 0: + raise EmbeddingClientUnavailable("llama-embedding output had no JSON object") + try: + payload = json.loads(stdout[start:]) + except json.JSONDecodeError as exc: + raise EmbeddingClientUnavailable( + f"llama-embedding output unparseable: {exc}" + ) from exc + + data = payload.get("data") if isinstance(payload, dict) else None + if not isinstance(data, list) or not data: + raise EmbeddingClientUnavailable("llama-embedding output had no 'data' list") + first = data[0] + if not isinstance(first, dict): + raise EmbeddingClientUnavailable("llama-embedding output 'data[0]' was not an object") + embedding = first.get("embedding") + if not isinstance(embedding, list) or not embedding: + raise EmbeddingClientUnavailable("llama-embedding output had no 'embedding' vector") + if not all(isinstance(v, (int, float)) for v in embedding): + raise EmbeddingClientUnavailable("llama-embedding output embedding had non-numeric values") + return [float(v) for v in embedding] + + +def _resolve_binary() -> str | None: + override = os.environ.get(CHAOSENGINE_LLAMA_EMBEDDING_BIN) + if override and Path(override).is_file(): + return override + found = shutil.which("llama-embedding") + return found + + +def _resolve_model(data_dir: Path | None) -> str | None: + override = os.environ.get(CHAOSENGINE_EMBEDDING_MODEL) + if override and Path(override).is_file(): + return override + if data_dir is not None: + candidate_dir = data_dir / "embeddings" + if candidate_dir.is_dir(): + ggufs = sorted(candidate_dir.glob("*.gguf")) + if ggufs: + return str(ggufs[0]) + return None + + +def resolve_embedding_client( + data_dir: Path | None = None, + *, + timeout: float = DEFAULT_TIMEOUT_S, +) -> EmbeddingClient | None: + """Best-effort discovery — returns an EmbeddingClient or None. + + None means "no embedding path is available right now"; callers + should fall back to the keyword/TF-IDF retrieval. Callers that + cache the result MUST tolerate the result flipping to non-None + after the user drops a model into `/embeddings/`. + """ + binary = _resolve_binary() + if binary is None: + return None + model = _resolve_model(data_dir) + if model is None: + return None + return EmbeddingClient(binary=binary, model_path=model, timeout=timeout) + + +def warm_test(client: EmbeddingClient) -> tuple[bool, str | None]: + """Best-effort embedding round-trip — used in diagnostics. + + Returns (ok, error_message). Never raises; callers can render the + result on a Setup tab without try/except. + """ + started = time.perf_counter() + try: + vec = client.embed("ping") + except EmbeddingClientUnavailable as exc: + return False, str(exc) + if not vec: + return False, "embedding returned empty vector" + elapsed = time.perf_counter() - started + return True, f"OK ({len(vec)}-dim, {elapsed:.2f}s)" diff --git a/backend_service/rag/vector_store.py b/backend_service/rag/vector_store.py new file mode 100644 index 0000000..d32cd4a --- /dev/null +++ b/backend_service/rag/vector_store.py @@ -0,0 +1,116 @@ +"""In-memory cosine-similarity vector store for Phase 2.6 RAG. + +Tiny by design — no external dep beyond numpy (already in the chat +runtime). Stores per-chunk embeddings + a parallel list of citation +metadata. Persists as a JSON blob the existing DocumentIndex storage +can hold alongside its TF-IDF state. + +Embeddings are assumed to be L2-normalised at insert time (the +`llama-embedding --embd-normalize 2` flag the EmbeddingClient sets +guarantees this). With normalised vectors, cosine similarity = +dot product = a single matmul — fast enough for thousands of chunks +without an ANN index. +""" + +from __future__ import annotations + +import math +from typing import Any + + +class VectorStore: + """Append + search over normalised dense vectors. + + The store keeps embeddings in a 2-D list of floats rather than a + numpy array on disk; numpy comes back into play only at query + time so the JSON serialisation stays portable across Python + versions / numpy upgrades. + """ + + def __init__(self) -> None: + self._vectors: list[list[float]] = [] + self._dim: int | None = None + + @property + def size(self) -> int: + return len(self._vectors) + + @property + def dim(self) -> int | None: + return self._dim + + def add(self, vector: list[float]) -> None: + if not vector: + raise ValueError("VectorStore.add received an empty vector") + if self._dim is None: + self._dim = len(vector) + elif len(vector) != self._dim: + raise ValueError( + f"VectorStore vector length mismatch: got {len(vector)}, store dim is {self._dim}" + ) + self._vectors.append(list(vector)) + + def add_batch(self, vectors: list[list[float]]) -> None: + for vector in vectors: + self.add(vector) + + def reset(self) -> None: + self._vectors = [] + self._dim = None + + def remove_indices(self, indices: set[int]) -> None: + """Drop vectors at the given positions. Renumbers the rest. + + Used when DocumentIndex.remove_document needs to drop a + document's chunks — both the chunk list and the vector list + must stay in lockstep. + """ + if not indices: + return + self._vectors = [v for i, v in enumerate(self._vectors) if i not in indices] + if not self._vectors: + self._dim = None + + def search(self, query: list[float], top_k: int = 5) -> list[tuple[int, float]]: + """Return (index, similarity) pairs for the top-k matches. + + Both the stored vectors and the query are assumed normalised + (L2 = 1). When that holds, dot product equals cosine + similarity. The function still falls back to the explicit + normalisation form if the assumption is violated, so it + works even on hand-built test fixtures. + """ + if not self._vectors or not query: + return [] + if self._dim is not None and len(query) != self._dim: + raise ValueError( + f"VectorStore.search query dim {len(query)} does not match store dim {self._dim}" + ) + + query_norm = math.sqrt(sum(q * q for q in query)) + if query_norm == 0: + return [] + + scores: list[tuple[int, float]] = [] + for idx, vec in enumerate(self._vectors): + dot = sum(q * v for q, v in zip(query, vec)) + vec_norm = math.sqrt(sum(v * v for v in vec)) + if vec_norm == 0: + continue + similarity = dot / (query_norm * vec_norm) + scores.append((idx, similarity)) + scores.sort(key=lambda pair: pair[1], reverse=True) + return scores[:top_k] + + def to_dict(self) -> dict[str, Any]: + return {"vectors": self._vectors, "dim": self._dim} + + @classmethod + def from_dict(cls, payload: dict[str, Any]) -> "VectorStore": + store = cls() + vectors = payload.get("vectors") if isinstance(payload, dict) else None + if isinstance(vectors, list): + for vector in vectors: + if isinstance(vector, list) and vector and all(isinstance(v, (int, float)) for v in vector): + store.add([float(v) for v in vector]) + return store diff --git a/backend_service/reasoning_split.py b/backend_service/reasoning_split.py index 97fe002..151d3bf 100644 --- a/backend_service/reasoning_split.py +++ b/backend_service/reasoning_split.py @@ -9,6 +9,100 @@ _THINK_TAIL_GUARD = len(_THINK_OPEN) - 1 _STARTUP_BUFFER_LIMIT = 500 +# Per-model-family overrides for reasoning delimiters. Keyed by canonical +# repo or family prefix (case-insensitive prefix match). Models that do not +# match any entry use the default `...` tags. Add new entries +# here when adopting models that emit a non-standard reasoning marker. +# Values are (open_tag, close_tag) pairs. +_REASONING_DELIMITER_REGISTRY: dict[str, tuple[str, str]] = { + # Gemma 4 emits ASYMMETRIC channel markers (verified against the + # mlx-community/gemma-4-26b-a4b-it-5bit tokenizer): + # <|channel>thought ...reasoning... + # ...final answer text... + # Note: open tag is ``<|channel>`` (open + pipe + name + close, + # NO second pipe before the close angle), close tag is + # ```` (mirror — pipe goes BEFORE the closing angle). + # This is NOT the OpenAI Harmony ``<|channel|>...<|message|>`` + # symmetric format despite looking similar at a glance. + "google/gemma-4": ("<|channel>thought", ""), + "mlx-community/gemma-4": ("<|channel>thought", ""), + "lmstudio-community/gemma-4": ("<|channel>thought", ""), + # gpt-oss + OpenAI Harmony format ships SYMMETRIC delimiters + # (<|channel|>thought ... <|message|>...content...<|end|>). Stays + # at the original tags so swaps between gpt-oss and Gemma 4 work. + "openai/gpt-oss": ("<|channel|>thought", "<|end|>"), + "mlx-community/gpt-oss": ("<|channel|>thought", "<|end|>"), +} + + +# Channel-format boilerplate. Stripped as a final pass after the +# ThinkingTokenFilter to remove leftover channel/turn/message markers. +# Covers BOTH formats: +# +# * **Gemma 4 asymmetric** — ``<|NAME>`` opens, ```` closes. +# Open variants: ``<|channel>``, ``<|turn>``, ``<|tool>``, +# ``<|tool_call>``, ``<|tool_response>``, ``<|image>``, ``<|audio>``. +# Close variants: same set with the pipe migrated before the angle. +# Open tags optionally carry a sub-name suffix (``thought`` / +# ``final`` / ``analysis`` / ``commentary``). +# +# * **OpenAI Harmony symmetric** (gpt-oss) — ``<|NAME|>`` for both +# open and close, plus ``<|start|>``/``<|message|>``/``<|end|>``/ +# ``<|return|>`` boilerplate around the channel content. +_HARMONY_BOILERPLATE_RE = re.compile( + r"(?:" + # Gemma 4 open: <|channel>, <|turn>, etc. + optional sub-name suffix. + r"<\|(?:channel|turn|tool_call|tool_response|tool|image|audio|message|start|end|return)>" + r"(?:[a-z]+)?" + r"|" + # Gemma 4 close: , , etc. + r"<(?:channel|turn|tool_call|tool_response|tool|image|audio|message|start|end|return)\|>" + r"|" + # OpenAI Harmony symmetric: <|start|>, <|channel|>, <|message|>, <|end|>, <|return|> + r"<\|(?:start|channel|message|end|return)\|>" + r"(?:assistant|final|analysis|commentary|thought)?" + r")", + re.IGNORECASE, +) + + +def strip_harmony_boilerplate(text: str) -> str: + """Remove OpenAI Harmony channel-format markers from a model's output. + + The Harmony format wraps multi-channel responses with + ``<|start|>``, ``<|channel|>NAME``, ``<|message|>``, ``<|end|>`` + delimiters. After ``ThinkingTokenFilter`` extracts the ``thought`` + channel into the reasoning sidecar, this helper sweeps the residual + boilerplate out of the user-visible text. Idempotent on text that + contains no Harmony markers (e.g. plain ```` output from + Qwen3 / DeepSeek R1). + """ + if not text: + return text + cleaned = _HARMONY_BOILERPLATE_RE.sub("", text) + # Collapse runs of blank lines that the boilerplate removal can leave + # behind — keeps the rendered chat tidy without blowing away + # intentional paragraph breaks. + cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) + return cleaned.strip() + + +def reasoning_delimiters_for(model_ref: str | None) -> tuple[str, str]: + """Resolve the reasoning open/close tag pair for a given model reference. + + Looks up `model_ref` against `_REASONING_DELIMITER_REGISTRY` using a + case-insensitive prefix match (so `Qwen/Qwen3-8B-Instruct` matches a + registry key of `qwen/qwen3`). Returns the default ``/`` + pair when no match is found. + """ + if not model_ref: + return (_THINK_OPEN, _THINK_CLOSE) + lower = model_ref.lower() + for key, tags in _REASONING_DELIMITER_REGISTRY.items(): + if lower.startswith(key.lower()): + return tags + return (_THINK_OPEN, _THINK_CLOSE) + _RAW_REASONING_LABELS = ( "thinking process", "chain of thought", @@ -196,7 +290,29 @@ class ThinkingTokenFilter: XML ```` tags are always processed regardless. """ - def __init__(self, *, detect_raw_reasoning: bool = True) -> None: + def __init__( + self, + *, + detect_raw_reasoning: bool = True, + open_tag: str = _THINK_OPEN, + close_tag: str = _THINK_CLOSE, + max_reasoning_chars: int | None = 32_000, + ) -> None: + # `open_tag` / `close_tag` let downstream callers override the XML + # delimiters per model family — see `reasoning_delimiters_for()`. + # Defaults match the `...` convention used by Qwen3, + # DeepSeek R1, GPT-OSS, and most other reasoning models. + # + # Phase 2.0.5-E: `max_reasoning_chars` caps the size of a single + # reasoning block. When the cap is hit while still inside the open + # tag, the filter force-closes the block, emits `reasoning_done`, + # and routes any further bytes to `text` so the assistant turn + # finalises instead of streaming reasoning forever. Defaults to + # 32,000 chars (~8000 tokens). Pass `None` to disable. + if not open_tag or not close_tag: + raise ValueError("ThinkingTokenFilter requires non-empty open/close tags.") + if max_reasoning_chars is not None and max_reasoning_chars <= 0: + raise ValueError("max_reasoning_chars must be positive or None.") self._inside_xml_think = False self._inside_raw_think = False self._startup_done = False @@ -204,6 +320,12 @@ def __init__(self, *, detect_raw_reasoning: bool = True) -> None: self._pending_raw_final = "" self._total_fed = 0 self._detect_raw = detect_raw_reasoning + self._open_tag = open_tag + self._close_tag = close_tag + self._tail_guard = max(0, len(open_tag) - 1) + self._max_reasoning_chars = max_reasoning_chars + self._reasoning_emitted = 0 + self._reasoning_capped = False def feed(self, text: str) -> ThinkingStreamResult: self._buffer += text @@ -212,10 +334,10 @@ def feed(self, text: str) -> ThinkingStreamResult: while True: if not self._startup_done and not self._inside_xml_think and not self._inside_raw_think: - think_idx = _find_tag(self._buffer, _THINK_OPEN) + think_idx = _find_tag(self._buffer, self._open_tag) if think_idx != -1: output.text += self._buffer[:think_idx] - self._buffer = self._buffer[think_idx + len(_THINK_OPEN):] + self._buffer = self._buffer[think_idx + len(self._open_tag):] self._inside_xml_think = True self._startup_done = True continue @@ -256,27 +378,53 @@ def feed(self, text: str) -> ThinkingStreamResult: break if self._inside_xml_think: - end_idx = _find_tag(self._buffer, _THINK_CLOSE) + end_idx = _find_tag(self._buffer, self._close_tag) if end_idx == -1: + # Phase 2.0.5-E: reasoning budget cap. If the model is + # rambling past `max_reasoning_chars` without ever + # emitting a close tag, force the close so the + # assistant turn can finalise. Surplus bytes route to + # text from this point on. + if ( + self._max_reasoning_chars is not None + and self._reasoning_emitted + len(self._buffer) >= self._max_reasoning_chars + ): + slice_end = max(0, self._max_reasoning_chars - self._reasoning_emitted) + output.reasoning += self._buffer[:slice_end] + self._reasoning_emitted += slice_end + leftover = self._buffer[slice_end:] + self._buffer = leftover + self._inside_xml_think = False + self._reasoning_capped = True + output.reasoning_done = True + # Continue the loop so the leftover bytes get + # routed through the post-think text/tail logic. + continue output.reasoning += self._buffer + self._reasoning_emitted += len(self._buffer) self._buffer = "" break output.reasoning += self._buffer[:end_idx] - self._buffer = self._buffer[end_idx + len(_THINK_CLOSE):] + self._reasoning_emitted += end_idx + self._buffer = self._buffer[end_idx + len(self._close_tag):] self._inside_xml_think = False output.reasoning_done = True continue - start_idx = _find_tag(self._buffer, _THINK_OPEN) + start_idx = _find_tag(self._buffer, self._open_tag) if start_idx != -1: output.text += self._buffer[:start_idx] - self._buffer = self._buffer[start_idx + len(_THINK_OPEN):] + self._buffer = self._buffer[start_idx + len(self._open_tag):] self._inside_xml_think = True continue - if len(self._buffer) > _THINK_TAIL_GUARD: - output.text += self._buffer[:-_THINK_TAIL_GUARD] - self._buffer = self._buffer[-_THINK_TAIL_GUARD:] + if len(self._buffer) > self._tail_guard: + if self._tail_guard == 0: + output.text += self._buffer + self._buffer = "" + else: + output.text += self._buffer[:-self._tail_guard] + self._buffer = self._buffer[-self._tail_guard:] break return output diff --git a/backend_service/routes/__init__.py b/backend_service/routes/__init__.py index 091d439..46c3437 100644 --- a/backend_service/routes/__init__.py +++ b/backend_service/routes/__init__.py @@ -25,6 +25,7 @@ def register_routes(app: FastAPI) -> None: from .prompts import router as prompts_router from .diagnostics import router as diagnostics_router from .storage import router as storage_router + from .workspaces import router as workspaces_router app.include_router(auth_router) app.include_router(health_router) @@ -45,3 +46,4 @@ def register_routes(app: FastAPI) -> None: app.include_router(prompts_router) app.include_router(diagnostics_router) app.include_router(storage_router) + app.include_router(workspaces_router) diff --git a/backend_service/routes/chat.py b/backend_service/routes/chat.py index 6c99be5..5af7a53 100644 --- a/backend_service/routes/chat.py +++ b/backend_service/routes/chat.py @@ -2,10 +2,12 @@ from typing import Any -from fastapi import APIRouter, Request, UploadFile, File +from fastapi import APIRouter, HTTPException, Request, UploadFile, File from backend_service.models import ( + AddVariantRequest, CreateSessionRequest, + ForkSessionRequest, UpdateSessionRequest, GenerateRequest, ) @@ -21,6 +23,72 @@ def create_session(request: Request, body: CreateSessionRequest) -> dict[str, An return {"session": session} +@router.post("/api/chat/sessions/{session_id}/delve/{message_index}") +def delve_message(request: Request, session_id: str, message_index: int) -> dict[str, Any]: + """Phase 3.6: re-process an assistant message with a critique pass. + + The currently-loaded model re-reads the answer with a reviewer's + framing and produces a Critique / Revised answer pair. The result + attaches as a ``Delve critique`` variant on the message so the + frontend's existing variant card surfaces it without bespoke UI. + """ + state = request.app.state.chaosengine + try: + session = state.delve_message( + session_id=session_id, + message_index=message_index, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + return {"session": session} + + +@router.post("/api/chat/sessions/{session_id}/variants") +def add_message_variant(request: Request, session_id: str, body: AddVariantRequest) -> dict[str, Any]: + """Phase 2.5: generate a sibling variant of an assistant message + using a different model. Returns the updated session payload so + the frontend can swap its local copy in one round-trip.""" + state = request.app.state.chaosengine + try: + session = state.add_message_variant( + session_id=session_id, + message_index=body.messageIndex, + model_ref=body.modelRef, + model_name=body.modelName, + canonical_repo=body.canonicalRepo, + source=body.source, + path=body.path, + backend=body.backend, + max_tokens=body.maxTokens, + temperature=body.temperature, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + return {"session": session} + + +@router.post("/api/chat/sessions/{session_id}/fork") +def fork_session(request: Request, session_id: str, body: ForkSessionRequest) -> dict[str, Any]: + """Phase 2.4: fork an existing thread at a chosen message. + + Returns the freshly-created session payload (same shape as + create_session) plus the parent linkage on its + `parentSessionId` / `forkedAtMessageIndex` fields. Frontend + swaps the active chat to the new fork and lets the user + continue divergently. + """ + state = request.app.state.chaosengine + try: + session = state.fork_session( + source_session_id=session_id, + fork_at_message_index=body.forkAtMessageIndex, + title=body.title, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + return {"session": session} + + @router.patch("/api/chat/sessions/{session_id}") def update_session(request: Request, session_id: str, body: UpdateSessionRequest) -> dict[str, Any]: state = request.app.state.chaosengine @@ -46,6 +114,18 @@ def generate_stream(request: Request, body: GenerateRequest): return state.generate_stream(body) +@router.post("/api/chat/generate/{session_id}/cancel") +def cancel_generate(request: Request, session_id: str) -> dict[str, Any]: + """Mark an in-flight chat generation for cancellation. + + The streaming loop checks this flag between events and stops gracefully, + persisting whatever output has accumulated. Returning is fast — the + actual stream termination happens on the client's open SSE connection. + """ + state = request.app.state.chaosengine + return state.request_cancel_chat(session_id) + + @router.get("/api/chat/sessions/{session_id}/documents") def list_session_documents(request: Request, session_id: str) -> dict[str, Any]: state = request.app.state.chaosengine @@ -67,7 +147,14 @@ def delete_session_document(request: Request, session_id: str, doc_id: str) -> d @router.get("/api/tools") def list_tools() -> dict[str, Any]: - """List all available agent tools with their schemas.""" + """List all available agent tools with their schemas. + + Phase 2.10: each entry now carries a `provenance` field — either + ``"builtin"`` for the in-tree tools (web search, calculator, + file reader, code executor) or ``"mcp:"`` for tools + sourced from a configured MCP server. The frontend renders a + badge per source so users can tell which tools came from where. + """ tools = tool_registry.list_tools() return { "tools": [ @@ -75,6 +162,7 @@ def list_tools() -> dict[str, Any]: "name": t.name, "description": t.description, "schema": t.openai_schema(), + "provenance": getattr(t, "provenance", "builtin"), } for t in tools ], diff --git a/backend_service/routes/images.py b/backend_service/routes/images.py index 7f81689..2e3692c 100644 --- a/backend_service/routes/images.py +++ b/backend_service/routes/images.py @@ -228,6 +228,30 @@ def generate_image(request: Request, body: ImageGenerationRequest) -> dict[str, state.add_log("images", "error", f"Image model not found in catalog or tracked seeds: '{body.modelId}'") raise HTTPException(status_code=404, detail=f"Unknown image model '{body.modelId}'. The model isn't in the curated catalog or tracked seeds.") state.add_log("images", "info", f"Resolved variant: {variant.get('name')} (repo={variant.get('repo')})") + # Phase 2.0.5-H: pre-flight memory gate. Refuse before invoking the + # diffusion pipeline if the host is already memory-starved — image + # gen on a swap-thrashing laptop typically takes minutes to recover + # and can wedge the desktop entirely. Gate failure (psutil error) + # never blocks legitimate work; logged + skipped. + try: + from backend_service.helpers.memory_gate import ( + gate_image_generation, + snapshot_memory_signals, + ) + + available_gb, pressure_percent = snapshot_memory_signals() + refusal = gate_image_generation(available_gb, pressure_percent) + if refusal is not None: + state.add_log( + "images", "warning", + f"Memory gate refused image gen: {refusal['code']} " + f"(avail={available_gb:.1f} GB, pressure={pressure_percent:.0f}%).", + ) + raise HTTPException(status_code=503, detail=refusal["message"]) + except HTTPException: + raise + except Exception as gate_exc: + state.add_log("images", "warning", f"Memory gate skipped: {gate_exc}") _unload_idle_video_runtime_for_image(request, "image generation") try: artifacts, runtime = _generate_image_artifacts(body, variant, state.image_runtime) @@ -240,10 +264,20 @@ def generate_image(request: Request, body: ImageGenerationRequest) -> dict[str, state.add_log("images", "info", f"Image generation cancelled for {variant.get('name')} by user.") raise HTTPException(status_code=409, detail="cancelled") from None except Exception as exc: + from backend_service.helpers.video_runtime_diagnostics import ( + diagnose_diffusers_lazy_import_error, + ) tb_str = _tb.format_exc() state.add_log("images", "error", f"Image generation FAILED for {variant.get('name')}: {type(exc).__name__}: {exc}") - state.add_log("images", "error", f"Traceback:\n{tb_str[-500:]}") - raise HTTPException(status_code=500, detail=f"Image generation failed for {variant.get('name')}: {type(exc).__name__}: {exc}") from exc + state.add_log("images", "error", f"Traceback:\n{tb_str[-2000:]}") + # Diffusers' lazy-import wrapper hides the real cause when + # transformers / torchao / torch versions don't agree -- same + # T5EncoderModel symptom that bites video generation. Run the + # diagnostic so the user sees the actual missing/broken module + # instead of "Could not import module 'T5EncoderModel'". + friendly = diagnose_diffusers_lazy_import_error(str(exc)) + detail = friendly or f"Image generation failed for {variant.get('name')}: {type(exc).__name__}: {exc}" + raise HTTPException(status_code=500, detail=detail) from exc state.add_log( "images", "info", diff --git a/backend_service/routes/openai_compat.py b/backend_service/routes/openai_compat.py index ef2e3f8..28f2948 100644 --- a/backend_service/routes/openai_compat.py +++ b/backend_service/routes/openai_compat.py @@ -4,7 +4,10 @@ from fastapi import APIRouter, Request -from backend_service.models import OpenAIChatCompletionRequest +from backend_service.models import ( + OpenAIChatCompletionRequest, + OpenAIEmbeddingsRequest, +) router = APIRouter() @@ -19,3 +22,16 @@ def list_openai_models(request: Request) -> dict[str, Any]: def openai_chat_completion(request: Request, body: OpenAIChatCompletionRequest): state = request.app.state.chaosengine return state.openai_chat_completion(body) + + +@router.post("/v1/embeddings") +def openai_embeddings(request: Request, body: OpenAIEmbeddingsRequest) -> dict[str, Any]: + """Phase 2.13: OpenAI-compatible embeddings via the bundled GGUF. + + Lets external scripts / IDE plugins / Jupyter hit local models + without re-implementing inference. Falls back to a 503 when no + embedding binary or model is configured — the caller should + decide whether to keyword-search or surface the gap. + """ + state = request.app.state.chaosengine + return state.openai_embeddings(body) diff --git a/backend_service/routes/prompts.py b/backend_service/routes/prompts.py index b827312..fee8ffd 100644 --- a/backend_service/routes/prompts.py +++ b/backend_service/routes/prompts.py @@ -45,6 +45,10 @@ class PromptTemplateRequest(BaseModel): tags: list[str] = Field(default_factory=list) category: str = Field(default="General", max_length=80) fewShotExamples: list[dict[str, Any]] = Field(default_factory=list) + # Phase 2.7: optional variable declarations + preset samplers + preset model + variables: list[dict[str, Any]] = Field(default_factory=list) + presetSamplers: dict[str, Any] | None = None + presetModelRef: str | None = Field(default=None, max_length=200) # --------------------------------------------------------------------------- @@ -94,3 +98,57 @@ async def delete_prompt(template_id: str, request: Request) -> dict[str, Any]: if not lib.delete(template_id): raise HTTPException(status_code=404, detail="Template not found") return {"deleted": True, "id": template_id} + + +# --------------------------------------------------------------------------- +# FU-022: LLM-based prompt enhancer +# --------------------------------------------------------------------------- + + +class PromptEnhanceRequest(BaseModel): + """Body for ``POST /api/prompt/enhance``. ``repo`` selects the + family-specific system prompt; ``modelId`` overrides the default + enhancer model (Apple Silicon dev machines all default to + ``mlx-community/Qwen2.5-0.5B-Instruct-4bit``).""" + + prompt: str = Field(min_length=1, max_length=4000) + repo: str = Field(min_length=1, max_length=200) + modelId: str | None = None + maxTokens: int = Field(default=256, ge=32, le=1024) + + +class PromptEnhanceResponse(BaseModel): + enhanced: str + note: str | None + modelUsed: str | None + family: str + + +@router.post("/prompt/enhance") +async def enhance_prompt(payload: PromptEnhanceRequest) -> PromptEnhanceResponse: + """Rewrite a short prompt into the structured format the requested + image / video model expects. Apple Silicon path uses ``mlx_lm`` — + other platforms get a graceful no-op + runtimeNote in the response. + + Synchronous because the model is small (~700 MB / 0.5B params, + sub-second after a warm cache); first call pays the load cost. + """ + from backend_service.helpers.prompt_enhancer import ( + enhance_prompt as _enhance, + _DEFAULT_ENHANCER_MODEL, + ) + + model_id = payload.modelId or _DEFAULT_ENHANCER_MODEL + result = _enhance( + payload.prompt, + repo=payload.repo, + enabled=True, + model_id=model_id, + max_tokens=payload.maxTokens, + ) + return PromptEnhanceResponse( + enhanced=result.enhanced, + note=result.note, + modelUsed=result.modelUsed, + family=result.family, + ) diff --git a/backend_service/routes/setup.py b/backend_service/routes/setup.py index dcdfd92..a5b76f3 100644 --- a/backend_service/routes/setup.py +++ b/backend_service/routes/setup.py @@ -13,7 +13,7 @@ from typing import Any from fastapi import APIRouter, HTTPException, Request -from pydantic import BaseModel +from pydantic import BaseModel, Field router = APIRouter() @@ -82,6 +82,30 @@ # ~12 GB on M-series Macs. Roughly half the memory saving of NF4 # but twice the platform reach. "torchao": "torchao", + # SageAttention CUDA fast-attention kernels. Wired through + # ``backend_service/helpers/attention_backend.py`` (FU-016). Pin to 2.2.0 + # (SageAttention2++) — PyPI's default resolves to the stale 1.0.6 + # (2024-11) which lacks the SA2++ kernels. SageAttention3 lives on the + # ``sageattention3_blackwell`` branch (Blackwell SM10.0 only) and is + # not yet on PyPI; install path here always pulls the released SA2++ + # kernels regardless of GPU generation. No-op on macOS / CPU / non-DiT + # pipelines — the helper guards before invoking. + "sageattention": "sageattention==2.2.0", + # FU-023 Nunchaku / SVDQuant — 4-bit weight quantization for FLUX + # family + Qwen-Image + SD3.5 on CUDA. ~3× over NF4 on FLUX.1-dev. + # CUDA only; Apple Silicon / Linux-CPU installs no-op at runtime + # because the Nunchaku transformer subclasses fall back to the + # stock diffusers transformer when the import fails. v1.2.1 is the + # current pin (2026-01-25) — covers FLUX dev/Schnell/Tools/Kontext/ + # Krea, Qwen-Image + Qwen-Image-Edit, Z-Image-Turbo, SANA, PixArt-Σ. + "nunchaku": "nunchaku>=1.2.1", + # FU-027 NVIDIA/kvpress — KV cache compression toolkit (Apache 2.0, + # 26 releases as of v0.5.3 / 2026-04-09). HF transformers + multi-GPU + # Accelerate hookups. CUDA-side complement to TurboQuant on Apple + # Silicon. Hooks land separately under cache_compression/kvpress.py + # — installable here so the Setup tab can pre-stage the wheel before + # the integration code goes live. + "kvpress": "kvpress>=0.5.3", # Native Apple Silicon FLUX runtime. mflux uses MLX directly instead # of diffusers+MPS, which is noticeably faster and doesn't hit the # MPS fp16-black-image edge cases. Apple Silicon only — installer @@ -1067,6 +1091,23 @@ def _gpu_bundle_job_worker(python: str, extras_dir: Path) -> None: state.cuda_verified = cuda_ok state.attempts.append({"phase": "verify", "ok": cuda_ok, "output": detail[-2000:]}) + # Tell the import system to re-scan ``sys.path`` so packages + # written into the extras dir during this run are visible to the + # next ``importlib.util.find_spec`` call (the image-runtime probe + # uses one). Without this, the runtime continues reporting + # "placeholder" until a backend restart even though the bundle + # is on disk. Also reset the cached VRAM total so the post-install + # capabilities snapshot reflects the freshly importable torch. + try: + importlib.invalidate_caches() + except Exception: + pass + try: + from backend_service.helpers.gpu import reset_vram_total_cache + reset_vram_total_cache() + except Exception: + pass + state.phase = "done" state.percent = 100.0 state.done = True @@ -1441,6 +1482,280 @@ def install_longlive_status() -> dict[str, Any]: return _LONGLIVE_JOB.to_dict() +# ------------------------------------------------------------------ +# mlx-video Wan install (FU-025) +# ------------------------------------------------------------------ +# +# Mirror of the LongLive install pattern but for the Apple Silicon +# Wan2.x → MLX conversion path. Phases: preflight, download-raw, +# convert, verify. Same single-job semantics, same InstallLogPanel +# attempt-row shape, same status poll cadence. + + +@dataclass +class _WanInstallJobState: + id: str = "" + phase: str = "idle" # idle | preflight | downloading | converting | verifying | done | error + message: str = "" + repo: str | None = None + package_current: str | None = None + package_index: int = 0 + package_total: int = 0 + percent: float = 0.0 + output_dir: str | None = None + error: str | None = None + started_at: float = 0.0 + finished_at: float = 0.0 + attempts: list[dict[str, Any]] = field(default_factory=list) + done: bool = False + + def to_dict(self) -> dict[str, Any]: + return { + "id": self.id, + "phase": self.phase, + "message": self.message, + "repo": self.repo, + "packageCurrent": self.package_current, + "packageIndex": self.package_index, + "packageTotal": self.package_total, + "percent": round(self.percent, 1), + "outputDir": self.output_dir, + "error": self.error, + "startedAt": self.started_at, + "finishedAt": self.finished_at, + "attempts": self.attempts, + "done": self.done, + } + + +_WAN_INSTALL_JOB = _WanInstallJobState() +_WAN_INSTALL_LOCK = threading.Lock() + + +_WAN_PHASE_LABELS: dict[str, str] = { + "preflight": "Verify Apple Silicon + mlx-video", + "download-raw": "Download raw Wan checkpoint", + "convert": "Convert weights to MLX", + "verify": "Verify converted output", +} + + +class _WanInstallRequest(BaseModel): + repo: str = Field(min_length=1, max_length=128) + dtype: str = Field(default="bfloat16") + quantize: bool = Field(default=False) + bits: int = Field(default=4) + groupSize: int = Field(default=64) + cleanupRaw: bool = Field(default=False) + + +def _wan_install_job_worker( + repo: str, + *, + dtype: str, + quantize: bool, + bits: int, + group_size: int, + cleanup_raw: bool, +) -> None: + """Run the Wan installer + stream output into the shared job state. + + Same buffering pattern as ``_longlive_job_worker``: per-phase line + accumulation flushed to an attempt row on each progress event, + capped at 8000 chars to bound the response payload size. + """ + from backend_service import mlx_video_wan_installer # noqa: PLC0415 + + job = _WAN_INSTALL_JOB + phase_buffer: list[str] = [] + current_phase: dict[str, object] = {"name": "preflight"} + total_phases = len(mlx_video_wan_installer.INSTALL_PHASES) + + def push_attempt(phase: str, ok: bool) -> None: + job.attempts.append({ + "phase": phase, + "package": _WAN_PHASE_LABELS.get(phase, phase), + "ok": ok, + "output": "\n".join(phase_buffer)[-8000:], + }) + phase_buffer.clear() + + def stream_log(line: str) -> None: + phase_buffer.append(line) + if len(phase_buffer) > 400: + del phase_buffer[: len(phase_buffer) - 400] + + def report_progress(event: dict[str, object]) -> None: + phase_name = str(event.get("phase") or "") + ok = bool(event.get("ok")) + # Phase event marks the START of that phase; flush prior buffer + # as a completed attempt only when transitioning from a real + # phase. The first event (preflight) has no prior buffer. + if current_phase.get("name") and current_phase.get("name") != phase_name: + push_attempt(str(current_phase["name"]), ok=True) + if not ok: + push_attempt(phase_name, ok=False) + job.phase = "error" + return + current_phase["name"] = phase_name + try: + idx = mlx_video_wan_installer.INSTALL_PHASES.index(phase_name) + except ValueError: + return + job.package_index = idx + job.percent = (idx / total_phases) * 100.0 + job.package_current = _WAN_PHASE_LABELS.get(phase_name, phase_name) + job.message = f"Running: {job.package_current}" + # Update job phase label for the UI status badge. + job.phase = { + "preflight": "preflight", + "download-raw": "downloading", + "convert": "converting", + "verify": "verifying", + }.get(phase_name, "preflight") + + job.message = f"Starting Wan install for {repo}" + job.package_current = _WAN_PHASE_LABELS["preflight"] + job.package_total = total_phases + + try: + mlx_video_wan_installer.install( + repo, + dtype=dtype, + quantize=quantize, + bits=bits, + group_size=group_size, + keep_raw=not cleanup_raw, + logger=stream_log, + progress=report_progress, + ) + except mlx_video_wan_installer.WanInstallError as exc: + if phase_buffer: + push_attempt(str(current_phase["name"]), ok=False) + job.phase = "error" + job.error = str(exc) + job.message = f"Wan install failed: {exc}" + except Exception as exc: # noqa: BLE001 + if phase_buffer: + push_attempt(str(current_phase["name"]), ok=False) + job.phase = "error" + job.error = f"Unexpected error: {exc}" + job.message = job.error + else: + if phase_buffer: + # Flush the verify-phase buffer that wasn't followed by a + # phase-transition event. + push_attempt(str(current_phase["name"]), ok=True) + job.phase = "done" + job.percent = 100.0 + job.package_index = total_phases + job.package_current = None + job.message = f"Wan install complete: {repo}" + finally: + job.finished_at = time.time() + job.done = True + + +@router.post("/api/setup/install-mlx-video-wan") +def start_install_mlx_video_wan( + body: _WanInstallRequest, request: Request +) -> dict[str, Any]: + """Kick off a background Wan install (download raw HF weights + + convert to MLX). + + Returns the current job state immediately. Poll + ``/api/setup/install-mlx-video-wan/status`` for progress. + Calling again while a job runs returns the running state without + starting a duplicate. + """ + state_chaosengine = request.app.state.chaosengine + + from backend_service import mlx_video_wan_convert, mlx_video_wan_installer # noqa: PLC0415 + + if not mlx_video_wan_installer.is_supported_raw_repo(body.repo): + raise HTTPException( + status_code=400, + detail=( + f"Unsupported Wan repo {body.repo!r}. Supported: " + f"{sorted(mlx_video_wan_installer.SUPPORTED_RAW_REPOS)}" + ), + ) + + output_dir = mlx_video_wan_convert.output_dir_for(body.repo) + + with _WAN_INSTALL_LOCK: + if _WAN_INSTALL_JOB.phase in {"preflight", "downloading", "converting", "verifying"}: + return _WAN_INSTALL_JOB.to_dict() + + _WAN_INSTALL_JOB.id = f"wan-mlx-{int(time.time() * 1000)}" + _WAN_INSTALL_JOB.phase = "preflight" + _WAN_INSTALL_JOB.repo = body.repo + _WAN_INSTALL_JOB.message = "Starting install" + _WAN_INSTALL_JOB.package_current = _WAN_PHASE_LABELS["preflight"] + _WAN_INSTALL_JOB.package_index = 0 + _WAN_INSTALL_JOB.package_total = len(mlx_video_wan_installer.INSTALL_PHASES) + _WAN_INSTALL_JOB.percent = 0.0 + _WAN_INSTALL_JOB.output_dir = str(output_dir) + _WAN_INSTALL_JOB.error = None + _WAN_INSTALL_JOB.started_at = time.time() + _WAN_INSTALL_JOB.finished_at = 0.0 + _WAN_INSTALL_JOB.attempts = [] + _WAN_INSTALL_JOB.done = False + + thread = threading.Thread( + target=_wan_install_job_worker, + name="chaosengine-wan-install", + kwargs={ + "repo": body.repo, + "dtype": body.dtype, + "quantize": body.quantize, + "bits": body.bits, + "group_size": body.groupSize, + "cleanup_raw": body.cleanupRaw, + }, + daemon=True, + ) + thread.start() + + state_chaosengine.add_log( + "server", "info", + f"Wan install started (job={_WAN_INSTALL_JOB.id}, repo={body.repo}, " + f"target={output_dir})", + ) + return _WAN_INSTALL_JOB.to_dict() + + +@router.get("/api/setup/install-mlx-video-wan/status") +def install_mlx_video_wan_status() -> dict[str, Any]: + """Snapshot of the current Wan install job. Safe to poll at 1-2 Hz.""" + return _WAN_INSTALL_JOB.to_dict() + + +@router.get("/api/setup/mlx-video-wan/inventory") +def mlx_video_wan_inventory() -> dict[str, Any]: + """List every Wan repo: supported + converted-on-disk + approx size. + + The Setup-page panel uses this to render a per-variant install + table without poking at every status endpoint individually.""" + from backend_service import mlx_video_wan_convert, mlx_video_wan_installer # noqa: PLC0415 + + converted_repos = {s.repo for s in mlx_video_wan_convert.list_converted()} + items: list[dict[str, Any]] = [] + for repo in sorted(mlx_video_wan_installer.SUPPORTED_RAW_REPOS): + status = mlx_video_wan_convert.status_for(repo) + items.append({ + "repo": repo, + "approxRawSizeGb": mlx_video_wan_installer.approx_raw_size_gb(repo), + "converted": repo in converted_repos, + "status": status.to_dict(), + }) + return { + "items": items, + "convertRoot": str(mlx_video_wan_convert.CONVERT_ROOT), + "rawRoot": str(mlx_video_wan_installer.RAW_ROOT), + } + + # ------------------------------------------------------------------ # llama-server-turbo update check # ------------------------------------------------------------------ diff --git a/backend_service/routes/video.py b/backend_service/routes/video.py index c11a977..29da938 100644 --- a/backend_service/routes/video.py +++ b/backend_service/routes/video.py @@ -161,18 +161,37 @@ def preload_video_model(request: Request, body: VideoRuntimePreloadRequest) -> d try: runtime = state.video_runtime.preload(variant["repo"]) except RuntimeError as exc: - state.add_log("video", "error", f"Failed to preload {variant['name']}: {exc}") - raise HTTPException(status_code=400, detail=f"Failed to load {variant['name']}: {exc}") from exc + # Diffusers' lazy-import wrapper hides the real underlying cause when + # transformers / torchao / torch versions don't agree -- the user + # sees "Could not import module 'T5EncoderModel'" with no actionable + # next step. Probe the suspected dep chain and rewrite the message + # with the actual missing/broken module + a Setup-page hint. + from backend_service.helpers.video_runtime_diagnostics import ( + diagnose_diffusers_lazy_import_error, + ) + import traceback as _tb + full_tb = _tb.format_exc() + state.add_log( + "video", "error", + f"Failed to preload {variant['name']}: {exc}\nTraceback:\n{full_tb[-2000:]}", + ) + friendly = diagnose_diffusers_lazy_import_error(str(exc)) + detail = friendly or f"Failed to load {variant['name']}: {exc}" + raise HTTPException(status_code=400, detail=detail) from exc except Exception as exc: + from backend_service.helpers.video_runtime_diagnostics import ( + diagnose_diffusers_lazy_import_error, + ) + import traceback as _tb + full_tb = _tb.format_exc() state.add_log( - "video", - "error", - f"Unexpected error preloading {variant['name']}: {type(exc).__name__}: {exc}", + "video", "error", + f"Unexpected error preloading {variant['name']}: " + f"{type(exc).__name__}: {exc}\nTraceback:\n{full_tb[-2000:]}", ) - raise HTTPException( - status_code=500, - detail=f"Failed to load {variant['name']}: {type(exc).__name__}: {exc}", - ) from exc + friendly = diagnose_diffusers_lazy_import_error(str(exc)) + detail = friendly or f"Failed to load {variant['name']}: {type(exc).__name__}: {exc}" + raise HTTPException(status_code=500, detail=detail) from exc state.add_log("video", "info", f"Preloaded video model {variant['name']}.") state.add_activity("Video model loaded", variant["name"]) @@ -295,6 +314,29 @@ def generate_video(request: Request, body: VideoGenerationRequest) -> dict[str, status_code=404, detail=f"Unknown video model '{body.modelId}'. The model isn't in the curated catalog.", ) + # Phase 2.0.5-H: pre-flight memory gate. Video gen has the highest + # working set of the three flows — a hung diffusion loop on a memory- + # starved Apple Silicon machine can swap-thrash the host for minutes. + # Refuse early when the floor is breached; gate exceptions never block. + try: + from backend_service.helpers.memory_gate import ( + gate_video_generation, + snapshot_memory_signals, + ) + + available_gb, pressure_percent = snapshot_memory_signals() + refusal = gate_video_generation(available_gb, pressure_percent) + if refusal is not None: + state.add_log( + "video", "warning", + f"Memory gate refused video gen: {refusal['code']} " + f"(avail={available_gb:.1f} GB, pressure={pressure_percent:.0f}%).", + ) + raise HTTPException(status_code=503, detail=refusal["message"]) + except HTTPException: + raise + except Exception as gate_exc: + state.add_log("video", "warning", f"Memory gate skipped: {gate_exc}") if not _video_variant_available_locally(variant): validation_error = _video_variant_validation_error(variant) @@ -310,19 +352,27 @@ def generate_video(request: Request, body: VideoGenerationRequest) -> dict[str, state.add_log("video", "info", f"Video generation cancelled for {variant['name']} by user.") raise HTTPException(status_code=409, detail="cancelled") from None except RuntimeError as exc: - state.add_log("video", "error", f"Video generation failed for {variant['name']}: {exc}") - raise HTTPException( - status_code=400, - detail=f"Video generation failed for {variant['name']}: {exc}", - ) from exc + from backend_service.helpers.video_runtime_diagnostics import ( + diagnose_diffusers_lazy_import_error, + ) + tb_str = _tb.format_exc() + state.add_log( + "video", "error", + f"Video generation failed for {variant['name']}: {exc}\nTraceback:\n{tb_str[-2000:]}", + ) + friendly = diagnose_diffusers_lazy_import_error(str(exc)) + detail = friendly or f"Video generation failed for {variant['name']}: {exc}" + raise HTTPException(status_code=400, detail=detail) from exc except Exception as exc: + from backend_service.helpers.video_runtime_diagnostics import ( + diagnose_diffusers_lazy_import_error, + ) tb_str = _tb.format_exc() state.add_log("video", "error", f"Video generation FAILED: {type(exc).__name__}: {exc}") - state.add_log("video", "error", f"Traceback:\n{tb_str[-500:]}") - raise HTTPException( - status_code=500, - detail=f"Video generation failed for {variant['name']}: {type(exc).__name__}: {exc}", - ) from exc + state.add_log("video", "error", f"Traceback:\n{tb_str[-2000:]}") + friendly = diagnose_diffusers_lazy_import_error(str(exc)) + detail = friendly or f"Video generation failed for {variant['name']}: {type(exc).__name__}: {exc}" + raise HTTPException(status_code=500, detail=detail) from exc state.add_log( "video", diff --git a/backend_service/routes/workspaces.py b/backend_service/routes/workspaces.py new file mode 100644 index 0000000..70af854 --- /dev/null +++ b/backend_service/routes/workspaces.py @@ -0,0 +1,106 @@ +"""Phase 3.7: workspace knowledge stack routes. + +CRUD over workspace metadata + per-workspace document listing. +Document upload / delete reuse the existing `state.upload_document` +path with a different target dir; ChatSession assignment is a +PATCH on the session. +""" + +from __future__ import annotations + +from typing import Any + +from fastapi import APIRouter, HTTPException, Request, UploadFile, File +from pydantic import BaseModel, Field + +from backend_service.helpers.workspaces import WorkspaceRegistry + +router = APIRouter(prefix="/api/workspaces", tags=["workspaces"]) + +_registry: WorkspaceRegistry | None = None + + +def _get_registry(_request: Request) -> WorkspaceRegistry: + global _registry + if _registry is not None: + return _registry + from backend_service.app import WORKSPACES_PATH, WORKSPACES_DIR + _registry = WorkspaceRegistry(WORKSPACES_PATH, WORKSPACES_DIR) + return _registry + + +class WorkspaceRequest(BaseModel): + title: str = Field(min_length=1, max_length=200) + description: str = Field(default="", max_length=2000) + + +class WorkspaceUpdateRequest(BaseModel): + title: str | None = Field(default=None, max_length=200) + description: str | None = Field(default=None, max_length=2000) + + +@router.get("") +def list_workspaces(request: Request) -> dict[str, Any]: + registry = _get_registry(request) + return {"workspaces": registry.list_all()} + + +@router.post("") +def create_workspace(request: Request, body: WorkspaceRequest) -> dict[str, Any]: + registry = _get_registry(request) + return {"workspace": registry.create(body.title, body.description)} + + +@router.patch("/{workspace_id}") +def update_workspace( + request: Request, + workspace_id: str, + body: WorkspaceUpdateRequest, +) -> dict[str, Any]: + registry = _get_registry(request) + updated = registry.update(workspace_id, title=body.title, description=body.description) + if updated is None: + raise HTTPException(status_code=404, detail="Workspace not found") + return {"workspace": updated} + + +@router.delete("/{workspace_id}") +def delete_workspace(request: Request, workspace_id: str) -> dict[str, Any]: + registry = _get_registry(request) + if not registry.delete(workspace_id): + raise HTTPException(status_code=404, detail="Workspace not found") + return {"deleted": True, "id": workspace_id} + + +@router.post("/{workspace_id}/documents") +async def upload_workspace_document( + request: Request, + workspace_id: str, + file: UploadFile = File(...), +) -> dict[str, Any]: + registry = _get_registry(request) + workspace = registry.get(workspace_id) + if workspace is None: + raise HTTPException(status_code=404, detail="Workspace not found") + state = request.app.state.chaosengine + raw = await file.read() + return { + "document": state.upload_workspace_document( + workspace_id=workspace_id, + filename=file.filename or "document", + data=raw, + ) + } + + +@router.delete("/{workspace_id}/documents/{doc_id}") +def delete_workspace_document( + request: Request, + workspace_id: str, + doc_id: str, +) -> dict[str, Any]: + registry = _get_registry(request) + if registry.get(workspace_id) is None: + raise HTTPException(status_code=404, detail="Workspace not found") + state = request.app.state.chaosengine + return state.delete_workspace_document(workspace_id, doc_id) diff --git a/backend_service/runaway_guard.py b/backend_service/runaway_guard.py new file mode 100644 index 0000000..758a820 --- /dev/null +++ b/backend_service/runaway_guard.py @@ -0,0 +1,117 @@ +"""Runaway-generation detection shared across MLX worker and llama.cpp paths. + +Phase 2.0.5-F: the MLX worker has had a `RunawayGuard` for a while that +catches three failure modes — repeated identical lines, near-duplicate +reasoning loops, and raw thinking-heading dumps. The llama.cpp streaming +path didn't have an equivalent, so a runaway on a GGUF model could fill the +context buffer and pin the host until the user noticed. + +Moved here so both backends can import the same implementation. The +`mlx_worker` module re-exports it for backward compatibility with existing +imports. +""" + +from __future__ import annotations + +import re + +from backend_service.reasoning_split import RAW_REASONING_HEADING_RE + + +_RAW_THINKING_HEADING_RE = RAW_REASONING_HEADING_RE + +_REASONING_LINE_RE = re.compile( + r"^\s*(?:" + r"wait,|okay[,.]|actually[,.]|let me|i (?:need to|should|will|must|can)" + r"|so (?:i |the )|hmm|looking|check(?:ing)?|(?:re)?evaluat" + r"|draft(?:ing)?|refin(?:ing|e)|final (?:check|answer|decision|polish)" + r")", + re.IGNORECASE, +) + + +class RunawayGuard: + """Detect and abort runaway generation loops in streamed output. + + Catches three failure modes: + 1. Repeated identical lines (e.g. "Wait, I will write 'Qwen3.5'." x100) + 2. Near-duplicate reasoning loops (lines starting with "Wait," / "Okay," etc.) + 3. Raw thinking-heading dumps (e.g. "Thinking Process:" at generation start) + + Raises ``RuntimeError`` when a runaway is detected. + """ + + def __init__( + self, + *, + min_line_length: int = 30, + max_repeats: int = 4, + max_reasoning_lines: int = 20, + ) -> None: + self._min_line_length = min_line_length + self._max_repeats = max_repeats + self._max_reasoning_lines = max_reasoning_lines + self._buffer = "" + self._last_line: str | None = None + self._repeat_count = 0 + self._reasoning_streak = 0 + self._total_chars = 0 + self._thinking_heading_seen = False + + def feed(self, text: str) -> None: + """Feed a chunk of streamed text. Raises on detected runaway.""" + self._total_chars += len(text) + self._buffer += text + + # Check for raw thinking heading at the start of generation + if not self._thinking_heading_seen and self._total_chars < 200: + if _RAW_THINKING_HEADING_RE.search(self._buffer): + self._thinking_heading_seen = True + + # Check for repeated / reasoning lines + while "\n" in self._buffer: + line, self._buffer = self._buffer.split("\n", 1) + self._check_line(line) + + def flush(self) -> None: + if self._buffer: + self._check_line(self._buffer) + self._buffer = "" + + @property + def saw_thinking_heading(self) -> bool: + return self._thinking_heading_seen + + def _check_line(self, line: str) -> None: + normalized = " ".join(line.strip().lower().split()) + if len(normalized) < self._min_line_length: + # Short lines still decay the reasoning streak so alternating + # "Wait, ..." / "31536000 seconds." patterns get caught. + self._reasoning_streak = max(0, self._reasoning_streak - 1) + return + + # Exact-match repetition + if normalized == self._last_line: + self._repeat_count += 1 + else: + self._last_line = normalized + self._repeat_count = 1 + + if self._repeat_count >= self._max_repeats: + raise RuntimeError( + "Stopped runaway generation: model is repeating itself." + ) + + # Near-duplicate reasoning loop detection + # Lines like "Wait, I should...", "Okay, I'll...", "Actually, looking..." + # Non-reasoning lines decay the streak by 1 instead of resetting, + # so alternating "Wait, ..." / "31536000 seconds." still trips the guard. + if _REASONING_LINE_RE.match(normalized): + self._reasoning_streak += 2 + else: + self._reasoning_streak = max(0, self._reasoning_streak - 1) + + if self._reasoning_streak >= self._max_reasoning_lines: + raise RuntimeError( + "Stopped runaway generation: model is stuck in a reasoning loop." + ) diff --git a/backend_service/sdcpp_image_runtime.py b/backend_service/sdcpp_image_runtime.py new file mode 100644 index 0000000..259fcc1 --- /dev/null +++ b/backend_service/sdcpp_image_runtime.py @@ -0,0 +1,348 @@ +"""stable-diffusion.cpp image runtime (FU-008 image subset). + +Wraps the staged ``sd`` binary from ``leejet/stable-diffusion.cpp`` (MIT) +as a subprocess engine for cross-platform image generation, mirroring +``SdCppVideoEngine`` and ``MfluxImageEngine``. Targets SD 1.x/2.x/XL, +FLUX.1, FLUX.2, Qwen Image, and Z-Image — the binary supports all of +these via GGUF transformer files. + +Routing +------- +Apple Silicon: prefer mflux for FLUX (faster MLX-native), then sd.cpp +for non-FLUX GGUF, then diffusers MPS. + +Linux/Windows + CUDA: prefer diffusers + bnb NF4 for FLUX, sd.cpp for +GGUF lanes when the user explicitly opts in. + +The engine is selected when a catalog variant carries ``engine="sdcpp"``; +the manager's ``ImageRuntimeManager.generate`` checks ``config.runtime`` +and dispatches accordingly. +""" + +from __future__ import annotations + +import io +import os +import platform +import re +import subprocess +import tempfile +import time +from pathlib import Path +from typing import Any + +from backend_service.image_runtime import ( + GeneratedImage, + ImageGenerationConfig, + _resolve_base_seed, +) + + +# Same progress regex as the video engine — sd.cpp emits ``[INFO] step +# N/M`` lines on stdout regardless of which output type is active. +_STEP_RE = re.compile(r"(?:step\s+|\[)(\d+)\s*/\s*(\d+)") +_LAST_OUTPUT_LINES = 80 +_RUNTIME_LABEL = "stable-diffusion.cpp" + + +# Repos sd.cpp's image lane supports natively. The Wan 2.1/2.2 video +# repos live in ``sdcpp_video_runtime._SUPPORTED_REPOS``; this module +# stays narrow to image-side families. Catalog variants with +# ``engine="sdcpp"`` must reference one of these repos *and* pin a +# ``ggufRepo`` + ``ggufFile`` so the binary has a single transformer +# file to load. +_SUPPORTED_REPOS: frozenset[str] = frozenset({ + "black-forest-labs/FLUX.1-schnell", + "black-forest-labs/FLUX.1-dev", + "black-forest-labs/FLUX.2-klein-4B", + "black-forest-labs/FLUX.2-klein-9B", + "stabilityai/stable-diffusion-3.5-large", + "stabilityai/stable-diffusion-xl-base-1.0", + "stabilityai/stable-diffusion-2-1", + "Qwen/Qwen-Image", + "Qwen/Qwen-Image-2512", + "Tongyi-MAI/Z-Image", + "Tongyi-MAI/Z-Image-Turbo", +}) + + +def supported_repos() -> frozenset[str]: + """Repo ids the sd.cpp image engine accepts.""" + return _SUPPORTED_REPOS + + +def _is_sdcpp_image_repo(repo: str | None) -> bool: + if not repo: + return False + return repo in _SUPPORTED_REPOS + + +def _resolve_sd_binary() -> Path | None: + """Resolve the staged ``sd`` binary path. Same lookup order as + ``sdcpp_video_runtime._resolve_sd_binary`` — the image and video + lanes share the same binary. + """ + env_path = os.environ.get("CHAOSENGINE_SDCPP_BIN") + if env_path: + candidate = Path(env_path) + if candidate.exists(): + return candidate + + home = os.environ.get("HOME") + if home: + managed = Path(home) / ".chaosengine" / "bin" / "sd" + if managed.exists(): + return managed + + return None + + +class SdCppImageEngine: + """Subprocess wrapper around stable-diffusion.cpp for image GGUF. + + ``probe()`` reports binary presence + readiness. ``generate()`` + renders a single PNG via the staged binary, streaming ``step N/M`` + progress lines into ``IMAGE_PROGRESS`` so the desktop UI keeps a + live denoise count. Output is read back as PNG bytes for the + standard ``GeneratedImage`` contract. + """ + + runtime_label = _RUNTIME_LABEL + + def __init__(self) -> None: + self._loaded_repo: str | None = None + + # ------------------------------------------------------------------ + # Probe + lifecycle + # ------------------------------------------------------------------ + + def probe(self) -> dict[str, Any]: + binary = _resolve_sd_binary() + if binary is None: + return { + "available": False, + "reason": ( + "stable-diffusion.cpp binary not staged. Run " + "``./scripts/build-sdcpp.sh`` (or set " + "CHAOSENGINE_SDCPP_BIN) to build and install." + ), + } + return { + "available": True, + "reason": None, + "binary": str(binary), + "device": "mps" if platform.system() == "Darwin" else "cuda", + } + + def preload(self, repo: str) -> dict[str, Any]: + if not _is_sdcpp_image_repo(repo): + raise RuntimeError( + f"sd.cpp image lane does not support {repo}. " + f"Supported: {sorted(_SUPPORTED_REPOS)}" + ) + self._loaded_repo = repo + return self.probe() + + def unload(self, repo: str | None = None) -> dict[str, Any]: + if repo is None or repo == self._loaded_repo: + self._loaded_repo = None + return self.probe() + + # ------------------------------------------------------------------ + # Generation + # ------------------------------------------------------------------ + + def generate(self, config: ImageGenerationConfig) -> list[GeneratedImage]: + binary = _resolve_sd_binary() + if binary is None: + raise RuntimeError( + "stable-diffusion.cpp binary not staged. " + "Run ``./scripts/build-sdcpp.sh`` first." + ) + if not _is_sdcpp_image_repo(config.repo): + raise RuntimeError( + f"sd.cpp image lane does not support {config.repo}. " + f"Supported: {sorted(_SUPPORTED_REPOS)}" + ) + if not config.ggufFile: + raise RuntimeError( + "sd.cpp image generate requires a GGUF variant. Pick a " + "catalog entry that pins ``ggufRepo`` + ``ggufFile`` " + "(e.g. FLUX.1-dev · GGUF Q4_K_M)." + ) + + base_seed = _resolve_base_seed(config.seed) + batch = max(1, int(config.batchSize or 1)) + out_images: list[GeneratedImage] = [] + started = time.perf_counter() + + # sd.cpp renders one image per invocation. Loop the batch — same + # pattern the diffusers engine uses when it can't batch on a + # given pipeline. Each iteration gets its own seed so the user + # sees a real variation set rather than four copies. + for index in range(batch): + seed = base_seed + index + with tempfile.TemporaryDirectory(prefix="chaosengine-sdcpp-img-") as tmpdir: + output_path = Path(tmpdir) / f"sdcpp-{seed}.png" + model_path = self._resolve_gguf_path(config) + args = self._build_cli_args( + binary=binary, + config=config, + model_path=model_path, + output_path=output_path, + seed=seed, + ) + output_bytes = self._run_subprocess( + args=args, + config=config, + output_path=output_path, + ) + + elapsed = max(0.1, time.perf_counter() - started) + out_images.append( + GeneratedImage( + seed=seed, + bytes=output_bytes, + extension="png", + mimeType="image/png", + durationSeconds=round(elapsed, 1), + runtimeLabel=_RUNTIME_LABEL, + runtimeNote=( + f"Generated via sd.cpp subprocess " + f"({Path(model_path).name})." + ), + ) + ) + # Reset the timer so the next image's durationSeconds + # measures its own wall-time, not cumulative. + started = time.perf_counter() + + return out_images + + # ------------------------------------------------------------------ + # CLI builders + subprocess plumbing + # ------------------------------------------------------------------ + + def _resolve_gguf_path(self, config: ImageGenerationConfig) -> str: + """Materialise the GGUF transformer file from HF cache (or + download on first use). The catalog variant pins + ``ggufRepo`` + ``ggufFile``. + """ + if not config.ggufFile or not config.ggufRepo: + raise RuntimeError( + "GGUF transformer required for sd.cpp image. Catalog variant " + "must pin ``ggufRepo`` + ``ggufFile``." + ) + try: + from huggingface_hub import hf_hub_download # type: ignore + except ImportError as exc: + raise RuntimeError( + f"huggingface_hub is required to resolve the GGUF path: {exc}" + ) from exc + return hf_hub_download( + repo_id=config.ggufRepo, + filename=config.ggufFile, + ) + + def _build_cli_args( + self, + *, + binary: Path, + config: ImageGenerationConfig, + model_path: str, + output_path: Path, + seed: int, + ) -> list[str]: + """Map an ``ImageGenerationConfig`` onto sd.cpp's CLI flags. + + Mirrors the video CLI builder shape but drops video-specific + flags (``--video-frames``, ``--fps``). Output is PNG; sd.cpp + infers the format from the ``-o`` file extension. + """ + args: list[str] = [ + str(binary), + "--diffusion-model", + model_path, + "-p", + config.prompt, + "-W", + str(config.width), + "-H", + str(config.height), + "--steps", + str(config.steps), + "--cfg-scale", + f"{config.guidance:g}", + "--seed", + str(seed), + "-o", + str(output_path), + ] + if config.negativePrompt: + args.extend(["--negative-prompt", config.negativePrompt]) + return args + + def _run_subprocess( + self, + *, + args: list[str], + config: ImageGenerationConfig, + output_path: Path, + ) -> bytes: + """Spawn ``sd``, stream stdout into ``IMAGE_PROGRESS``, read result.""" + from backend_service.progress import IMAGE_PROGRESS + + proc = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + last_lines: list[str] = [] + try: + stdout = proc.stdout + if stdout is None: + proc.wait() + raise RuntimeError("sd.cpp subprocess produced no stdout.") + for line in stdout: + stripped = line.rstrip() + last_lines.append(stripped) + if len(last_lines) > _LAST_OUTPUT_LINES: + last_lines.pop(0) + + match = _STEP_RE.search(stripped) + if match: + step = int(match.group(1)) + total = int(match.group(2)) + IMAGE_PROGRESS.set_step(step, total=total) + + if IMAGE_PROGRESS.is_cancelled(): + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + raise RuntimeError("sd.cpp generation cancelled by user.") + + rc = proc.wait() + except KeyboardInterrupt: + proc.terminate() + raise + + if rc != 0: + tail = "\n".join(last_lines[-20:]) + raise RuntimeError( + f"sd.cpp exited with code {rc}.\n" + f"Last output:\n{tail}" + ) + + if not output_path.exists(): + tail = "\n".join(last_lines[-10:]) + raise RuntimeError( + f"sd.cpp completed but output file {output_path.name} is " + f"missing. Last output:\n{tail}" + ) + + return output_path.read_bytes() diff --git a/backend_service/sdcpp_video_runtime.py b/backend_service/sdcpp_video_runtime.py index 6f746c0..f593ce0 100644 --- a/backend_service/sdcpp_video_runtime.py +++ b/backend_service/sdcpp_video_runtime.py @@ -9,12 +9,10 @@ SCOPE ----- -Phase C scaffold: ``probe()`` reports availability based on the staged -``sd`` binary (path resolved by the Tauri shell into ``CHAOSENGINE_SDCPP_BIN``). -``generate()`` raises ``NotImplementedError`` until the per-model CLI -arg builders + stdout progress parser land. The hooks the manager calls -(``probe``/``preload``/``unload``) match the contract expected by -``VideoRuntimeManager`` so routing can be wired before the heavy lift. +Phase 3 lift (FU-008): ``generate()`` is wired. Builds the CLI invocation +from a ``VideoGenerationConfig``, spawns the staged ``sd`` binary, parses +``step N/M`` lines off stdout into ``VIDEO_PROGRESS``, then reads the +output mp4 back as bytes for the standard ``GeneratedVideo`` contract. ROUTING ------- @@ -29,6 +27,10 @@ import os import platform +import re +import subprocess +import tempfile +import time from pathlib import Path from typing import Any @@ -39,6 +41,15 @@ ) +# Progress regex — sd.cpp emits ``[INFO] step N/M (..)`` style lines on +# stdout during the denoise loop. Loose pattern catches both the older +# ``step N/M`` and the newer ``[N/M]`` formats; whichever matches gets +# fed into ``VIDEO_PROGRESS``. +_STEP_RE = re.compile(r"(?:step\s+|\[)(\d+)\s*/\s*(\d+)") +_LAST_OUTPUT_LINES = 80 +_RUNTIME_LABEL = "stable-diffusion.cpp" + + # Repos sd.cpp supports natively via GGUF. Kept narrow on the video side — # the binary supports image families too, but those route through # image_runtime (FU-008 image side, separate engine). @@ -110,22 +121,22 @@ def probe(self) -> VideoRuntimeStatus: expectedDevice=None, missingDependencies=["sd"], message=( - "stable-diffusion.cpp binary not staged. Build " - "leejet/stable-diffusion.cpp and either set " - "CHAOSENGINE_SDCPP_BIN or copy `sd` to " - "~/.chaosengine/bin/. See FU-008 in CLAUDE.md." + "stable-diffusion.cpp binary not staged. Run " + "``./scripts/build-sdcpp.sh`` (or set " + "CHAOSENGINE_SDCPP_BIN) to build and install. " + "See FU-008 in CLAUDE.md." ), ) device = "mps" if platform.system() == "Darwin" else "cuda" return VideoRuntimeStatus( activeEngine="sd.cpp", - realGenerationAvailable=False, # scaffold — generate() not wired yet + realGenerationAvailable=True, device=device, expectedDevice=device, message=( - f"sd.cpp binary detected at {binary}. Generation pipeline " - "still scaffold — Wan GGUF generate path lands in the " - "next iteration of FU-008." + f"sd.cpp binary detected at {binary}. Wan GGUF " + "generate path active — pass ``ggufRepo`` + " + "``ggufFile`` on the catalog variant to route here." ), loadedModelRepo=self._loaded_repo, ) @@ -145,11 +156,211 @@ def unload(self, repo: str | None = None) -> VideoRuntimeStatus: return self.probe() def generate(self, config: VideoGenerationConfig) -> GeneratedVideo: - raise NotImplementedError( - "sd.cpp video generate() is scaffold-only. Wan GGUF " - "subprocess wiring lands in the next FU-008 iteration: " - "build CLI args from VideoGenerationConfig (prompt, " - "num_frames, fps, steps, guidance, seed, output path), " - "spawn the staged `sd` binary, stream stdout into " - "VIDEO_PROGRESS, then return the rendered mp4." + binary = _resolve_sd_binary() + if binary is None: + raise RuntimeError( + "stable-diffusion.cpp binary not staged. " + "Run ``./scripts/build-sdcpp.sh`` first." + ) + if not _is_sdcpp_video_repo(config.repo): + raise RuntimeError( + f"sd.cpp does not support {config.repo}. " + f"Supported: {sorted(_SUPPORTED_REPOS)}" + ) + + # The Wan video path needs a GGUF transformer file — sd.cpp + # cannot consume a sharded diffusers safetensors snapshot + # directly. The catalog variant pins ``ggufRepo`` + ``ggufFile`` + # for the GGUF lanes (e.g. QuantStack/Wan2.2-TI2V-5B-GGUF). + if not config.ggufFile: + raise RuntimeError( + "sd.cpp video generate requires a GGUF variant. Pick a " + "catalog entry that pins ``ggufRepo`` + ``ggufFile`` " + "(e.g. Wan 2.2 TI2V 5B · GGUF Q4_K_M)." + ) + + seed = config.seed if config.seed is not None else int(time.time()) + + with tempfile.TemporaryDirectory(prefix="chaosengine-sdcpp-") as tmpdir: + # sd.cpp's single-file video outputs are .avi / .webm / + # animated .webp (no native .mp4). webm is the smallest + + # most broadly playable in the desktop's webview. + output_path = Path(tmpdir) / f"sdcpp-{seed}.webm" + model_path = self._resolve_gguf_path(config) + args = self._build_cli_args( + binary=binary, + config=config, + model_path=model_path, + output_path=output_path, + seed=seed, + ) + output_bytes = self._run_subprocess( + args=args, + config=config, + output_path=output_path, + ) + + duration = round(config.numFrames / max(1, config.fps), 3) + return GeneratedVideo( + seed=seed, + bytes=output_bytes, + extension="webm", + mimeType="video/webm", + durationSeconds=duration, + frameCount=config.numFrames, + fps=config.fps, + width=config.width, + height=config.height, + runtimeLabel=_RUNTIME_LABEL, + runtimeNote=( + f"Generated via sd.cpp subprocess " + f"({Path(model_path).name})." + ), + effectiveSteps=config.steps, + effectiveGuidance=config.guidance, ) + + # ------------------------------------------------------------------ + # CLI builders + subprocess plumbing + # ------------------------------------------------------------------ + + def _resolve_gguf_path(self, config: VideoGenerationConfig) -> str: + """Resolve the absolute on-disk path for the GGUF transformer. + + The catalog variant carries ``ggufRepo`` (HF repo) + ``ggufFile`` + (filename within the repo); the standard diffusers download + machinery pulls them into the HF cache. Reuse that — we just + re-resolve the file path so sd.cpp can read it directly. + """ + if not config.ggufFile or not config.ggufRepo: + raise RuntimeError( + "GGUF transformer required for sd.cpp video. Catalog variant " + "must pin ``ggufRepo`` + ``ggufFile``." + ) + try: + from huggingface_hub import hf_hub_download # type: ignore + except ImportError as exc: + raise RuntimeError( + f"huggingface_hub is required to resolve the GGUF path: {exc}" + ) from exc + return hf_hub_download( + repo_id=config.ggufRepo, + filename=config.ggufFile, + ) + + def _build_cli_args( + self, + *, + binary: Path, + config: VideoGenerationConfig, + model_path: str, + output_path: Path, + seed: int, + ) -> list[str]: + """Map a ``VideoGenerationConfig`` onto sd.cpp's CLI flags. + + The mapping mirrors the ``--help`` output of leejet's master tip + as of 2026-04-29 (master-593). If a future sd.cpp release renames + a flag (e.g. ``--video-frames`` → ``--frames``) update here. The + binary fails fast on unknown flags so a regression surfaces as a + clean stderr message rather than silently bad output. + """ + args: list[str] = [ + str(binary), + "--diffusion-model", + model_path, + "-p", + config.prompt, + "-W", + str(config.width), + "-H", + str(config.height), + "--steps", + str(config.steps), + "--cfg-scale", + f"{config.guidance:g}", + "--seed", + str(seed), + "-o", + str(output_path), + "--video-frames", + str(config.numFrames), + "--fps", + str(config.fps), + ] + if config.negativePrompt: + args.extend(["--negative-prompt", config.negativePrompt]) + return args + + def _run_subprocess( + self, + *, + args: list[str], + config: VideoGenerationConfig, + output_path: Path, + ) -> bytes: + """Spawn ``sd``, stream stdout into ``VIDEO_PROGRESS``, read result. + + Uses ``stderr=STDOUT`` so the same parser sees both info-level + progress lines and any error chatter. Tail of the output is kept + in ``last_lines`` so a non-zero exit can include the last few + lines in the raised RuntimeError. Cancellation is cooperative: + we poll ``VIDEO_PROGRESS.is_cancelled()`` per stdout line and + terminate the child if a cancel comes in mid-run. + """ + from backend_service.progress import VIDEO_PROGRESS + + proc = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + last_lines: list[str] = [] + try: + stdout = proc.stdout + if stdout is None: + proc.wait() + raise RuntimeError("sd.cpp subprocess produced no stdout.") + for line in stdout: + stripped = line.rstrip() + last_lines.append(stripped) + if len(last_lines) > _LAST_OUTPUT_LINES: + last_lines.pop(0) + + match = _STEP_RE.search(stripped) + if match: + step = int(match.group(1)) + total = int(match.group(2)) + VIDEO_PROGRESS.set_step(step, total=total) + + if VIDEO_PROGRESS.is_cancelled(): + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + raise RuntimeError("sd.cpp generation cancelled by user.") + + rc = proc.wait() + except KeyboardInterrupt: + proc.terminate() + raise + + if rc != 0: + tail = "\n".join(last_lines[-20:]) + raise RuntimeError( + f"sd.cpp exited with code {rc}.\n" + f"Last output:\n{tail}" + ) + + if not output_path.exists(): + tail = "\n".join(last_lines[-10:]) + raise RuntimeError( + f"sd.cpp completed but output file {output_path.name} is " + f"missing. Last output:\n{tail}" + ) + + return output_path.read_bytes() diff --git a/backend_service/state.py b/backend_service/state.py index 67fcfa9..8bea54f 100644 --- a/backend_service/state.py +++ b/backend_service/state.py @@ -30,6 +30,7 @@ UpdateSessionRequest, GenerateRequest, OpenAIChatCompletionRequest, + OpenAIEmbeddingsRequest, BenchmarkRunRequest, UpdateSettingsRequest, ) @@ -97,6 +98,74 @@ def _compose_chat_system_prompt(system_prompt: str | None, thinking_mode: str | return (system_prompt or "").strip() +def _build_sampler_overrides(request: Any) -> dict[str, Any]: + """Phase 2.2: collect the request's sampler overrides into a flat dict + keyed using the llama-server `/v1/chat/completions` field names. + + The dict contains only fields the user actually set — `None` defaults + are skipped so the backend's defaults stay in force when the UI sends + no override. Both engines treat unknown keys as no-ops, so the output + is forward-compatible across llama-server / mlx-lm versions. + """ + overrides: dict[str, Any] = {} + + def _put(dst: str, value: Any) -> None: + if value is not None: + overrides[dst] = value + + _put("top_p", getattr(request, "topP", None)) + _put("top_k", getattr(request, "topK", None)) + _put("min_p", getattr(request, "minP", None)) + _put("repeat_penalty", getattr(request, "repeatPenalty", None)) + _put("seed", getattr(request, "seed", None)) + mirostat_mode = getattr(request, "mirostatMode", None) + if mirostat_mode is not None: + overrides["mirostat"] = mirostat_mode + _put("mirostat_tau", getattr(request, "mirostatTau", None)) + _put("mirostat_eta", getattr(request, "mirostatEta", None)) + # Phase 3.3: when the user enables logprobs on a request the + # frontend sends a top-k count; map it onto llama-server's + # `logprobs` + `top_logprobs` parameters so the response delta + # carries the per-token info. + logprobs = getattr(request, "logprobs", None) + if logprobs is not None and logprobs > 0: + overrides["logprobs"] = True + overrides["top_logprobs"] = int(logprobs) + return overrides + + +def _build_history_with_reasoning( + messages: list[dict[str, Any]], + *, + preserve_reasoning: bool, +) -> list[dict[str, Any]]: + """Project a session's stored messages into the history list passed to the + inference layer. + + When `preserve_reasoning` is true and an assistant message has a + `reasoning` field captured by ThinkingTokenFilter on a previous turn, + the reasoning is re-emitted inside `...` tags ahead of + the visible answer. Reasoning-capable models (Qwen3, DeepSeek R1, etc.) + consume this naturally on follow-up turns; non-reasoning models will + treat it as inline text. Falsy / missing reasoning is skipped, so this + is safe to call unconditionally. + """ + history: list[dict[str, Any]] = [] + for message in messages: + role = message.get("role") + text = str(message.get("text") or "") + if ( + preserve_reasoning + and role == "assistant" + and message.get("reasoning") + ): + reasoning_str = str(message["reasoning"]).strip() + if reasoning_str: + text = f"\n{reasoning_str}\n\n\n{text}" + history.append({"role": role, "text": text}) + return history + + def _title_from_prompt(prompt: str | None) -> str: words = str(prompt or "").strip().split() return " ".join(words[:4]) or "New chat" @@ -227,6 +296,12 @@ def __init__( self._loading_state: dict[str, Any] | None = None self._downloads: dict[str, dict[str, Any]] = {} self._download_cancel: dict[str, bool] = {} + # Cancellation flags for in-flight chat generations, keyed by session id. + # Set to True via request_cancel_chat(); the streaming loop in + # generate_stream() checks this flag between events and breaks early. + # Cleared at the start of each new generation so a stale flag from a + # prior turn never aborts a fresh request. + self._chat_cancel: dict[str, bool] = {} self._download_processes: dict[str, subprocess.Popen[str]] = {} self._download_tokens: dict[str, str] = {} self._bootstrap() @@ -604,6 +679,7 @@ def _stream_assistant_metrics_payload( tok_s: float, response_seconds: float, requested_runtime: dict[str, Any] | None = None, + ttft_seconds: float | None = None, ) -> dict[str, Any]: metrics: dict[str, Any] = { "finishReason": final_chunk.finish_reason if final_chunk else "stop", @@ -616,6 +692,29 @@ def _stream_assistant_metrics_payload( } if final_chunk and getattr(final_chunk, "dflash_acceptance_rate", None) is not None: metrics["dflashAcceptanceRate"] = final_chunk.dflash_acceptance_rate + if ttft_seconds is not None: + metrics["ttftSeconds"] = ttft_seconds + # Phase 3.1: forward DDTree accepted-span data when present. + accepted_spans = getattr(final_chunk, "accepted_spans", None) if final_chunk else None + if accepted_spans: + metrics["acceptedSpans"] = accepted_spans + accepted_token_text = getattr(final_chunk, "accepted_token_text", None) if final_chunk else None + if accepted_token_text: + metrics["acceptedTokenText"] = accepted_token_text + + # Phase 3.5: per-turn perf telemetry snapshot. Best-effort — + # samplers fail silently and the telemetry strip just omits the + # missing fields. Captured at finalisation so the values reflect + # the load the turn actually generated, not idle baseline. + try: + from backend_service.helpers.perf import snapshot_perf_telemetry + telemetry = snapshot_perf_telemetry() + if not telemetry.is_empty: + metrics["perfTelemetry"] = telemetry.to_dict() + except Exception: + # Telemetry must never block a turn from finalising. + pass + return { **self._loaded_model_metrics_fields(), **self._result_runtime_metrics_fields(final_chunk), @@ -1013,6 +1112,315 @@ def create_session(self, title: str | None = None) -> dict[str, Any]: session = self._ensure_session(title=title) return session + def add_message_variant( + self, + session_id: str, + message_index: int, + model_ref: str, + model_name: str, + canonical_repo: str | None, + source: str, + path: str | None, + backend: str, + max_tokens: int, + temperature: float, + ) -> dict[str, Any]: + """Phase 2.5: generate a sibling variant of an assistant message. + + Truncates the session's message list to the user message that + produced the target assistant turn (i.e. messages[0..index-1] + plus the user prompt at index-1), then runs a non-streaming + generation against the override model. The result is attached + to ``messages[message_index].variants`` so the frontend can + render it side-by-side with the original answer. + + The override model must already be loaded as the current + runtime — callers should preload via the existing My Models + flow before invoking compare. Raising on misalignment keeps + the contract simple: variant generation never reloads the + runtime under the user. + + Returns the updated session dict so the frontend can replace + its local copy in one round-trip. + """ + with self._lock: + session = next( + (s for s in self.chat_sessions if s.get("id") == session_id), + None, + ) + if session is None: + raise ValueError(f"Session not found: {session_id}") + messages = session.get("messages") or [] + if message_index < 0 or message_index >= len(messages): + raise ValueError( + f"message_index {message_index} out of range " + f"(session has {len(messages)} messages)" + ) + target = messages[message_index] + if target.get("role") != "assistant": + raise ValueError( + f"Variants can only be added to assistant messages " + f"(message {message_index} role: {target.get('role')})" + ) + if message_index == 0: + raise ValueError("Cannot add a variant to the first message — no prompt available") + user_msg = messages[message_index - 1] + if user_msg.get("role") != "user": + raise ValueError( + f"Variant prompt must come from a user message at index " + f"{message_index - 1}, got role {user_msg.get('role')}" + ) + history = _build_history_with_reasoning( + messages[: message_index - 1], + preserve_reasoning=False, + ) + user_prompt = str(user_msg.get("text") or "") + + if self.runtime.loaded_model is None: + raise ValueError("Load the override model before requesting a variant") + loaded = self.runtime.loaded_model + # Sanity check the runtime is the requested model. We don't + # auto-reload because the user explicitly wants to compare + # against an already-warm choice. + if loaded.ref != model_ref and loaded.runtimeTarget != model_ref: + raise ValueError( + f"Loaded runtime is {loaded.ref}, but variant requested {model_ref}. " + "Load the desired model first via My Models, then retry." + ) + + started_at = time.perf_counter() + try: + result = self.runtime.generate( + prompt=user_prompt, + history=history, + system_prompt=_compose_chat_system_prompt(None), + max_tokens=max_tokens, + temperature=temperature, + ) + except RuntimeError as exc: + raise ValueError(f"Variant generation failed: {exc}") from exc + elapsed = round(time.perf_counter() - started_at, 2) + + metrics = self._stream_assistant_metrics_payload( + final_chunk=type("Chunk", (), { + "finish_reason": result.finishReason, + "prompt_tokens": result.promptTokens, + "completion_tokens": result.completionTokens, + "tok_s": result.tokS, + "runtime_note": result.runtimeNote, + "dflash_acceptance_rate": getattr(result, "dflashAcceptanceRate", None), + })(), + tok_s=result.tokS, + response_seconds=elapsed, + ) + metrics["model"] = model_name + metrics["modelRef"] = model_ref + metrics["canonicalRepo"] = canonical_repo + metrics["modelSource"] = source + metrics["modelPath"] = path + metrics["backend"] = backend + + variant = { + "modelRef": model_ref, + "modelName": model_name, + "text": result.text, + "metrics": metrics, + "generatedAt": self._time_label(), + } + target.setdefault("variants", []).append(variant) + session["updatedAt"] = self._time_label() + self._persist_sessions() + return session + + def delve_message( + self, + session_id: str, + message_index: int, + max_tokens: int = 1024, + temperature: float = 0.5, + ) -> dict[str, Any]: + """Phase 3.6: re-process an assistant message with a critique system + prompt and attach the result as a variant. + + The Delve pass asks the currently-loaded model to read the prior + answer with a critic's eye and surface anything wrong / missing + / misleading, then propose a corrected response. Attached as a + ``modelName: "Delve critique"`` variant so the frontend's + existing variant rendering surfaces it under the original turn. + + Like add_message_variant, requires the model to already be + loaded (no auto-reload). + """ + with self._lock: + session = next( + (s for s in self.chat_sessions if s.get("id") == session_id), + None, + ) + if session is None: + raise ValueError(f"Session not found: {session_id}") + messages = session.get("messages") or [] + if message_index < 0 or message_index >= len(messages): + raise ValueError( + f"message_index {message_index} out of range " + f"(session has {len(messages)} messages)" + ) + target = messages[message_index] + if target.get("role") != "assistant": + raise ValueError( + f"Delve only works on assistant messages " + f"(message {message_index} role: {target.get('role')})" + ) + if message_index == 0: + raise ValueError("Cannot delve on the first message — no prompt available") + user_msg = messages[message_index - 1] + user_prompt = str(user_msg.get("text") or "") + original_answer = str(target.get("text") or "") + + if self.runtime.loaded_model is None: + raise ValueError("Load a model before requesting a Delve pass") + loaded = self.runtime.loaded_model + + # Build the critique-mode system prompt. We deliberately ask + # for both critique + improved answer in one pass so the + # variant card renders something the user can drop straight + # back into the thread if they like the result. + critique_system = ( + "You are a careful reviewer. Read the prior assistant answer with a " + "critic's eye. First, list any factual errors, missing context, or " + "misleading claims under a 'Critique:' heading. Then, under a 'Revised " + "answer:' heading, write a corrected response that fixes the issues " + "you identified. Be concise." + ) + + history = _build_history_with_reasoning( + messages[: message_index - 1], + preserve_reasoning=False, + ) + # Append the user prompt + original answer as context, then + # ask the model to delve into it. + history.append({"role": "user", "text": user_prompt}) + history.append({"role": "assistant", "text": original_answer}) + delve_prompt = ( + "Apply the Critique / Revised answer treatment to the assistant's " + "previous response." + ) + + started_at = time.perf_counter() + try: + result = self.runtime.generate( + prompt=delve_prompt, + history=history, + system_prompt=critique_system, + max_tokens=max_tokens, + temperature=temperature, + ) + except RuntimeError as exc: + raise ValueError(f"Delve generation failed: {exc}") from exc + elapsed = round(time.perf_counter() - started_at, 2) + + metrics = self._stream_assistant_metrics_payload( + final_chunk=type("Chunk", (), { + "finish_reason": result.finishReason, + "prompt_tokens": result.promptTokens, + "completion_tokens": result.completionTokens, + "tok_s": result.tokS, + "runtime_note": result.runtimeNote, + "dflash_acceptance_rate": getattr(result, "dflashAcceptanceRate", None), + })(), + tok_s=result.tokS, + response_seconds=elapsed, + ) + metrics["model"] = "Delve critique" + metrics["modelRef"] = loaded.ref + + variant = { + "modelRef": loaded.ref, + "modelName": "Delve critique", + "text": result.text, + "metrics": metrics, + "generatedAt": self._time_label(), + } + target.setdefault("variants", []).append(variant) + session["updatedAt"] = self._time_label() + self._persist_sessions() + return session + + def fork_session( + self, + source_session_id: str, + fork_at_message_index: int, + title: str | None = None, + ) -> dict[str, Any]: + """Phase 2.4: branch a thread at a specific message. + + Creates a new session containing a deep copy of the source's + messages up to (and including) `fork_at_message_index`, plus + the source's runtime profile (model, cache, thinking mode) so + the fork resumes exactly where the user diverged. The new + session carries `parentSessionId` and `forkedAtMessageIndex` + metadata so the sidebar can render a relationship hint and + future features (compare-vs-parent, merge) have the linkage. + + Raises ``ValueError`` when the source session doesn't exist + or the fork index is out of range. + """ + import copy + + with self._lock: + source = next( + (s for s in self.chat_sessions if s.get("id") == source_session_id), + None, + ) + if source is None: + raise ValueError(f"Source session not found: {source_session_id}") + messages = source.get("messages") or [] + if fork_at_message_index < 0 or fork_at_message_index >= len(messages): + raise ValueError( + f"fork_at_message_index {fork_at_message_index} out of range " + f"(session has {len(messages)} messages)" + ) + + fork_title = title or f"{source.get('title', 'Chat')} (fork)" + new_id = f"session-{uuid.uuid4().hex[:8]}" + new_session: dict[str, Any] = { + "id": new_id, + "title": fork_title, + "updatedAt": self._time_label(), + "pinned": False, + # Carry the runtime profile so the fork resumes on the + # same model + cache config as the parent. + "model": source.get("model"), + "modelRef": source.get("modelRef"), + "canonicalRepo": source.get("canonicalRepo"), + "modelSource": source.get("modelSource"), + "modelPath": source.get("modelPath"), + "modelBackend": source.get("modelBackend"), + "thinkingMode": source.get("thinkingMode") or "off", + "cacheLabel": source.get("cacheLabel"), + "cacheStrategy": source.get("cacheStrategy"), + "cacheBits": source.get("cacheBits"), + "fp16Layers": source.get("fp16Layers"), + "fusedAttention": source.get("fusedAttention"), + "fitModelInMemory": source.get("fitModelInMemory"), + "contextTokens": source.get("contextTokens"), + "speculativeDecoding": source.get("speculativeDecoding"), + "dflashDraftModel": source.get("dflashDraftModel"), + "treeBudget": source.get("treeBudget"), + # Branching linkage so the UI can render the + # parent-child relationship and so future features + # (diff, merge) have the tie. + "parentSessionId": source_session_id, + "forkedAtMessageIndex": fork_at_message_index, + "messages": copy.deepcopy(messages[: fork_at_message_index + 1]), + } + self.chat_sessions.insert(0, new_session) + self.add_activity( + "Chat session forked", + f"{source.get('title', 'Chat')} → {fork_title}", + ) + self._persist_sessions() + return new_session + def update_session(self, session_id: str, request: UpdateSessionRequest) -> dict[str, Any]: with self._lock: session = self._ensure_session(session_id=session_id) @@ -1053,6 +1461,9 @@ def update_session(self, session_id: str, request: UpdateSessionRequest) -> dict session["treeBudget"] = request.treeBudget if "dflashDraftModel" in fields_set: session["dflashDraftModel"] = request.dflashDraftModel + if "workspaceId" in fields_set: + # Phase 3.7: empty string clears the assignment. + session["workspaceId"] = request.workspaceId or None if request.messages is not None: session["messages"] = request.messages session["updatedAt"] = self._time_label() @@ -1938,6 +2349,124 @@ def delete_document(self, session_id: str, doc_id: str) -> dict[str, Any]: self._persist_sessions() return {"deleted": doc_id} + # -- Phase 3.7: workspace knowledge stack helpers -------------------- + + def _workspace_dir(self, workspace_id: str) -> Path: + from backend_service.app import WORKSPACES_DIR + safe_id = "".join(ch for ch in workspace_id if ch.isalnum() or ch in "-_") + return WORKSPACES_DIR / safe_id + + def upload_workspace_document( + self, + workspace_id: str, + filename: str, + data: bytes, + ) -> dict[str, Any]: + """Phase 3.7: ingest a document into a workspace. + + Mirrors `upload_document` but writes under + `/workspaces//`. The chunked text JSON sits next + to the original file so the RAG retriever can read both + session and workspace docs through the same DocumentIndex + helpers without bespoke logic. + """ + from backend_service.app import MAX_DOC_SIZE_BYTES, DOC_ALLOWED_EXTENSIONS + from backend_service.helpers.workspaces import WorkspaceRegistry + from backend_service.app import WORKSPACES_PATH, WORKSPACES_DIR + + if len(data) > MAX_DOC_SIZE_BYTES: + raise HTTPException( + status_code=413, + detail=f"File exceeds {MAX_DOC_SIZE_BYTES // (1024*1024)}MB limit.", + ) + sanitized = _sanitize_filename(filename) + ext = Path(sanitized).suffix.lower() + if ext not in DOC_ALLOWED_EXTENSIONS: + raise HTTPException(status_code=400, detail=f"File type not supported: {ext}") + + registry = WorkspaceRegistry(WORKSPACES_PATH, WORKSPACES_DIR) + workspace = registry.get(workspace_id) + if workspace is None: + raise HTTPException(status_code=404, detail="Workspace not found") + + doc_id = f"doc-{uuid.uuid4().hex[:12]}" + workspace_dir = self._workspace_dir(workspace_id) + workspace_dir.mkdir(parents=True, exist_ok=True) + doc_path = workspace_dir / f"{doc_id}{ext}" + doc_path.write_bytes(data) + try: + doc_path.chmod(0o600) + except OSError: + pass + + try: + text = _extract_text_from_file(doc_path) + except RuntimeError as exc: + doc_path.unlink(missing_ok=True) + raise HTTPException(status_code=400, detail=str(exc)) from exc + + chunks = _chunk_text(text) + chunks_path = workspace_dir / f"{doc_id}.chunks.json" + chunks_path.write_text( + json.dumps([{"index": i, "text": c} for i, c in enumerate(chunks)], indent=2), + encoding="utf-8", + ) + + doc_meta = { + "id": doc_id, + "filename": doc_path.name, + "originalName": sanitized, + "sizeBytes": len(data), + "chunkCount": len(chunks), + "uploadedAt": self._time_label(), + } + + # Persist on the workspace registry too so the doc list comes + # back on subsequent /api/workspaces calls without reading the + # filesystem again. + existing_docs = list(workspace.get("documents") or []) + existing_docs.append(doc_meta) + registry.update(workspace_id, title=workspace["title"]) + # The update() call doesn't currently support documents — read + # the entry back, mutate, save by writing the full payload. + # Workaround: write directly via the registry's internal map. + registry._workspaces[workspace_id]["documents"] = existing_docs + registry._workspaces[workspace_id]["updatedAt"] = self._time_label() + registry.save() + self.add_log( + "chat", "info", + f"Document uploaded to workspace {workspace_id}: {sanitized} ({len(chunks)} chunks)", + ) + return doc_meta + + def delete_workspace_document(self, workspace_id: str, doc_id: str) -> dict[str, Any]: + """Phase 3.7: remove a document from a workspace's stack.""" + from backend_service.helpers.workspaces import WorkspaceRegistry + from backend_service.app import WORKSPACES_PATH, WORKSPACES_DIR + + registry = WorkspaceRegistry(WORKSPACES_PATH, WORKSPACES_DIR) + workspace = registry.get(workspace_id) + if workspace is None: + raise HTTPException(status_code=404, detail="Workspace not found") + + docs = list(workspace.get("documents") or []) + target = next((d for d in docs if d.get("id") == doc_id), None) + if not target: + raise HTTPException(status_code=404, detail="Document not found.") + remaining = [d for d in docs if d.get("id") != doc_id] + registry._workspaces[workspace_id]["documents"] = remaining + registry._workspaces[workspace_id]["updatedAt"] = self._time_label() + registry.save() + + workspace_dir = self._workspace_dir(workspace_id) + for f in workspace_dir.glob(f"{doc_id}*"): + try: + f.unlink() + except OSError: + pass + self.add_log("chat", "info", f"Workspace document removed: {target.get('originalName')}") + return {"deleted": doc_id} + def delete_session(self, session_id: str) -> dict[str, Any]: with self._lock: target = next((s for s in self.chat_sessions if s.get("id") == session_id), None) @@ -1953,26 +2482,70 @@ def _retrieve_session_context(self, session_id: str, prompt: str, top_k: int = 5 Returns (context_text, citations) where citations is a list of dicts with docId, docName, chunkIndex, page, preview keys. + + Phase 2.6: when an llama-embedding binary + embedding GGUF are + both discoverable via env vars or `/embeddings/`, + retrieval uses semantic cosine similarity blended with BM25 + (70/30) instead of TF-IDF + BM25. The embedding client is + resolved per-call so newly-installed models pick up without a + restart, and the legacy lexical path remains the fallback when + anything goes wrong. """ from backend_service.helpers.documents import DocumentIndex - + from backend_service.rag import resolve_embedding_client + + # Phase 3.7: collect document directories from both the session + # and (when assigned) the session's workspace, so the RAG + # retriever sees the merged corpus. Workspace docs survive + # session deletion + are visible across every session in the + # workspace. + chunk_dirs: list[Path] = [] session_dir = self._session_docs_dir(session_id) - if not session_dir.exists(): + if session_dir.exists(): + chunk_dirs.append(session_dir) + + with self._lock: + session = next( + (s for s in self.chat_sessions if s.get("id") == session_id), + None, + ) + workspace_id = session.get("workspaceId") if session else None + if workspace_id: + workspace_dir = self._workspace_dir(workspace_id) + if workspace_dir.exists(): + chunk_dirs.append(workspace_dir) + + if not chunk_dirs: return "", [] - # Build a temporary index from all session documents + # Embedding client discovery: env vars override path; if no + # CHAOSENGINE_EMBEDDING_MODEL is set we look under + # `/embeddings/*.gguf`. Returns None when + # nothing is wired, in which case retrieval transparently + # falls back to TF-IDF + BM25. + from backend_service.app import DOCUMENTS_DIR + + embedding_client = resolve_embedding_client(DOCUMENTS_DIR.parent) + + # Build a temporary index from all collected directories. index = DocumentIndex() - for chunk_file in session_dir.glob("*.chunks.json"): - try: - doc_chunks = json.loads(chunk_file.read_text(encoding="utf-8")) - doc_name = chunk_file.stem.replace(".chunks", "") - full_text = "\n\n".join(c.get("text", "") for c in doc_chunks) - if full_text.strip(): - index.add_document(full_text, doc_id=doc_name, doc_name=doc_name) - except (OSError, json.JSONDecodeError): - continue + for chunk_dir in chunk_dirs: + for chunk_file in chunk_dir.glob("*.chunks.json"): + try: + doc_chunks = json.loads(chunk_file.read_text(encoding="utf-8")) + doc_name = chunk_file.stem.replace(".chunks", "") + full_text = "\n\n".join(c.get("text", "") for c in doc_chunks) + if full_text.strip(): + index.add_document( + full_text, + doc_id=doc_name, + doc_name=doc_name, + embedding_client=embedding_client, + ) + except (OSError, json.JSONDecodeError): + continue - results = index.search(prompt, top_k=top_k) + results = index.search(prompt, top_k=top_k, embedding_client=embedding_client) if not results: return "", [] @@ -2080,15 +2653,25 @@ def generate(self, request: GenerateRequest) -> dict[str, Any]: if effective_canonical_repo and self.runtime.loaded_model.canonicalRepo != effective_canonical_repo: self.runtime.loaded_model.canonicalRepo = effective_canonical_repo - history = [{"role": message["role"], "text": message["text"]} for message in session["messages"]] + history = _build_history_with_reasoning( + session["messages"], + preserve_reasoning=(effective_thinking_mode == "auto"), + ) session["messages"].append({"role": "user", "text": request.prompt, "metrics": None}) session["updatedAt"] = self._time_label() - session["model"] = self.runtime.loaded_model.name - session["modelRef"] = self.runtime.loaded_model.ref - session["canonicalRepo"] = self.runtime.loaded_model.canonicalRepo - session["modelSource"] = self.runtime.loaded_model.source - session["modelPath"] = self.runtime.loaded_model.path - session["modelBackend"] = self.runtime.loaded_model.backend + # Phase 2.12: if `oneTurnOverride` is set, skip persisting the + # active runtime's model identity onto the session so the + # session default (the previously-loaded model) sticks for + # the next plain message. Other session metadata (cache + # strategy, context, thinking mode) still updates so the + # picked model's runtime profile is reflected on this turn. + if not getattr(request, "oneTurnOverride", False): + session["model"] = self.runtime.loaded_model.name + session["modelRef"] = self.runtime.loaded_model.ref + session["canonicalRepo"] = self.runtime.loaded_model.canonicalRepo + session["modelSource"] = self.runtime.loaded_model.source + session["modelPath"] = self.runtime.loaded_model.path + session["modelBackend"] = self.runtime.loaded_model.backend session["thinkingMode"] = effective_thinking_mode session["cacheLabel"] = self._cache_label( cache_strategy=str(self.runtime.loaded_model.cacheStrategy), @@ -2161,6 +2744,12 @@ class _AgentResultProxy: "arguments": tc.arguments, "result": tc.result, "elapsed": tc.elapsed_seconds, + # Phase 2.8: forward structured output hint + + # data through to the frontend `ToolCallInfo`. + # When `render_as` is None the frontend falls + # back to the legacy collapsible-JSON view. + "renderAs": tc.render_as, + "data": tc.data, } for tc in agent_result.tool_calls ] @@ -2172,6 +2761,9 @@ class _AgentResultProxy: max_tokens=request.maxTokens, temperature=request.temperature, images=request.images, + samplers=_build_sampler_overrides(request), + reasoning_effort=request.reasoningEffort, + json_schema=request.jsonSchema, ) tool_call_payloads = [] except RuntimeError as exc: @@ -2309,15 +2901,25 @@ def generate_stream(self, request: GenerateRequest): if effective_canonical_repo and self.runtime.loaded_model.canonicalRepo != effective_canonical_repo: self.runtime.loaded_model.canonicalRepo = effective_canonical_repo - history = [{"role": m["role"], "text": m["text"]} for m in session["messages"]] + history = _build_history_with_reasoning( + session["messages"], + preserve_reasoning=(effective_thinking_mode == "auto"), + ) session["messages"].append({"role": "user", "text": request.prompt, "metrics": None}) session["updatedAt"] = self._time_label() - session["model"] = self.runtime.loaded_model.name - session["modelRef"] = self.runtime.loaded_model.ref - session["canonicalRepo"] = self.runtime.loaded_model.canonicalRepo - session["modelSource"] = self.runtime.loaded_model.source - session["modelPath"] = self.runtime.loaded_model.path - session["modelBackend"] = self.runtime.loaded_model.backend + # Phase 2.12: if `oneTurnOverride` is set, skip persisting the + # active runtime's model identity onto the session so the + # session default (the previously-loaded model) sticks for + # the next plain message. Other session metadata (cache + # strategy, context, thinking mode) still updates so the + # picked model's runtime profile is reflected on this turn. + if not getattr(request, "oneTurnOverride", False): + session["model"] = self.runtime.loaded_model.name + session["modelRef"] = self.runtime.loaded_model.ref + session["canonicalRepo"] = self.runtime.loaded_model.canonicalRepo + session["modelSource"] = self.runtime.loaded_model.source + session["modelPath"] = self.runtime.loaded_model.path + session["modelBackend"] = self.runtime.loaded_model.backend session["thinkingMode"] = effective_thinking_mode session["cacheLabel"] = self._cache_label( cache_strategy=str(self.runtime.loaded_model.cacheStrategy), @@ -2343,6 +2945,27 @@ def generate_stream(self, request: GenerateRequest): model_tag = self.runtime.loaded_model.name self.add_log("chat", "info", f"[{model_tag}] Streaming response...") self.active_requests += 1 + # Hotfix (2026-05-01 v2): vision input has no working path + # on either runtime today. The MLX worker subprocess never + # wired images, and `_resolve_gguf_path` strips mmproj + # projector files so llama-server never gets `--mmproj`. + # Until mmproj wiring lands (Phase 2.6+ work), the + # `visionEnabled` flag on LoadedModelInfo stays False on + # every load and we strip + warn loudly here. The capability + # resolver also demotes vision via this same flag so the + # composer hides the attach button — this branch is the + # belt-and-braces for legacy clients that bypass the gate. + if request.images and not self.runtime.loaded_model.visionEnabled: + engine_label = self.runtime.loaded_model.engine or "current" + self.add_log( + "chat", "warning", + f"[{model_tag}] Stripped {len(request.images)} attached " + f"image(s): the {engine_label} runtime has no mmproj " + "vision projector wired up, so images would be silently " + "dropped and the model would hallucinate. Vision support " + "lands with the mmproj loader.", + ) + request.images = None effective_system_prompt = _compose_chat_system_prompt(request.systemPrompt, effective_thinking_mode) doc_context, stream_rag_citations = self._retrieve_session_context(session["id"], request.prompt) if doc_context: @@ -2359,12 +2982,132 @@ def generate_stream(self, request: GenerateRequest): enable_tools = request.enableTools available_tools = request.availableTools gen_start = time.perf_counter() + # Reset any stale cancellation flag from a prior turn so this fresh + # generation isn't aborted before it starts. + chaosengine.clear_chat_cancel(session["id"]) + session_id_for_cancel = session["id"] def _sse_stream(): full_text = "" full_reasoning = "" final_chunk = None agent_tool_calls: list[dict[str, Any]] = [] + cancelled = False + # Phase 2.0: track prompt-eval → generating phase transition so the + # client can render an explicit "Processing prompt..." indicator + # instead of a blank flashing cursor while the model is still + # ingesting the prompt. The OpenAI-compat streaming endpoint + # exposes nothing until the first decoded token, so phase here is + # binary (prompt_eval | generating) plus a TTFT measurement on + # transition. + phase_first_output_seen = False + ttft_seconds: float | None = None + + # Phase 2.0.5-B: pre-flight memory gate. Refuse the generation + # before it starts when the host is already memory-starved, so + # the user gets an actionable error instead of a silent OOM / + # swap-thrash that wedges the laptop. The gate is conservative + # — it does not predict working-set size, just bails when the + # available-memory floor or pressure ceiling is breached. + try: + from backend_service.helpers.memory_gate import ( + gate_chat_generation, + snapshot_memory_signals, + ) + + available_gb, pressure_percent = snapshot_memory_signals() + refusal = gate_chat_generation(available_gb, pressure_percent) + if refusal is not None: + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] Memory gate refused generation: " + f"{refusal['code']} (avail={available_gb:.1f} GB, " + f"pressure={pressure_percent:.0f}%).", + ) + with chaosengine._lock: + # Roll back the optimistic user message we appended + # earlier so the refusal looks like the request never + # happened, matching the existing RuntimeError path. + if (session["messages"] + and session["messages"][-1].get("role") == "user" + and session["messages"][-1].get("text") == request.prompt): + session["messages"].pop() + session["updatedAt"] = chaosengine._time_label() + chaosengine._persist_sessions() + chaosengine.active_requests = max(0, chaosengine.active_requests - 1) + yield f"data: {json.dumps({'error': refusal['message']})}\n\n" + return + except Exception as exc: + # Gate failure must not block legitimate generations. Log and + # continue — better to risk a possible OOM than to refuse + # everything when psutil glitches. + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] Memory gate skipped due to error: {exc}", + ) + + yield f"data: {json.dumps({'phase': 'prompt_eval'})}\n\n" + + # Phase 2.0.5-D: output-length runaway guard. Abort the generation + # if accumulated visible text exceeds the user's max_tokens budget + # by 1.5×, which catches decoder loops that ignore the EOS token + # (a known failure mode on certain quantised models). Char count + # is a fast proxy — average ~4 chars per token across English + + # markdown code, so the threshold is `max_tokens * 6` chars. + runaway_char_budget = max(2000, int(request.maxTokens) * 6) + runaway_triggered = False + runaway_loop_reason: str | None = None + + # Phase 2.0.5-F: per-stream repetition / reasoning-loop guard for + # the llama.cpp path. The MLX worker has run this guard inside the + # subprocess for a while; the llama-server REST stream had no + # equivalent and a runaway model could decode tokens indefinitely + # against a paused UI. Same RunawayGuard module both paths use. + from backend_service.runaway_guard import RunawayGuard as _RunawayGuard + + llama_path_guard = _RunawayGuard() + + # Phase 2.0.5-C: tok/s floor monitor. After the model has + # produced output for a 30-second window, check the rolling + # decode rate. Falling below 0.3 tok/s for that long usually + # means thermal throttle, GPU stall, or a corrupted model + # state — none of which recovers on its own. Abort with a + # diagnostic so the user can switch model / cool down / + # restart the worker. + TOKS_FLOOR_WINDOW_S = 30.0 + TOKS_FLOOR_MIN = 0.3 + window_started_at: float | None = None + window_tokens = 0 + stall_triggered = False + + # Phase 2.0.5-G: in-stream panic monitor. While a generation + # is in flight, sample memory every PANIC_SAMPLE_INTERVAL_S + # and emit a `panic` SSE event when free RAM crosses the + # critical floor or pressure goes critical. The front-end + # renders a non-blocking banner offering Cancel / Unload + # warm / Continue. Generation is NOT auto-cancelled here — + # that's the user's call. The stricter pre-flight gate + # (Phase 2.0.5-B) blocks tight starts, this catches mid- + # flight degradation as KV cache or other activity grows. + PANIC_SAMPLE_INTERVAL_S = 5.0 + PANIC_AVAILABLE_FLOOR_GB = 0.5 + PANIC_PRESSURE_CEILING = 96.0 + last_panic_sample_at: float | None = None + panic_emitted = False + # Phase 2.0.5-I: thermal pressure watch. `pmset -g therm` on + # macOS reports warning levels when CPU/GPU is throttling. + # We surface the first transition to "critical" via a SSE + # event so the user sees why decode just slowed. Linux / + # Windows: read returns None and this watch is a no-op. + thermal_warning_emitted = False + + def _maybe_emit_generating_phase() -> str: + nonlocal phase_first_output_seen, ttft_seconds + if phase_first_output_seen: + return "" + phase_first_output_seen = True + ttft_seconds = round(time.perf_counter() - gen_start, 3) + return f"data: {json.dumps({'phase': 'generating', 'ttftSeconds': ttft_seconds})}\n\n" try: if enable_tools: @@ -2378,10 +3121,23 @@ def _sse_stream(): images=request.images, available_tools=available_tools, ): + if chaosengine.is_chat_cancel_requested(session_id_for_cancel): + cancelled = True + break if "token" in event: + phase_event = _maybe_emit_generating_phase() + if phase_event: + yield phase_event full_text += event["token"] yield f"data: {json.dumps({'token': event['token']})}\n\n" + if len(full_text) > runaway_char_budget: + runaway_triggered = True + cancelled = True + break elif "tool_call_start" in event: + phase_event = _maybe_emit_generating_phase() + if phase_event: + yield phase_event yield f"data: {json.dumps({'toolCallStart': event['tool_call_start']})}\n\n" elif "tool_call_result" in event: agent_tool_calls.append(event["tool_call_result"]) @@ -2396,15 +3152,145 @@ def _sse_stream(): max_tokens=request.maxTokens, temperature=request.temperature, images=request.images, thinking_mode=effective_thinking_mode, + samplers=_build_sampler_overrides(request), + reasoning_effort=request.reasoningEffort, + json_schema=request.jsonSchema, ): + if chaosengine.is_chat_cancel_requested(session_id_for_cancel): + cancelled = True + break if chunk.reasoning: + phase_event = _maybe_emit_generating_phase() + if phase_event: + yield phase_event full_reasoning += chunk.reasoning yield f"data: {json.dumps({'reasoning': chunk.reasoning})}\n\n" if chunk.reasoning_done: yield f"data: {json.dumps({'reasoningDone': True})}\n\n" if chunk.text: + phase_event = _maybe_emit_generating_phase() + if phase_event: + yield phase_event full_text += chunk.text yield f"data: {json.dumps({'token': chunk.text})}\n\n" + # Phase 3.3: forward per-token logprobs when + # the inference layer captured them. + if chunk.token_logprobs: + yield f"data: {json.dumps({'tokenLogprobs': chunk.token_logprobs})}\n\n" + if len(full_text) > runaway_char_budget: + runaway_triggered = True + cancelled = True + break + # Phase 2.0.5-F: feed loop / repetition guard. + try: + llama_path_guard.feed(chunk.text) + except RuntimeError as guard_exc: + runaway_triggered = True + runaway_loop_reason = str(guard_exc) + cancelled = True + break + # Phase 2.0.5-C: tok/s floor sampling. Each + # chunk roughly maps to one token from the + # SSE stream; chunk count is a workable proxy. + now = time.perf_counter() + if window_started_at is None: + window_started_at = now + window_tokens = 0 + window_tokens += 1 + if now - window_started_at >= TOKS_FLOOR_WINDOW_S: + rate = window_tokens / max(1e-6, now - window_started_at) + if rate < TOKS_FLOOR_MIN: + stall_triggered = True + cancelled = True + runaway_loop_reason = ( + f"Decode stalled at {rate:.2f} tok/s " + f"for {TOKS_FLOOR_WINDOW_S:.0f}s — " + "likely thermal throttle, GPU stall, " + "or worker deadlock. Aborting." + ) + break + window_started_at = now + window_tokens = 0 + # Phase 2.0.5-G + I: panic + thermal monitors. + # Sampled at PANIC_SAMPLE_INTERVAL_S together to + # keep subprocess / psutil cost bounded. Each + # emits at most once per turn. + if ( + (not panic_emitted or not thermal_warning_emitted) + and ( + last_panic_sample_at is None + or now - last_panic_sample_at >= PANIC_SAMPLE_INTERVAL_S + ) + ): + last_panic_sample_at = now + if not panic_emitted: + try: + from backend_service.helpers.memory_gate import ( + snapshot_memory_signals as _panic_snapshot, + ) + p_avail, p_pressure = _panic_snapshot() + if ( + p_avail < PANIC_AVAILABLE_FLOOR_GB + or p_pressure > PANIC_PRESSURE_CEILING + ): + panic_emitted = True + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] Panic: avail=" + f"{p_avail:.1f} GB, " + f"pressure={p_pressure:.0f}%.", + ) + yield ( + "data: " + + json.dumps({ + "panic": True, + "availableGb": p_avail, + "pressurePercent": p_pressure, + "message": ( + "System memory critical mid-" + "generation. Consider cancelling " + "this turn or unloading warm " + "models before retrying." + ), + }) + + "\n\n" + ) + except Exception as panic_exc: + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] Panic sample skipped: {panic_exc}", + ) + if not thermal_warning_emitted: + try: + from backend_service.helpers.thermal import ( + read_thermal_state, + ) + thermal_state = read_thermal_state() + if thermal_state == "critical": + thermal_warning_emitted = True + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] Thermal warning: critical.", + ) + yield ( + "data: " + + json.dumps({ + "thermalWarning": True, + "state": thermal_state, + "message": ( + "System is thermally throttling. " + "Decode speed will drop until the " + "machine cools. Consider pausing " + "and retrying after a cooldown." + ), + }) + + "\n\n" + ) + except Exception as thermal_exc: + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] Thermal sample skipped: {thermal_exc}", + ) if chunk.done: final_chunk = chunk except RuntimeError as exc: @@ -2417,8 +3303,29 @@ def _sse_stream(): chaosengine._persist_sessions() chaosengine.active_requests = max(0, chaosengine.active_requests - 1) chaosengine.add_log("chat", "error", f"[{model_tag}] Streaming failed: {exc}") + chaosengine.clear_chat_cancel(session_id_for_cancel) yield f"data: {json.dumps({'error': str(exc)})}\n\n" return + finally: + chaosengine.clear_chat_cancel(session_id_for_cancel) + + if cancelled: + yield f"data: {json.dumps({'cancelled': True})}\n\n" + if runaway_loop_reason is not None: + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] {runaway_loop_reason} " + f"(after {len(full_text)} chars).", + ) + elif runaway_triggered: + chaosengine.add_log( + "chat", "warning", + f"[{model_tag}] Output runaway guard tripped at " + f"{len(full_text)} chars (budget {runaway_char_budget}); " + "stream aborted to prevent decoder loop.", + ) + else: + chaosengine.add_log("chat", "info", f"[{model_tag}] Generation cancelled by user.") gen_elapsed = round(time.perf_counter() - gen_start, 2) with chaosengine._lock: @@ -2436,6 +3343,7 @@ def _sse_stream(): tok_s=tok_s, response_seconds=gen_elapsed, requested_runtime=requested_runtime, + ttft_seconds=ttft_seconds, ) if agent_tool_calls: metrics["toolCalls"] = agent_tool_calls @@ -2469,6 +3377,8 @@ def _sse_stream(): requests_served=chaosengine.requests_served, ), } + if cancelled: + done_payload["cancelled"] = True yield f"data: {json.dumps(done_payload)}\n\n" return StreamingResponse( @@ -2766,6 +3676,34 @@ def _unload_repo_from_runtimes(self, repo: str, repo_cache_dir: Path) -> None: except Exception: pass + def request_cancel_chat(self, session_id: str) -> dict[str, Any]: + """Mark a chat generation for cancellation. + + The streaming loop in generate_stream() checks this flag between + events and breaks early, persisting whatever output has accumulated + so far. Returns metadata about whether the session is currently + generating so the UI can decide whether to show a "stop" toast. + """ + with self._lock: + self._chat_cancel[session_id] = True + session = next( + (s for s in self.chat_sessions if s.get("id") == session_id), + None, + ) + return { + "sessionId": session_id, + "cancelled": True, + "wasActive": session is not None, + } + + def is_chat_cancel_requested(self, session_id: str) -> bool: + with self._lock: + return bool(self._chat_cancel.get(session_id, False)) + + def clear_chat_cancel(self, session_id: str) -> None: + with self._lock: + self._chat_cancel.pop(session_id, None) + def cancel_download(self, repo: str) -> dict[str, Any]: from backend_service.helpers.huggingface import _hf_repo_downloaded_bytes @@ -3099,6 +4037,65 @@ def openai_models(self) -> dict[str, Any]: }) return {"object": "list", "data": data} + def openai_embeddings(self, request: OpenAIEmbeddingsRequest) -> dict[str, Any]: + """Phase 2.13: OpenAI-compatible embeddings endpoint. + + Routes through the bundled GGUF embedding model (Phase 2.6). + Returns a 503 when no embedding client is available; returns + the OpenAI-shaped response shape on success so external + scripts can drop us in for OpenAI without code changes. + """ + from backend_service.app import DOCUMENTS_DIR + from backend_service.rag import resolve_embedding_client + from backend_service.rag.embedding_client import EmbeddingClientUnavailable + + client = resolve_embedding_client(DOCUMENTS_DIR.parent) + if client is None: + raise HTTPException( + status_code=503, + detail=( + "No embedding model is configured. Set CHAOSENGINE_EMBEDDING_MODEL " + "or drop a *.gguf into /embeddings/." + ), + ) + + if isinstance(request.input, str): + inputs = [request.input] + else: + inputs = list(request.input) + + if not inputs: + raise HTTPException(status_code=400, detail="`input` must be a non-empty string or list of strings.") + + try: + vectors = client.embed_batch(inputs) + except EmbeddingClientUnavailable as exc: + raise HTTPException(status_code=503, detail=str(exc)) from exc + + # Truncate per OpenAI's `dimensions` parameter when set. We don't + # re-normalise after truncation; the bundled model is already + # L2-normalised end-to-end, so cosine similarity stays well-defined. + if request.dimensions is not None: + vectors = [vec[: request.dimensions] for vec in vectors] + + prompt_tokens = sum(max(1, len(text.split())) for text in inputs) + return { + "object": "list", + "data": [ + { + "object": "embedding", + "embedding": vec, + "index": idx, + } + for idx, vec in enumerate(vectors) + ], + "model": request.model or "chaosengine-embed", + "usage": { + "prompt_tokens": prompt_tokens, + "total_tokens": prompt_tokens, + }, + } + def openai_chat_completion(self, request: OpenAIChatCompletionRequest) -> dict[str, Any] | StreamingResponse: if not request.messages: raise HTTPException(status_code=400, detail="At least one message is required.") @@ -3178,6 +4175,39 @@ def openai_chat_completion(self, request: OpenAIChatCompletionRequest) -> dict[s created = int(time.time()) self.add_log("server", "info", f"[{model_tag}] Running chat completion on conversation with {msg_count} messages.") + # Phase 2.13: build a sampler dict from OpenAI-shaped fields. The + # runtime accepts the same llama-server key names so we map field + # → key here once and pass the dict to both stream + non-stream + # paths. None values drop out so they don't override server + # defaults. + oai_samplers: dict[str, Any] = {} + if request.top_p is not None: + oai_samplers["top_p"] = request.top_p + if request.top_k is not None: + oai_samplers["top_k"] = request.top_k + if request.frequency_penalty is not None: + oai_samplers["frequency_penalty"] = request.frequency_penalty + if request.presence_penalty is not None: + oai_samplers["presence_penalty"] = request.presence_penalty + if request.seed is not None: + oai_samplers["seed"] = request.seed + if request.stop is not None: + oai_samplers["stop"] = request.stop if isinstance(request.stop, list) else [request.stop] + + # Phase 2.13: pull a JSON schema out of OpenAI's response_format + # envelope so the constrained-decode path lights up. Anything + # other than `json_schema` → no constraint (json_object would + # require a different code path llama-server already handles + # via response_format= but we don't surface that here). + oai_json_schema: dict[str, Any] | None = None + if isinstance(request.response_format, dict): + rf_type = request.response_format.get("type") + if rf_type == "json_schema": + schema_envelope = request.response_format.get("json_schema") or {} + schema_obj = schema_envelope.get("schema") + if isinstance(schema_obj, dict): + oai_json_schema = schema_obj + if request.stream: chaosengine = self @@ -3198,6 +4228,8 @@ def _stream_chunks(): images=last_user_images or None, tools=request.tools, engine=target_engine, + samplers=oai_samplers or None, + json_schema=oai_json_schema, ): if chunk.text: token_count += 1 @@ -3273,6 +4305,8 @@ def _stream_chunks(): images=last_user_images or None, tools=request.tools, engine=target_engine, + samplers=oai_samplers or None, + json_schema=oai_json_schema, ) except RuntimeError as exc: with self._lock: diff --git a/backend_service/tools/__init__.py b/backend_service/tools/__init__.py index e48ec6c..6d2c667 100644 --- a/backend_service/tools/__init__.py +++ b/backend_service/tools/__init__.py @@ -8,9 +8,29 @@ from __future__ import annotations from abc import ABC, abstractmethod +from dataclasses import dataclass, field from typing import Any +# Phase 2.8: rich tool output payload. +# +# `text` is what the language model sees on the next turn (preserves +# the existing contract — the agent loop feeds tool results back as +# message content). `render_as` + `data` are an optional UI hint the +# frontend's `ToolCallCard` reads to render a table / code block / +# markdown / image / chart instead of dumping raw JSON. Tools that +# don't override `execute_structured` continue to return plain text +# and the UI falls back to the existing collapsible-JSON view. +RenderAsLiteral = str # "table" | "code" | "markdown" | "image" | "chart" | "json" + + +@dataclass +class StructuredToolOutput: + text: str + render_as: RenderAsLiteral = "json" + data: dict[str, Any] | None = None + + class BaseTool(ABC): """Interface every tool must implement.""" @@ -32,6 +52,26 @@ def parameters_schema(self) -> dict[str, Any]: def execute(self, **kwargs: Any) -> str: """Run the tool with the given arguments and return a text result.""" + def execute_structured(self, **kwargs: Any) -> StructuredToolOutput | None: + """Phase 2.8: optional rich-output entry point. + + Tools that want the UI to render a table / code block / markdown + instead of a JSON dump override this to return a + `StructuredToolOutput`. The agent loop calls this first; when + it returns None (the default), the loop falls back to + `execute(...)` and treats the result as plain text. Built-in + tools that haven't been migrated yet keep working unchanged. + """ + return None + + @property + def provenance(self) -> str: + """Phase 2.10: where this tool came from. Built-ins return + ``"builtin"``; MCP-adapted tools override to ``"mcp:"``. + Surfaced via /api/tools so the UI can render a source badge. + """ + return "builtin" + def openai_schema(self) -> dict[str, Any]: """Return the OpenAI function-calling representation of this tool.""" return { @@ -49,10 +89,18 @@ class ToolRegistry: def __init__(self) -> None: self._tools: dict[str, BaseTool] = {} + # Phase 2.10: keep MCP-sourced tools in a parallel set so we + # can refresh them (re-spawn server, swap configs) without + # disturbing the built-in registrations. + self._mcp_tool_names: set[str] = set() def register(self, tool: BaseTool) -> None: self._tools[tool.name] = tool + def unregister(self, name: str) -> None: + self._tools.pop(name, None) + self._mcp_tool_names.discard(name) + def get(self, name: str) -> BaseTool | None: return self._tools.get(name) @@ -81,6 +129,20 @@ def discover(self) -> None: instance = cls() self.register(instance) + def replace_mcp_tools(self, tools: list[BaseTool]) -> None: + """Phase 2.10: swap the registry's MCP-sourced tools. + + Drops every previously-registered MCP tool and registers the + provided list. Built-in tools are untouched. Called whenever + the user updates `mcpServers` in settings or the app starts up. + """ + for stale in list(self._mcp_tool_names): + self._tools.pop(stale, None) + self._mcp_tool_names.clear() + for tool in tools: + self.register(tool) + self._mcp_tool_names.add(tool.name) + # Module-level singleton registry = ToolRegistry() diff --git a/backend_service/tools/calculator.py b/backend_service/tools/calculator.py index b5cec1f..3882b48 100644 --- a/backend_service/tools/calculator.py +++ b/backend_service/tools/calculator.py @@ -108,3 +108,18 @@ def execute(self, **kwargs: Any) -> str: return f"{expression} = {result}" except (ValueError, TypeError, ZeroDivisionError, SyntaxError, OverflowError) as exc: return f"Error evaluating '{expression}': {exc}" + + def execute_structured(self, **kwargs: Any) -> Any: + """Phase 2.8: render the calculation as a one-line code block + so the result reads like ``2 + 2 = 4`` in monospace rather + than getting collapsed into a JSON dump.""" + from backend_service.tools import StructuredToolOutput + + text = self.execute(**kwargs) + if text.startswith("Error"): + return StructuredToolOutput(text=text, render_as="markdown") + return StructuredToolOutput( + text=text, + render_as="code", + data={"code": text, "language": "text"}, + ) diff --git a/backend_service/tools/code_executor.py b/backend_service/tools/code_executor.py index 337ac9c..072d770 100644 --- a/backend_service/tools/code_executor.py +++ b/backend_service/tools/code_executor.py @@ -114,3 +114,24 @@ def execute(self, **kwargs: Any) -> str: except OSError as exc: return f"Error: failed to execute code: {exc}" + + def execute_structured(self, **kwargs: Any) -> Any: + """Phase 2.8: render the executed code + its captured output + in a syntax-highlighted Python block. Errors fall back to + markdown so the user sees the failure clearly.""" + from backend_service.tools import StructuredToolOutput + + text = self.execute(**kwargs) + if text.startswith("Error"): + return StructuredToolOutput(text=text, render_as="markdown") + code = str(kwargs.get("code", "")).strip() + return StructuredToolOutput( + text=text, + render_as="code", + data={ + "code": text, + "language": "text", + "sourceCode": code, + "sourceLanguage": "python", + }, + ) diff --git a/backend_service/tools/file_reader.py b/backend_service/tools/file_reader.py index 4164bc7..8048ef7 100644 --- a/backend_service/tools/file_reader.py +++ b/backend_service/tools/file_reader.py @@ -125,3 +125,44 @@ def execute(self, **kwargs: Any) -> str: text += f"\n\n... ({len(lines) - max_lines} more lines truncated)" return f"Contents of {file_path}:\n\n{text}" + + def execute_structured(self, **kwargs: Any) -> Any: + """Phase 2.8: render code files as syntax-highlighted blocks + and markdown / text files as rendered markdown. + + The text returned to the model still includes the same + ``"Contents of :"`` framing the legacy `execute` path + produces so the model's downstream reasoning is unchanged. + Errors fall back to a markdown render so messages like + ``Error: file not found: ...`` show with proper styling. + """ + from backend_service.tools import StructuredToolOutput + + text = self.execute(**kwargs) + if text.startswith("Error"): + return StructuredToolOutput(text=text, render_as="markdown") + + raw_path = str(kwargs.get("path", "")).strip() + try: + ext = Path(os.path.expanduser(raw_path)).suffix.lower().lstrip(".") + except OSError: + ext = "" + # Strip the "Contents of :" leader so the rendered code + # block holds only the file body. The leader stays in `text` + # for the model — it carries the citation context. + body = text.split("\n\n", 1)[1] if "\n\n" in text else text + if ext in {"md", "markdown", "rst"}: + return StructuredToolOutput( + text=text, + render_as="markdown", + data={"markdown": body, "path": raw_path}, + ) + return StructuredToolOutput( + text=text, + render_as="code", + data={ + "code": body, + "language": ext or "text", + "path": raw_path, + }, + ) diff --git a/backend_service/tools/web_search.py b/backend_service/tools/web_search.py index 6c59382..b142eb5 100644 --- a/backend_service/tools/web_search.py +++ b/backend_service/tools/web_search.py @@ -8,7 +8,7 @@ import urllib.request from typing import Any -from backend_service.tools import BaseTool +from backend_service.tools import BaseTool, StructuredToolOutput class WebSearchTool(BaseTool): @@ -33,23 +33,58 @@ def parameters_schema(self) -> dict[str, Any]: } def execute(self, **kwargs: Any) -> str: + # Legacy text path — kept for callers / tests that don't go + # through `execute_structured`. The model-facing return is the + # same human-readable summary structured produces below. query = str(kwargs.get("query", "")).strip() if not query: return "Error: no search query provided." - max_results = min(max(int(kwargs.get("max_results", 5)), 1), 10) - try: return self._search_ddg(query, max_results) except Exception as exc: return f"Search failed: {exc}" - def _search_ddg(self, query: str, max_results: int) -> str: - """Use DuckDuckGo HTML search as a lightweight fallback. + def execute_structured(self, **kwargs: Any) -> StructuredToolOutput | None: + """Phase 2.8: surface a `table` of {title, url, snippet} rows. - This avoids any external SDK dependency while still providing - real web search results via the DDG instant answer API. + The model still sees the human-readable summary text in + `text` so its next reasoning step has all the data; the UI + renders the rows as a clickable table via ToolCallCard. """ + query = str(kwargs.get("query", "")).strip() + if not query: + return StructuredToolOutput( + text="Error: no search query provided.", + render_as="markdown", + ) + max_results = min(max(int(kwargs.get("max_results", 5)), 1), 10) + try: + results = self._search_results(query, max_results) + except Exception as exc: + return StructuredToolOutput( + text=f"Search failed: {exc}", + render_as="markdown", + ) + if not results: + return StructuredToolOutput( + text=f"No results found for: {query}", + render_as="markdown", + ) + return StructuredToolOutput( + text=_format_results_text(query, results), + render_as="table", + data={ + "columns": ["#", "Title", "URL", "Snippet"], + "rows": [ + [str(i + 1), r["title"], r["url"], r["snippet"]] + for i, r in enumerate(results) + ], + "title": f"Web search results for \"{query}\"", + }, + ) + + def _search_results(self, query: str, max_results: int) -> list[dict[str, str]]: url = "https://api.duckduckgo.com/?" + urllib.parse.urlencode({ "q": query, "format": "json", @@ -60,13 +95,10 @@ def _search_ddg(self, query: str, max_results: int) -> str: req = urllib.request.Request(url, headers={ "User-Agent": "ChaosEngineAI/0.5 (desktop AI tool-use agent)", }) - with urllib.request.urlopen(req, timeout=10) as resp: data = json.loads(resp.read().decode("utf-8")) results: list[dict[str, str]] = [] - - # Abstract (instant answer) abstract = data.get("AbstractText", "").strip() abstract_url = data.get("AbstractURL", "").strip() if abstract: @@ -75,8 +107,6 @@ def _search_ddg(self, query: str, max_results: int) -> str: "url": abstract_url, "snippet": abstract, }) - - # Related topics for topic in data.get("RelatedTopics", []): if len(results) >= max_results: break @@ -89,16 +119,25 @@ def _search_ddg(self, query: str, max_results: int) -> str: "url": first_url, "snippet": text, }) + return results + def _search_ddg(self, query: str, max_results: int) -> str: + results = self._search_results(query, max_results) if not results: return f"No results found for: {query}" - - lines = [f"Web search results for: {query}\n"] - for i, r in enumerate(results, 1): - lines.append(f"{i}. {r['title']}") - if r.get("url"): - lines.append(f" URL: {r['url']}") - lines.append(f" {r['snippet']}") - lines.append("") - - return "\n".join(lines) + return _format_results_text(query, results) + + +def _format_results_text(query: str, results: list[dict[str, str]]) -> str: + """Plain-text summary of the result list — fed to the language + model on the next agent turn. Kept identical across the legacy + `execute` and Phase 2.8 `execute_structured` paths so the model's + reasoning is unchanged regardless of which entry point fired.""" + lines = [f"Web search results for: {query}\n"] + for i, r in enumerate(results, 1): + lines.append(f"{i}. {r['title']}") + if r.get("url"): + lines.append(f" URL: {r['url']}") + lines.append(f" {r['snippet']}") + lines.append("") + return "\n".join(lines) diff --git a/backend_service/video_runtime.py b/backend_service/video_runtime.py index f301294..40c9c31 100644 --- a/backend_service/video_runtime.py +++ b/backend_service/video_runtime.py @@ -30,7 +30,7 @@ from pathlib import Path from typing import Any -from backend_service.helpers.gpu import nvidia_gpu_present +from backend_service.helpers.gpu import nvidia_gpu_present, torch_install_warning from backend_service.image_runtime import validate_local_diffusers_snapshot from backend_service.progress import ( GenerationCancelled, @@ -201,6 +201,14 @@ class VideoRuntimeStatus: # via nvidia-smi. ``None`` means we couldn't detect it — the frontend # falls back to its MPS-strict defaults in that case. deviceMemoryGb: float | None = None + # ``torchInstallWarning`` carries a one-line warning when the installed + # torch wheel doesn't match the host accelerator (e.g. +cpu wheel on a + # CUDA host -- generation silently runs on CPU). Computed without + # importing torch (we read dist-info METADATA) so the probe stays free + # of Windows DLL-lock side effects. Frontend renders this as a loud + # warning chip in the Studio so users don't see "Real engine ready" + # next to "Device: cuda (expected)" while their NVIDIA GPU sits idle. + torchInstallWarning: str | None = None def to_dict(self) -> dict[str, Any]: return asdict(self) @@ -223,6 +231,32 @@ def _guess_video_expected_device() -> str | None: return "cpu" +def _windows_cuda_unavailable_message(torch: Any) -> str | None: + if platform.system() != "Windows" or not nvidia_gpu_present(): + return None + cuda_module = getattr(torch, "cuda", None) + if cuda_module is None: + return ( + "CUDA torch is unavailable on this Windows NVIDIA host: torch imports " + "but has no torch.cuda module. Open Settings > Setup and click " + "Install CUDA torch, then Restart Backend." + ) + try: + cuda_available = bool(getattr(cuda_module, "is_available", lambda: False)()) + except Exception as exc: + return ( + "CUDA torch is unavailable on this Windows NVIDIA host: " + f"torch.cuda.is_available failed ({type(exc).__name__}: {exc}). " + "Open Settings > Setup and click Install CUDA torch, then Restart Backend." + ) + if not cuda_available: + return ( + "CUDA torch is unavailable on this Windows NVIDIA host. Open Settings > " + "Setup and click Install CUDA torch, then Restart Backend." + ) + return None + + @dataclass(frozen=True) class VideoGenerationConfig: """Shape consumed by ``DiffusersVideoEngine.generate``.""" @@ -279,9 +313,65 @@ class VideoGenerationConfig: # Phase E1: opt-in template-based prompt enhancement for short prompts # (< 25 words). See ``_enhance_prompt`` for the per-model suffixes. enhancePrompt: bool = True + # FU-018: TAESD / TAEHV preview-decode VAE swap. Preview-only quality + # knob — when True the engine swaps ``pipeline.vae`` for the matching + # tiny VAE (taew2_2 for Wan, taeltx2_3_wide for LTX, taehv1_5 for + # HunyuanVideo, taecogvideox for CogVideoX, taemochi for Mochi) + # before the first denoise. Each step decodes in a fraction of the + # wall-time. Default off — video users typically want full fidelity. + previewVae: bool = False + # Phase 3 / Wan2.2-Distill 4-step: catalog-pinned distilled + # transformers. Wan 2.2 A14B is MoE with two transformer experts + # (``transformer`` = high-noise, ``transformer_2`` = low-noise). + # lightx2v's 4-step distillation publishes both experts as standalone + # safetensors files; the runtime swaps both onto the pipeline at + # build time so subsequent ``pipeline(...)`` calls run the distilled + # 4-step schedule. Mutually exclusive with LoRA loading — when the + # distill files are pinned, the LoRA path is skipped. + distillTransformerRepo: str | None = None + distillTransformerHighNoiseFile: str | None = None + distillTransformerLowNoiseFile: str | None = None + # ``"bf16"`` | ``"fp8_e4m3"`` | ``"int8"`` — dictates the torch dtype + # used at load. FP8/INT8 distill weights ship pre-quantized and need + # the corresponding torch dtype + a CUDA backend that exposes the + # native kernel. On platforms without FP8/INT8 ops the runtime falls + # back to bf16 dequant. + distillTransformerPrecision: str | None = None # Phase E2: CFG decay schedule. Linear ramp from initial guidance_scale # at step 0 to 1.0 at the last step. Default-on for flow-match pipelines. cfgDecay: bool = True + # Spatial-Temporal Guidance scale, consumed only by the mlx-video LTX-2 + # path. 1.0 keeps the upstream-recommended perturbed forward pass per + # step; 0.0 disables it and saves ~33 % wall time at a mild quality + # cost. Other runtimes ignore the value. + stgScale: float = 1.0 + # FU-023 Nunchaku / SVDQuant: pinned by catalog variants that ship + # CUDA INT4 SVDQuant snapshots. CUDA only — falls back when the + # nunchaku package isn't installed or device != cuda. The video-side + # path stays parked until upstream Nunchaku ships Wan / HunyuanVideo + # / LTX wrappers (FLUX + Qwen-Image only as of v1.2.1) — wiring is + # in place so adding a video variant becomes a catalog-row change. + nunchakuRepo: str | None = None + nunchakuFile: str | None = None + # FU-024 FP8 layerwise casting on CUDA SM 8.9+ (Ada/Hopper/Blackwell). + # Halves transformer VRAM by storing fp8 weights + computing in bf16 + # inside the matmul. E5M2 for HunyuanVideo, E4M3 for Wan / LTX / FLUX + # / Qwen-Image. Default off; opt-in. + fp8LayerwiseCasting: bool = False + # FU-019 distill LoRAs: when the catalog variant pins a LoRA + # (lightx2v Wan2.1 CausVid, Wan2.2-Distill-Models, FastWan), the + # engine fuses it into the pipeline transformer at load time so + # subsequent ``pipeline(...)`` calls run with the LoRA baked in. + # 4-step Wan via lightx2v cuts wall-time 7-8× vs the 30-step base. + loraRepo: str | None = None + loraFile: str | None = None + loraScale: float | None = None + # Variant-declared step / CFG defaults. Used by app.py's + # ``_generate_video_artifact`` to substitute the schema defaults + # (50 steps, CFG 3.0) when the user hasn't moved the sliders — + # distill LoRAs run at 4 steps CFG 1.0. + defaultSteps: int | None = None + cfgOverride: float | None = None @dataclass(frozen=True) @@ -322,9 +412,12 @@ class GeneratedVideo: # Community-maintained diffusers port of tencent/HunyuanVideo. "hunyuanvideo-community/HunyuanVideo": {"class_name": "HunyuanVideoPipeline", "task": "txt2video"}, # CogVideoX 2B and 5B share the same diffusers pipeline class — the - # transformer scales but the loader is the same. + # transformer scales but the loader is the same. CogVideoX 1.5 5B + # (catalog refresh, FU-019 round) uses the same class with refreshed + # weights and a higher training resolution. "THUDM/CogVideoX-2b": {"class_name": "CogVideoXPipeline", "task": "txt2video"}, "THUDM/CogVideoX-5b": {"class_name": "CogVideoXPipeline", "task": "txt2video"}, + "THUDM/CogVideoX-1.5-5b": {"class_name": "CogVideoXPipeline", "task": "txt2video"}, } @@ -393,6 +486,9 @@ def _bnb_nf4_transformer_class_for_repo(repo: str) -> str | None: "genmo/mochi-1-preview": {"steps": 64, "guidance": 4.5, "scheduler": None}, "THUDM/CogVideoX-2b": {"steps": 50, "guidance": 6.0, "scheduler": None}, "THUDM/CogVideoX-5b": {"steps": 50, "guidance": 7.0, "scheduler": None}, + # CogVideoX 1.5 5B inherits the 5B defaults — refreshed weights but + # the same step / CFG sweet spot per upstream model card. + "THUDM/CogVideoX-1.5-5b": {"steps": 50, "guidance": 7.0, "scheduler": None}, } # Schema-level defaults — must mirror ``VideoGenerationRequest`` in @@ -805,6 +901,10 @@ def __init__(self) -> None: self._loaded_path: str | None = None self._loaded_variant_key: str | None = None self._device: str | None = None + # FU-019 / FU-016: notes accumulated during pipeline load (LoRA + # fuse, attention backend). Reset on each load; surfaced via + # GeneratedVideo.runtimeNote. + self._load_notes: list[str] = [] # ---------- public API ---------- @@ -840,6 +940,7 @@ def probe(self) -> VideoRuntimeStatus: missingDependencies=missing_all, pythonExecutable=_resolve_video_python(), expectedDevice=_guess_video_expected_device(), + torchInstallWarning=torch_install_warning(), message=( f"Video runtime needs these packages: {', '.join(missing_core)}. " "Click the 'Install GPU runtime' button above to install the full bundle." @@ -893,6 +994,15 @@ def probe(self) -> VideoRuntimeStatus: message=message, loadedModelRepo=self._loaded_repo, deviceMemoryGb=device_memory_gb, + # The earlier replace_all that wired this missed the + # success-path return because the indentation differs from + # the placeholder branch above. Without it, the Studio + # warning chip + banner only fired on the rare path where + # core deps were also missing -- if torch was importable but + # +cpu (the actual user case), realGenerationAvailable=True + # and the field was never set, so the UI silently dropped + # the warning while every other badge read green. + torchInstallWarning=torch_install_warning(), ) def preload(self, repo: str) -> VideoRuntimeStatus: @@ -946,6 +1056,14 @@ def generate(self, config: VideoGenerationConfig) -> GeneratedVideo: gguf_repo=config.ggufRepo, gguf_file=config.ggufFile, use_nf4=config.useNf4, + lora_repo=config.loraRepo, + lora_file=config.loraFile, + lora_scale=config.loraScale, + preview_vae=config.previewVae, + distill_repo=config.distillTransformerRepo, + distill_high_file=config.distillTransformerHighNoiseFile, + distill_low_file=config.distillTransformerLowNoiseFile, + distill_precision=config.distillTransformerPrecision, ) # Early-cancel check after model load — from_pretrained is a # blocking C-extension call we can't interrupt. If the user hit @@ -1039,6 +1157,13 @@ def generate(self, config: VideoGenerationConfig) -> GeneratedVideo: ) VIDEO_PROGRESS.set_phase(PHASE_SAVING, message="Saving to gallery") + # FU-019 / FU-016: surface per-pipeline load notes (LoRA + # fuse, attention backend) on every generated mp4 so the + # user sees what was applied. Joined with " · " for a + # single-line UI presentation. + runtime_note = ( + " · ".join(self._load_notes) if self._load_notes else None + ) return GeneratedVideo( seed=base_seed, bytes=mp4_bytes, @@ -1050,6 +1175,9 @@ def generate(self, config: VideoGenerationConfig) -> GeneratedVideo: width=config.width, height=config.height, runtimeLabel=f"{self.runtime_label} ({self._device or 'cpu'})", + runtimeNote=runtime_note, + effectiveSteps=int(config.steps), + effectiveGuidance=float(config.guidance), ) finally: VIDEO_PROGRESS.finish() @@ -1229,6 +1357,11 @@ def _build_pipeline_kwargs( # underlying call. Lets the engine plumb decay through one # callback factory rather than threading state through self. kwargs["__cfg_decay"] = bool(config.cfgDecay) + # FU-018 part 2: same private-kwarg plumbing for the live + # denoise thumbnail emit. When on, the step callback decodes + # the current latent's middle frame via the TAEHV/TAEW preview + # VAE that ``_ensure_pipeline`` swapped onto ``pipeline.vae``. + kwargs["__preview_vae"] = bool(config.previewVae) return kwargs def _make_step_callback( @@ -1236,10 +1369,11 @@ def _make_step_callback( total_steps: int, initial_guidance: float, cfg_decay: bool, + preview_vae: bool = False, ) -> Any: """Build the per-step callback the pipeline calls during sampling. - Wires three concerns into one callback: + Wires four concerns into one callback: 1. Progress reporting via ``VIDEO_PROGRESS.set_step``. 2. Cooperative cancel — raise ``GenerationCancelled`` when the user hits Cancel on the modal. @@ -1249,6 +1383,10 @@ def _make_step_callback( to oversaturate when CFG is held high through the whole schedule; decaying lets the early steps lock semantics (high CFG) while late steps preserve fine detail (low CFG). + 4. FU-018 part 2 — when ``preview_vae`` is on, every Nth step + decode the current latent's middle frame via the swapped + TAEHV/TAEW preview VAE and publish a base64 PNG to + ``VIDEO_PROGRESS.set_thumbnail`` for the modal to render. """ # Floor MUST stay strictly above 1.0 so the pipeline's # ``do_classifier_free_guidance`` property (``_guidance_scale > 1.0``) @@ -1260,6 +1398,11 @@ def _make_step_callback( # dimension errors on LTX). decay_floor = 1.5 decay_active = cfg_decay and total_steps > 1 and initial_guidance > decay_floor + thumb_active = bool(preview_vae) + # Stride keeps the polled endpoint payload small. Video + # latent decode is more expensive than image (5D tensor), so + # we cap thumbnails at ~6 per gen. + thumb_stride = max(1, total_steps // 6) if thumb_active else 1 def _on_step_end(_pipeline: Any, step: int, _timestep: Any, callback_kwargs: dict[str, Any]): VIDEO_PROGRESS.set_step(step + 1, total=max(1, total_steps)) @@ -1280,6 +1423,21 @@ def _on_step_end(_pipeline: Any, step: int, _timestep: Any, callback_kwargs: dic _pipeline.guidance_scale = float(next_scale) except Exception: pass + if thumb_active: + is_final = (step + 1) >= total_steps + if is_final or (step % thumb_stride == 0): + latents = callback_kwargs.get("latents") if callback_kwargs else None + try: + from backend_service.helpers.preview_thumbnails import ( + decode_video_latent_to_b64, + ) + b64 = decode_video_latent_to_b64(_pipeline, latents) + if b64 is not None: + VIDEO_PROGRESS.set_thumbnail(b64) + except Exception: + # Best-effort — never fail the gen on a preview + # decode error. + pass return callback_kwargs return _on_step_end @@ -1303,7 +1461,13 @@ def _invoke_pipeline(self, pipeline: Any, kwargs: dict[str, Any]) -> list[Any]: # caller pops before passing to the pipeline. Default-on when # absent so existing call sites pick up the schedule. cfg_decay = bool(kwargs.pop("__cfg_decay", True)) - callback = self._make_step_callback(total_steps, initial_guidance, cfg_decay) + # FU-018 part 2: previewVae flag plumbs through the same + # private-kwarg pattern. When on, ``_make_step_callback`` emits + # a per-step base64 thumbnail decoded via the TAESD/TAEHV swap. + preview_vae = bool(kwargs.pop("__preview_vae", False)) + callback = self._make_step_callback( + total_steps, initial_guidance, cfg_decay, preview_vae=preview_vae, + ) kwargs.setdefault("callback_on_step_end", callback) try: @@ -1371,6 +1535,12 @@ def _invoke_pipeline_with_ltx_refiner( ) base_kwargs = dict(kwargs) + # Strip private kwargs the diffusers pipeline doesn't accept — + # ``_invoke_pipeline`` pops these before its own pipeline call, + # but the refiner path bypasses that and would otherwise leak + # ``__cfg_decay`` / ``__preview_vae`` into ``LTXPipeline.__call__``. + base_kwargs.pop("__cfg_decay", None) + base_kwargs.pop("__preview_vae", None) base_kwargs["output_type"] = "latent" base_result = pipeline(**base_kwargs) latents = getattr(base_result, "frames", None) @@ -1475,14 +1645,37 @@ def _ensure_pipeline( gguf_repo: str | None = None, gguf_file: str | None = None, use_nf4: bool = False, + lora_repo: str | None = None, + lora_file: str | None = None, + lora_scale: float | None = None, + preview_vae: bool = False, + distill_repo: str | None = None, + distill_high_file: str | None = None, + distill_low_file: str | None = None, + distill_precision: str | None = None, ) -> Any: with self._lock: - variant_suffix = "" + # Variant key folds in LoRA identity — switching LoRAs on the + # same base repo must rebuild the pipeline because fuse_lora + # mutates the transformer weights in place. ``preview_vae`` + # joins the same key set so toggling the FU-018 preview-decode + # knob triggers a clean rebuild. Distilled transformers replace + # both expert modules outright, so they also key on the variant. + variant_parts = [repo] if gguf_file: - variant_suffix = f"::{gguf_file}" + variant_parts.append(f"gguf={gguf_file}") elif use_nf4: - variant_suffix = "::nf4" - variant_key = f"{repo}{variant_suffix}" if variant_suffix else repo + variant_parts.append("nf4") + if lora_repo and lora_file: + variant_parts.append(f"lora={lora_repo}/{lora_file}@{lora_scale or 1.0}") + if preview_vae: + variant_parts.append("preview_vae") + if distill_repo and distill_high_file and distill_low_file: + variant_parts.append( + f"distill={distill_repo}/{distill_precision or 'bf16'}/" + f"{distill_high_file}/{distill_low_file}" + ) + variant_key = "::".join(variant_parts) if self._pipeline is not None and self._loaded_variant_key == variant_key: return self._pipeline @@ -1559,6 +1752,88 @@ def _ensure_pipeline( if hasattr(pipeline, "set_progress_bar_config"): pipeline.set_progress_bar_config(disable=True) + # FU-019: clear stale load notes from the previous pipeline + # and apply distill LoRAs (lightx2v Wan CausVid / + # Wan2.2-Distill-Models / FastWan) before placement so + # ``pipeline.to(device)`` moves the fused transformer weights + # in one pass. Failure is non-fatal — the user gets a note + # explaining why the LoRA didn't apply. + self._load_notes = [] + + # FU-016: SageAttention CUDA backend. No-op on MPS / CPU. + # Must run before LoRA fuse so the LoRA's adapter modules + # don't trip the backend swap (set_attention_backend + # mutates the attention class on existing modules). + try: + from backend_service.helpers.attention_backend import ( + maybe_apply_sage_attention, + ) + sage_note = maybe_apply_sage_attention(pipeline) + if sage_note: + self._load_notes.append(sage_note) + except Exception: + pass + + # FU-018: TAESD / TAEHV preview-decode VAE swap. No-op when + # toggle is off or no preview VAE is mapped for this repo. + # Runs before LoRA fuse so the swap settles before any + # transformer-side adapters touch the pipeline. + try: + from backend_service.helpers.preview_vae import ( + maybe_apply_preview_vae, + ) + preview_note = maybe_apply_preview_vae( + pipeline, repo=repo, enabled=preview_vae + ) + if preview_note: + self._load_notes.append(preview_note) + except Exception: + pass + + # Phase 3 / Wan2.2-Distill 4-step: replace transformer + + # transformer_2 with the lightx2v distilled experts. Skips + # LoRA below — distill weights already encode the 4-step + # schedule and are not LoRA-shaped. Failure is non-fatal: + # the stock Wan transformers stay in place and the user + # gets a runtimeNote explaining why. + distill_active = bool( + distill_repo and distill_high_file and distill_low_file + ) + if distill_active: + distill_note = self._swap_distill_transformers( + pipeline, + repo=distill_repo, + high_file=distill_high_file, + low_file=distill_low_file, + precision=distill_precision or "bf16", + torch=torch, + ) + self._load_notes.append(distill_note) + + if lora_repo and lora_file and not distill_active: + try: + pipeline.load_lora_weights( + lora_repo, + weight_name=lora_file, + local_files_only=True, + ) + effective_scale = ( + float(lora_scale) if lora_scale is not None else 1.0 + ) + pipeline.fuse_lora(lora_scale=effective_scale) + try: + pipeline.unload_lora_weights() + except Exception: + pass + self._load_notes.append( + f"LoRA: {lora_repo}/{lora_file} @ scale {effective_scale:.3f}" + ) + except Exception as exc: # noqa: BLE001 — non-fatal + self._load_notes.append( + f"LoRA load failed ({type(exc).__name__}: {exc}). " + "Pipeline continuing without LoRA." + ) + # Memory-saving knobs. Slicing + tiling are quality-lossy and # Reference workflows don't enable them by default — only flip them on # when there's real pressure. See ``_should_apply_memory_savers`` @@ -1682,12 +1957,26 @@ def _try_load_gguf_transformer( filename=gguf_file, local_files_only=True, ) + # ``from_single_file`` defaults the architecture config to the + # transformer class's largest known variant. For Wan that is the + # 14 B / A14B layout (cross-attn dim 5120). The TI2V 5B uses + # cross-attn dim 3072, so loading its GGUF without an explicit + # config raises: + # blocks.0.attn2.to_k.bias expected torch.Size([5120]), + # but got torch.Size([3072]) + # Pointing at the base diffusers repo's transformer subfolder + # makes diffusers build the model from the matching + # ``transformer/config.json`` before mapping in GGUF tensors, + # which fixes Wan 2.2 5B and stays correct for every other + # variant (the config dim happens to match the GGUF anyway). transformer = transformer_cls.from_single_file( gguf_local_path, quantization_config=GGUFQuantizationConfig( compute_dtype=torch.bfloat16, ), torch_dtype=torch.bfloat16, + config=repo, + subfolder="transformer", ) return transformer, f"Transformer loaded from GGUF ({gguf_file})" except Exception as exc: # noqa: BLE001 — any failure → fall back @@ -1771,6 +2060,100 @@ def _try_load_bnb_nf4_transformer( "falling back to the standard transformer." ) + def _swap_distill_transformers( + self, + pipeline: Any, + *, + repo: str, + high_file: str, + low_file: str, + precision: str, + torch: Any, + ) -> str: + """Swap ``pipeline.transformer`` + ``pipeline.transformer_2`` for + the lightx2v 4-step distilled experts (Wan 2.2 A14B I2V). + + Wan 2.2 A14B is MoE: ``transformer`` is the high-noise expert and + ``transformer_2`` is the low-noise expert. Distillation publishes + both as standalone safetensors files; the swap is the load-bearing + substitution that takes the pipeline from 30-step base to 4-step + distilled. Returns a runtimeNote describing what happened. Failure + is non-fatal — the stock transformers stay in place and the user + sees the failure in the note. + """ + try: + from huggingface_hub import hf_hub_download + except ImportError as exc: + return ( + f"Distill swap skipped: huggingface_hub unavailable ({exc}). " + "Pipeline continuing with stock Wan transformers." + ) + + try: + from diffusers import WanTransformer3DModel + except ImportError as exc: + return ( + f"Distill swap skipped: WanTransformer3DModel unavailable " + f"({exc}). Pipeline continuing with stock Wan transformers." + ) + + # FP8/INT8 distill weights ship pre-quantized; they need a torch + # backend that exposes the matching kernels (CUDA SM 8.9+ for FP8, + # CUDA / Metal for INT8). On platforms without those kernels we + # load as bf16 and let diffusers do the dequant — quality holds + # but the memory savings disappear. ``bf16`` (no quantization) + # always loads at native precision. + torch_dtype = torch.bfloat16 + if precision == "fp8_e4m3": + torch_dtype = getattr(torch, "float8_e4m3fn", torch.bfloat16) + + try: + high_local = hf_hub_download( + repo_id=repo, filename=high_file, local_files_only=False + ) + low_local = hf_hub_download( + repo_id=repo, filename=low_file, local_files_only=False + ) + except Exception as exc: # noqa: BLE001 — non-fatal + return ( + f"Distill download failed ({type(exc).__name__}: {exc}). " + "Pipeline continuing with stock Wan transformers." + ) + + try: + high_transformer = WanTransformer3DModel.from_single_file( + high_local, torch_dtype=torch_dtype + ) + low_transformer = WanTransformer3DModel.from_single_file( + low_local, torch_dtype=torch_dtype + ) + except Exception as exc: # noqa: BLE001 — non-fatal + return ( + f"Distill load failed ({type(exc).__name__}: {exc}). " + "Pipeline continuing with stock Wan transformers." + ) + + if not hasattr(pipeline, "transformer"): + return ( + "Distill swap skipped: pipeline has no .transformer attribute. " + "This Wan distill path requires a WanPipeline-shaped object." + ) + + pipeline.transformer = high_transformer + if hasattr(pipeline, "transformer_2"): + pipeline.transformer_2 = low_transformer + else: + return ( + f"Distill: high-noise expert applied, but pipeline lacks " + f"transformer_2 (low-noise expert). Verify base repo {repo} " + "is the A14B MoE pipeline. Quality may be degraded." + ) + + return ( + f"Distill: swapped transformer + transformer_2 from {repo} " + f"(precision={precision}, 4-step schedule)." + ) + def _release_pipeline(self) -> None: pipeline = self._pipeline torch = self._torch @@ -1798,8 +2181,16 @@ def _release_pipeline(self) -> None: pass def _detect_device(self, torch: Any) -> str: - if getattr(torch.cuda, "is_available", lambda: False)(): - return "cuda" + cuda_module = getattr(torch, "cuda", None) + if cuda_module is not None: + try: + if getattr(cuda_module, "is_available", lambda: False)(): + return "cuda" + except Exception: + pass + cuda_error = _windows_cuda_unavailable_message(torch) + if cuda_error: + raise RuntimeError(cuda_error) mps_backend = getattr(getattr(torch, "backends", None), "mps", None) if mps_backend is not None and getattr(mps_backend, "is_available", lambda: False)(): return "mps" diff --git a/build.ps1 b/build.ps1 index 2023f00..3f2616a 100644 --- a/build.ps1 +++ b/build.ps1 @@ -30,8 +30,17 @@ if (-not (Test-Path .venv)) { Assert-LastExit "python -m venv" } +# Use `python -m pip` rather than the bare `pip.exe` shim. On Windows, +# pip.exe refuses to upgrade itself ("To modify pip, please run the +# following command: -m pip install --upgrade pip") because +# it can't overwrite its own running .exe. Invoking pip as a python +# module lets python hold the file handle and replace pip cleanly. +# Same trick keeps subsequent pip calls consistent across pip +# versions. +$VenvPython = ".\.venv\Scripts\python.exe" + Write-Host "==> Installing Python dependencies..." -.\.venv\Scripts\pip install --upgrade pip -q +& $VenvPython -m pip install --upgrade pip -q Assert-LastExit "pip install --upgrade pip" # vendor/ChaosEngine declares `license = "Apache-2.0"` per PEP 639. Setuptools @@ -45,7 +54,7 @@ Assert-LastExit "pip install --upgrade pip" # dependency-warning heuristic surfaces that as a loud yellow warning on # every invocation after setuptools 82 is installed. 77..81 covers PEP 639 # while staying inside torch's supported range. -.\.venv\Scripts\pip install --upgrade "setuptools>=77,<82" wheel -q +& $VenvPython -m pip install --upgrade "setuptools>=77,<82" wheel -q Assert-LastExit "pip install --upgrade setuptools wheel" # Chat-only bundle: no torch, no diffusers, no CUDA DLLs. The installer @@ -57,12 +66,12 @@ Assert-LastExit "pip install --upgrade setuptools wheel" # # To include the GPU stack in the installer anyway (e.g. for air-gapped # deployments that can't download at runtime), set CHAOSENGINE_BUNDLE_GPU=1. -.\.venv\Scripts\pip install -q -e ".[desktop]" +& $VenvPython -m pip install -q -e ".[desktop]" Assert-LastExit "pip install -e .[desktop]" if ($env:CHAOSENGINE_BUNDLE_GPU -eq "1") { Write-Host "==> CHAOSENGINE_BUNDLE_GPU=1 -- also bundling [images] extras" - .\.venv\Scripts\pip install -q -e ".[desktop,images]" + & $VenvPython -m pip install -q -e ".[desktop,images]" Assert-LastExit "pip install -e .[desktop,images]" } diff --git a/cache_compression/__init__.py b/cache_compression/__init__.py index 1bcfa2c..2fc5355 100644 --- a/cache_compression/__init__.py +++ b/cache_compression/__init__.py @@ -266,6 +266,71 @@ def discover(self) -> list[CacheStrategy]: "supports_fp16_layers": False, "required_llama_binary": "standard", }, + { + # FU-015: First Block Cache via diffusers 0.36+ generic + # ``apply_first_block_cache`` hook. Same diffusion-cache + # contract as TeaCache (image+video only, threshold-based) + # but model-agnostic — covers Wan2.1/2.2 without a vendored + # forward, which closes FU-007. Same metadata shape as + # TeaCache; llama.cpp hook is N/A. + "id": "fbcache", + "name": "First Block Cache", + "module": "cache_compression.firstblockcache", + "class_name": "FirstBlockCacheStrategy", + "bit_range": None, + "default_bits": None, + "supports_fp16_layers": False, + "required_llama_binary": "standard", + }, + { + # Post-FU-026: TaylorSeer / MagCache / PAB / FasterCache + # all ship in diffusers 0.38 core via + # ``pipeline.transformer.enable_cache()``. Same + # diffusion-cache contract as TeaCache / FBCache — image + # + video DiTs only, threshold-shaped slider repurposed as + # the per-strategy primary knob (cache_interval for + # TaylorSeer, skip_range for PAB / FasterCache). UNet + # pipelines (SD1.5/SDXL) raise NotImplementedError into + # a runtimeNote. + "id": "taylorseer", + "name": "TaylorSeer Cache", + "module": "cache_compression.taylorseer", + "class_name": "TaylorSeerCacheStrategy", + "bit_range": None, + "default_bits": None, + "supports_fp16_layers": False, + "required_llama_binary": "standard", + }, + { + "id": "magcache", + "name": "MagCache", + "module": "cache_compression.magcache", + "class_name": "MagCacheStrategy", + "bit_range": None, + "default_bits": None, + "supports_fp16_layers": False, + "required_llama_binary": "standard", + }, + { + "id": "pab", + "name": "Pyramid Attention Broadcast", + "module": "cache_compression.pab", + "class_name": "PyramidAttentionBroadcastStrategy", + "bit_range": None, + "default_bits": None, + "supports_fp16_layers": False, + "required_llama_binary": "standard", + }, + { + "id": "fastercache", + "name": "FasterCache", + "module": "cache_compression.fastercache", + "class_name": "FasterCacheStrategy", + "bit_range": None, + "default_bits": None, + "supports_fp16_layers": False, + "required_llama_binary": "standard", + }, ] for spec in strategy_specs: diff --git a/cache_compression/fastercache.py b/cache_compression/fastercache.py new file mode 100644 index 0000000..ddf1d17 --- /dev/null +++ b/cache_compression/fastercache.py @@ -0,0 +1,120 @@ +"""FasterCache — diffusers 0.38+ core cache hook. + +Post-FU-026. Caches and reuses attention features similar to PAB, plus +optionally skips the unconditional CFG branch when residuals between +successive timesteps are highly correlated. Best on video DiTs running +classifier-free guidance. + +Reuses the shared ``apply_diffusion_cache_strategy`` dispatcher's +``rel_l1_thresh`` field as the *spatial_attention_block_skip_range* knob +(rounded to int, clamped >= 2). Default 2. +""" + +from __future__ import annotations + +import importlib.util +from typing import Any + +from . import CacheStrategy + + +_DEFAULT_SKIP_RANGE = 2 +_DEFAULT_TIMESTEP_RANGE = (-1, 681) +_DEFAULT_UNCOND_SKIP_RANGE = 5 +_DEFAULT_UNCOND_TIMESTEP_RANGE = (-1, 781) +_DEFAULT_ATTENTION_WEIGHT = 0.3 + + +def _import_config(): + try: + from diffusers import FasterCacheConfig + return FasterCacheConfig + except ImportError: + from diffusers.hooks import FasterCacheConfig + return FasterCacheConfig + + +class FasterCacheStrategy(CacheStrategy): + """Attention + uncond-branch cache backed by diffusers 0.38 FasterCache hook.""" + + @property + def strategy_id(self) -> str: + return "fastercache" + + @property + def name(self) -> str: + return "FasterCache" + + def is_available(self) -> bool: + if importlib.util.find_spec("diffusers") is None: + return False + try: + _import_config() + except Exception: + return False + return True + + def availability_badge(self) -> str: + return "Ready" if self.is_available() else "Upgrade" + + def availability_reason(self) -> str | None: + if self.is_available(): + return None + return ( + "FasterCache needs diffusers >= 0.38. " + "Run the GPU runtime installer to upgrade diffusers." + ) + + def applies_to(self) -> frozenset[str]: + return frozenset({"image", "video"}) + + def recommended_thresholds(self) -> dict[str, float]: + return {"image": 2.0, "video": 2.0} + + def apply_diffusers_hook( + self, + pipeline: Any, + *, + num_inference_steps: int, + rel_l1_thresh: float | None, + ) -> None: + try: + FasterCacheConfig = _import_config() + except ImportError as exc: + raise NotImplementedError( + f"diffusers FasterCache hook unavailable: {exc}" + ) from exc + + transformer = getattr(pipeline, "transformer", None) + if transformer is None: + raise NotImplementedError( + "FasterCache requires a DiT pipeline (with .transformer); " + "this pipeline appears to be UNet-based." + ) + if not hasattr(transformer, "enable_cache"): + raise NotImplementedError( + "transformer.enable_cache is not available on this pipeline. " + "Diffusers >= 0.38 is required for the FasterCache registry path." + ) + + if rel_l1_thresh is not None and rel_l1_thresh >= 2: + skip_range = int(round(rel_l1_thresh)) + else: + skip_range = _DEFAULT_SKIP_RANGE + + del num_inference_steps # FasterCache derives schedule from timesteps. + + try: + config = FasterCacheConfig( + spatial_attention_block_skip_range=skip_range, + spatial_attention_timestep_skip_range=_DEFAULT_TIMESTEP_RANGE, + current_timestep_callback=lambda: getattr(pipeline, "current_timestep", 0), + attention_weight_callback=lambda _: _DEFAULT_ATTENTION_WEIGHT, + unconditional_batch_skip_range=_DEFAULT_UNCOND_SKIP_RANGE, + unconditional_batch_timestep_skip_range=_DEFAULT_UNCOND_TIMESTEP_RANGE, + tensor_format="BFCHW", + ) + except TypeError: + config = FasterCacheConfig() + + transformer.enable_cache(config) diff --git a/cache_compression/firstblockcache.py b/cache_compression/firstblockcache.py new file mode 100644 index 0000000..1ce2463 --- /dev/null +++ b/cache_compression/firstblockcache.py @@ -0,0 +1,129 @@ +"""First Block Cache (FBCache) — diffusers 0.36+ generic DiT cache hook. + +FU-015. Replaces the per-model vendored TeaCache forwards with a single +model-agnostic hook that diffusers ships in ``diffusers.hooks``. Closes +FU-007 (Wan TeaCache) — the Wan signature mismatch that motivated the +deferral disappears here because FBCache attaches to ``pipeline.transformer`` +without needing a custom forward. + +The hook compares each step's first-block residual against the previous +step's. When the L1-relative delta is below the threshold, all subsequent +blocks reuse cached residuals, skipping a full forward through the rest +of the DiT. Threshold 0.12 is the diffusers-blog recommendation for +FLUX.1-dev (≈1.8× speedup, no visible quality loss). + +Applies to image + video DiTs (FLUX, SD3.5, Wan2.1/2.2, HunyuanVideo, +LTX-Video, CogVideoX, Mochi). Does NOT apply to UNet pipelines +(SD1.5/SDXL); ``applies_to`` would still report ``{"image","video"}`` so +the strategy is *visible* to those Studios, but the runtime hook will +raise ``NotImplementedError`` for non-DiT pipelines and the engine +swallows that into a "not applied" runtimeNote. +""" + +from __future__ import annotations + +import importlib.util +from typing import Any + +from . import CacheStrategy + + +# Default threshold matching diffusers blog post on FBCache for FLUX: +# 0.12 yields ~1.8× speedup with imperceptible quality drift on a wide +# prompt set. Lower (0.08) is safer for video DiTs where temporal +# consistency is more sensitive; higher (0.20) is more aggressive. +_DEFAULT_THRESHOLD = 0.12 + + +class FirstBlockCacheStrategy(CacheStrategy): + """Generic block-cache strategy backed by ``diffusers.hooks.apply_first_block_cache``.""" + + @property + def strategy_id(self) -> str: + return "fbcache" + + @property + def name(self) -> str: + return "First Block Cache" + + def is_available(self) -> bool: + if importlib.util.find_spec("diffusers") is None: + return False + try: + from diffusers.hooks import apply_first_block_cache # noqa: F401 + from diffusers.hooks import FirstBlockCacheConfig # noqa: F401 + except Exception: + return False + return True + + def availability_badge(self) -> str: + if self.is_available(): + return "Ready" + return "Upgrade" + + def availability_reason(self) -> str | None: + if self.is_available(): + return None + return ( + "First Block Cache needs diffusers >= 0.36. " + "Run the GPU runtime installer to upgrade diffusers." + ) + + def applies_to(self) -> frozenset[str]: + return frozenset({"image", "video"}) + + def recommended_thresholds(self) -> dict[str, float]: + """UI hints for the threshold slider per domain.""" + return {"image": 0.12, "video": 0.08} + + def apply_diffusers_hook( + self, + pipeline: Any, + *, + num_inference_steps: int, + rel_l1_thresh: float | None, + ) -> None: + """Attach FBCache to ``pipeline.transformer``. + + Raises ``NotImplementedError`` for pipelines without a ``transformer`` + attribute (UNet-based SD1.5/SDXL) — caller swallows this into a + runtimeNote so the user sees "not applied" instead of a crash. + """ + try: + from diffusers.hooks import apply_first_block_cache, FirstBlockCacheConfig + except ImportError as exc: + raise NotImplementedError( + f"diffusers FBCache hook unavailable: {exc}" + ) from exc + + transformer = getattr(pipeline, "transformer", None) + if transformer is None: + raise NotImplementedError( + "First Block Cache requires a DiT pipeline (with .transformer); " + "this pipeline appears to be UNet-based. Use TeaCache or stay on stock." + ) + + threshold = ( + rel_l1_thresh + if rel_l1_thresh is not None and rel_l1_thresh > 0 + else _DEFAULT_THRESHOLD + ) + # ``num_inference_steps`` is accepted for API parity with TeaCache + # but FBCache derives its own warmup internally — diffusers' hook + # only takes a threshold + optional num_blocks_to_skip. + del num_inference_steps # noqa: F841 — intentionally unused + + try: + config = FirstBlockCacheConfig(threshold=float(threshold)) + except TypeError: + # Older 0.36 betas exposed positional-only construction. Fall + # back to the no-arg form and set threshold post-construction + # if available. + config = FirstBlockCacheConfig() + if hasattr(config, "threshold"): + try: + config.threshold = float(threshold) + except Exception: + pass + + apply_first_block_cache(transformer, config) diff --git a/cache_compression/magcache.py b/cache_compression/magcache.py new file mode 100644 index 0000000..f485f3b --- /dev/null +++ b/cache_compression/magcache.py @@ -0,0 +1,140 @@ +"""MagCache — diffusers 0.38+ core cache hook (FLUX-only without calibration). + +Post-FU-026. Skips transformer blocks based on residual-magnitude decay over +the diffusion process. Requires per-model "magnitude ratios" — diffusers +ships pre-calibrated ratios for FLUX (``FLUX_MAG_RATIOS`` in +``diffusers.hooks.mag_cache``); other model families need a calibration +pass before MagCache can run. + +This adapter: +- Detects FLUX pipelines via class name and uses the shipped ratios. +- Raises ``NotImplementedError`` with a helpful message for other DiTs, + pointing to the ``MagCacheConfig(calibrate=True, ...)`` flow. + +Calibration UX is a planned follow-up; for now MagCache is FLUX-only in the +registry path. ``applies_to()`` stays ``{"image", "video"}`` so the strategy +is visible in both Studios — non-FLUX video DiTs surface the calibration +message via ``runtimeNote`` rather than crashing. +""" + +from __future__ import annotations + +import importlib.util +from typing import Any + +from . import CacheStrategy + + +def _import_config(): + try: + from diffusers import MagCacheConfig + return MagCacheConfig + except ImportError: + from diffusers.hooks import MagCacheConfig + return MagCacheConfig + + +def _import_flux_ratios(): + from diffusers.hooks.mag_cache import FLUX_MAG_RATIOS + return FLUX_MAG_RATIOS + + +class MagCacheStrategy(CacheStrategy): + """Magnitude-based cache backed by diffusers 0.38 ``MagCacheConfig``.""" + + @property + def strategy_id(self) -> str: + return "magcache" + + @property + def name(self) -> str: + return "MagCache" + + def is_available(self) -> bool: + if importlib.util.find_spec("diffusers") is None: + return False + try: + _import_config() + except Exception: + return False + return True + + def availability_badge(self) -> str: + return "Ready" if self.is_available() else "Upgrade" + + def availability_reason(self) -> str | None: + if self.is_available(): + return None + return ( + "MagCache needs diffusers >= 0.38. " + "Run the GPU runtime installer to upgrade diffusers." + ) + + def applies_to(self) -> frozenset[str]: + return frozenset({"image", "video"}) + + def recommended_thresholds(self) -> dict[str, float]: + # MagCache's main knob is the calibration ratio array, not a + # single threshold. The slider value is ignored by this adapter + # and the dispatcher passes through whatever the UI sends. + return {"image": 0.0, "video": 0.0} + + @staticmethod + def _is_flux_pipeline(pipeline: Any) -> bool: + cls_name = pipeline.__class__.__name__.lower() + return "flux" in cls_name + + def apply_diffusers_hook( + self, + pipeline: Any, + *, + num_inference_steps: int, + rel_l1_thresh: float | None, + ) -> None: + try: + MagCacheConfig = _import_config() + except ImportError as exc: + raise NotImplementedError( + f"diffusers MagCache hook unavailable: {exc}" + ) from exc + + transformer = getattr(pipeline, "transformer", None) + if transformer is None: + raise NotImplementedError( + "MagCache requires a DiT pipeline (with .transformer); " + "this pipeline appears to be UNet-based." + ) + if not hasattr(transformer, "enable_cache"): + raise NotImplementedError( + "transformer.enable_cache is not available on this pipeline. " + "Diffusers >= 0.38 is required for the MagCache registry path." + ) + + del rel_l1_thresh # MagCache has no single-threshold knob. + + if not self._is_flux_pipeline(pipeline): + raise NotImplementedError( + "MagCache requires per-model calibration. Pre-calibrated ratios " + "ship only for FLUX (FLUX_MAG_RATIOS). For other DiTs, run a " + "calibration pass first via " + "MagCacheConfig(calibrate=True, num_inference_steps=...) and " + "pass the printed ratios via mag_ratios=[...]. Until " + "calibration UX lands, use FBCache or TaylorSeer." + ) + + try: + flux_ratios = _import_flux_ratios() + except ImportError as exc: + raise NotImplementedError( + f"FLUX_MAG_RATIOS missing from diffusers.hooks.mag_cache: {exc}" + ) from exc + + try: + config = MagCacheConfig( + mag_ratios=list(flux_ratios), + num_inference_steps=int(num_inference_steps), + ) + except TypeError: + config = MagCacheConfig(mag_ratios=list(flux_ratios)) + + transformer.enable_cache(config) diff --git a/cache_compression/pab.py b/cache_compression/pab.py new file mode 100644 index 0000000..6a5e6b2 --- /dev/null +++ b/cache_compression/pab.py @@ -0,0 +1,119 @@ +"""Pyramid Attention Broadcast — diffusers 0.38+ core cache hook. + +Post-FU-026. Skips spatial-attention computations on a fixed timestep +schedule, exploiting the small differences in attention outputs between +successive denoise steps. Most effective on video DiTs where timestep +schedules are long (CogVideoX, HunyuanVideo, Wan). + +Reuses the shared ``apply_diffusion_cache_strategy`` dispatcher's +``rel_l1_thresh`` field as the *spatial_attention_block_skip_range* knob +(rounded to int, clamped >= 2). Default 2 = skip every other step's +spatial attention. +""" + +from __future__ import annotations + +import importlib.util +from typing import Any + +from . import CacheStrategy + + +_DEFAULT_SKIP_RANGE = 2 +# Diffusers blog default for CogVideoX. Smaller intervals slow inference; +# larger intervals harm quality. Validated for video DiTs. +_DEFAULT_TIMESTEP_RANGE = (100, 800) + + +def _import_config(): + try: + from diffusers import PyramidAttentionBroadcastConfig + return PyramidAttentionBroadcastConfig + except ImportError: + from diffusers.hooks import PyramidAttentionBroadcastConfig + return PyramidAttentionBroadcastConfig + + +class PyramidAttentionBroadcastStrategy(CacheStrategy): + """Spatial-attention skip schedule backed by diffusers 0.38 PAB hook.""" + + @property + def strategy_id(self) -> str: + return "pab" + + @property + def name(self) -> str: + return "Pyramid Attention Broadcast" + + def is_available(self) -> bool: + if importlib.util.find_spec("diffusers") is None: + return False + try: + _import_config() + except Exception: + return False + return True + + def availability_badge(self) -> str: + return "Ready" if self.is_available() else "Upgrade" + + def availability_reason(self) -> str | None: + if self.is_available(): + return None + return ( + "Pyramid Attention Broadcast needs diffusers >= 0.38. " + "Run the GPU runtime installer to upgrade diffusers." + ) + + def applies_to(self) -> frozenset[str]: + return frozenset({"image", "video"}) + + def recommended_thresholds(self) -> dict[str, float]: + # Slider repurposed as skip_range. Image DiTs run shorter + # schedules where larger skips bite harder; video DiTs tolerate + # bigger intervals. + return {"image": 2.0, "video": 3.0} + + def apply_diffusers_hook( + self, + pipeline: Any, + *, + num_inference_steps: int, + rel_l1_thresh: float | None, + ) -> None: + try: + PyramidAttentionBroadcastConfig = _import_config() + except ImportError as exc: + raise NotImplementedError( + f"diffusers PAB hook unavailable: {exc}" + ) from exc + + transformer = getattr(pipeline, "transformer", None) + if transformer is None: + raise NotImplementedError( + "Pyramid Attention Broadcast requires a DiT pipeline " + "(with .transformer); this pipeline appears to be UNet-based." + ) + if not hasattr(transformer, "enable_cache"): + raise NotImplementedError( + "transformer.enable_cache is not available on this pipeline. " + "Diffusers >= 0.38 is required for the PAB registry path." + ) + + if rel_l1_thresh is not None and rel_l1_thresh >= 2: + skip_range = int(round(rel_l1_thresh)) + else: + skip_range = _DEFAULT_SKIP_RANGE + + del num_inference_steps # PAB derives its own schedule from timesteps. + + try: + config = PyramidAttentionBroadcastConfig( + spatial_attention_block_skip_range=skip_range, + spatial_attention_timestep_skip_range=_DEFAULT_TIMESTEP_RANGE, + current_timestep_callback=lambda: getattr(pipeline, "current_timestep", 0), + ) + except TypeError: + config = PyramidAttentionBroadcastConfig() + + transformer.enable_cache(config) diff --git a/cache_compression/taylorseer.py b/cache_compression/taylorseer.py new file mode 100644 index 0000000..a60aceb --- /dev/null +++ b/cache_compression/taylorseer.py @@ -0,0 +1,116 @@ +"""TaylorSeer Cache — diffusers 0.38+ core cache hook. + +Post-FU-026. Approximates intermediate transformer activations across denoise +steps via a Taylor series expansion, reusing them at fixed intervals to skip +full forwards. Strong wall-time wins on FLUX (~1.6× at cache_interval=5, +max_order=1, disable_cache_before_step=10). + +Unlike FBCache (threshold-based), TaylorSeer is interval-based. Reuses the +shared ``apply_diffusion_cache_strategy`` dispatcher's ``rel_l1_thresh`` +field as the *cache_interval* knob (rounded to nearest int, clamped >= 2). +When ``rel_l1_thresh`` is ``None`` or below 2, falls back to the +diffusers-blog default of 5. +""" + +from __future__ import annotations + +import importlib.util +from typing import Any + +from . import CacheStrategy + + +_DEFAULT_CACHE_INTERVAL = 5 +_DEFAULT_MAX_ORDER = 1 + + +def _import_config(): + try: + from diffusers import TaylorSeerCacheConfig + return TaylorSeerCacheConfig + except ImportError: + from diffusers.hooks import TaylorSeerCacheConfig + return TaylorSeerCacheConfig + + +class TaylorSeerCacheStrategy(CacheStrategy): + """Taylor-series interval cache backed by diffusers 0.38 ``TaylorSeerCacheConfig``.""" + + @property + def strategy_id(self) -> str: + return "taylorseer" + + @property + def name(self) -> str: + return "TaylorSeer Cache" + + def is_available(self) -> bool: + if importlib.util.find_spec("diffusers") is None: + return False + try: + _import_config() + except Exception: + return False + return True + + def availability_badge(self) -> str: + return "Ready" if self.is_available() else "Upgrade" + + def availability_reason(self) -> str | None: + if self.is_available(): + return None + return ( + "TaylorSeer Cache needs diffusers >= 0.38. " + "Run the GPU runtime installer to upgrade diffusers." + ) + + def applies_to(self) -> frozenset[str]: + return frozenset({"image", "video"}) + + def recommended_thresholds(self) -> dict[str, float]: + return {"image": 5.0, "video": 4.0} + + def apply_diffusers_hook( + self, + pipeline: Any, + *, + num_inference_steps: int, + rel_l1_thresh: float | None, + ) -> None: + try: + TaylorSeerCacheConfig = _import_config() + except ImportError as exc: + raise NotImplementedError( + f"diffusers TaylorSeer hook unavailable: {exc}" + ) from exc + + transformer = getattr(pipeline, "transformer", None) + if transformer is None: + raise NotImplementedError( + "TaylorSeer Cache requires a DiT pipeline (with .transformer); " + "this pipeline appears to be UNet-based. Use TeaCache or stay on stock." + ) + if not hasattr(transformer, "enable_cache"): + raise NotImplementedError( + "transformer.enable_cache is not available on this pipeline. " + "Diffusers >= 0.38 is required for the TaylorSeer registry path." + ) + + if rel_l1_thresh is not None and rel_l1_thresh >= 2: + cache_interval = int(round(rel_l1_thresh)) + else: + cache_interval = _DEFAULT_CACHE_INTERVAL + + steps = max(1, int(num_inference_steps)) + warmup = max(0, min(steps // 2, max(2, steps // 4))) if steps >= 4 else 0 + + try: + config = TaylorSeerCacheConfig( + cache_interval=cache_interval, + max_order=_DEFAULT_MAX_ORDER, + disable_cache_before_step=warmup, + ) + except TypeError: + config = TaylorSeerCacheConfig() + + transformer.enable_cache(config) diff --git a/package-lock.json b/package-lock.json index 40fdbea..df061b8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,26 +1,32 @@ { "name": "chaosengine-desktop", - "version": "0.7.0-rc.5", + "version": "0.7.2", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "chaosengine-desktop", - "version": "0.7.0-rc.5", + "version": "0.7.2", "dependencies": { "@tauri-apps/api": "^2.1.0", "@tauri-apps/plugin-dialog": "^2.7.0", "@tauri-apps/plugin-opener": "^2.5.3", "@tauri-apps/plugin-process": "^2.0.0", "@tauri-apps/plugin-updater": "^2.0.0", + "katex": "^0.16.45", "react": "^18.3.1", "react-dom": "^18.3.1", - "react-markdown": "^10.1.0" + "react-markdown": "^10.1.0", + "react-syntax-highlighter": "^15.6.6", + "rehype-katex": "^7.0.1", + "remark-gfm": "^4.0.1", + "remark-math": "^6.0.0" }, "devDependencies": { "@tauri-apps/cli": "^2.1.0", "@types/react": "^18.3.12", "@types/react-dom": "^18.3.1", + "@types/react-syntax-highlighter": "^15.5.13", "@vitejs/plugin-react": "^5.1.0", "typescript": "^5.6.3", "vite": "^7.3.2", @@ -261,6 +267,15 @@ "@babel/core": "^7.0.0-0" } }, + "node_modules/@babel/runtime": { + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.29.2.tgz", + "integrity": "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, "node_modules/@babel/template": { "version": "7.28.6", "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz", @@ -1521,6 +1536,12 @@ "@types/unist": "*" } }, + "node_modules/@types/katex": { + "version": "0.16.8", + "resolved": "https://registry.npmjs.org/@types/katex/-/katex-0.16.8.tgz", + "integrity": "sha512-trgaNyfU+Xh2Tc+ABIb44a5AYUpicB3uwirOioeOkNPPbmgRNtcWyDeeFRzjPZENO9Vq8gvVqfhaaXWLlevVwg==", + "license": "MIT" + }, "node_modules/@types/mdast": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz", @@ -1562,6 +1583,16 @@ "@types/react": "^18.0.0" } }, + "node_modules/@types/react-syntax-highlighter": { + "version": "15.5.13", + "resolved": "https://registry.npmjs.org/@types/react-syntax-highlighter/-/react-syntax-highlighter-15.5.13.tgz", + "integrity": "sha512-uLGJ87j6Sz8UaBAooU0T6lWJ0dBmjZgN1PZTrj05TNql2/XpC6+4HhMT5syIdFUUt+FASfCeLLv4kBygNU+8qA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/react": "*" + } + }, "node_modules/@types/unist": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", @@ -1866,6 +1897,15 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/commander": { + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz", + "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==", + "license": "MIT", + "engines": { + "node": ">= 12" + } + }, "node_modules/convert-source-map": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", @@ -1938,6 +1978,18 @@ "dev": true, "license": "ISC" }, + "node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/es-module-lexer": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.0.0.tgz", @@ -1997,6 +2049,18 @@ "node": ">=6" } }, + "node_modules/escape-string-regexp": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", + "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/estree-util-is-identifier-name": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/estree-util-is-identifier-name/-/estree-util-is-identifier-name-3.0.0.tgz", @@ -2033,6 +2097,19 @@ "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", "license": "MIT" }, + "node_modules/fault": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/fault/-/fault-1.0.4.tgz", + "integrity": "sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==", + "license": "MIT", + "dependencies": { + "format": "^0.2.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/fdir": { "version": "6.5.0", "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", @@ -2051,6 +2128,14 @@ } } }, + "node_modules/format": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", + "integrity": "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww==", + "engines": { + "node": ">=0.4.x" + } + }, "node_modules/fsevents": { "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", @@ -2076,6 +2161,158 @@ "node": ">=6.9.0" } }, + "node_modules/hast-util-from-dom": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.1.tgz", + "integrity": "sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==", + "license": "ISC", + "dependencies": { + "@types/hast": "^3.0.0", + "hastscript": "^9.0.0", + "web-namespaces": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-dom/node_modules/hast-util-parse-selector": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", + "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-dom/node_modules/hastscript": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz", + "integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-parse-selector": "^4.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-html": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz", + "integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "devlop": "^1.1.0", + "hast-util-from-parse5": "^8.0.0", + "parse5": "^7.0.0", + "vfile": "^6.0.0", + "vfile-message": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-html-isomorphic": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/hast-util-from-html-isomorphic/-/hast-util-from-html-isomorphic-2.0.0.tgz", + "integrity": "sha512-zJfpXq44yff2hmE0XmwEOzdWin5xwH+QIhMLOScpX91e/NSGPsAzNCvLQDIEPyO2TXi+lBmU6hjLIhV8MwP2kw==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "hast-util-from-dom": "^5.0.0", + "hast-util-from-html": "^2.0.0", + "unist-util-remove-position": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-parse5": { + "version": "8.0.3", + "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.3.tgz", + "integrity": "sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "devlop": "^1.0.0", + "hastscript": "^9.0.0", + "property-information": "^7.0.0", + "vfile": "^6.0.0", + "vfile-location": "^5.0.0", + "web-namespaces": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-parse5/node_modules/hast-util-parse-selector": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", + "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-parse5/node_modules/hastscript": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz", + "integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-parse-selector": "^4.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-is-element": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", + "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-parse-selector": { + "version": "2.2.5", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-2.2.5.tgz", + "integrity": "sha512-7j6mrk/qqkSehsM92wQjdIgWM2/BW61u/53G6xmC8i1OmEdKLHbk419QKQUjz6LglWsfqoiHmyMRkP1BGjecNQ==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-to-jsx-runtime": { "version": "2.3.6", "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.6.tgz", @@ -2103,6 +2340,22 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/hast-util-to-text": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/hast-util-to-text/-/hast-util-to-text-4.0.2.tgz", + "integrity": "sha512-KK6y/BN8lbaq654j7JgBydev7wuNMcID54lkRav1P0CaE1e47P72AWWPiGKXTJU271ooYzcvTAn/Zt0REnvc7A==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "hast-util-is-element": "^3.0.0", + "unist-util-find-after": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-whitespace": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", @@ -2116,6 +2369,86 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/hastscript": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-6.0.0.tgz", + "integrity": "sha512-nDM6bvd7lIqDUiYEiu5Sl/+6ReP0BMk/2f4U/Rooccxkj0P5nm+acM5PrGJ/t5I8qPGiqZSE6hVAwZEdZIvP4w==", + "license": "MIT", + "dependencies": { + "@types/hast": "^2.0.0", + "comma-separated-tokens": "^1.0.0", + "hast-util-parse-selector": "^2.0.0", + "property-information": "^5.0.0", + "space-separated-tokens": "^1.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hastscript/node_modules/@types/hast": { + "version": "2.3.10", + "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz", + "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==", + "license": "MIT", + "dependencies": { + "@types/unist": "^2" + } + }, + "node_modules/hastscript/node_modules/@types/unist": { + "version": "2.0.11", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", + "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==", + "license": "MIT" + }, + "node_modules/hastscript/node_modules/comma-separated-tokens": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-1.0.8.tgz", + "integrity": "sha512-GHuDRO12Sypu2cV70d1dkA2EUmXHgntrzbpvOB+Qy+49ypNfGgFQIC2fhhXbnyrJRynDCAARsT7Ou0M6hirpfw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/hastscript/node_modules/property-information": { + "version": "5.6.0", + "resolved": "https://registry.npmjs.org/property-information/-/property-information-5.6.0.tgz", + "integrity": "sha512-YUHSPk+A30YPv+0Qf8i9Mbfe/C0hdPXk1s1jPVToV8pk8BQtpw10ct89Eo7OWkutrwqvT0eicAxlOg3dOAu8JA==", + "license": "MIT", + "dependencies": { + "xtend": "^4.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/hastscript/node_modules/space-separated-tokens": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-1.1.5.tgz", + "integrity": "sha512-q/JSVd1Lptzhf5bkYm4ob4iWPjx0KiRe3sRFBNrVqbJkFaBm5vbbowy1mymoPNLRa52+oadOhJ+K49wsSeSjTA==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/highlight.js": { + "version": "10.7.3", + "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.7.3.tgz", + "integrity": "sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==", + "license": "BSD-3-Clause", + "engines": { + "node": "*" + } + }, + "node_modules/highlightjs-vue": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/highlightjs-vue/-/highlightjs-vue-1.0.0.tgz", + "integrity": "sha512-PDEfEF102G23vHmPhLyPboFCD+BkMGu+GuJe2d9/eH4FsCwvgBpnc9n0pGE+ffKdph38s6foEZiEjdgHdzp+IA==", + "license": "CC0-1.0" + }, "node_modules/html-url-attributes": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz", @@ -2220,6 +2553,22 @@ "node": ">=6" } }, + "node_modules/katex": { + "version": "0.16.45", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.45.tgz", + "integrity": "sha512-pQpZbdBu7wCTmQUh7ufPmLr0pFoObnGUoL/yhtwJDgmmQpbkg/0HSVti25Fu4rmd1oCR6NGWe9vqTWuWv3GcNA==", + "funding": [ + "https://opencollective.com/katex", + "https://github.com/sponsors/katex" + ], + "license": "MIT", + "dependencies": { + "commander": "^8.3.0" + }, + "bin": { + "katex": "cli.js" + } + }, "node_modules/longest-streak": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz", @@ -2242,6 +2591,20 @@ "loose-envify": "cli.js" } }, + "node_modules/lowlight": { + "version": "1.20.0", + "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-1.20.0.tgz", + "integrity": "sha512-8Ktj+prEb1RoCPkEOrPMYUN/nCggB7qAWe3a7OpMjWQkh3l2RD5wKRQ+o8Q8YuI9RG/xs95waaI/E6ym/7NsTw==", + "license": "MIT", + "dependencies": { + "fault": "^1.0.0", + "highlight.js": "~10.7.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -2262,6 +2625,32 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, + "node_modules/markdown-table": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", + "integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/mdast-util-find-and-replace": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.2.tgz", + "integrity": "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "escape-string-regexp": "^5.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/mdast-util-from-markdown": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.3.tgz", @@ -2286,17 +2675,18 @@ "url": "https://opencollective.com/unified" } }, - "node_modules/mdast-util-mdx-expression": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz", - "integrity": "sha512-J6f+9hUp+ldTZqKRSg7Vw5V6MqjATc+3E4gf3CFNcuZNWD8XdyI6zQ8GqH7f8169MM6P7hMBRDVGnn7oHB9kXQ==", + "node_modules/mdast-util-gfm": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", + "integrity": "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==", "license": "MIT", "dependencies": { - "@types/estree-jsx": "^1.0.0", - "@types/hast": "^3.0.0", - "@types/mdast": "^4.0.0", - "devlop": "^1.0.0", "mdast-util-from-markdown": "^2.0.0", + "mdast-util-gfm-autolink-literal": "^2.0.0", + "mdast-util-gfm-footnote": "^2.0.0", + "mdast-util-gfm-strikethrough": "^2.0.0", + "mdast-util-gfm-table": "^2.0.0", + "mdast-util-gfm-task-list-item": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" }, "funding": { @@ -2304,23 +2694,142 @@ "url": "https://opencollective.com/unified" } }, - "node_modules/mdast-util-mdx-jsx": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.2.0.tgz", - "integrity": "sha512-lj/z8v0r6ZtsN/cGNNtemmmfoLAFZnjMbNyLzBafjzikOM+glrjNHPlf6lQDOTccj9n5b0PPihEBbhneMyGs1Q==", + "node_modules/mdast-util-gfm-autolink-literal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz", + "integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==", "license": "MIT", "dependencies": { - "@types/estree-jsx": "^1.0.0", - "@types/hast": "^3.0.0", "@types/mdast": "^4.0.0", - "@types/unist": "^3.0.0", "ccount": "^2.0.0", - "devlop": "^1.1.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0", - "parse-entities": "^4.0.0", - "stringify-entities": "^4.0.0", - "unist-util-stringify-position": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-find-and-replace": "^3.0.0", + "micromark-util-character": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.1.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-strikethrough": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz", + "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-table": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz", + "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "markdown-table": "^3.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-task-list-item": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz", + "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-math": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-math/-/mdast-util-math-3.0.0.tgz", + "integrity": "sha512-Tl9GBNeG/AhJnQM221bJR2HPvLOSnLE/T9cJI9tlc6zwQk2nPk/4f0cHkOdEixQPC/j8UtKDdITswvLAy1OZ1w==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "longest-streak": "^3.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.1.0", + "unist-util-remove-position": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-mdx-expression": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz", + "integrity": "sha512-J6f+9hUp+ldTZqKRSg7Vw5V6MqjATc+3E4gf3CFNcuZNWD8XdyI6zQ8GqH7f8169MM6P7hMBRDVGnn7oHB9kXQ==", + "license": "MIT", + "dependencies": { + "@types/estree-jsx": "^1.0.0", + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-mdx-jsx": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.2.0.tgz", + "integrity": "sha512-lj/z8v0r6ZtsN/cGNNtemmmfoLAFZnjMbNyLzBafjzikOM+glrjNHPlf6lQDOTccj9n5b0PPihEBbhneMyGs1Q==", + "license": "MIT", + "dependencies": { + "@types/estree-jsx": "^1.0.0", + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "@types/unist": "^3.0.0", + "ccount": "^2.0.0", + "devlop": "^1.1.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "parse-entities": "^4.0.0", + "stringify-entities": "^4.0.0", + "unist-util-stringify-position": "^4.0.0", "vfile-message": "^4.0.0" }, "funding": { @@ -2484,6 +2993,146 @@ "micromark-util-types": "^2.0.0" } }, + "node_modules/micromark-extension-gfm": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz", + "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==", + "license": "MIT", + "dependencies": { + "micromark-extension-gfm-autolink-literal": "^2.0.0", + "micromark-extension-gfm-footnote": "^2.0.0", + "micromark-extension-gfm-strikethrough": "^2.0.0", + "micromark-extension-gfm-table": "^2.0.0", + "micromark-extension-gfm-tagfilter": "^2.0.0", + "micromark-extension-gfm-task-list-item": "^2.0.0", + "micromark-util-combine-extensions": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-autolink-literal": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz", + "integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==", + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-core-commonmark": "^2.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-strikethrough": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz", + "integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-classify-character": "^2.0.0", + "micromark-util-resolve-all": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-table": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz", + "integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-tagfilter": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz", + "integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==", + "license": "MIT", + "dependencies": { + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-task-list-item": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz", + "integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-math": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-math/-/micromark-extension-math-3.1.0.tgz", + "integrity": "sha512-lvEqd+fHjATVs+2v/8kg9i5Q0AP2k85H0WUOwpIVvUML8BapsMvh1XAogmQjOCsLpoKRCVQqEkQBB3NhVBcsOg==", + "license": "MIT", + "dependencies": { + "@types/katex": "^0.16.0", + "devlop": "^1.0.0", + "katex": "^0.16.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/micromark-factory-destination": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", @@ -2925,6 +3574,18 @@ "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==", "license": "MIT" }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "license": "MIT", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/pathe": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/pathe/-/pathe-2.0.3.tgz", @@ -2981,6 +3642,15 @@ "node": "^10 || ^12 || >=14" } }, + "node_modules/prismjs": { + "version": "1.30.0", + "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.30.0.tgz", + "integrity": "sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/property-information": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", @@ -3053,6 +3723,192 @@ "node": ">=0.10.0" } }, + "node_modules/react-syntax-highlighter": { + "version": "15.6.6", + "resolved": "https://registry.npmjs.org/react-syntax-highlighter/-/react-syntax-highlighter-15.6.6.tgz", + "integrity": "sha512-DgXrc+AZF47+HvAPEmn7Ua/1p10jNoVZVI/LoPiYdtY+OM+/nG5yefLHKJwdKqY1adMuHFbeyBaG9j64ML7vTw==", + "license": "MIT", + "dependencies": { + "@babel/runtime": "^7.3.1", + "highlight.js": "^10.4.1", + "highlightjs-vue": "^1.0.0", + "lowlight": "^1.17.0", + "prismjs": "^1.30.0", + "refractor": "^3.6.0" + }, + "peerDependencies": { + "react": ">= 0.14.0" + } + }, + "node_modules/refractor": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/refractor/-/refractor-3.6.0.tgz", + "integrity": "sha512-MY9W41IOWxxk31o+YvFCNyNzdkc9M20NoZK5vq6jkv4I/uh2zkWcfudj0Q1fovjUQJrNewS9NMzeTtqPf+n5EA==", + "license": "MIT", + "dependencies": { + "hastscript": "^6.0.0", + "parse-entities": "^2.0.0", + "prismjs": "~1.27.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/character-entities": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-1.2.4.tgz", + "integrity": "sha512-iBMyeEHxfVnIakwOuDXpVkc54HijNgCyQB2w0VfGQThle6NXn50zU6V/u+LDhxHcDUPojn6Kpga3PTAD8W1bQw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/character-entities-legacy": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-1.1.4.tgz", + "integrity": "sha512-3Xnr+7ZFS1uxeiUDvV02wQ+QDbc55o97tIV5zHScSPJpcLm/r0DFPcoY3tYRp+VZukxuMeKgXYmsXQHO05zQeA==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/character-reference-invalid": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-1.1.4.tgz", + "integrity": "sha512-mKKUkUbhPpQlCOfIuZkvSEgktjPFIsZKRRbC6KWVEMvlzblj3i3asQv5ODsrwt0N3pHAEvjP8KTQPHkp0+6jOg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/is-alphabetical": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-1.0.4.tgz", + "integrity": "sha512-DwzsA04LQ10FHTZuL0/grVDk4rFoVH1pjAToYwBrHSxcrBIGQuXrQMtD5U1b0U2XVgKZCTLLP8u2Qxqhy3l2Vg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/is-alphanumerical": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-1.0.4.tgz", + "integrity": "sha512-UzoZUr+XfVz3t3v4KyGEniVL9BDRoQtY7tOyrRybkVNjDFWyo1yhXNGrrBTQxp3ib9BLAWs7k2YKBQsFRkZG9A==", + "license": "MIT", + "dependencies": { + "is-alphabetical": "^1.0.0", + "is-decimal": "^1.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/is-decimal": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-1.0.4.tgz", + "integrity": "sha512-RGdriMmQQvZ2aqaQq3awNA6dCGtKpiDFcOzrTWrDAT2MiWrKQVPmxLGHl7Y2nNu6led0kEyoX0enY0qXYsv9zw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/is-hexadecimal": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-1.0.4.tgz", + "integrity": "sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/parse-entities": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-2.0.0.tgz", + "integrity": "sha512-kkywGpCcRYhqQIchaWqZ875wzpS/bMKhz5HnN3p7wveJTkTtyAB/AlnS0f8DFSqYW1T82t6yEAkEcB+A1I3MbQ==", + "license": "MIT", + "dependencies": { + "character-entities": "^1.0.0", + "character-entities-legacy": "^1.0.0", + "character-reference-invalid": "^1.0.0", + "is-alphanumerical": "^1.0.0", + "is-decimal": "^1.0.0", + "is-hexadecimal": "^1.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/refractor/node_modules/prismjs": { + "version": "1.27.0", + "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.27.0.tgz", + "integrity": "sha512-t13BGPUlFDR7wRB5kQDG4jjl7XeuH6jbJGt11JHPL96qwsEHNX2+68tFXqc1/k+/jALsbSWJKUOT/hcYAZ5LkA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/rehype-katex": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/rehype-katex/-/rehype-katex-7.0.1.tgz", + "integrity": "sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/katex": "^0.16.0", + "hast-util-from-html-isomorphic": "^2.0.0", + "hast-util-to-text": "^4.0.0", + "katex": "^0.16.0", + "unist-util-visit-parents": "^6.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-gfm": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.1.tgz", + "integrity": "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-gfm": "^3.0.0", + "micromark-extension-gfm": "^3.0.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-math": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/remark-math/-/remark-math-6.0.0.tgz", + "integrity": "sha512-MMqgnP74Igy+S3WwnhQ7kqGlEerTETXMvJhrUzDikVZ2/uogJCb+WHUg97hK9/jcfc0dkD73s3LN8zU49cTEtA==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-math": "^3.0.0", + "micromark-extension-math": "^3.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/remark-parse": { "version": "11.0.0", "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", @@ -3086,6 +3942,21 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/remark-stringify": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz", + "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-to-markdown": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/rollup": { "version": "4.60.1", "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.1.tgz", @@ -3320,6 +4191,20 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/unist-util-find-after": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz", + "integrity": "sha512-amQa0Ep2m6hE2g72AugUItjbuM8X8cGQnFoHk0pGfrFeT9GZhzN5SW8nRsiGKK7Aif4CrACPENkA6P/Lw6fHGQ==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/unist-util-is": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.1.tgz", @@ -3346,6 +4231,20 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/unist-util-remove-position": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/unist-util-remove-position/-/unist-util-remove-position-5.0.0.tgz", + "integrity": "sha512-Hp5Kh3wLxv0PHj9m2yZhhLt58KzPtEYKQQ4yxfYFEO7EvHwzyDYnduhHnY1mDxoqr7VUwVuHXk9RXKIiYS1N8Q==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-visit": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/unist-util-stringify-position": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", @@ -3433,6 +4332,20 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/vfile-location": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-5.0.3.tgz", + "integrity": "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/vfile-message": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz", @@ -3612,6 +4525,16 @@ } } }, + "node_modules/web-namespaces": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz", + "integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/why-is-node-running": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", @@ -3629,6 +4552,15 @@ "node": ">=8" } }, + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "license": "MIT", + "engines": { + "node": ">=0.4" + } + }, "node_modules/yallist": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", @@ -3802,6 +4734,11 @@ "@babel/helper-plugin-utils": "^7.27.1" } }, + "@babel/runtime": { + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.29.2.tgz", + "integrity": "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g==" + }, "@babel/template": { "version": "7.28.6", "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz", @@ -4468,6 +5405,11 @@ "@types/unist": "*" } }, + "@types/katex": { + "version": "0.16.8", + "resolved": "https://registry.npmjs.org/@types/katex/-/katex-0.16.8.tgz", + "integrity": "sha512-trgaNyfU+Xh2Tc+ABIb44a5AYUpicB3uwirOioeOkNPPbmgRNtcWyDeeFRzjPZENO9Vq8gvVqfhaaXWLlevVwg==" + }, "@types/mdast": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz", @@ -4502,6 +5444,15 @@ "dev": true, "requires": {} }, + "@types/react-syntax-highlighter": { + "version": "15.5.13", + "resolved": "https://registry.npmjs.org/@types/react-syntax-highlighter/-/react-syntax-highlighter-15.5.13.tgz", + "integrity": "sha512-uLGJ87j6Sz8UaBAooU0T6lWJ0dBmjZgN1PZTrj05TNql2/XpC6+4HhMT5syIdFUUt+FASfCeLLv4kBygNU+8qA==", + "dev": true, + "requires": { + "@types/react": "*" + } + }, "@types/unist": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", @@ -4671,6 +5622,11 @@ "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz", "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==" }, + "commander": { + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz", + "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==" + }, "convert-source-map": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", @@ -4717,6 +5673,11 @@ "integrity": "sha512-IbxXrsTlD3hRodkLnbxAPP4OuJYdWCeM3IOdT+CpcMoIwIoDfCmRpEtSPfwBXxVkg9xmBeY7Lz2Eo2TDn/HC3Q==", "dev": true }, + "entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==" + }, "es-module-lexer": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.0.0.tgz", @@ -4763,6 +5724,11 @@ "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", "dev": true }, + "escape-string-regexp": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", + "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==" + }, "estree-util-is-identifier-name": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/estree-util-is-identifier-name/-/estree-util-is-identifier-name-3.0.0.tgz", @@ -4788,25 +5754,144 @@ "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" }, - "fdir": { - "version": "6.5.0", - "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", - "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", - "dev": true, - "requires": {} + "fault": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/fault/-/fault-1.0.4.tgz", + "integrity": "sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==", + "requires": { + "format": "^0.2.0" + } + }, + "fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "requires": {} + }, + "format": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", + "integrity": "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww==" + }, + "fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "optional": true + }, + "gensync": { + "version": "1.0.0-beta.2", + "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", + "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", + "dev": true + }, + "hast-util-from-dom": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.1.tgz", + "integrity": "sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==", + "requires": { + "@types/hast": "^3.0.0", + "hastscript": "^9.0.0", + "web-namespaces": "^2.0.0" + }, + "dependencies": { + "hast-util-parse-selector": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", + "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", + "requires": { + "@types/hast": "^3.0.0" + } + }, + "hastscript": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz", + "integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==", + "requires": { + "@types/hast": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-parse-selector": "^4.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0" + } + } + } + }, + "hast-util-from-html": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz", + "integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==", + "requires": { + "@types/hast": "^3.0.0", + "devlop": "^1.1.0", + "hast-util-from-parse5": "^8.0.0", + "parse5": "^7.0.0", + "vfile": "^6.0.0", + "vfile-message": "^4.0.0" + } + }, + "hast-util-from-html-isomorphic": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/hast-util-from-html-isomorphic/-/hast-util-from-html-isomorphic-2.0.0.tgz", + "integrity": "sha512-zJfpXq44yff2hmE0XmwEOzdWin5xwH+QIhMLOScpX91e/NSGPsAzNCvLQDIEPyO2TXi+lBmU6hjLIhV8MwP2kw==", + "requires": { + "@types/hast": "^3.0.0", + "hast-util-from-dom": "^5.0.0", + "hast-util-from-html": "^2.0.0", + "unist-util-remove-position": "^5.0.0" + } }, - "fsevents": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", - "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", - "dev": true, - "optional": true + "hast-util-from-parse5": { + "version": "8.0.3", + "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.3.tgz", + "integrity": "sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==", + "requires": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "devlop": "^1.0.0", + "hastscript": "^9.0.0", + "property-information": "^7.0.0", + "vfile": "^6.0.0", + "vfile-location": "^5.0.0", + "web-namespaces": "^2.0.0" + }, + "dependencies": { + "hast-util-parse-selector": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", + "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", + "requires": { + "@types/hast": "^3.0.0" + } + }, + "hastscript": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz", + "integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==", + "requires": { + "@types/hast": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-parse-selector": "^4.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0" + } + } + } }, - "gensync": { - "version": "1.0.0-beta.2", - "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", - "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", - "dev": true + "hast-util-is-element": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", + "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==", + "requires": { + "@types/hast": "^3.0.0" + } + }, + "hast-util-parse-selector": { + "version": "2.2.5", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-2.2.5.tgz", + "integrity": "sha512-7j6mrk/qqkSehsM92wQjdIgWM2/BW61u/53G6xmC8i1OmEdKLHbk419QKQUjz6LglWsfqoiHmyMRkP1BGjecNQ==" }, "hast-util-to-jsx-runtime": { "version": "2.3.6", @@ -4830,6 +5915,17 @@ "vfile-message": "^4.0.0" } }, + "hast-util-to-text": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/hast-util-to-text/-/hast-util-to-text-4.0.2.tgz", + "integrity": "sha512-KK6y/BN8lbaq654j7JgBydev7wuNMcID54lkRav1P0CaE1e47P72AWWPiGKXTJU271ooYzcvTAn/Zt0REnvc7A==", + "requires": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "hast-util-is-element": "^3.0.0", + "unist-util-find-after": "^5.0.0" + } + }, "hast-util-whitespace": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", @@ -4838,6 +5934,61 @@ "@types/hast": "^3.0.0" } }, + "hastscript": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-6.0.0.tgz", + "integrity": "sha512-nDM6bvd7lIqDUiYEiu5Sl/+6ReP0BMk/2f4U/Rooccxkj0P5nm+acM5PrGJ/t5I8qPGiqZSE6hVAwZEdZIvP4w==", + "requires": { + "@types/hast": "^2.0.0", + "comma-separated-tokens": "^1.0.0", + "hast-util-parse-selector": "^2.0.0", + "property-information": "^5.0.0", + "space-separated-tokens": "^1.0.0" + }, + "dependencies": { + "@types/hast": { + "version": "2.3.10", + "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz", + "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==", + "requires": { + "@types/unist": "^2" + } + }, + "@types/unist": { + "version": "2.0.11", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", + "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==" + }, + "comma-separated-tokens": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-1.0.8.tgz", + "integrity": "sha512-GHuDRO12Sypu2cV70d1dkA2EUmXHgntrzbpvOB+Qy+49ypNfGgFQIC2fhhXbnyrJRynDCAARsT7Ou0M6hirpfw==" + }, + "property-information": { + "version": "5.6.0", + "resolved": "https://registry.npmjs.org/property-information/-/property-information-5.6.0.tgz", + "integrity": "sha512-YUHSPk+A30YPv+0Qf8i9Mbfe/C0hdPXk1s1jPVToV8pk8BQtpw10ct89Eo7OWkutrwqvT0eicAxlOg3dOAu8JA==", + "requires": { + "xtend": "^4.0.0" + } + }, + "space-separated-tokens": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-1.1.5.tgz", + "integrity": "sha512-q/JSVd1Lptzhf5bkYm4ob4iWPjx0KiRe3sRFBNrVqbJkFaBm5vbbowy1mymoPNLRa52+oadOhJ+K49wsSeSjTA==" + } + } + }, + "highlight.js": { + "version": "10.7.3", + "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.7.3.tgz", + "integrity": "sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==" + }, + "highlightjs-vue": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/highlightjs-vue/-/highlightjs-vue-1.0.0.tgz", + "integrity": "sha512-PDEfEF102G23vHmPhLyPboFCD+BkMGu+GuJe2d9/eH4FsCwvgBpnc9n0pGE+ffKdph38s6foEZiEjdgHdzp+IA==" + }, "html-url-attributes": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz", @@ -4894,6 +6045,14 @@ "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", "dev": true }, + "katex": { + "version": "0.16.45", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.45.tgz", + "integrity": "sha512-pQpZbdBu7wCTmQUh7ufPmLr0pFoObnGUoL/yhtwJDgmmQpbkg/0HSVti25Fu4rmd1oCR6NGWe9vqTWuWv3GcNA==", + "requires": { + "commander": "^8.3.0" + } + }, "longest-streak": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz", @@ -4907,6 +6066,15 @@ "js-tokens": "^3.0.0 || ^4.0.0" } }, + "lowlight": { + "version": "1.20.0", + "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-1.20.0.tgz", + "integrity": "sha512-8Ktj+prEb1RoCPkEOrPMYUN/nCggB7qAWe3a7OpMjWQkh3l2RD5wKRQ+o8Q8YuI9RG/xs95waaI/E6ym/7NsTw==", + "requires": { + "fault": "^1.0.0", + "highlight.js": "~10.7.0" + } + }, "lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -4925,6 +6093,22 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, + "markdown-table": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", + "integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==" + }, + "mdast-util-find-and-replace": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.2.tgz", + "integrity": "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==", + "requires": { + "@types/mdast": "^4.0.0", + "escape-string-regexp": "^5.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + } + }, "mdast-util-from-markdown": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.3.tgz", @@ -4944,6 +6128,91 @@ "unist-util-stringify-position": "^4.0.0" } }, + "mdast-util-gfm": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", + "integrity": "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==", + "requires": { + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-gfm-autolink-literal": "^2.0.0", + "mdast-util-gfm-footnote": "^2.0.0", + "mdast-util-gfm-strikethrough": "^2.0.0", + "mdast-util-gfm-table": "^2.0.0", + "mdast-util-gfm-task-list-item": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + } + }, + "mdast-util-gfm-autolink-literal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz", + "integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==", + "requires": { + "@types/mdast": "^4.0.0", + "ccount": "^2.0.0", + "devlop": "^1.0.0", + "mdast-util-find-and-replace": "^3.0.0", + "micromark-util-character": "^2.0.0" + } + }, + "mdast-util-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ==", + "requires": { + "@types/mdast": "^4.0.0", + "devlop": "^1.1.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0" + } + }, + "mdast-util-gfm-strikethrough": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz", + "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==", + "requires": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + } + }, + "mdast-util-gfm-table": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz", + "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==", + "requires": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "markdown-table": "^3.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + } + }, + "mdast-util-gfm-task-list-item": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz", + "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==", + "requires": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + } + }, + "mdast-util-math": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-math/-/mdast-util-math-3.0.0.tgz", + "integrity": "sha512-Tl9GBNeG/AhJnQM221bJR2HPvLOSnLE/T9cJI9tlc6zwQk2nPk/4f0cHkOdEixQPC/j8UtKDdITswvLAy1OZ1w==", + "requires": { + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "longest-streak": "^3.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.1.0", + "unist-util-remove-position": "^5.0.0" + } + }, "mdast-util-mdx-expression": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz", @@ -5085,6 +6354,106 @@ "micromark-util-types": "^2.0.0" } }, + "micromark-extension-gfm": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz", + "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==", + "requires": { + "micromark-extension-gfm-autolink-literal": "^2.0.0", + "micromark-extension-gfm-footnote": "^2.0.0", + "micromark-extension-gfm-strikethrough": "^2.0.0", + "micromark-extension-gfm-table": "^2.0.0", + "micromark-extension-gfm-tagfilter": "^2.0.0", + "micromark-extension-gfm-task-list-item": "^2.0.0", + "micromark-util-combine-extensions": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "micromark-extension-gfm-autolink-literal": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz", + "integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==", + "requires": { + "micromark-util-character": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "micromark-extension-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==", + "requires": { + "devlop": "^1.0.0", + "micromark-core-commonmark": "^2.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "micromark-extension-gfm-strikethrough": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz", + "integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==", + "requires": { + "devlop": "^1.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-classify-character": "^2.0.0", + "micromark-util-resolve-all": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "micromark-extension-gfm-table": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz", + "integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==", + "requires": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "micromark-extension-gfm-tagfilter": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz", + "integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==", + "requires": { + "micromark-util-types": "^2.0.0" + } + }, + "micromark-extension-gfm-task-list-item": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz", + "integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==", + "requires": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "micromark-extension-math": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-math/-/micromark-extension-math-3.1.0.tgz", + "integrity": "sha512-lvEqd+fHjATVs+2v/8kg9i5Q0AP2k85H0WUOwpIVvUML8BapsMvh1XAogmQjOCsLpoKRCVQqEkQBB3NhVBcsOg==", + "requires": { + "@types/katex": "^0.16.0", + "devlop": "^1.0.0", + "katex": "^0.16.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, "micromark-factory-destination": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", @@ -5293,6 +6662,14 @@ } } }, + "parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "requires": { + "entities": "^6.0.0" + } + }, "pathe": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/pathe/-/pathe-2.0.3.tgz", @@ -5322,6 +6699,11 @@ "source-map-js": "^1.2.1" } }, + "prismjs": { + "version": "1.30.0", + "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.30.0.tgz", + "integrity": "sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==" + }, "property-information": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", @@ -5368,6 +6750,126 @@ "integrity": "sha512-QgT5//D3jfjJb6Gsjxv0Slpj23ip+HtOpnNgnb2S5zU3CB26G/IDPGoy4RJB42wzFE46DRsstbW6tKHoKbhAxw==", "dev": true }, + "react-syntax-highlighter": { + "version": "15.6.6", + "resolved": "https://registry.npmjs.org/react-syntax-highlighter/-/react-syntax-highlighter-15.6.6.tgz", + "integrity": "sha512-DgXrc+AZF47+HvAPEmn7Ua/1p10jNoVZVI/LoPiYdtY+OM+/nG5yefLHKJwdKqY1adMuHFbeyBaG9j64ML7vTw==", + "requires": { + "@babel/runtime": "^7.3.1", + "highlight.js": "^10.4.1", + "highlightjs-vue": "^1.0.0", + "lowlight": "^1.17.0", + "prismjs": "^1.30.0", + "refractor": "^3.6.0" + } + }, + "refractor": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/refractor/-/refractor-3.6.0.tgz", + "integrity": "sha512-MY9W41IOWxxk31o+YvFCNyNzdkc9M20NoZK5vq6jkv4I/uh2zkWcfudj0Q1fovjUQJrNewS9NMzeTtqPf+n5EA==", + "requires": { + "hastscript": "^6.0.0", + "parse-entities": "^2.0.0", + "prismjs": "~1.27.0" + }, + "dependencies": { + "character-entities": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-1.2.4.tgz", + "integrity": "sha512-iBMyeEHxfVnIakwOuDXpVkc54HijNgCyQB2w0VfGQThle6NXn50zU6V/u+LDhxHcDUPojn6Kpga3PTAD8W1bQw==" + }, + "character-entities-legacy": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-1.1.4.tgz", + "integrity": "sha512-3Xnr+7ZFS1uxeiUDvV02wQ+QDbc55o97tIV5zHScSPJpcLm/r0DFPcoY3tYRp+VZukxuMeKgXYmsXQHO05zQeA==" + }, + "character-reference-invalid": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-1.1.4.tgz", + "integrity": "sha512-mKKUkUbhPpQlCOfIuZkvSEgktjPFIsZKRRbC6KWVEMvlzblj3i3asQv5ODsrwt0N3pHAEvjP8KTQPHkp0+6jOg==" + }, + "is-alphabetical": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-1.0.4.tgz", + "integrity": "sha512-DwzsA04LQ10FHTZuL0/grVDk4rFoVH1pjAToYwBrHSxcrBIGQuXrQMtD5U1b0U2XVgKZCTLLP8u2Qxqhy3l2Vg==" + }, + "is-alphanumerical": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-1.0.4.tgz", + "integrity": "sha512-UzoZUr+XfVz3t3v4KyGEniVL9BDRoQtY7tOyrRybkVNjDFWyo1yhXNGrrBTQxp3ib9BLAWs7k2YKBQsFRkZG9A==", + "requires": { + "is-alphabetical": "^1.0.0", + "is-decimal": "^1.0.0" + } + }, + "is-decimal": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-1.0.4.tgz", + "integrity": "sha512-RGdriMmQQvZ2aqaQq3awNA6dCGtKpiDFcOzrTWrDAT2MiWrKQVPmxLGHl7Y2nNu6led0kEyoX0enY0qXYsv9zw==" + }, + "is-hexadecimal": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-1.0.4.tgz", + "integrity": "sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==" + }, + "parse-entities": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-2.0.0.tgz", + "integrity": "sha512-kkywGpCcRYhqQIchaWqZ875wzpS/bMKhz5HnN3p7wveJTkTtyAB/AlnS0f8DFSqYW1T82t6yEAkEcB+A1I3MbQ==", + "requires": { + "character-entities": "^1.0.0", + "character-entities-legacy": "^1.0.0", + "character-reference-invalid": "^1.0.0", + "is-alphanumerical": "^1.0.0", + "is-decimal": "^1.0.0", + "is-hexadecimal": "^1.0.0" + } + }, + "prismjs": { + "version": "1.27.0", + "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.27.0.tgz", + "integrity": "sha512-t13BGPUlFDR7wRB5kQDG4jjl7XeuH6jbJGt11JHPL96qwsEHNX2+68tFXqc1/k+/jALsbSWJKUOT/hcYAZ5LkA==" + } + } + }, + "rehype-katex": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/rehype-katex/-/rehype-katex-7.0.1.tgz", + "integrity": "sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==", + "requires": { + "@types/hast": "^3.0.0", + "@types/katex": "^0.16.0", + "hast-util-from-html-isomorphic": "^2.0.0", + "hast-util-to-text": "^4.0.0", + "katex": "^0.16.0", + "unist-util-visit-parents": "^6.0.0", + "vfile": "^6.0.0" + } + }, + "remark-gfm": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.1.tgz", + "integrity": "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg==", + "requires": { + "@types/mdast": "^4.0.0", + "mdast-util-gfm": "^3.0.0", + "micromark-extension-gfm": "^3.0.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unified": "^11.0.0" + } + }, + "remark-math": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/remark-math/-/remark-math-6.0.0.tgz", + "integrity": "sha512-MMqgnP74Igy+S3WwnhQ7kqGlEerTETXMvJhrUzDikVZ2/uogJCb+WHUg97hK9/jcfc0dkD73s3LN8zU49cTEtA==", + "requires": { + "@types/mdast": "^4.0.0", + "mdast-util-math": "^3.0.0", + "micromark-extension-math": "^3.0.0", + "unified": "^11.0.0" + } + }, "remark-parse": { "version": "11.0.0", "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", @@ -5391,6 +6893,16 @@ "vfile": "^6.0.0" } }, + "remark-stringify": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz", + "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==", + "requires": { + "@types/mdast": "^4.0.0", + "mdast-util-to-markdown": "^2.0.0", + "unified": "^11.0.0" + } + }, "rollup": { "version": "4.60.1", "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.1.tgz", @@ -5552,6 +7064,15 @@ "vfile": "^6.0.0" } }, + "unist-util-find-after": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz", + "integrity": "sha512-amQa0Ep2m6hE2g72AugUItjbuM8X8cGQnFoHk0pGfrFeT9GZhzN5SW8nRsiGKK7Aif4CrACPENkA6P/Lw6fHGQ==", + "requires": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0" + } + }, "unist-util-is": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.1.tgz", @@ -5568,6 +7089,15 @@ "@types/unist": "^3.0.0" } }, + "unist-util-remove-position": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/unist-util-remove-position/-/unist-util-remove-position-5.0.0.tgz", + "integrity": "sha512-Hp5Kh3wLxv0PHj9m2yZhhLt58KzPtEYKQQ4yxfYFEO7EvHwzyDYnduhHnY1mDxoqr7VUwVuHXk9RXKIiYS1N8Q==", + "requires": { + "@types/unist": "^3.0.0", + "unist-util-visit": "^5.0.0" + } + }, "unist-util-stringify-position": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", @@ -5614,6 +7144,15 @@ "vfile-message": "^4.0.0" } }, + "vfile-location": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-5.0.3.tgz", + "integrity": "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg==", + "requires": { + "@types/unist": "^3.0.0", + "vfile": "^6.0.0" + } + }, "vfile-message": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz", @@ -5666,6 +7205,11 @@ "why-is-node-running": "^2.3.0" } }, + "web-namespaces": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz", + "integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==" + }, "why-is-node-running": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", @@ -5676,6 +7220,11 @@ "stackback": "0.0.2" } }, + "xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==" + }, "yallist": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", diff --git a/package.json b/package.json index 07071e0..7432e8f 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "chaosengine-desktop", "private": true, - "version": "0.7.2", + "version": "0.7.4", "type": "module", "scripts": { "dev": "vite", @@ -20,14 +20,20 @@ "@tauri-apps/plugin-opener": "^2.5.3", "@tauri-apps/plugin-process": "^2.0.0", "@tauri-apps/plugin-updater": "^2.0.0", + "katex": "^0.16.45", "react": "^18.3.1", "react-dom": "^18.3.1", - "react-markdown": "^10.1.0" + "react-markdown": "^10.1.0", + "react-syntax-highlighter": "^15.6.6", + "rehype-katex": "^7.0.1", + "remark-gfm": "^4.0.1", + "remark-math": "^6.0.0" }, "devDependencies": { "@tauri-apps/cli": "^2.1.0", "@types/react": "^18.3.12", "@types/react-dom": "^18.3.1", + "@types/react-syntax-highlighter": "^15.5.13", "@vitejs/plugin-react": "^5.1.0", "typescript": "^5.6.3", "vite": "^7.3.2", diff --git a/pyproject.toml b/pyproject.toml index 6e93ee3..d0780f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta:__legacy__" [project] name = "chaosengine-ai" -version = "0.6.3" +version = "0.7.4" description = "Local AI model runner with pluggable cache/compression strategies" readme = "README.md" license = {text = "Apache-2.0"} @@ -23,12 +23,26 @@ mlx-lm = [ "gguf>=0.18.0", "mlx-lm>=0.22.0", ] +# Apple Silicon vision-language runtime (Blaizzy/mlx-vlm). Loads +# multimodal MLX models like Gemma 4, Qwen2.5-VL, LLaVA, etc. and +# routes images + audio through the matching processors. Wired in +# ``backend_service/mlx_worker.py`` via ``is_multimodal_family`` +# detection — the worker swaps from mlx_lm.load → mlx_vlm.load when +# a multimodal repo prefix is hit. Pulls mlx + transformers + Pillow +# transitively; ~150 MB extra in the venv. ``torchvision`` is needed +# by HF's Qwen2VLVideoProcessor (loaded transitively by Qwen2.5-VL +# AutoProcessor); without it ``mlx_vlm.load`` raises ImportError on +# the Qwen2.5-VL family during processor build. +mlx-vlm = [ + "mlx-vlm>=0.4.0", + "torchvision>=0.20", +] triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git", "vllm>=0.8.0"] triattention-mlx = ["triattention @ git+https://github.com/WeianMao/triattention.git", "mlx-lm>=0.22.0"] rotorquant = ["turboquant>=0.2.0"] -turboquant = ["turboquant-mlx-full>=0.1.3"] +turboquant = ["turboquant-mlx-full>=0.3.0"] vllm = ["vllm>=0.8.0"] -dflash-mlx = ["dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@f825ffb268e50d531e8b6524413b0847334a14dd"] +dflash-mlx = ["dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@8d8545d791383008b5e2b1e738c38a7a73ba484e"] dflash = ["dflash>=0.1.0"] desktop = [ "fastapi>=0.115.0", @@ -40,29 +54,41 @@ desktop = [ ] images = [ "accelerate>=0.34.0", - "diffusers>=0.30.0", + "diffusers>=0.38.0", "huggingface-hub>=0.26.0", "pillow>=10.4.0", "safetensors>=0.4.5", "torch>=2.4.0", ] -# Diffusion cache acceleration. The TeaCache strategy scaffold ships in -# cache_compression/ without a runtime dependency; upstream ali-vilab/TeaCache -# is distributed as a repo of per-model patches, not a pip package, so we -# vendor the ``teacache_forward`` functions into cache_compression/_teacache_patches/ -# under Apache 2.0 as each model lands (FLUX, Wan2.1 first — see FU-007). -# This extra exists so the Setup page can pin the minimum diffusers version -# known to work with our vendored patches without bumping the core ``images`` -# extra that non-diffusion installs pull in. +# Diffusion cache acceleration. Multiple strategies live here: +# 1. TeaCache (vendored per-model forwards under cache_compression/ +# _teacache_patches/ — FLUX, HunyuanVideo, LTX-Video, CogVideoX, Mochi). +# 2. First Block Cache (FU-015) — diffusers 0.36+ ships +# ``apply_first_block_cache`` as a model-agnostic hook, so it covers +# every DiT (FLUX, SD3, Wan, HunyuanVideo, LTX, CogVideoX, Mochi) +# without per-model vendoring. Obsoletes the original FU-007 Wan +# TeaCache port. +# 3. TaylorSeer / MagCache / PyramidAttentionBroadcast / FasterCache +# (post-FU-026) — all four configs ship in diffusers 0.38 core and +# attach via ``pipeline.transformer.enable_cache(config)``. No extra +# pip dep beyond diffusers. +# Pin diffusers >=0.38 so the full cache-hooks set is available. diffusion-accel = [ - "diffusers>=0.30.0", + "diffusers>=0.38.0", ] # Apple Silicon MLX video runtime (Blaizzy/mlx-video) — MIT. Covers Wan2.1 # (1.3B/14B), Wan2.2 (T2V-14B, TI2V-5B, I2V-14B), LTX-2 (19B) with T2V, I2V, # and A2V. The engine is a subprocess wrapper (like mflux for image), so the # dependency is only pulled in when the user opts into the Mac-native video # path on Apple Silicon (FU-009). -mlx-video = ["mlx-video"] +# +# IMPORTANT: install from GIT, not PyPI. PyPI's ``mlx-video==0.1.0`` is an +# unrelated 0.1.0 utilities package (just ``load``/``normalize``/``resize``/ +# ``to_float``) — does NOT ship the LTX-2 / Wan / HunyuanVideo generation +# entrypoints we wrap. Blaizzy's repo lives only on GitHub; pin by branch so +# new model entries (Wan2.2-Distill, LTX-2.3, etc.) land without needing a +# PyPI release every time. +mlx-video = ["mlx-video @ git+https://github.com/Blaizzy/mlx-video.git"] [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/scripts/build-llama-turbo.ps1 b/scripts/build-llama-turbo.ps1 new file mode 100644 index 0000000..af264f6 --- /dev/null +++ b/scripts/build-llama-turbo.ps1 @@ -0,0 +1,165 @@ +#!/usr/bin/env pwsh +# Windows PowerShell port of build-llama-turbo.sh. +# +# Build llama-server-turbo from the TheTom/llama-cpp-turboquant fork. +# This fork extends standard llama-server with extra KV cache quantization +# types (iso3/4, planar3/4, turbo2/3/4) required by the RotorQuant and +# TurboQuant cache strategies, while staying compatible with all standard +# cache types. +# +# The binary is installed as ``llama-server-turbo.exe`` into +# %USERPROFILE%\.chaosengine\bin\ alongside the standard ``llama-server.exe`` +# so ChaosEngineAI auto-detects it at runtime. +# +# Usage: +# .\scripts\build-llama-turbo.ps1 +# +# Prerequisites: +# * Visual Studio 2022 Build Tools (cmake + MSVC C++) +# * Git for Windows +# * Optional: CUDA Toolkit 12+ for the GGML_CUDA build path +# +# Environment variables: +# LLAMA_TURBO_DIR Source checkout dir (default: $env:TEMP\llama-cpp-turboquant) +# CHAOSENGINE_BIN_DIR Install destination (default: $HOME\.chaosengine\bin) +# LLAMA_TURBO_BRANCH Git branch to build (default: feature/turboquant-kv-cache) +# LLAMA_TURBO_JOBS Parallel build jobs (default: $env:NUMBER_OF_PROCESSORS) +# CHAOSENGINE_LLAMA_TURBO_NO_CUDA Set to 1 to force CPU-only build even when CUDA is present. + +$ErrorActionPreference = "Stop" + +# Shared MSVC/CUDA CMake helpers (Resolve-CmakeWindowsBuildContext, +# Sync-CudaVsIntegration, Get-CmakeWindowsConfigureArgs, +# Invoke-CmakeStaleCacheWipe). Same logic also drives build-sdcpp.ps1. +. (Join-Path $PSScriptRoot "lib\windows-msvc-cuda.ps1") + +function Assert-LastExit { + param([string]$Step) + if ($LASTEXITCODE -ne 0) { + throw "$Step failed (exit $LASTEXITCODE)" + } +} + +$TurboRepo = "https://github.com/TheTom/llama-cpp-turboquant.git" +$TurboBranch = if ($env:LLAMA_TURBO_BRANCH) { $env:LLAMA_TURBO_BRANCH } else { "feature/turboquant-kv-cache" } +$TurboDir = if ($env:LLAMA_TURBO_DIR) { $env:LLAMA_TURBO_DIR } else { Join-Path $env:TEMP "llama-cpp-turboquant" } +$InstallDir = if ($env:CHAOSENGINE_BIN_DIR) { $env:CHAOSENGINE_BIN_DIR } else { Join-Path $HOME ".chaosengine\bin" } +$Jobs = if ($env:LLAMA_TURBO_JOBS) { $env:LLAMA_TURBO_JOBS } else { $env:NUMBER_OF_PROCESSORS } +if (-not $Jobs) { $Jobs = "4" } + +Write-Host "==> llama-server-turbo builder (Windows)" +Write-Host " repo: $TurboRepo" +Write-Host " branch: $TurboBranch" +Write-Host " source: $TurboDir" +Write-Host " install: $InstallDir" +Write-Host " jobs: $Jobs" +Write-Host "" + +# Clone or update the source checkout +if (Test-Path (Join-Path $TurboDir ".git")) { + Write-Host "==> updating existing checkout" + Push-Location $TurboDir + git fetch --all --prune + Assert-LastExit "git fetch" + git checkout $TurboBranch + Assert-LastExit "git checkout" + git reset --hard "origin/$TurboBranch" + Assert-LastExit "git reset" +} else { + Write-Host "==> cloning $TurboRepo (branch: $TurboBranch)" + git clone --branch $TurboBranch $TurboRepo $TurboDir + Assert-LastExit "git clone" + Push-Location $TurboDir +} + +try { + # CMake flags. Static link mirrors the .sh shape so the installed + # binary doesn't drag a .dll trail. CUDA is opt-in: detected via + # ``nvcc`` on PATH unless CHAOSENGINE_LLAMA_TURBO_NO_CUDA is set. + $cmakeFlags = @( + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_SHARED_LIBS=OFF" + ) + $forceNoCuda = $env:CHAOSENGINE_LLAMA_TURBO_NO_CUDA -eq "1" + $hasCuda = -not $forceNoCuda -and (Get-Command nvcc -ErrorAction SilentlyContinue) + if ($hasCuda) { + Write-Host "==> CUDA detected (nvcc on PATH); enabling GGML_CUDA" + $cmakeFlags += "-DGGML_CUDA=ON" + } else { + Write-Host "==> CUDA not detected (or disabled); building CPU-only" + } + + # Resolve generator + VS install (handles isComplete=0 installs, + # builds CMAKE_GENERATOR_INSTANCE override, etc.). Throws with an + # install link if MSVC isn't present. + $buildCtx = Resolve-CmakeWindowsBuildContext ` + -ProductLabel "llama-server-turbo" ` + -GeneratorEnv "CHAOSENGINE_LLAMA_TURBO_GENERATOR" + Write-Host "==> cmake generator: $($buildCtx.Generator)" + + # CMake's CUDA detection needs the CUDA installer's MSBuild .props/ + # .targets files copied into VS. Sync them now if they're missing + # (UAC-elevated copy when Program Files isn't writable). + $cudaIntegrationJustCopied = $false + if ($hasCuda -and $buildCtx.VsInstance) { + $cudaIntegrationJustCopied = Sync-CudaVsIntegration -VsRoot $buildCtx.VsInstance + } + + Invoke-CmakeStaleCacheWipe -Generator $buildCtx.Generator ` + -CudaIntegrationJustCopied $cudaIntegrationJustCopied + + $configureArgs = Get-CmakeWindowsConfigureArgs -Context $buildCtx -ExtraFlags $cmakeFlags + + Write-Host "==> cmake configure" + cmake @configureArgs + Assert-LastExit "cmake configure" + + Write-Host "==> building llama-server + llama-cli" + cmake --build build --config Release -j $Jobs --target llama-server llama-cli + Assert-LastExit "cmake build" + + # MSVC drops .exe artefacts under build\bin\Release\ on multi-config + # generators (the default on Windows). Single-config Ninja drops + # them under build\bin\. Probe both. + $candidates = @( + "build\bin\Release\llama-server.exe", + "build\bin\llama-server.exe" + ) + $serverExe = $null + foreach ($candidate in $candidates) { + if (Test-Path $candidate) { $serverExe = $candidate; break } + } + if (-not $serverExe) { + throw "llama-server.exe not found under build\bin -- check build output." + } + $cliExe = $serverExe.Replace("llama-server.exe", "llama-cli.exe") + + if (-not (Test-Path $InstallDir)) { + New-Item -ItemType Directory -Force -Path $InstallDir | Out-Null + } + Write-Host "==> installing to $InstallDir" + Copy-Item $serverExe (Join-Path $InstallDir "llama-server-turbo.exe") -Force + if (Test-Path $cliExe) { + Copy-Item $cliExe (Join-Path $InstallDir "llama-cli-turbo.exe") -Force + } + + # Version tracking. Same shape as the .sh so the same Setup-page + # detector works on both platforms. + $commit = (git rev-parse HEAD).Trim() + $versionFile = Join-Path $InstallDir "llama-server-turbo.version" + @( + $commit, + $TurboBranch, + ((Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")) + ) | Set-Content -Path $versionFile -Encoding ascii + Write-Host "==> version tracked in $versionFile" +} +finally { + Pop-Location +} + +Write-Host "" +Write-Host "==> build complete" +Write-Host "llama-server-turbo installed to $InstallDir\llama-server-turbo.exe" +Write-Host "ChaosEngineAI will auto-detect it on next model load." +Write-Host "Restart the app if it is currently running." diff --git a/scripts/build-sdcpp.ps1 b/scripts/build-sdcpp.ps1 new file mode 100644 index 0000000..b7ddbe2 --- /dev/null +++ b/scripts/build-sdcpp.ps1 @@ -0,0 +1,153 @@ +#!/usr/bin/env pwsh +# Windows PowerShell port of build-sdcpp.sh. +# +# Build the ``sd`` CLI binary from leejet/stable-diffusion.cpp (FU-008). +# Cross-platform diffusion runtime: SD 1.x/2.x/XL, FLUX.1/2, Wan 2.1 / 2.2 +# video, Qwen Image, Z-Image. Wired into ChaosEngineAI as a subprocess +# engine via ``backend_service/sdcpp_video_runtime.py``. +# +# Usage: +# .\scripts\build-sdcpp.ps1 +# +# Prerequisites: +# * Visual Studio 2022 Build Tools (cmake + MSVC C++) +# * Git for Windows +# * Optional: CUDA Toolkit 12+ for the SD_CUBLAS build path +# +# Environment variables: +# SDCPP_DIR Source checkout dir (default: $env:TEMP\stable-diffusion.cpp) +# CHAOSENGINE_BIN_DIR Install destination (default: $HOME\.chaosengine\bin) +# SDCPP_BRANCH Git branch to build (default: master) +# SDCPP_JOBS Parallel build jobs (default: $env:NUMBER_OF_PROCESSORS) +# CHAOSENGINE_SDCPP_NO_CUDA Set to 1 to force CPU-only build even when CUDA is present. + +$ErrorActionPreference = "Stop" + +# Shared MSVC/CUDA CMake helpers (Resolve-CmakeWindowsBuildContext, +# Sync-CudaVsIntegration, Get-CmakeWindowsConfigureArgs, +# Invoke-CmakeStaleCacheWipe). Same logic also drives build-llama-turbo.ps1. +. (Join-Path $PSScriptRoot "lib\windows-msvc-cuda.ps1") + +function Assert-LastExit { + param([string]$Step) + if ($LASTEXITCODE -ne 0) { + throw "$Step failed (exit $LASTEXITCODE)" + } +} + +$SdcppRepo = "https://github.com/leejet/stable-diffusion.cpp.git" +$SdcppBranch = if ($env:SDCPP_BRANCH) { $env:SDCPP_BRANCH } else { "master" } +$SdcppDir = if ($env:SDCPP_DIR) { $env:SDCPP_DIR } else { Join-Path $env:TEMP "stable-diffusion.cpp" } +$InstallDir = if ($env:CHAOSENGINE_BIN_DIR) { $env:CHAOSENGINE_BIN_DIR } else { Join-Path $HOME ".chaosengine\bin" } +$Jobs = if ($env:SDCPP_JOBS) { $env:SDCPP_JOBS } else { $env:NUMBER_OF_PROCESSORS } +if (-not $Jobs) { $Jobs = "4" } + +Write-Host "==> stable-diffusion.cpp builder (Windows)" +Write-Host " repo: $SdcppRepo" +Write-Host " branch: $SdcppBranch" +Write-Host " source: $SdcppDir" +Write-Host " install: $InstallDir" +Write-Host " jobs: $Jobs" +Write-Host "" + +if (Test-Path (Join-Path $SdcppDir ".git")) { + Write-Host "==> updating existing checkout" + Push-Location $SdcppDir + git fetch --all --prune + Assert-LastExit "git fetch" + git checkout $SdcppBranch + Assert-LastExit "git checkout" + git reset --hard "origin/$SdcppBranch" + Assert-LastExit "git reset" + git submodule update --init --recursive + Assert-LastExit "git submodule update" +} else { + Write-Host "==> cloning $SdcppRepo (branch: $SdcppBranch)" + git clone --recursive --branch $SdcppBranch $SdcppRepo $SdcppDir + Assert-LastExit "git clone" + Push-Location $SdcppDir +} + +try { + # CMake flags. Static link so the installed sd.exe doesn't trail + # .dll dependencies. CUDA opt-in via nvcc detection. + $cmakeFlags = @( + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_SHARED_LIBS=OFF" + ) + $forceNoCuda = $env:CHAOSENGINE_SDCPP_NO_CUDA -eq "1" + $hasCuda = -not $forceNoCuda -and (Get-Command nvcc -ErrorAction SilentlyContinue) + if ($hasCuda) { + Write-Host "==> CUDA detected (nvcc on PATH); enabling SD_CUBLAS" + $cmakeFlags += "-DSD_CUBLAS=ON" + } else { + Write-Host "==> CUDA not detected (or disabled); building CPU-only" + } + + # Resolve generator + VS install (same Windows toolchain plumbing as + # build-llama-turbo.ps1: handles isComplete=0 installs, builds the + # CMAKE_GENERATOR_INSTANCE override, etc.). Throws with an install + # link if MSVC isn't present. + $buildCtx = Resolve-CmakeWindowsBuildContext ` + -ProductLabel "stable-diffusion.cpp (sd-cli)" ` + -GeneratorEnv "CHAOSENGINE_SDCPP_GENERATOR" + Write-Host "==> cmake generator: $($buildCtx.Generator)" + + $cudaIntegrationJustCopied = $false + if ($hasCuda -and $buildCtx.VsInstance) { + $cudaIntegrationJustCopied = Sync-CudaVsIntegration -VsRoot $buildCtx.VsInstance + } + + Invoke-CmakeStaleCacheWipe -Generator $buildCtx.Generator ` + -CudaIntegrationJustCopied $cudaIntegrationJustCopied + + $configureArgs = Get-CmakeWindowsConfigureArgs -Context $buildCtx -ExtraFlags $cmakeFlags + + Write-Host "==> cmake configure" + cmake @configureArgs + Assert-LastExit "cmake configure" + + Write-Host "==> building sd-cli binary" + # Upstream renamed the CLI target ``sd`` -> ``sd-cli`` around master-590 + # (2026-04). Build the new target; install with the legacy ``sd.exe`` + # name so the runtime resolver in sdcpp_video_runtime.py and + # stage-runtime.mjs keep working without a path rename. + cmake --build build --config Release -j $Jobs --target sd-cli + Assert-LastExit "cmake build" + + $candidates = @( + "build\bin\Release\sd-cli.exe", + "build\bin\sd-cli.exe" + ) + $sdExe = $null + foreach ($candidate in $candidates) { + if (Test-Path $candidate) { $sdExe = $candidate; break } + } + if (-not $sdExe) { + throw "sd-cli.exe not found under build\bin -- check build output." + } + + if (-not (Test-Path $InstallDir)) { + New-Item -ItemType Directory -Force -Path $InstallDir | Out-Null + } + Write-Host "==> installing to $InstallDir" + Copy-Item $sdExe (Join-Path $InstallDir "sd.exe") -Force + + $commit = (git rev-parse HEAD).Trim() + $versionFile = Join-Path $InstallDir "sd.version" + @( + $commit, + $SdcppBranch, + ((Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")) + ) | Set-Content -Path $versionFile -Encoding ascii + Write-Host "==> version tracked in $versionFile" +} +finally { + Pop-Location +} + +Write-Host "" +Write-Host "==> build complete" +Write-Host "sd installed to $InstallDir\sd.exe" +Write-Host "ChaosEngineAI will auto-detect it on next video / image generate request." +Write-Host "Restart the app if it is currently running." diff --git a/scripts/build-sdcpp.sh b/scripts/build-sdcpp.sh new file mode 100755 index 0000000..c35ad60 --- /dev/null +++ b/scripts/build-sdcpp.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# Build the ``sd`` binary from leejet/stable-diffusion.cpp (FU-008). +# +# Cross-platform diffusion runtime: SD 1.x/2.x/XL, FLUX.1/2, Wan 2.1 / 2.2 +# video, Qwen Image, Z-Image. Wired into ChaosEngineAI as a subprocess +# engine via ``backend_service/sdcpp_video_runtime.py``. Mirrors the +# llama-server-turbo build script pattern so the desktop installer can +# trigger it the same way. +# +# Usage: +# ./scripts/build-sdcpp.sh +# +# Environment variables: +# SDCPP_DIR Source checkout dir (default: /tmp/stable-diffusion.cpp) +# CHAOSENGINE_BIN_DIR Install destination (default: ~/.chaosengine/bin) +# SDCPP_BRANCH Git branch to build (default: master) +# SDCPP_JOBS Parallel build jobs (default: $(nproc) or sysctl) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SDCPP_REPO="https://github.com/leejet/stable-diffusion.cpp.git" +SDCPP_BRANCH="${SDCPP_BRANCH:-master}" +SDCPP_DIR="${SDCPP_DIR:-/tmp/stable-diffusion.cpp}" +INSTALL_DIR="${CHAOSENGINE_BIN_DIR:-$HOME/.chaosengine/bin}" + +# Detect parallel jobs (matches build-llama-turbo.sh) +if command -v nproc &>/dev/null; then + JOBS="${SDCPP_JOBS:-$(nproc)}" +elif command -v sysctl &>/dev/null; then + JOBS="${SDCPP_JOBS:-$(sysctl -n hw.ncpu 2>/dev/null || echo 4)}" +else + JOBS="${SDCPP_JOBS:-4}" +fi + +echo "==> stable-diffusion.cpp builder" +echo " repo: $SDCPP_REPO" +echo " branch: $SDCPP_BRANCH" +echo " source: $SDCPP_DIR" +echo " install: $INSTALL_DIR" +echo " jobs: $JOBS" +echo + +# Clone or update the source checkout — sd.cpp uses git submodules for +# ggml, so always pass --recurse-submodules / --recursive. +if [[ -d "$SDCPP_DIR/.git" ]]; then + echo "==> updating existing checkout" + cd "$SDCPP_DIR" + git fetch --all --prune + git checkout "$SDCPP_BRANCH" + git reset --hard "origin/$SDCPP_BRANCH" + git submodule update --init --recursive +else + echo "==> cloning $SDCPP_REPO (branch: $SDCPP_BRANCH)" + git clone --recursive --branch "$SDCPP_BRANCH" "$SDCPP_REPO" "$SDCPP_DIR" + cd "$SDCPP_DIR" +fi + +# Platform-specific CMake flags +# -DBUILD_SHARED_LIBS=OFF — match build-llama-turbo.sh: produce a +# self-contained binary so dyld doesn't need rpath-resolved .dylibs. +CMAKE_FLAGS=(-DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF) +case "$(uname -s)" in + Darwin) + CMAKE_FLAGS+=(-DSD_METAL=ON) + ;; + Linux) + if command -v nvcc &>/dev/null; then + CMAKE_FLAGS+=(-DSD_CUBLAS=ON) + fi + ;; +esac + +echo "==> cmake configure" +cmake -B build "${CMAKE_FLAGS[@]}" + +echo "==> building sd-cli binary" +# Upstream renamed the CLI target ``sd`` → ``sd-cli`` around master-590 +# (2026-04). Build the new target; install with the legacy ``sd`` name +# so the runtime resolver in ``sdcpp_video_runtime.py`` and +# ``scripts/stage-runtime.mjs`` keep working without a path rename. +cmake --build build --config Release -j "$JOBS" --target sd-cli + +echo "==> installing to $INSTALL_DIR" +mkdir -p "$INSTALL_DIR" +cp build/bin/sd-cli "$INSTALL_DIR/sd" +chmod +x "$INSTALL_DIR/sd" + +# Version tracking — mirrors build-llama-turbo.sh shape so the same +# update detection logic applies. +VERSION_FILE="$INSTALL_DIR/sd.version" +{ + git rev-parse HEAD + echo "$SDCPP_BRANCH" + date -u +"%Y-%m-%dT%H:%M:%SZ" +} > "$VERSION_FILE" +echo "==> version tracked in $VERSION_FILE" + +echo +echo "==> build complete" +echo "sd installed to $INSTALL_DIR/sd" +echo "ChaosEngineAI will auto-detect it on next video generate request." +echo "Restart the app if it is currently running." diff --git a/scripts/inference-test-runner.py b/scripts/inference-test-runner.py index e0e5905..b9301bb 100755 --- a/scripts/inference-test-runner.py +++ b/scripts/inference-test-runner.py @@ -427,6 +427,9 @@ def run_inference( "contextTokens": config["contextTokens"], "speculativeDecoding": config["speculativeDecoding"], "treeBudget": config["treeBudget"], + # FU-002: forward kvBudget so TriAttention MLX strategy + # picks up the configured budget at apply time. + "kvBudget": config.get("kvBudget", 2048), }, timeout=300) except RuntimeError as exc: return { @@ -484,6 +487,11 @@ def run_inference( "contextTokens": config["contextTokens"], "speculativeDecoding": config["speculativeDecoding"], "treeBudget": config["treeBudget"], + "kvBudget": config.get("kvBudget", 2048), + # Bug 1 / multimodal images: base64 blobs forwarded + # straight through; backend dispatches via + # is_multimodal_family + mlx_vlm.generate. + "images": config.get("images") or [], }, timeout=300, ) @@ -650,6 +658,14 @@ def run_batch(port: int, batch_file: Path) -> None: "speculativeDecoding": test.get("speculativeDecoding", False), "treeBudget": test.get("treeBudget", 0), "thinkingMode": test.get("thinkingMode", "off"), + # FU-002: TriAttention MLX kv_budget. Backend defaults + # to 2048 server-side; only consulted when + # cacheStrategy == "triattention". + "kvBudget": test.get("kvBudget", 2048), + # Bug 1 / multimodal images: base64-encoded image blobs + # forwarded to the chat /stream endpoint. Empty list → + # text-only request. + "images": test.get("images", []), } prompt = test.get("prompt", DEFAULT_PROMPT) result = run_inference(port, model, config, prompt, run_id) diff --git a/scripts/lib/windows-msvc-cuda.ps1 b/scripts/lib/windows-msvc-cuda.ps1 new file mode 100644 index 0000000..ed81413 --- /dev/null +++ b/scripts/lib/windows-msvc-cuda.ps1 @@ -0,0 +1,285 @@ +# Shared Windows toolchain helpers for CMake-based builders +# (build-llama-turbo.ps1, build-sdcpp.ps1, ...). +# +# Functions: +# Resolve-CmakeWindowsBuildContext -- pick a generator and probe VS +# Sync-CudaVsIntegration -- copy CUDA's MSBuild .props/.targets +# into the VS BuildCustomizations dir +# Get-CmakeWindowsConfigureArgs -- expand generator/instance into -G ... flags +# Invoke-CmakeStaleCacheWipe -- nuke build/ when its cache is stale +# +# All four are no-ops on non-Windows (the .sh scripts call native cmake +# directly without needing this layer), so dot-sourcing is safe to gate +# behind ``$IsWindows``. + +function Resolve-CmakeWindowsBuildContext { + <# + .SYNOPSIS + Pick a CMake generator and locate a working VS install. + + .DESCRIPTION + Without -G, cmake defaults to "NMake Makefiles" on Windows, which + fails outside a Developer Command Prompt. Probe in this order: + 1. -GeneratorEnv override (e.g. CHAOSENGINE_LLAMA_TURBO_GENERATOR) + 2. Ninja, when on PATH + 3. "Visual Studio 17 2022" + + For the Visual Studio path, locate cl.exe via vswhere with -all so + isComplete=0 installs (Microsoft's installer flagging optional + components as missing) are still accepted. Pass the install path + AND its version back so the caller can hand them to CMake via + CMAKE_GENERATOR_INSTANCE -- otherwise CMake re-runs its own -latest + probe and rejects the same install with "instance is not known to + the Visual Studio Installer". + + .PARAMETER ProductLabel + Short label for the binary being built (e.g. "llama-server-turbo") + used in the "install Visual Studio" error message. + + .PARAMETER GeneratorEnv + Name of an environment variable that overrides generator selection + (e.g. "CHAOSENGINE_LLAMA_TURBO_GENERATOR"). + #> + param( + [Parameter(Mandatory)] [string] $ProductLabel, + [Parameter(Mandatory)] [string] $GeneratorEnv + ) + + $generator = $null + $envOverride = (Get-Item "env:$GeneratorEnv" -ErrorAction SilentlyContinue).Value + if ($envOverride) { + $generator = $envOverride + } elseif (Get-Command ninja -ErrorAction SilentlyContinue) { + $generator = "Ninja" + } else { + $generator = "Visual Studio 17 2022" + } + + $vsInstance = $null + $vsInstanceVersion = $null + if ($generator -like "Visual Studio*") { + $vswhere = Join-Path ${env:ProgramFiles(x86)} "Microsoft Visual Studio\Installer\vswhere.exe" + $clCandidates = @() + $vsInstalls = @() + if (Test-Path $vswhere) { + $clCandidates = & $vswhere -all -prerelease -products * ` + -find "VC\Tools\MSVC\**\bin\Hostx64\x64\cl.exe" 2>$null + $vsInstallsJson = & $vswhere -all -prerelease -products * -format json 2>$null + if ($vsInstallsJson) { + $vsInstalls = $vsInstallsJson | ConvertFrom-Json + } + } + if ($clCandidates) { + $clExe = $clCandidates | Sort-Object -Descending | Select-Object -First 1 + # Walk up from \VC\Tools\MSVC\\bin\Hostx64\x64\cl.exe + # to : 8 segments to strip (x64, Hostx64, bin, , + # MSVC, Tools, VC, cl.exe-the-leaf-itself). + $vsInstance = $clExe + for ($i = 0; $i -lt 8; $i++) { $vsInstance = Split-Path -Parent $vsInstance } + $matchedInstall = $vsInstalls | Where-Object { + $_.installationPath.TrimEnd('\') -eq $vsInstance.TrimEnd('\') + } | Select-Object -First 1 + if ($matchedInstall) { + $vsInstanceVersion = $matchedInstall.installationVersion + } + Write-Host "==> Visual Studio detected at: $vsInstance" + if ($vsInstanceVersion) { Write-Host " version: $vsInstanceVersion" } + Write-Host " cl.exe: $clExe" + } else { + $msg = @( + "", + "Visual Studio 2022 with the C++ workload is not installed.", + "$ProductLabel cannot build without an MSVC toolchain --", + "and on CUDA hosts, nvcc itself proxies to cl.exe, so even the", + "CUDA path requires MSVC. Install one of:", + "", + " * Visual Studio 2022 Community (free, full IDE):", + " https://visualstudio.microsoft.com/vs/community/", + " * Visual Studio Build Tools 2022 (compiler only, smaller):", + " https://visualstudio.microsoft.com/visual-cpp-build-tools/", + "", + "During install, tick 'Desktop development with C++'", + "(or, in Build Tools, the 'C++ build tools' workload).", + "Re-run this script afterwards.", + "" + ) -join [Environment]::NewLine + throw $msg + } + } + + return [pscustomobject]@{ + Generator = $generator + VsInstance = $vsInstance + VsInstanceVersion = $vsInstanceVersion + } +} + +function Sync-CudaVsIntegration { + <# + .SYNOPSIS + Copy CUDA's MSBuild integration files into the VS BuildCustomizations dir. + + .DESCRIPTION + CMake's CUDA detection bails with "No CUDA toolset found" when these + files are missing -- which happens whenever CUDA was installed + before Visual Studio, or when the CUDA installer's "Visual Studio + Integration" component was unticked. Auto-elevates via UAC if the + target dir isn't writable. + + Returns $true when files were actually copied (caller should wipe + build/CMakeCache.txt so CMake re-detects), $false when up to date + or skipped. + #> + param( + [Parameter(Mandatory)] [string] $VsRoot + ) + $cudaPath = $env:CUDA_PATH + if (-not $cudaPath -or -not (Test-Path $cudaPath)) { + Write-Host "==> CUDA_PATH not set; skipping VS integration sync" + return $false + } + $cudaSrc = Join-Path $cudaPath "extras\visual_studio_integration\MSBuildExtensions" + $vsTarget = Join-Path $VsRoot "MSBuild\Microsoft\VC\v170\BuildCustomizations" + if (-not (Test-Path $cudaSrc)) { + Write-Host "==> CUDA integration source not found at $cudaSrc; skipping sync" + return $false + } + if (-not (Test-Path $vsTarget)) { + Write-Host "==> VS BuildCustomizations dir not found at $vsTarget; skipping sync" + return $false + } + $sourceFiles = Get-ChildItem -Path $cudaSrc -File -ErrorAction SilentlyContinue + $missing = @($sourceFiles | Where-Object { -not (Test-Path (Join-Path $vsTarget $_.Name)) }) + if (-not $missing -or $missing.Count -eq 0) { + Write-Host "==> CUDA VS integration already present in $vsTarget" + return $false + } + Write-Host "==> CUDA VS integration missing $($missing.Count) file(s) from $vsTarget" + $missing | ForEach-Object { Write-Host " - $($_.Name)" } + + $copied = $true + try { + foreach ($file in $missing) { + Copy-Item -LiteralPath $file.FullName -Destination $vsTarget -Force -ErrorAction Stop + } + Write-Host "==> CUDA VS integration files copied (direct)" + } catch { + $copied = $false + Write-Host "==> Direct copy denied; relaunching as admin via UAC..." + # Per-file Copy-Item: -LiteralPath does NOT support wildcards, so + # an "...\*" pattern silently copies nothing. Iterate by full path + # and verify each file lands. + $copyCommands = $missing | ForEach-Object { + $srcEsc = $_.FullName.Replace("'", "''") + $dstEsc = $vsTarget.Replace("'", "''") + "Copy-Item -LiteralPath '$srcEsc' -Destination '$dstEsc' -Force" + } + $verifyLine = ( + "if (@(Get-ChildItem -LiteralPath '" + $vsTarget.Replace("'", "''") + + "' -Filter 'CUDA *.props' -ErrorAction SilentlyContinue).Count -eq 0) { exit 1 }" + ) + $script = ($copyCommands + @($verifyLine)) -join "; " + $argList = @("-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", $script) + try { + $proc = Start-Process -FilePath powershell -ArgumentList $argList -Verb RunAs -Wait -PassThru + if ($proc.ExitCode -eq 0) { + $stillMissing = @($sourceFiles | Where-Object { + -not (Test-Path (Join-Path $vsTarget $_.Name)) + }) + if ($stillMissing.Count -eq 0) { + $copied = $true + Write-Host "==> CUDA VS integration files copied (elevated)" + } else { + Write-Host "==> Elevated copy reported success but $($stillMissing.Count) file(s) still missing:" + $stillMissing | ForEach-Object { Write-Host " - $($_.Name)" } + } + } else { + Write-Host "==> Elevated copy exited with code $($proc.ExitCode)" + } + } catch { + Write-Host "==> UAC copy failed: $_" + } + } + if (-not $copied) { + $manualCopy = $missing | ForEach-Object { + " Copy-Item -LiteralPath '$($_.FullName)' -Destination '$vsTarget' -Force" + } + $msg = @( + "", + "Could not install CUDA's Visual Studio integration files.", + "Run the following in an Administrator PowerShell, then retry:", + "" + ) + $manualCopy + @("") + throw ($msg -join [Environment]::NewLine) + } + return $true +} + +function Get-CmakeWindowsConfigureArgs { + <# + .SYNOPSIS + Expand a build context into -G/-A/-DCMAKE_GENERATOR_INSTANCE flags. + #> + param( + [Parameter(Mandatory)] $Context, + [string[]] $ExtraFlags = @() + ) + $args = @("-B", "build", "-G", $Context.Generator) + if ($Context.Generator -like "Visual Studio*") { + $args += @("-A", "x64") + if ($Context.VsInstance) { + $instanceArg = if ($Context.VsInstanceVersion) { + "$($Context.VsInstance),version=$($Context.VsInstanceVersion)" + } else { + $Context.VsInstance + } + $args += @("-DCMAKE_GENERATOR_INSTANCE=$instanceArg") + } + } + return $args + $ExtraFlags +} + +function Invoke-CmakeStaleCacheWipe { + <# + .SYNOPSIS + Wipe build/ when the cached generator no longer matches, or when + CUDA integration was just installed. + + .DESCRIPTION + CMake refuses to switch generators in an existing build directory + ("Does not match the generator used previously"). And it caches + CUDA-language detection results, so installing the integration + files between runs doesn't get re-evaluated unless we wipe. + + Pattern detail: do NOT use -SimpleMatch on the regex -- it disables + regex parsing, making the leading ^ a literal character, and the + cache line never matches. + #> + param( + [Parameter(Mandatory)] [string] $Generator, + [bool] $CudaIntegrationJustCopied = $false + ) + $cachePath = "build\CMakeCache.txt" + if (-not (Test-Path $cachePath)) { return } + + $shouldWipe = $false + $wipeReason = $null + $cachedGeneratorLine = Select-String -Path $cachePath ` + -Pattern '^CMAKE_GENERATOR:INTERNAL=' -ErrorAction SilentlyContinue | + Select-Object -First 1 + if ($cachedGeneratorLine) { + $cachedGenerator = ($cachedGeneratorLine.Line -split "=", 2)[1].Trim() + if ($cachedGenerator -and ($cachedGenerator -ne $Generator)) { + $shouldWipe = $true + $wipeReason = "generator changed from '$cachedGenerator' to '$Generator'" + } + } + if (-not $shouldWipe -and $CudaIntegrationJustCopied) { + $shouldWipe = $true + $wipeReason = "CUDA VS integration was just installed" + } + if ($shouldWipe) { + Write-Host "==> wiping build\ ($wipeReason)" + Remove-Item -Recurse -Force "build" -ErrorAction SilentlyContinue + } +} diff --git a/scripts/spike_triattention_mlx.py b/scripts/spike_triattention_mlx.py new file mode 100644 index 0000000..baad7e3 --- /dev/null +++ b/scripts/spike_triattention_mlx.py @@ -0,0 +1,141 @@ +"""FU-002 spike: validate triattention.mlx on a small Qwen. + +Loads mlx-community/Qwen2.5-0.5B-Instruct-4bit via mlx_lm, applies +``apply_triattention_mlx(model, kv_budget=2048)``, runs a short generation, +and reports wall-time + first-256-char output. Compare to baseline (same +model without TriAttention) to gauge whether the integration is shippable. + +Run: ``./.venv/bin/python scripts/spike_triattention_mlx.py`` +""" + +from __future__ import annotations + +import argparse +import sys +import time +import traceback + + +def _format_section(title: str) -> str: + return f"\n=== {title} ===\n" + + +def _run(model_id: str, *, with_triattention: bool, kv_budget: int, max_tokens: int, prompt: str) -> dict: + from mlx_lm import load, generate + + print(_format_section(f"loading {model_id} (with_triattention={with_triattention})")) + t0 = time.perf_counter() + model, tokenizer = load(model_id) + print(f"load wall-time: {time.perf_counter() - t0:.2f}s") + + if with_triattention: + from triattention.mlx import apply_triattention_mlx + print(f"applying apply_triattention_mlx(kv_budget={kv_budget})") + t1 = time.perf_counter() + try: + apply_triattention_mlx(model, kv_budget=kv_budget) + print(f"apply wall-time: {time.perf_counter() - t1:.2f}s") + except Exception as exc: + print(f"apply_triattention_mlx FAILED: {type(exc).__name__}: {exc}") + traceback.print_exc() + return {"failed": True, "stage": "apply", "error": str(exc)} + + print(_format_section(f"generate (max_tokens={max_tokens})")) + t2 = time.perf_counter() + try: + out = generate(model, tokenizer, prompt=prompt, max_tokens=max_tokens, verbose=False) + except Exception as exc: + print(f"generate FAILED: {type(exc).__name__}: {exc}") + traceback.print_exc() + return {"failed": True, "stage": "generate", "error": str(exc)} + elapsed = time.perf_counter() - t2 + + print(f"gen wall-time: {elapsed:.2f}s ({max_tokens / max(elapsed, 0.001):.1f} tok/s)") + print(f"output (first 256 chars):\n{out[:256]!r}") + + return { + "failed": False, + "elapsed": elapsed, + "output": out, + "tokens_per_sec": max_tokens / max(elapsed, 0.001), + } + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--model", + default="mlx-community/Qwen2.5-0.5B-Instruct-4bit", + help="HF model id loadable by mlx_lm.load", + ) + parser.add_argument("--kv-budget", type=int, default=2048) + parser.add_argument("--max-tokens", type=int, default=64) + parser.add_argument( + "--prompt", + default="Write one sentence about why caching helps inference:", + ) + parser.add_argument( + "--skip-baseline", + action="store_true", + help="Skip the no-TriAttention baseline run (saves time).", + ) + args = parser.parse_args(argv) + + print(_format_section("environment check")) + try: + import triattention # noqa: F401 + from triattention.mlx import apply_triattention_mlx # noqa: F401 + print("triattention.mlx import: OK") + except ImportError as exc: + print(f"triattention.mlx NOT importable: {exc}") + return 2 + + try: + import mlx_lm # noqa: F401 + print(f"mlx_lm import: OK (version {getattr(mlx_lm, '__version__', 'unknown')})") + except ImportError as exc: + print(f"mlx_lm NOT importable: {exc}") + return 2 + + if not args.skip_baseline: + print(_format_section("BASELINE (no triattention)")) + baseline = _run( + args.model, + with_triattention=False, + kv_budget=args.kv_budget, + max_tokens=args.max_tokens, + prompt=args.prompt, + ) + else: + baseline = None + + print(_format_section("WITH TRIATTENTION")) + triatt = _run( + args.model, + with_triattention=True, + kv_budget=args.kv_budget, + max_tokens=args.max_tokens, + prompt=args.prompt, + ) + + print(_format_section("verdict")) + if triatt.get("failed"): + print(f"FAIL — TriAttention {triatt.get('stage')} stage raised. FU-002 stays parked.") + return 1 + + if not triatt.get("output", "").strip(): + print("FAIL — generation returned empty string with TriAttention applied.") + return 1 + + if baseline and not baseline.get("failed"): + speedup = baseline["elapsed"] / max(triatt["elapsed"], 0.001) + print(f"baseline: {baseline['elapsed']:.2f}s") + print(f"triatt: {triatt['elapsed']:.2f}s") + print(f"speedup: {speedup:.2f}x ({'helpful' if speedup > 1.05 else 'neutral or slower'})") + + print("PASS — apply_triattention_mlx works on this model. FU-002 unblocked.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/update-sdcpp.sh b/scripts/update-sdcpp.sh new file mode 100755 index 0000000..280b4dd --- /dev/null +++ b/scripts/update-sdcpp.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# Update the ``sd`` binary from leejet/stable-diffusion.cpp. +# +# Companion to ``build-sdcpp.sh`` — fetches the latest commit on the +# tracked branch and rebuilds in place. Mirrors update-llama-turbo.sh. +# +# Usage: ./scripts/update-sdcpp.sh +# +# Override the source dir with SDCPP_DIR if the checkout lives somewhere +# other than /tmp/stable-diffusion.cpp. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SDCPP_BRANCH="${SDCPP_BRANCH:-master}" +SDCPP_DIR="${SDCPP_DIR:-/tmp/stable-diffusion.cpp}" +INSTALL_DIR="${CHAOSENGINE_BIN_DIR:-$HOME/.chaosengine/bin}" +VERSION_FILE="$INSTALL_DIR/sd.version" + +if command -v nproc &>/dev/null; then + JOBS="${SDCPP_JOBS:-$(nproc)}" +elif command -v sysctl &>/dev/null; then + JOBS="${SDCPP_JOBS:-$(sysctl -n hw.ncpu 2>/dev/null || echo 4)}" +else + JOBS="${SDCPP_JOBS:-4}" +fi + +if [[ ! -d "$SDCPP_DIR/.git" ]]; then + echo "No existing checkout at $SDCPP_DIR — running full build instead." + exec "$SCRIPT_DIR/build-sdcpp.sh" +fi + +cd "$SDCPP_DIR" + +if [[ -f "$VERSION_FILE" ]]; then + CURRENT_COMMIT=$(head -1 "$VERSION_FILE") + echo "Current installed commit: $CURRENT_COMMIT" +else + CURRENT_COMMIT="" + echo "No version file found — will rebuild regardless." +fi + +echo "==> fetching latest changes" +git fetch --all --prune + +echo "==> checking out $SDCPP_BRANCH" +git checkout "$SDCPP_BRANCH" + +REMOTE_COMMIT=$(git rev-parse "origin/$SDCPP_BRANCH") +echo "Remote HEAD: $REMOTE_COMMIT" + +if [[ "$CURRENT_COMMIT" == "$REMOTE_COMMIT" ]]; then + echo + echo "Already up to date. No rebuild needed." + exit 0 +fi + +echo "==> resetting to origin/$SDCPP_BRANCH" +git reset --hard "origin/$SDCPP_BRANCH" +git submodule update --init --recursive + +CMAKE_FLAGS=(-DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF) +case "$(uname -s)" in + Darwin) + CMAKE_FLAGS+=(-DSD_METAL=ON) + ;; + Linux) + if command -v nvcc &>/dev/null; then + CMAKE_FLAGS+=(-DSD_CUBLAS=ON) + fi + ;; +esac + +echo "==> cmake configure" +cmake -B build "${CMAKE_FLAGS[@]}" + +echo "==> rebuilding sd-cli binary" +# Target renamed upstream; install with legacy ``sd`` name so downstream +# resolvers don't need a rename. See build-sdcpp.sh for context. +cmake --build build --config Release -j "$JOBS" --target sd-cli + +echo "==> installing to $INSTALL_DIR" +mkdir -p "$INSTALL_DIR" +cp build/bin/sd-cli "$INSTALL_DIR/sd" +chmod +x "$INSTALL_DIR/sd" + +{ + git rev-parse HEAD + echo "$SDCPP_BRANCH" + date -u +"%Y-%m-%dT%H:%M:%SZ" +} > "$VERSION_FILE" + +echo +echo "==> update complete" +echo "Updated from ${CURRENT_COMMIT:0:12} to $(git rev-parse --short HEAD)" +echo "Restart ChaosEngineAI to pick up the new binary." diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index 720b12c..b4f170d 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -455,7 +455,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "chaosengineai" -version = "0.7.2" +version = "0.7.4" dependencies = [ "flate2", "libc", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 9556adf..9b8844e 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "chaosengineai" -version = "0.7.2" +version = "0.7.4" description = "ChaosEngineAI desktop shell for local AI model inference" authors = ["OpenAI Codex"] edition = "2021" diff --git a/src-tauri/installer.nsh b/src-tauri/installer.nsh new file mode 100644 index 0000000..b02ce27 --- /dev/null +++ b/src-tauri/installer.nsh @@ -0,0 +1,44 @@ +; Tauri 2 NSIS installer hooks for the Windows ChaosEngineAI bundle. +; +; Tauri's default NSIS template installs the app under +; %LOCALAPPDATA%\\ and the uninstaller removes that tree on +; uninstall. The GPU runtime bundle (torch + diffusers + transformers, +; ~2.5 GB) is intentionally written to a sibling directory: +; +; %LOCALAPPDATA%\ChaosEngineAI\extras\cp{major}{minor}\site-packages +; +; The path is namespaced by Python ABI tag (commit 24518af, v0.7.0-rc.5) +; so a runtime upgrade that changes Python minor versions cannot shadow +; the wheels from the previous tag. +; +; CRITICAL: this directory MUST survive an uninstall + reinstall cycle. +; Re-downloading 2.5 GB of CUDA wheels every time the user upgrades the +; desktop app is unacceptable, both for users on slow links and for the +; PyPI mirrors that serve the bundle. +; +; The hooks below are intentionally empty as a guardrail. If anyone +; later adds custom uninstall behaviour: +; +; 1. NEVER ``RMDir /r "$LOCALAPPDATA\ChaosEngineAI\extras"`` here. +; 2. Test that ``setup.py:_extras_site_packages()`` resolves the same +; path before AND after a clean uninstall + reinstall on Windows. +; 3. Mirror any change in ``src-tauri/src/lib.rs::chaosengine_extras_root``. + +!macro NSIS_HOOK_PREINSTALL + ; Reserved — currently a no-op. See contract above before adding code. +!macroend + +!macro NSIS_HOOK_POSTINSTALL + ; Reserved — currently a no-op. See contract above before adding code. +!macroend + +!macro NSIS_HOOK_PREUNINSTALL + ; Reserved — currently a no-op. See contract above before adding code. +!macroend + +!macro NSIS_HOOK_POSTUNINSTALL + ; Reserved — currently a no-op. The persistent GPU runtime tree at + ; %LOCALAPPDATA%\ChaosEngineAI\extras MUST be left intact so an + ; immediate reinstall can pick it up without re-downloading 2.5 GB. + ; See contract above before adding code. +!macroend diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 4f29137..ddbe60b 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -697,6 +697,12 @@ fn apply_embedded_runtime_env(command: &mut Command, runtime: &EmbeddedRuntime) /// Returns ``None`` if we can't resolve a home directory at all (headless /// environments). Callers treat that as "no extras available". fn chaosengine_extras_root() -> Option { + // The extras tree lives OUTSIDE the Tauri install directory so it + // survives uninstall + reinstall cycles — re-downloading the 2.5 GB + // GPU bundle on every desktop upgrade is unacceptable. The Windows + // NSIS installer is told to leave this path alone via the empty + // hooks in ``src-tauri/installer.nsh``; if anyone changes either + // side the other MUST be kept in sync. let base = if cfg!(windows) { env::var_os("LOCALAPPDATA") .map(PathBuf::from) diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index 350c0e8..cea4d6b 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -2,7 +2,7 @@ "$schema": "https://schema.tauri.app/config/2", "productName": "ChaosEngineAI", "mainBinaryName": "ChaosEngineAI", - "version": "0.7.2", + "version": "0.7.4", "identifier": "com.chaosengineai.desktop", "build": { "beforeBuildCommand": "npm run build", @@ -52,6 +52,11 @@ "hardenedRuntime": true, "entitlements": "macos/ChaosEngineAI.entitlements" }, + "windows": { + "nsis": { + "installerHooks": "./installer.nsh" + } + }, "resources": { "resources/": "" } diff --git a/src/App.tsx b/src/App.tsx index 20c2555..4212354 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -68,6 +68,7 @@ import { libraryItemSourceKind, inferHfRepoFromLocalPath, isChatLibraryItem, + resolveCapabilities, downloadProgressLabel, syncRuntime, settingsDraftFromWorkspace, @@ -115,13 +116,23 @@ export default function App() { | { ok: false; message: string; pythonVersion: string | null; noWheelForPython: boolean } | null >(null); + // Raw install result, kept alongside the reduced ``cudaTorchResult`` + // shape above so the Studio's CudaTorchLogPanel can render the full + // per-attempt pip output (the reduced shape drops ``attempts`` to + // keep the in-line success/failure summary terse). One more state + // slot is cheaper than reshaping every existing call site. + const [cudaTorchRawResult, setCudaTorchRawResult] = useState< + import("./api").CudaTorchInstallResult | null + >(null); const handleInstallCudaTorch = async () => { if (installingCudaTorch) return; setInstallingCudaTorch(true); setCudaTorchResult(null); + setCudaTorchRawResult(null); try { const result = await installCudaTorch(); + setCudaTorchRawResult(result); if (result.ok) { setCudaTorchResult({ ok: true, @@ -139,15 +150,54 @@ export default function App() { }); } } catch (err) { + const message = err instanceof Error ? err.message : String(err); setCudaTorchResult({ ok: false, - message: err instanceof Error ? err.message : String(err), + message, + pythonVersion: null, + noWheelForPython: false, + }); + // Always synthesize a raw result on exception so the + // CudaTorchLogPanel renders the failure instead of silently + // hiding -- previously any network error / 5xx / timeout left + // the panel showing nothing and the user couldn't tell whether + // the install was running, finished, or never reached the + // backend at all. The synthesized "attempt" carries the + // exception text so the panel surfaces it as a [FAIL] entry. + setCudaTorchRawResult({ + ok: false, + output: message, + indexUrl: null, + attempts: [ + { indexUrl: "(request never returned)", ok: false, output: message }, + ], + requiresRestart: false, + pythonExecutable: "", pythonVersion: null, noWheelForPython: false, + capabilities: {}, }); } finally { setInstallingCudaTorch(false); } + // Refresh runtime status after install completes (success or + // failure). Without this, the warning banner keeps reading the + // pre-install torchInstallWarning value and the user thinks the + // button did nothing -- the cache is bound to whatever the + // probe last returned. Both Studios subscribe to their own + // runtime probes via useImageState / useVideoState; calling + // their refresh handlers re-runs the probe and the banner + // self-clears (or self-updates with a new failure mode). + try { + await imgState.refreshImageData(); + } catch { + /* refresh is best-effort */ + } + try { + await videoState.refreshVideoData(); + } catch { + /* refresh is best-effort */ + } }; // ── Settings / Server / Preview ──────────────────────────── @@ -348,6 +398,7 @@ export default function App() { const matched = findCatalogVariantForLibraryItem(workspace.featuredModels, item); const displayFormat = libraryItemFormat(item, matched); const displayQuantization = libraryItemQuantization(item, matched); + const canonicalRepo = matched?.repo ?? inferHfRepoFromLocalPath(item.path); return { key: `library:${item.path}`, label: item.name, @@ -355,7 +406,7 @@ export default function App() { group: "Local library", model: item.name, modelRef: item.name, - canonicalRepo: matched?.repo ?? inferHfRepoFromLocalPath(item.path), + canonicalRepo, source: "library", path: item.path, backend: libraryItemBackend(item, matched), @@ -365,6 +416,9 @@ export default function App() { format: displayFormat, quantization: displayQuantization ?? undefined, maxContext: item.maxContext ?? matched?.maxContext ?? null, + // Phase 2.11: resolve typed capabilities so the picker can show + // capability badges per option without re-deriving in each view. + capabilities: resolveCapabilities(canonicalRepo ?? item.name, matched?.capabilities ?? null), }; }); @@ -418,6 +472,7 @@ export default function App() { contextTokens?: number; speculativeDecoding?: boolean; treeBudget?: number; + kvBudget?: number; }): Promise { setError(null); setBusyAction(payload.busyLabel ?? "Loading model..."); @@ -445,6 +500,7 @@ export default function App() { contextTokens: payload.contextTokens ?? launchSettings.contextTokens, speculativeDecoding: sanitizedSpeculative.speculativeDecoding, treeBudget: sanitizedSpeculative.treeBudget, + kvBudget: payload.kvBudget ?? launchSettings.kvBudget, }; let loadSucceeded = false; @@ -740,12 +796,29 @@ export default function App() { }); }, [activeTab, benchmarkDraft.cacheBits, benchmarkDraft.fp16Layers, benchmarkDraft.contextTokens, benchmarkDraft.cacheStrategy, setPreviewControls]); - // Sync previewVariant -> previewControls.paramsB + // Sync previewVariant -> previewControls.paramsB + architecture + // estimate. Bug surfaced 2026-05-05: this effect previously only + // pushed paramsB and left numLayers / numHeads / numKvHeads / + // hiddenSize at 0, which collapsed the Native f16 cache estimate + // to ~0 bytes (kv_elements = num_layers * num_kv_heads * head_dim * + // ctx — anything * 0 = 0) and made "Fits Easily" fire on models + // that absolutely don't fit. Also pushed paramsB=0 cases through. useEffect(() => { - if (!previewVariant) return; - setPreviewControls((current) => - current.paramsB === previewVariant.paramsB ? current : { ...current, paramsB: previewVariant.paramsB }, - ); + if (!previewVariant?.paramsB) return; + const paramsB = previewVariant.paramsB; + const arch = estimateArchFromParams(paramsB); + setPreviewControls((current) => { + if ( + current.paramsB === paramsB + && current.numLayers === arch.numLayers + && current.numHeads === arch.numHeads + && current.numKvHeads === arch.numKvHeads + && current.hiddenSize === arch.hiddenSize + ) { + return current; + } + return { ...current, paramsB, ...arch }; + }); }, [previewVariant?.paramsB, setPreviewControls]); // Sync serverModelKey when options change @@ -1275,6 +1348,7 @@ export default function App() { hubFileCache={hubFileCache} hubFileLoading={hubFileLoading} hubFileError={hubFileError} + availableMemoryGb={workspace.system.availableMemoryGb} /> ); } else if (activeTab === "my-models") { @@ -1385,6 +1459,16 @@ export default function App() { onImageDraftModeChange={imgState.setImageDraftMode} imageSampler={imgState.imageSampler} onImageSamplerChange={imgState.setImageSampler} + imageCacheStrategy={imgState.imageCacheStrategy} + onImageCacheStrategyChange={imgState.setImageCacheStrategy} + imageCacheRelL1Thresh={imgState.imageCacheRelL1Thresh} + onImageCacheRelL1ThreshChange={imgState.setImageCacheRelL1Thresh} + imageCfgDecay={imgState.imageCfgDecay} + onImageCfgDecayChange={imgState.setImageCfgDecay} + imagePreviewVae={imgState.imagePreviewVae} + onImagePreviewVaeChange={imgState.setImagePreviewVae} + imageFp8LayerwiseCasting={imgState.imageFp8LayerwiseCasting} + onImageFp8LayerwiseCastingChange={imgState.setImageFp8LayerwiseCasting} imageRatioId={imgState.imageRatioId} imageWidth={imgState.imageWidth} onImageWidthChange={imgState.setImageWidth} @@ -1411,6 +1495,9 @@ export default function App() { onPreloadImageModel={(variant) => void imgState.handlePreloadImageModel(variant)} onUnloadImageModel={(variant) => void imgState.handleUnloadImageModel(variant)} onInstallImageRuntime={() => imgState.handleInstallImageRuntime()} + onInstallCudaTorch={() => void handleInstallCudaTorch()} + installingCudaTorch={installingCudaTorch} + cudaTorchResult={cudaTorchRawResult} gpuBundleJob={imgState.gpuBundleJob} onImageDownload={(repo) => void imgState.handleImageDownload(repo)} onCancelImageDownload={(repo) => void imgState.handleCancelImageDownload(repo)} @@ -1555,6 +1642,18 @@ export default function App() { onVideoEnhancePromptChange={videoState.setVideoEnhancePrompt} videoCfgDecay={videoState.videoCfgDecay} onVideoCfgDecayChange={videoState.setVideoCfgDecay} + videoPreviewVae={videoState.videoPreviewVae} + onVideoPreviewVaeChange={videoState.setVideoPreviewVae} + videoFp8LayerwiseCasting={videoState.videoFp8LayerwiseCasting} + onVideoFp8LayerwiseCastingChange={videoState.setVideoFp8LayerwiseCasting} + videoCacheStrategy={videoState.videoCacheStrategy} + onVideoCacheStrategyChange={videoState.setVideoCacheStrategy} + videoCacheRelL1Thresh={videoState.videoCacheRelL1Thresh} + onVideoCacheRelL1ThreshChange={videoState.setVideoCacheRelL1Thresh} + videoStgScale={videoState.videoStgScale} + onVideoStgScaleChange={videoState.setVideoStgScale} + videoFastPreview={videoState.videoFastPreview} + onVideoFastPreviewChange={videoState.setVideoFastPreview} onActiveTabChange={setActiveTab} onPreloadVideoModel={(variant) => void videoState.handlePreloadVideoModel(variant)} onUnloadVideoModel={(variant) => void videoState.handleUnloadVideoModel(variant)} @@ -1564,6 +1663,9 @@ export default function App() { onRestartServer={() => void handleRestartServer()} onInstallVideoOutputDeps={(packages) => videoState.handleInstallVideoOutputDeps(packages)} onInstallVideoGpuRuntime={() => videoState.handleInstallVideoGpuRuntime()} + onInstallCudaTorch={() => void handleInstallCudaTorch()} + installingCudaTorch={installingCudaTorch} + cudaTorchResult={cudaTorchRawResult} longLiveStatus={videoState.longLiveStatus} installingLongLive={videoState.installingLongLive} onRefreshLongLiveStatus={() => void videoState.refreshLongLiveStatus()} @@ -1637,6 +1739,8 @@ export default function App() { chatScrollRef={chatScrollRef} serverLoading={workspace.server.loading} loadedModelRef={workspace.runtime.loadedModel?.ref} + loadedModelCapabilities={workspace.runtime.loadedModel?.capabilities ?? null} + loadedModelEngine={workspace.runtime.loadedModel?.engine ?? null} engineLabel={workspace.runtime.engineLabel} launchSettings={launchSettings} warmModels={workspace.runtime.warmModels ?? []} @@ -1660,6 +1764,9 @@ export default function App() { onCopyMessage={chat.handleCopyMessage} onRetryMessage={chat.handleRetryMessage} onDeleteMessage={chat.handleDeleteMessage} + onForkAtMessage={chat.handleForkAtMessage} + onAddVariant={chat.handleAddVariant} + onDelveMessage={chat.handleDelveMessage} onDetailsToggle={handleDetailsToggle} onSendMessage={sendMessage} onSetError={setError} @@ -1667,6 +1774,9 @@ export default function App() { onToggleTools={chat.setEnableTools} onCompareMode={() => setCompareMode(true)} onCancelGeneration={chat.cancelGeneration} + oneTurnOverride={chat.oneTurnOverride} + onOneTurnOverrideChange={chat.setOneTurnOverride} + availableCacheStrategies={workspace.system.availableCacheStrategies} /> ); } else if (activeTab === "server") { @@ -1810,6 +1920,7 @@ export default function App() { launchSettings={launchSettings} availableMemoryGb={workspace.system.availableMemoryGb} totalMemoryGb={workspace.system.totalMemoryGb} + gpuVramTotalGb={workspace.system.gpuVramTotalGb} availableCacheStrategies={workspace.system.availableCacheStrategies} dflashInfo={workspace.system.dflash} turboInstalled={Boolean(workspace.system.llamaServerTurboPath)} @@ -1966,6 +2077,7 @@ export default function App() { preview={preview} availableMemoryGb={workspace.system.availableMemoryGb} totalMemoryGb={workspace.system.totalMemoryGb} + gpuVramTotalGb={workspace.system.gpuVramTotalGb} availableCacheStrategies={workspace.system.availableCacheStrategies} dflashInfo={workspace.system.dflash} installingPackage={installingPackage} diff --git a/src/__tests__/streamPhase.test.ts b/src/__tests__/streamPhase.test.ts new file mode 100644 index 0000000..166ee2f --- /dev/null +++ b/src/__tests__/streamPhase.test.ts @@ -0,0 +1,166 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +vi.mock("@tauri-apps/api/core", () => ({ + invoke: vi.fn(), + isTauri: vi.fn(() => false), +})); + +import { generateChatStream } from "../api"; + +afterEach(() => { + vi.unstubAllGlobals(); + vi.restoreAllMocks(); +}); + +/** + * Build a fetch-like response whose body emits the given SSE chunks one at a + * time. Each chunk is encoded as `data: \n` so the api.ts parser sees + * realistic line boundaries. + */ +function makeStreamResponse(events: object[]): Response { + const encoder = new TextEncoder(); + const stream = new ReadableStream({ + start(controller) { + for (const event of events) { + controller.enqueue(encoder.encode(`data: ${JSON.stringify(event)}\n`)); + } + controller.close(); + }, + }); + return new Response(stream, { status: 200, headers: { "Content-Type": "text/event-stream" } }); +} + +/** + * Build a fetch mock that routes auth/session requests to a benign token + * payload and chat-stream requests to the configured SSE response. Without + * this, the chat stream call is preceded by an auth fetch that would otherwise + * consume the same mocked response and break the test. + */ +function makeFetchMock(streamEvents: object[]): ReturnType { + return vi.fn().mockImplementation((url: string) => { + if (url.includes("/api/auth/session")) { + return Promise.resolve( + new Response(JSON.stringify({ apiToken: null }), { status: 200, headers: { "Content-Type": "application/json" } }), + ); + } + return Promise.resolve(makeStreamResponse(streamEvents)); + }); +} + +describe("generateChatStream phase events (Phase 2.0)", () => { + it("invokes onPhase('prompt_eval') as soon as the backend emits it", async () => { + const fetchMock = makeFetchMock( + [ + { phase: "prompt_eval" }, + { + done: true, + session: { id: "s1", title: "x", updatedAt: "now", model: "m", cacheLabel: "f16", messages: [] }, + assistant: { role: "assistant", text: "" }, + runtime: {}, + }, + ], + ); + vi.stubGlobal("fetch", fetchMock); + + const phaseCalls: Array<[string, number | undefined]> = []; + await generateChatStream( + { prompt: "hi" }, + { + onToken: () => {}, + onPhase: (phase, ttft) => phaseCalls.push([phase, ttft]), + onDone: () => {}, + onError: () => {}, + }, + ); + + expect(phaseCalls).toEqual([["prompt_eval", undefined]]); + }); + + it("invokes onPhase('generating', ttftSeconds) on phase transition", async () => { + const fetchMock = makeFetchMock( + [ + { phase: "prompt_eval" }, + { phase: "generating", ttftSeconds: 0.42 }, + { token: "hi" }, + { + done: true, + session: { id: "s1", title: "x", updatedAt: "now", model: "m", cacheLabel: "f16", messages: [] }, + assistant: { role: "assistant", text: "hi" }, + runtime: {}, + }, + ], + ); + vi.stubGlobal("fetch", fetchMock); + + const phaseCalls: Array<[string, number | undefined]> = []; + await generateChatStream( + { prompt: "hi" }, + { + onToken: () => {}, + onPhase: (phase, ttft) => phaseCalls.push([phase, ttft]), + onDone: () => {}, + onError: () => {}, + }, + ); + + expect(phaseCalls).toEqual([ + ["prompt_eval", undefined], + ["generating", 0.42], + ]); + }); + + it("does not invoke onPhase when callback omitted", async () => { + const fetchMock = makeFetchMock( + [ + { phase: "prompt_eval" }, + { + done: true, + session: { id: "s1", title: "x", updatedAt: "now", model: "m", cacheLabel: "f16", messages: [] }, + assistant: { role: "assistant", text: "" }, + runtime: {}, + }, + ], + ); + vi.stubGlobal("fetch", fetchMock); + + let errored = false; + await generateChatStream( + { prompt: "hi" }, + { + onToken: () => {}, + onDone: () => {}, + onError: () => { errored = true; }, + }, + ); + + expect(errored).toBe(false); + }); + + it("ignores unknown phase values", async () => { + const fetchMock = makeFetchMock( + [ + { phase: "weird_phase" }, + { + done: true, + session: { id: "s1", title: "x", updatedAt: "now", model: "m", cacheLabel: "f16", messages: [] }, + assistant: { role: "assistant", text: "" }, + runtime: {}, + }, + ], + ); + vi.stubGlobal("fetch", fetchMock); + + const phaseCalls: Array<[string, number | undefined]> = []; + await generateChatStream( + { prompt: "hi" }, + { + onToken: () => {}, + onPhase: (phase, ttft) => phaseCalls.push([phase, ttft]), + onDone: () => {}, + onError: () => {}, + }, + ); + + expect(phaseCalls).toEqual([]); + }); +}); diff --git a/src/api.test.ts b/src/api.test.ts index ef128a9..e8b6ab5 100644 --- a/src/api.test.ts +++ b/src/api.test.ts @@ -5,7 +5,7 @@ vi.mock("@tauri-apps/api/core", () => ({ isTauri: vi.fn(() => false), })); -import { convertModel, generateChat, getWorkspace, loadModel, searchHubModels } from "./api"; +import { checkBackend, convertModel, generateChat, getWorkspace, loadModel, searchHubModels } from "./api"; import { mockWorkspace } from "./mockData"; const stubSession = { @@ -36,6 +36,22 @@ describe("desktop api helpers", () => { await expect(getWorkspace()).rejects.toThrow("offline"); }); + it("treats the backend as online when the session endpoint responds after health fails", async () => { + const fetchMock = vi.fn() + .mockRejectedValueOnce(new Error("health failed")) + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ apiToken: "token" }), + }); + vi.stubGlobal("fetch", fetchMock); + + await expect(checkBackend()).resolves.toBe(true); + expect(fetchMock).toHaveBeenLastCalledWith( + "http://127.0.0.1:8876/api/auth/session", + expect.any(Object), + ); + }); + it("posts model load payloads to the sidecar", async () => { const mockRuntime = { ...mockWorkspace.runtime, diff --git a/src/api.ts b/src/api.ts index 1881b06..9ea28b5 100644 --- a/src/api.ts +++ b/src/api.ts @@ -263,7 +263,12 @@ export async function checkBackend(): Promise { await fetchJson("/api/health", 15000, { includeAuth: false }); return true; } catch { - return false; + try { + await fetchJson("/api/auth/session", 5000, { includeAuth: false }); + return true; + } catch { + return false; + } } } @@ -455,6 +460,69 @@ export async function createSession(title?: string): Promise { return result.session; } +/** + * Phase 2.5: generate a sibling variant for an assistant message + * using a different (currently-loaded) model. Returns the updated + * session payload with `messages[messageIndex].variants` populated. + */ +export async function addMessageVariant( + sessionId: string, + payload: { + messageIndex: number; + modelRef: string; + modelName: string; + canonicalRepo?: string | null; + source?: string; + path?: string; + backend?: string; + maxTokens?: number; + temperature?: number; + }, +): Promise { + const result = await postJson( + `/api/chat/sessions/${encodeURIComponent(sessionId)}/variants`, + payload, + 300000, + ); + return result.session; +} + +/** + * Phase 3.6: ask the loaded model to re-read an assistant message + * with a critic's framing and produce a Critique / Revised answer + * pair. Result attaches as a "Delve critique" variant on the + * message so the frontend's existing variant card surfaces it. + */ +export async function delveMessage( + sessionId: string, + messageIndex: number, +): Promise { + const result = await postJson( + `/api/chat/sessions/${encodeURIComponent(sessionId)}/delve/${messageIndex}`, + {}, + 300000, + ); + return result.session; +} + +/** + * Phase 2.4: fork an existing thread at a specific message index. + * Returns the new session, which the caller swaps active to so the + * user can continue divergently. Parent linkage is preserved on + * `parentSessionId` + `forkedAtMessageIndex`. + */ +export async function forkChatSession( + sourceSessionId: string, + forkAtMessageIndex: number, + title?: string, +): Promise { + const result = await postJson( + `/api/chat/sessions/${encodeURIComponent(sourceSessionId)}/fork`, + { forkAtMessageIndex, title }, + ); + return result.session; +} + export async function updateSession(sessionId: string, payload: UpdateSessionPayload): Promise { const result = await patchJson(`/api/chat/sessions/${encodeURIComponent(sessionId)}`, payload); return result.session; @@ -464,14 +532,60 @@ export async function generateChat(payload: GeneratePayload): Promise("/api/chat/generate", payload, 300000); } +export type ChatStreamPhase = "prompt_eval" | "generating"; + export interface StreamCallbacks { onToken: (token: string) => void; onReasoning?: (reasoning: string) => void; onReasoningDone?: () => void; + onCancelled?: () => void; + /** + * Phase transition signal (Phase 2.0). Backend emits `prompt_eval` + * immediately when generation begins, then `generating` (with a + * `ttftSeconds` measurement) the moment the model produces its first + * token or reasoning fragment. Use this to render an explicit + * "Processing prompt..." indicator instead of a blank flashing cursor. + */ + onPhase?: (phase: ChatStreamPhase, ttftSeconds?: number) => void; + /** + * Phase 2.0.5-G: mid-stream panic signal. Backend emits at most once + * per turn when memory crosses critical floors (free < 0.5 GB OR + * pressure > 96%). Stream continues; user decides whether to cancel. + */ + onPanic?: (signal: { message: string; availableGb?: number; pressurePercent?: number }) => void; + /** + * Phase 2.0.5-I: mid-stream thermal warning. Backend emits when host + * is actively thermally throttling. Stream continues. + */ + onThermalWarning?: (signal: { state: "moderate" | "critical"; message: string }) => void; + /** + * Phase 3.3: per-token logprob batches. The backend forwards + * llama-server's `logprobs.content` shape verbatim — each entry has + * the chosen token + top-k alternatives. Only fires when the request + * had `logprobs: N` set. + */ + onTokenLogprobs?: (entries: Array<{ + token: string | null; + logprob: number | null; + alternatives: Array<{ token: string | null; logprob: number | null }>; + }>) => void; onDone: (response: GenerateResponse) => void; onError: (error: string) => void; } +/** + * Ask the backend to cancel an in-flight chat generation. The streaming loop + * checks this flag between events and stops within ~one tick, persisting + * whatever output has accumulated. Safe to call when no generation is active. + */ +export async function cancelChatGeneration(sessionId: string): Promise<{ sessionId: string; cancelled: boolean; wasActive: boolean }> { + return await postJson<{ sessionId: string; cancelled: boolean; wasActive: boolean }>( + `/api/chat/generate/${encodeURIComponent(sessionId)}/cancel`, + {}, + 10000, + ); +} + export async function generateChatStream( payload: GeneratePayload, callbacks: StreamCallbacks, @@ -542,6 +656,30 @@ export async function generateChatStream( if (event.reasoningDone) { callbacks.onReasoningDone?.(); } + if (event.cancelled) { + callbacks.onCancelled?.(); + } + if (event.phase === "prompt_eval" || event.phase === "generating") { + const ttft = typeof event.ttftSeconds === "number" ? event.ttftSeconds : undefined; + callbacks.onPhase?.(event.phase, ttft); + } + if (event.panic === true && typeof event.message === "string") { + callbacks.onPanic?.({ + message: event.message, + availableGb: typeof event.availableGb === "number" ? event.availableGb : undefined, + pressurePercent: typeof event.pressurePercent === "number" ? event.pressurePercent : undefined, + }); + } + if (event.thermalWarning === true && typeof event.message === "string" + && (event.state === "moderate" || event.state === "critical")) { + callbacks.onThermalWarning?.({ + state: event.state, + message: event.message, + }); + } + if (Array.isArray(event.tokenLogprobs) && event.tokenLogprobs.length > 0) { + callbacks.onTokenLogprobs?.(event.tokenLogprobs); + } if (event.done) { callbacks.onDone({ session: event.session, @@ -949,6 +1087,102 @@ export async function getLongLiveInstallStatus(): Promise { return await fetchJson("/api/setup/install-longlive/status", 10000); } +// --- mlx-video Wan install (FU-025) ------------------------------- +// +// Apple-Silicon only. Same pattern as LongLive: kick off a background +// job (download raw HF weights → run mlx_video.models.wan_2.convert → +// verify), poll status, render attempts via InstallLogPanel. The +// shared LongLive panel variant works as-is — we just supply the +// matching state shape. + +export interface WanInstallAttempt { + phase?: string; + package?: string; + /** Always undefined for Wan; carried for the shared InstallLogPanel union. */ + indexUrl?: string; + ok: boolean; + output: string; +} + +export interface WanInstallJobState { + id: string; + phase: "idle" | "preflight" | "downloading" | "converting" | "verifying" | "done" | "error"; + message: string; + repo: string | null; + packageCurrent: string | null; + packageIndex: number; + packageTotal: number; + percent: number; + outputDir: string | null; + error: string | null; + startedAt: number; + finishedAt: number; + attempts: WanInstallAttempt[]; + done: boolean; +} + +export interface WanConvertStatusFields { + repo: string; + converted: boolean; + outputDir: string; + hasTransformer: boolean; + hasMoeExperts: boolean; + hasVae: boolean; + hasTextEncoder: boolean; + note: string | null; +} + +export interface WanInventoryItem { + repo: string; + approxRawSizeGb: number | null; + converted: boolean; + status: WanConvertStatusFields; +} + +export interface WanInventory { + items: WanInventoryItem[]; + convertRoot: string; + rawRoot: string; +} + +export async function startWanInstall( + repo: string, + options: { + dtype?: "bfloat16" | "float16" | "float32"; + quantize?: boolean; + bits?: 4 | 8; + groupSize?: 32 | 64 | 128; + cleanupRaw?: boolean; + } = {}, +): Promise { + return await postJson( + "/api/setup/install-mlx-video-wan", + { + repo, + dtype: options.dtype ?? "bfloat16", + quantize: options.quantize ?? false, + bits: options.bits ?? 4, + groupSize: options.groupSize ?? 64, + cleanupRaw: options.cleanupRaw ?? false, + }, + 15000, + ); +} + +export async function getWanInstallStatus(): Promise { + return await fetchJson( + "/api/setup/install-mlx-video-wan/status", + 10000, + ); +} + +export async function getWanInventory(): Promise { + return await fetchJson( + "/api/setup/mlx-video-wan/inventory", + 10000, + ); +} + // --- Diagnostics --------------------------------------------------- // // Surfaced in Settings → Diagnostics. The snapshot is a structured dump @@ -1134,6 +1368,39 @@ export async function refreshCapabilities(): Promise> { return result.capabilities; } +/** + * FU-022: LLM-based prompt enhancer. Rewrites a short user prompt into + * the structured format the requested image / video model was trained + * on. Apple Silicon path uses mlx_lm with a small instruct model + * (default mlx-community/Qwen2.5-0.5B-Instruct-4bit, ~700 MB). Other + * platforms use the backend's deterministic template fallback. + */ +export interface PromptEnhanceResult { + enhanced: string; + note: string | null; + modelUsed: string | null; + family: string; +} + +export async function enhancePromptViaLLM(payload: { + prompt: string; + repo: string; + modelId?: string; + maxTokens?: number; +}): Promise { + // Long timeout: the first call materialises the model (~2-3s on + // M-series cold cache), subsequent calls are sub-second. 30s is + // enough headroom for first-call without waiting forever if the + // model fails to load. + const body = { + prompt: payload.prompt, + repo: payload.repo, + modelId: payload.modelId ?? null, + maxTokens: payload.maxTokens ?? 256, + }; + return await postJson("/api/prompt/enhance", body, 30000); +} + export async function stopManagedBackend(): Promise { if (!isTauri()) { return null; diff --git a/src/components/AcceptedTokenOverlay.tsx b/src/components/AcceptedTokenOverlay.tsx new file mode 100644 index 0000000..031b0aa --- /dev/null +++ b/src/components/AcceptedTokenOverlay.tsx @@ -0,0 +1,90 @@ +import { useState } from "react"; +import type { GenerationMetrics } from "../types"; + +/** + * Phase 3.1: DDTree accepted-span overlay. + * + * Renders a collapsible block that shows the assistant's response + * with draft-accepted character ranges tinted (green) vs + * verifier-decoded ranges (default). Substrate truth view — + * doesn't replace the markdown body, sits alongside it so users + * can see how aggressively DDTree's draft acceptance kicked in. + * + * Visible only when the message metrics carry accepted-span data, + * which requires speculative decoding to have run on the turn. + * + * The text in `acceptedTokenText` is the per-token-decoded string + * which can differ slightly from the markdown body (no formatting, + * sometimes BPE artifacts) — that's OK; the overlay is for + * substrate diagnostics, not display. + */ +export interface AcceptedTokenOverlayProps { + metrics: GenerationMetrics; +} + +interface SpanStats { + totalChars: number; + acceptedChars: number; + acceptedRatio: number; + spanCount: number; +} + +export function computeSpanStats( + spans: AcceptedTokenOverlayProps["metrics"]["acceptedSpans"], +): SpanStats { + if (!spans || spans.length === 0) { + return { totalChars: 0, acceptedChars: 0, acceptedRatio: 0, spanCount: 0 }; + } + let total = 0; + let accepted = 0; + for (const span of spans) { + total += span.length; + if (span.accepted) accepted += span.length; + } + return { + totalChars: total, + acceptedChars: accepted, + acceptedRatio: total > 0 ? accepted / total : 0, + spanCount: spans.length, + }; +} + +export function AcceptedTokenOverlay({ metrics }: AcceptedTokenOverlayProps) { + const [open, setOpen] = useState(false); + const spans = metrics.acceptedSpans; + const text = metrics.acceptedTokenText; + if (!spans?.length || !text) return null; + const stats = computeSpanStats(spans); + + return ( +
setOpen((event.currentTarget as HTMLDetailsElement).open)} + > + + DDTree acceptance overlay + + {(stats.acceptedRatio * 100).toFixed(1)}% of {stats.totalChars} chars + accepted from draft · {stats.spanCount} runs + + +

+ Green ranges = tokens the verifier accepted from the draft model + without re-decoding. Plain ranges = tokens the verifier produced + directly. Higher acceptance means DDTree saved more compute. +

+
+        {spans.map((span, idx) => (
+          
+            {text.slice(span.start, span.start + span.length)}
+          
+        ))}
+      
+
+ ); +} diff --git a/src/components/ChatPerfStrip.tsx b/src/components/ChatPerfStrip.tsx new file mode 100644 index 0000000..72695ad --- /dev/null +++ b/src/components/ChatPerfStrip.tsx @@ -0,0 +1,104 @@ +import type { GenerationMetrics, PerfTelemetry } from "../types"; + +/** + * Phase 3.5: cross-platform per-turn perf telemetry strip. + * + * Renders a compact row of substrate-side host metrics sampled at + * the moment the turn finalised — CPU %, GPU %, available memory, + * thermal state. Sits below the substrate routing badge to give + * operators a thermal / load read alongside the runtime decision. + * + * All fields are optional: macOS today reads thermal via pmset, + * Windows / Linux fall through to None. The strip omits any field + * that's null so unsupported platforms still show a useful subset. + */ +export interface ChatPerfStripProps { + metrics: GenerationMetrics; +} + +interface PerfChip { + key: string; + label: string; + title: string; + tone: "default" | "warn" | "alert"; +} + +const THERMAL_TONE: Record = { + nominal: "default", + moderate: "warn", + critical: "alert", +}; + +function buildPerfChips(telemetry: PerfTelemetry, tokS: number | null): PerfChip[] { + const chips: PerfChip[] = []; + + if (tokS != null && tokS > 0) { + chips.push({ + key: "toks", + label: `${tokS.toFixed(1)} tok/s`, + title: `Decode throughput for this turn (${tokS.toFixed(2)} tokens/sec)`, + tone: tokS < 1 ? "alert" : tokS < 5 ? "warn" : "default", + }); + } + + if (telemetry.cpuPercent != null) { + chips.push({ + key: "cpu", + label: `CPU ${telemetry.cpuPercent.toFixed(0)}%`, + title: `CPU utilisation at turn finalisation (${telemetry.cpuPercent.toFixed(1)}%)`, + tone: telemetry.cpuPercent > 90 ? "warn" : "default", + }); + } + + if (telemetry.gpuPercent != null) { + chips.push({ + key: "gpu", + label: `GPU ${telemetry.gpuPercent.toFixed(0)}%`, + title: `GPU / accelerator utilisation at turn finalisation (${telemetry.gpuPercent.toFixed(1)}%)`, + tone: telemetry.gpuPercent > 90 ? "warn" : "default", + }); + } + + if (telemetry.availableMemoryGb != null) { + chips.push({ + key: "mem", + label: `${telemetry.availableMemoryGb.toFixed(1)} GB free`, + title: `Available RAM at turn finalisation (${telemetry.availableMemoryGb.toFixed(2)} GB)`, + tone: telemetry.availableMemoryGb < 2 ? "alert" : telemetry.availableMemoryGb < 4 ? "warn" : "default", + }); + } + + if (telemetry.thermalState) { + chips.push({ + key: "thermal", + label: `Thermal: ${telemetry.thermalState}`, + title: `Host thermal state (${telemetry.thermalState}). Critical means active throttling.`, + tone: THERMAL_TONE[telemetry.thermalState] ?? "default", + }); + } + + return chips; +} + +export function ChatPerfStrip({ metrics }: ChatPerfStripProps) { + const telemetry = metrics.perfTelemetry; + if (!telemetry) return null; + const chips = buildPerfChips(telemetry, metrics.tokS ?? null); + if (chips.length === 0) return null; + return ( +
+ {chips.map((chip) => ( + + {chip.label} + + ))} +
+ ); +} + +// Exported for unit testing. +export { buildPerfChips }; diff --git a/src/components/CodeBlock.tsx b/src/components/CodeBlock.tsx new file mode 100644 index 0000000..85c6c0f --- /dev/null +++ b/src/components/CodeBlock.tsx @@ -0,0 +1,80 @@ +import { useEffect, useState } from "react"; +import { Prism as SyntaxHighlighter } from "react-syntax-highlighter"; +import { oneDark } from "react-syntax-highlighter/dist/esm/styles/prism"; + +interface CodeBlockProps { + code: string; + language?: string; +} + +const COPY_RESET_MS = 1500; + +export function CodeBlock({ code, language }: CodeBlockProps) { + const [copied, setCopied] = useState(false); + const lang = (language ?? "").toLowerCase().trim(); + const displayLang = lang || "text"; + + useEffect(() => { + if (!copied) return; + const timer = window.setTimeout(() => setCopied(false), COPY_RESET_MS); + return () => window.clearTimeout(timer); + }, [copied]); + + const handleCopy = async () => { + try { + await navigator.clipboard.writeText(code); + setCopied(true); + } catch { + // Clipboard unavailable; silently no-op + } + }; + + return ( +
+
+ {displayLang} + +
+ + {code.replace(/\n$/, "")} + +
+ ); +} diff --git a/src/components/CudaTorchLogPanel.tsx b/src/components/CudaTorchLogPanel.tsx new file mode 100644 index 0000000..cf076c0 --- /dev/null +++ b/src/components/CudaTorchLogPanel.tsx @@ -0,0 +1,131 @@ +import { useEffect, useRef } from "react"; +import type { CudaTorchInstallResult } from "../api"; + +// Collapsible terminal-style log for the inline "Install CUDA torch" +// action in Image / Video Studio. Mirrors the visual shape of +// InstallLogPanel (single scrollable
, [ OK ]/[FAIL] markers per
+// attempt, target-dir / Python meta line) but keyed off the
+// CudaTorchInstallResult shape returned by /api/setup/install-cuda-torch
+// rather than the GpuBundleJobState progress lifecycle. The endpoint
+// is synchronous -- it walks cu124/cu126/cu128/cu121 in order and
+// returns the full attempts array on completion -- so there's no
+// streaming to drive an in-progress phase. We expose only the final
+// result, but we still want the per-index pip output visible for
+// debugging because users hitting "No CUDA wheel for this Python" or
+// resolver clashes need to see which index failed and why.
+//
+// Collapsed by default on success; auto-opens on failure so the user
+// doesn't have to click to find out what went wrong.
+
+interface CudaTorchLogPanelProps {
+  result: CudaTorchInstallResult | null;
+}
+
+export function CudaTorchLogPanel({ result }: CudaTorchLogPanelProps) {
+  const scrollRef = useRef(null);
+  const attemptCount = result?.attempts.length ?? 0;
+  useEffect(() => {
+    const el = scrollRef.current;
+    if (!el) return;
+    el.scrollTop = el.scrollHeight;
+  }, [attemptCount]);
+
+  if (!result) return null;
+
+  const openByDefault = !result.ok;
+  const summary = result.ok
+    ? `Install complete — see log${result.indexUrl ? ` (${shortIndex(result.indexUrl)})` : ""}`
+    : `Install failed — see log${result.attempts.length > 0 ? ` (${result.attempts.length} attempt${result.attempts.length === 1 ? "" : "s"})` : ""}`;
+
+  return (
+    
+ {summary} +
+ {renderMeta(result)} +
+          {renderTerminal(result)}
+        
+
+
+ ); +} + +function renderMeta(result: CudaTorchInstallResult): React.ReactNode { + const fragments: string[] = []; + if (result.targetDir) fragments.push(`Target: ${result.targetDir}`); + if (result.pythonVersion) fragments.push(`Python ${result.pythonVersion}`); + if (result.indexUrl) fragments.push(`CUDA index: ${result.indexUrl}`); + if (result.noWheelForPython) fragments.push("No CUDA wheel for this Python"); + if (result.requiresRestart) fragments.push("Restart Backend to activate"); + if (fragments.length === 0) return null; + return
{fragments.join(" · ")}
; +} + +function renderTerminal(result: CudaTorchInstallResult): string { + const lines: string[] = []; + for (const attempt of result.attempts) { + const marker = attempt.ok ? "[ OK ]" : "[FAIL]"; + lines.push(`${marker} torch (from ${attempt.indexUrl})`); + if (attempt.output) { + const body = filterPipNoise(attempt.output); + if (body) { + for (const bodyLine of body.split(/\r?\n/)) { + lines.push(` ${bodyLine}`); + } + } + } + lines.push(""); + } + // Some failure modes (e.g. no extras dir resolvable) come back with + // empty attempts but a populated top-level output -- show that so + // users aren't staring at a blank panel. + if (result.attempts.length === 0 && result.output) { + const body = filterPipNoise(result.output); + if (body) { + for (const bodyLine of body.split(/\r?\n/)) { + lines.push(bodyLine); + } + } + } + return lines.join("\n").trimEnd() || "(no output captured)"; +} + +function shortIndex(url: string): string { + return url.replace("https://download.pytorch.org/whl/", ""); +} + +// Trim pip's noisy resolver complaints + cap the displayed log at the +// last 80 lines so the panel doesn't scroll to the bottom of the +// universe when torch downloads ~2.5 GB. Mirror of the helper in +// InstallLogPanel -- copied rather than shared so this panel has no +// runtime dependency on the GPU-bundle job shape. +const PIP_NOISE_PATTERNS = [ + /^ERROR: pip's dependency resolver does not currently take into account/i, + /^\w[\w-]+\s+[\d.]+\s+requires\s+[\w-]+(?:[<>=!~].+)?, which is not installed\.$/i, +]; + +function filterPipNoise(output: string): string { + const lines = output.split(/\r?\n/); + const filtered: string[] = []; + let inNoiseBlock = false; + for (const line of lines) { + const isNoiseHeader = PIP_NOISE_PATTERNS[0].test(line); + const isNoiseDetail = PIP_NOISE_PATTERNS[1].test(line.trim()); + if (isNoiseHeader) { + inNoiseBlock = true; + continue; + } + if (inNoiseBlock && (isNoiseDetail || line.trim() === "")) { + if (isNoiseDetail) continue; + inNoiseBlock = false; + continue; + } + inNoiseBlock = false; + filtered.push(line); + } + if (filtered.length > 80) { + const kept = filtered.slice(-80); + return `... (${filtered.length - 80} earlier lines omitted)\n${kept.join("\n")}`; + } + return filtered.join("\n"); +} diff --git a/src/components/KvStrategyChip.tsx b/src/components/KvStrategyChip.tsx new file mode 100644 index 0000000..bd3bfcb --- /dev/null +++ b/src/components/KvStrategyChip.tsx @@ -0,0 +1,186 @@ +import { useEffect, useMemo, useRef, useState } from "react"; +import type { SystemStats } from "../types"; +import type { KvStrategyOverride } from "../features/chat/kvStrategyOverride"; +import { filterTextStrategies } from "./kvStrategyFilter"; + +/** + * Phase 3.2: per-turn KV strategy chip for the composer. + * + * Lets the user change cache strategy (TurboQuant / ChaosEngine / + * Native f16, etc.) and bit width without touching launch settings. + * The chip shows the *effective* strategy — either the override or + * the session default — and clicking it opens a popover with the + * available strategies plus a clear-override action. + * + * The backend reloads the runtime transparently when the requested + * cacheStrategy / cacheBits don't match the currently-loaded profile. + * Strategies marked `available: false` are still rendered (greyed) + * with a tooltip explaining the gap so users know the option exists. + */ +export interface KvStrategyChipProps { + override: KvStrategyOverride | null; + defaultStrategy: string; + defaultBits: number; + availableStrategies: SystemStats["availableCacheStrategies"]; + /** + * Phase 3.2 hotfix: the loaded model's engine. Used to filter + * strategies down to ones the substrate can actually run — e.g. + * MLX runtime can't use llama.cpp-only RotorQuant / ChaosEngine, + * and TeaCache is diffusion-only. Pass undefined / null when no + * model is loaded; the chip then shows all text-domain strategies. + */ + engine?: string | null; + onChange: (override: KvStrategyOverride | null) => void; + disabled?: boolean; +} + +function formatBits(bits: number): string { + if (bits <= 0) return "f16"; + return `${bits}-bit`; +} + +function formatLabel(strategy: string, bits: number): string { + return `${strategy} ${formatBits(bits)}`; +} + +export function KvStrategyChip({ + override, + defaultStrategy, + defaultBits, + availableStrategies, + engine, + onChange, + disabled, +}: KvStrategyChipProps) { + const [open, setOpen] = useState(false); + const wrapRef = useRef(null); + + useEffect(() => { + if (!open) return; + const handler = (event: MouseEvent) => { + if (wrapRef.current && !wrapRef.current.contains(event.target as Node)) { + setOpen(false); + } + }; + document.addEventListener("mousedown", handler); + return () => document.removeEventListener("mousedown", handler); + }, [open]); + + const effectiveStrategy = override?.strategy ?? defaultStrategy; + const effectiveBits = override?.bits ?? defaultBits; + const isOverridden = override != null; + + // Phase 3.2 hotfix: filter strategies to ones the loaded engine + // can actually run. Drops TeaCache (diffusion-only) and removes + // engine-incompatible options so picking them doesn't 500. + const filteredStrategies = useMemo( + () => filterTextStrategies(availableStrategies, engine), + [availableStrategies, engine], + ); + + // Trigger label uses the strategy's metadata regardless of whether + // it survived the filter — so a session whose default strategy got + // filtered out (e.g. session loaded under llama.cpp, current model + // is MLX) still shows the right label on the trigger. + void availableStrategies?.find((s) => s.id === effectiveStrategy); + + return ( +
+ + {open ? ( +
+
+ KV cache for next turn + Switching reloads the runtime if needed. +
+ {filteredStrategies.map((strategy) => { + const isActive = strategy.id === effectiveStrategy; + const range = strategy.bitRange?.length ? strategy.bitRange : [0]; + return ( +
+
+ + {strategy.name} + {!strategy.available ? ( + + unavailable + + ) : null} + +
+
+ {range.map((bits) => { + const label = formatBits(bits); + const isSelected = isActive && bits === effectiveBits; + return ( + + ); + })} +
+
+ ); + })} + {isOverridden ? ( + + ) : null} +
+ ) : null} +
+ ); +} diff --git a/src/components/LaunchModal.tsx b/src/components/LaunchModal.tsx index ba0d7a5..0c50a22 100644 --- a/src/components/LaunchModal.tsx +++ b/src/components/LaunchModal.tsx @@ -16,6 +16,7 @@ export interface LaunchModalProps { preview: PreviewMetrics; availableMemoryGb: number; totalMemoryGb: number; + gpuVramTotalGb?: number | null; availableCacheStrategies: SystemStats["availableCacheStrategies"] | undefined; dflashInfo?: SystemStats["dflash"]; installingPackage: string | null; @@ -37,6 +38,7 @@ export function LaunchModal({ preview, availableMemoryGb, totalMemoryGb, + gpuVramTotalGb, availableCacheStrategies, dflashInfo, installingPackage, @@ -75,6 +77,7 @@ export function LaunchModal({ preview={preview} availableMemoryGb={availableMemoryGb} totalMemoryGb={totalMemoryGb} + gpuVramTotalGb={gpuVramTotalGb} availableCacheStrategies={availableCacheStrategies} dflashInfo={dflashInfo} installingPackage={installingPackage} diff --git a/src/components/LiveProgress.tsx b/src/components/LiveProgress.tsx index 9f2f386..d96da63 100644 --- a/src/components/LiveProgress.tsx +++ b/src/components/LiveProgress.tsx @@ -152,6 +152,18 @@ export function LiveProgress({ /> + {realProgress?.active && realProgress.thumbnail ? ( +
+ Live denoise preview + + Live preview · TAESD decode + +
+ ) : null} +
{phases.map((phase, i) => { const state = i < activeIndex ? "done" : i === activeIndex ? "active" : "pending"; diff --git a/src/components/LogprobSummary.tsx b/src/components/LogprobSummary.tsx new file mode 100644 index 0000000..1f8a23a --- /dev/null +++ b/src/components/LogprobSummary.tsx @@ -0,0 +1,101 @@ +import { useState } from "react"; +import type { TokenLogprob } from "../types"; + +/** + * Phase 3.3: per-message logprob summary. + * + * Renders a collapsible block beneath the assistant bubble that + * shows confidence stats + a hover-revealed list of any low-confidence + * tokens with their top alternatives. We deliberately don't replace + * the markdown body with hoverable token spans — that breaks + * formatting + accessibility — instead we surface a compact summary + * the user can drill into when something looks off. + * + * Visible only when message.tokenLogprobs is populated, which + * requires `advancedLogprobs` to be enabled in settings. + */ +export interface LogprobSummaryProps { + entries: TokenLogprob[]; +} + +interface SummaryStats { + count: number; + avgLogprob: number; + lowConfidenceCount: number; +} + +function computeStats(entries: TokenLogprob[]): SummaryStats { + const valid = entries.filter((e) => typeof e.logprob === "number" && Number.isFinite(e.logprob)); + if (valid.length === 0) { + return { count: entries.length, avgLogprob: 0, lowConfidenceCount: 0 }; + } + const sum = valid.reduce((acc, e) => acc + (e.logprob as number), 0); + // logprob < -3.0 ≈ probability < 5%. Flag those as low-confidence + // so the user can see where the model was uncertain. + const lowConfidenceCount = valid.filter((e) => (e.logprob as number) < -3.0).length; + return { + count: entries.length, + avgLogprob: sum / valid.length, + lowConfidenceCount, + }; +} + +function lowConfidenceEntries(entries: TokenLogprob[]): TokenLogprob[] { + return entries + .filter((e) => typeof e.logprob === "number" && (e.logprob as number) < -3.0) + .slice(0, 12); +} + +export function LogprobSummary({ entries }: LogprobSummaryProps) { + const [open, setOpen] = useState(false); + if (!entries?.length) return null; + const stats = computeStats(entries); + const flagged = lowConfidenceEntries(entries); + + return ( +
setOpen((event.currentTarget as HTMLDetailsElement).open)} + > + + Token confidence + + {stats.count} tokens · avg logprob {stats.avgLogprob.toFixed(2)} + {stats.lowConfidenceCount > 0 ? ` · ${stats.lowConfidenceCount} low confidence` : ""} + + + {flagged.length === 0 ? ( +

No low-confidence tokens — model was steady throughout.

+ ) : ( +
+

+ Tokens emitted with probability under ~5%. Hover for the top + alternatives the model considered. +

+
    + {flagged.map((entry, idx) => ( +
  • `${JSON.stringify(alt.token ?? "")} (${(alt.logprob ?? 0).toFixed(2)})`) + .join("\n") + : "No alternatives recorded." + } + > + {JSON.stringify(entry.token ?? "")} + + logprob {(entry.logprob ?? 0).toFixed(2)} + +
  • + ))} +
+
+ )} +
+ ); +} + +export { computeStats, lowConfidenceEntries }; diff --git a/src/components/ModelLaunchModal.tsx b/src/components/ModelLaunchModal.tsx index 432ce6c..2688a8d 100644 --- a/src/components/ModelLaunchModal.tsx +++ b/src/components/ModelLaunchModal.tsx @@ -1,9 +1,43 @@ import { useEffect, useState } from "react"; import { RuntimeControls } from "./RuntimeControls"; import { number, sizeLabel } from "../utils"; -import type { LaunchPreferences, PreviewMetrics, StrategyInstallLog, SystemStats } from "../types"; +import type { LaunchPreferences, ModelCapabilities, PreviewMetrics, StrategyInstallLog, SystemStats } from "../types"; import type { ChatModelOption } from "../types/chat"; +/** + * Phase 2.11: typed capability badges for the picker. Mirrors the + * map in ChatHeader so the same flag surfaces with the same label + * across the loaded-model header and the picker. + */ +const CAPABILITY_BADGES: Array<{ + flag: keyof Omit; + label: string; + title: string; +}> = [ + { flag: "supportsVision", label: "Vision", title: "Model accepts image input" }, + { flag: "supportsTools", label: "Tools", title: "Model supports tool / function calling" }, + { flag: "supportsReasoning", label: "Reasoning", title: "Model emits a reasoning trace" }, + { flag: "supportsCoding", label: "Code", title: "Model is tuned for code generation" }, + { flag: "supportsAgents", label: "Agents", title: "Model is tuned for multi-step agentic flows" }, + { flag: "supportsAudio", label: "Audio", title: "Model accepts audio input" }, + { flag: "supportsVideo", label: "Video", title: "Model accepts video input" }, +]; + +function renderCapabilityBadges(capabilities: ModelCapabilities | null | undefined) { + if (!capabilities) return null; + const active = CAPABILITY_BADGES.filter((entry) => capabilities[entry.flag]); + if (active.length === 0) return null; + return ( + + {active.map((entry) => ( + + {entry.label} + + ))} + + ); +} + export interface ModelLaunchModalProps { open: boolean; title?: string; @@ -16,6 +50,7 @@ export interface ModelLaunchModalProps { preview: PreviewMetrics; availableMemoryGb: number; totalMemoryGb: number; + gpuVramTotalGb?: number | null; availableCacheStrategies: SystemStats["availableCacheStrategies"] | undefined; dflashInfo?: SystemStats["dflash"]; installingPackage: string | null; @@ -41,6 +76,7 @@ export function ModelLaunchModal({ preview, availableMemoryGb, totalMemoryGb, + gpuVramTotalGb, availableCacheStrategies, dflashInfo, installingPackage, @@ -95,6 +131,7 @@ export function ModelLaunchModal({ {selectedOption.contextWindow ? {selectedOption.contextWindow} : null} {selectedOption.group}
+ {renderCapabilityBadges(selectedOption.capabilities)} @@ -160,6 +198,7 @@ export function ModelLaunchModal({ preview={preview} availableMemoryGb={availableMemoryGb} totalMemoryGb={totalMemoryGb} + gpuVramTotalGb={gpuVramTotalGb} availableCacheStrategies={availableCacheStrategies} onInstallPackage={onInstallPackage} installingPackage={installingPackage} diff --git a/src/components/PerformancePreview.tsx b/src/components/PerformancePreview.tsx index 80e51a8..e8bff6a 100644 --- a/src/components/PerformancePreview.tsx +++ b/src/components/PerformancePreview.tsx @@ -6,6 +6,13 @@ interface PerformancePreviewProps { preview: PreviewMetrics; availableMemoryGb: number; totalMemoryGb: number; + /** Discrete GPU VRAM in GB (CUDA card on Windows / Linux). When set, + * the cache-fit check uses this as the binding constraint -- llama.cpp + * places the KV cache on GPU with full offload, so a 60 GB cache on a + * 24 GB 4090 fails on VRAM long before it would have failed on system + * RAM. Null on Apple Silicon (unified memory already in + * totalMemoryGb) or hosts with no detected discrete GPU. */ + gpuVramTotalGb?: number | null; compact?: boolean; actualDiskSizeGb?: number; } @@ -21,9 +28,9 @@ function getSpeedLabel(tokS: number): { label: string; className: string } | nul return { label: "Very fast", className: "perf-preview__speed-label--fast" }; } -export function PerformancePreview({ preview, availableMemoryGb, totalMemoryGb, compact, actualDiskSizeGb }: PerformancePreviewProps) { +export function PerformancePreview({ preview, availableMemoryGb, totalMemoryGb, gpuVramTotalGb, compact, actualDiskSizeGb }: PerformancePreviewProps) { const diskGb = actualDiskSizeGb ?? preview.diskSizeGb; - const fitStatus = getCacheFitStatus(preview.optimizedCacheGb, diskGb, totalMemoryGb, preview.bits); + const fitStatus = getCacheFitStatus(preview.optimizedCacheGb, diskGb, totalMemoryGb, preview.bits, gpuVramTotalGb); const cacheDelta = preview.baselineCacheGb - preview.optimizedCacheGb; const qualityDelta = preview.qualityPercent - 100; const cacheMax = Math.max(preview.baselineCacheGb, totalMemoryGb * 0.6, 1); diff --git a/src/components/PromptEnhanceButton.tsx b/src/components/PromptEnhanceButton.tsx new file mode 100644 index 0000000..2d390d3 --- /dev/null +++ b/src/components/PromptEnhanceButton.tsx @@ -0,0 +1,65 @@ +/** + * FU-022: Prompt enhancer button for the Image / Video Studio prompt + * fields. Click → POST /api/prompt/enhance with the current prompt + + * the selected variant's repo id; on success, replace the prompt + * textarea via the parent's setter and surface a 1-line note as a + * tooltip on the button (so the user knows which model rewrote it). + * + * Apple Silicon path uses the small LLM rewrite. Other platforms use + * the backend's deterministic template fallback so the button still + * changes short prompts without adding runtime cost. + */ +import { useState } from "react"; +import { enhancePromptViaLLM } from "../api"; + +export interface PromptEnhanceButtonProps { + prompt: string; + repo: string; + onEnhanced: (next: string) => void; +} + +export function PromptEnhanceButton({ + prompt, + repo, + onEnhanced, +}: PromptEnhanceButtonProps) { + const [busy, setBusy] = useState(false); + const [note, setNote] = useState(null); + + const trimmed = prompt.trim(); + const disabled = busy || !trimmed || !repo; + + const handleClick = async () => { + if (disabled) return; + setBusy(true); + setNote(null); + try { + const result = await enhancePromptViaLLM({ prompt: trimmed, repo }); + // Only replace when the model actually changed the prompt — when + // the helper falls back (no Apple Silicon, mlx_lm missing, model + // not cached), enhanced === original and we just surface the + // note instead of clobbering the textarea. + if (result.enhanced && result.enhanced !== trimmed) { + onEnhanced(result.enhanced); + } + setNote(result.note); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + setNote(`Enhancer error: ${message}`); + } finally { + setBusy(false); + } + }; + + return ( + + ); +} diff --git a/src/components/PromptPhaseIndicator.tsx b/src/components/PromptPhaseIndicator.tsx new file mode 100644 index 0000000..d0269b7 --- /dev/null +++ b/src/components/PromptPhaseIndicator.tsx @@ -0,0 +1,49 @@ +import { useEffect, useState } from "react"; +import type { ChatStreamPhase } from "../types"; + +interface PromptPhaseIndicatorProps { + phase: ChatStreamPhase; +} + +const PROMPT_EVAL_LABEL = "Processing prompt"; +const GENERATING_LABEL = "Generating"; + +/** + * Live phase indicator shown below an assistant placeholder while a + * generation is in flight. Replaces the bare blinking cursor with an + * explicit "Processing prompt..." or "Generating..." label plus an elapsed + * counter, so the user knows the model is working through the prompt + * before the first token arrives. + * + * Updates internally on a 250ms tick — the parent doesn't need to drive + * re-renders for the timer. + */ +export function PromptPhaseIndicator({ phase }: PromptPhaseIndicatorProps) { + const [elapsedMs, setElapsedMs] = useState(0); + + // Reset the counter whenever the phase flips so "Generating" starts at 0s + // again rather than continuing from prompt-eval seconds. + useEffect(() => { + const startedAt = Date.now(); + setElapsedMs(0); + const timer = window.setInterval(() => { + setElapsedMs(Date.now() - startedAt); + }, 250); + return () => window.clearInterval(timer); + }, [phase]); + + const seconds = Math.floor(elapsedMs / 1000); + const tenths = Math.floor((elapsedMs % 1000) / 100); + const formatted = `${seconds}.${tenths}s`; + + const label = phase === "prompt_eval" ? PROMPT_EVAL_LABEL : GENERATING_LABEL; + const className = `prompt-phase-indicator prompt-phase-indicator--${phase}`; + + return ( +
+
+ ); +} diff --git a/src/components/ReasoningPanel.tsx b/src/components/ReasoningPanel.tsx index 0d5b25b..ef9bf9b 100644 --- a/src/components/ReasoningPanel.tsx +++ b/src/components/ReasoningPanel.tsx @@ -1,58 +1,88 @@ import { useEffect, useRef, useState } from "react"; -import Markdown from "react-markdown"; +import { RichMarkdown } from "./RichMarkdown"; interface ReasoningPanelProps { text?: string | null; streaming?: boolean; } -function lastLine(text: string): string { - const lines = text.split("\n").filter(Boolean); - return lines.length > 0 ? lines[lines.length - 1] : ""; +/** + * Phase 2.5+ post-fix: take the last N non-empty lines from the + * cumulative reasoning text. The streaming preview shows these so + * the user sees something meaningful even when collapsed mid-stream. + * Older revisions returned a single line, which made the preview + * jump abruptly when the model emitted short tokens. + */ +export function lastLines(text: string, count: number): string { + const lines = text.split("\n").map((l) => l.trim()).filter(Boolean); + if (lines.length === 0) return ""; + return lines.slice(-count).join(" · "); +} + +/** + * Models often emit a leading newline after `` and an extra + * blank line between the first thought and the rest, which renders + * as a tall visual gap inside the reasoning panel. Trim leading + * whitespace and collapse the very first paragraph break so the + * panel reads as one continuous thought stream. + */ +export function tidyReasoningForDisplay(text: string): string { + const trimmed = text.replace(/^[\s\n]+/, ""); + // Collapse the *first* `\n\n` (or longer) to a single newline so the + // first paragraph sits flush against subsequent content. Mid-stream + // paragraph breaks are preserved. + return trimmed.replace(/^([^\n]+)\n{2,}/, "$1\n"); } export function ReasoningPanel({ text, streaming = false }: ReasoningPanelProps) { - const content = text?.trim() ?? ""; - const [open, setOpen] = useState(Boolean(content && streaming)); + const rawContent = text?.trim() ?? ""; + const content = tidyReasoningForDisplay(rawContent); + // Default to *collapsed* during streaming so the user sees a compact + // running preview instead of a wall of streaming thought. The user + // can still expand explicitly; once expanded the choice sticks until + // streaming ends. Pre-fix this auto-opened, which clashed with the + // request for a 1-2 line streaming preview. + const [open, setOpen] = useState(false); const prevStreamingRef = useRef(streaming); - const userCollapsedRef = useRef(false); + const userExpandedRef = useRef(false); - // Auto-open when streaming starts (new reasoning content appears), - // but only if the user hasn't manually collapsed it. + // Reset auto-expand state whenever streaming starts again so the + // next message starts collapsed. useEffect(() => { - if (streaming && content && !userCollapsedRef.current) { - setOpen(true); + if (streaming && !prevStreamingRef.current) { + userExpandedRef.current = false; + setOpen(false); } - }, [streaming, content]); + prevStreamingRef.current = streaming; + }, [streaming]); - // Auto-collapse when streaming ends. Reset the user-collapsed - // flag so the next message auto-opens fresh. + // Auto-collapse when streaming ends if the user never expanded — + // matches the previous behaviour for the "thought trace landed" + // moment where the user typically wants the answer, not the full + // chain of thought, in front of them. useEffect(() => { - if (prevStreamingRef.current && !streaming && content) { + if (!streaming && !userExpandedRef.current) { setOpen(false); - userCollapsedRef.current = false; } - prevStreamingRef.current = streaming; - }, [streaming, content]); + }, [streaming]); if (!content) return null; const handleToggle = () => { setOpen((current) => { const next = !current; - // Track that the user explicitly collapsed so auto-open - // doesn't fight with them during streaming. - if (!next) { - userCollapsedRef.current = true; - } else { - userCollapsedRef.current = false; - } + if (next) userExpandedRef.current = true; return next; }); }; + // Two-line preview when collapsed during streaming — gives the user + // a real glimpse of the model's current train of thought without + // committing the whole panel to display. + const preview = !open && streaming ? lastLines(content, 2) : null; + return ( -
+
{open ? (
- {content} + {content}
) : null} diff --git a/src/components/RichMarkdown.tsx b/src/components/RichMarkdown.tsx new file mode 100644 index 0000000..8158160 --- /dev/null +++ b/src/components/RichMarkdown.tsx @@ -0,0 +1,61 @@ +import type { ReactNode } from "react"; +import Markdown from "react-markdown"; +import remarkGfm from "remark-gfm"; +import remarkMath from "remark-math"; +import rehypeKatex from "rehype-katex"; +import { CodeBlock } from "./CodeBlock"; + +interface RichMarkdownProps { + children: string; +} + +interface MarkdownCodeProps { + inline?: boolean; + className?: string; + children?: ReactNode; +} + +function extractLanguage(className?: string): string | undefined { + if (!className) return undefined; + const match = /language-([\w+-]+)/i.exec(className); + return match?.[1]; +} + +function flattenChildren(children: ReactNode): string { + if (children == null) return ""; + if (typeof children === "string") return children; + if (typeof children === "number") return String(children); + if (Array.isArray(children)) return children.map(flattenChildren).join(""); + if (typeof children === "object") { + const maybeElement = children as unknown as { props?: { children?: ReactNode } }; + if (maybeElement.props?.children !== undefined) { + return flattenChildren(maybeElement.props.children); + } + } + return ""; +} + +export function RichMarkdown({ children }: RichMarkdownProps) { + return ( + { + const language = extractLanguage(className); + const raw = flattenChildren(codeChildren); + // react-markdown reports `inline` for backtick spans; absence of newline is also a strong hint + const isInline = inline === true || (!language && !raw.includes("\n")); + if (isInline) { + return {codeChildren}; + } + return ; + }, + // Avoid wrapping the CodeBlock in a default
 — CodeBlock owns its own container
+        pre: ({ children: preChildren }: { children?: ReactNode }) => <>{preChildren},
+      }}
+    >
+      {children}
+    
+  );
+}
diff --git a/src/components/RuntimeControls.tsx b/src/components/RuntimeControls.tsx
index 9480fcb..ec9b5c1 100644
--- a/src/components/RuntimeControls.tsx
+++ b/src/components/RuntimeControls.tsx
@@ -115,6 +115,7 @@ interface RuntimeControlsProps {
   preview: PreviewMetrics;
   availableMemoryGb: number;
   totalMemoryGb: number;
+  gpuVramTotalGb?: number | null;
   compact?: boolean;
   showTemperature?: boolean;
   showPreview?: boolean;
@@ -213,6 +214,7 @@ export function RuntimeControls({
   preview,
   availableMemoryGb,
   totalMemoryGb,
+  gpuVramTotalGb,
   compact,
   showTemperature = true,
   showPreview = true,
@@ -639,6 +641,7 @@ export function RuntimeControls({
           preview={preview}
           availableMemoryGb={availableMemoryGb}
           totalMemoryGb={totalMemoryGb}
+          gpuVramTotalGb={gpuVramTotalGb}
           actualDiskSizeGb={diskSizeGb}
           compact={compact}
         />
diff --git a/src/components/SamplerPanel.tsx b/src/components/SamplerPanel.tsx
new file mode 100644
index 0000000..361e3a8
--- /dev/null
+++ b/src/components/SamplerPanel.tsx
@@ -0,0 +1,291 @@
+import { useEffect, useRef, useState } from "react";
+import type { SamplerOverrides } from "../types";
+
+/**
+ * Phase 2.2: advanced sampler panel for per-thread overrides.
+ *
+ * Renders behind the "Samplers" composer button. Each control accepts
+ * `null` (= use backend default) and returns `null` again on Reset.
+ * The panel does NOT own state — it's a controlled component so the
+ * parent (ChatTab) can persist to localStorage on every change.
+ */
+export interface SamplerPanelProps {
+  overrides: SamplerOverrides;
+  onChange: (overrides: SamplerOverrides) => void;
+  disabled?: boolean;
+}
+
+interface NumericInputProps {
+  label: string;
+  hint: string;
+  value: number | null | undefined;
+  min: number;
+  max: number;
+  step: number;
+  defaultLabel: string;
+  onChange: (value: number | null) => void;
+  disabled?: boolean;
+}
+
+function NumericInput({ label, hint, value, min, max, step, defaultLabel, onChange, disabled }: NumericInputProps) {
+  const isOverridden = value != null;
+  return (
+    
+
+ {label} + {hint} +
+
+ { + const raw = event.target.value; + if (raw === "") { + onChange(null); + return; + } + const parsed = parseFloat(raw); + if (Number.isFinite(parsed)) onChange(parsed); + }} + /> + {isOverridden ? ( + + ) : null} +
+
+ ); +} + +export function SamplerPanel({ overrides, onChange, disabled }: SamplerPanelProps) { + const [open, setOpen] = useState(false); + const wrapRef = useRef(null); + + useEffect(() => { + if (!open) return; + const handler = (event: MouseEvent) => { + if (wrapRef.current && !wrapRef.current.contains(event.target as Node)) { + setOpen(false); + } + }; + document.addEventListener("mousedown", handler); + return () => document.removeEventListener("mousedown", handler); + }, [open]); + + // Treat empty-string jsonSchemaText as "no override" so an empty + // textarea doesn't bloat the badge count. + const overrideCount = Object.entries(overrides).filter(([key, value]) => { + if (value == null) return false; + if (key === "jsonSchemaText" && typeof value === "string" && value.trim() === "") { + return false; + } + return true; + }).length; + const hasOverrides = overrideCount > 0; + const schemaText = overrides.jsonSchemaText ?? ""; + const schemaError = (() => { + if (!schemaText.trim()) return null; + try { + const parsed = JSON.parse(schemaText); + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + return "Schema must be a JSON object"; + } + return null; + } catch (err) { + return err instanceof Error ? err.message : "Invalid JSON"; + } + })(); + + function patch(key: K, value: SamplerOverrides[K]) { + const next = { ...overrides }; + if (value == null) { + delete next[key]; + } else { + next[key] = value; + } + onChange(next); + } + + return ( +
+ + {open ? ( +
+
+ Sampler overrides + +
+ patch("topP", v)} + /> + patch("topK", v == null ? null : Math.round(v))} + /> + patch("minP", v)} + /> + patch("repeatPenalty", v)} + /> + patch("seed", v == null ? null : Math.round(v))} + /> +
+
+ mirostat + Adaptive sampling target entropy +
+
+ +
+
+ {overrides.mirostatMode === 1 || overrides.mirostatMode === 2 ? ( + <> + patch("mirostatTau", v)} + /> + patch("mirostatEta", v)} + /> + + ) : null} +
+
+ JSON schema + Constrained output (llama.cpp only) +
+