vfeitoza · vfeitoza · Jun 26, 2026 · Jun 26, 2026 · Jun 27, 2026 · Jun 28, 2026
diff --git a/.env.template b/.env.template
@@ -266,6 +266,26 @@
 # (e.g. /v1/chat/completions and /v1/responses items).
 # ENABLE_GUARDRAILS_FOR_BATCH_PROCESSING=false
 
+# ----------------------------------------------------------------------------
+# Intelligent model routing (default: disabled)
+# Analyzes the request with a cheap analyzer model and selects the best catalog
+# model for execution. Only triggers for intelligent selectors (auto/smart/
+# auto-cost/auto-quality) or intelligent virtual models, unless mode is observe.
+# Configure analyzers/selectors in config.yaml under intelligent_routing.
+# See docs/dev/intelligent-model.md.
+# ----------------------------------------------------------------------------
+# INTELLIGENT_ROUTING_ENABLED=false
+# INTELLIGENT_ROUTING_MODE=off # off | observe | enforce
+# INTELLIGENT_ROUTING_DEFAULT_STRATEGY=balanced # cost | balanced | quality | latency
+# INTELLIGENT_ROUTING_MAX_ANALYSIS_TOKENS=256
+# INTELLIGENT_ROUTING_TIMEOUT=1500ms # Go duration string
+# INTELLIGENT_ROUTING_MIN_SAVINGS_RATIO=0.15
+# INTELLIGENT_ROUTING_MIN_CONFIDENCE=0.7
+# INTELLIGENT_ROUTING_FALLBACK_MODEL=
+# INTELLIGENT_ROUTING_ANALYSIS_USER_PATH=/intelligent-router
+# INTELLIGENT_ROUTING_CANDIDATES_ALLOW=
+# INTELLIGENT_ROUTING_CANDIDATES_DENY=
+
 # In-memory buffer size before flushing to storage (default: 1000)
 # USAGE_BUFFER_SIZE=1000
 

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -128,6 +128,7 @@ Full reference: `.env.template` and `config/config.yaml`
 - **HTTP client:** `HTTP_TIMEOUT` (600s), `HTTP_RESPONSE_HEADER_TIMEOUT` (600s)
 - **Resilience:** Configured via `config/config.yaml` - global `resilience.retry.*` and `resilience.circuit_breaker.*` defaults with optional per-provider overrides under `providers.<name>.resilience.retry.*` and `providers.<name>.resilience.circuit_breaker.*`. Retry defaults: `max_retries` (3), `initial_backoff` (1s), `max_backoff` (30s), `backoff_factor` (2.0), `jitter_factor` (0.1). Circuit breaker defaults: `failure_threshold` (5), `success_threshold` (2), `timeout` (30s)
 - **Metrics:** `METRICS_ENABLED` (false), `METRICS_ENDPOINT` (/metrics)
+- **Intelligent routing:** Disabled by default. When enabled, the gateway classifies each request with a cheap analyzer model and selects the best catalog model for execution. Configured via `config/config.yaml` under `intelligent_routing` (key env vars: `INTELLIGENT_ROUTING_ENABLED`, `INTELLIGENT_ROUTING_MODE` (`off`/`observe`/`enforce`), `INTELLIGENT_ROUTING_DEFAULT_STRATEGY` (`cost`/`balanced`/`quality`/`latency`), `INTELLIGENT_ROUTING_MAX_ANALYSIS_TOKENS`, `INTELLIGENT_ROUTING_TIMEOUT`, `INTELLIGENT_ROUTING_MIN_SAVINGS_RATIO`, `INTELLIGENT_ROUTING_FALLBACK_MODEL`). It only triggers for intelligent selectors (`auto`, `smart`, `auto-cost`, `auto-quality`) or intelligent virtual models, unless `mode` is `observe` (dry-run that records the recommendation without changing the executed model). The default example pool ships with `codex/gpt-5.4-mini`, `zai/glm-5-turbo`, and `anthropic/claude-haiku-4-5` as ordered analyzers (tried in order with failover). Analysis cost is attributed to `analysis_user_path` (`/intelligent-router` by default) to keep it separate from the main execution in usage reports. See `docs/dev/intelligent-model.md`.
 - **Guardrails:** Configured via `config/config.yaml` only (except `GUARDRAILS_ENABLED` env var)
 - **Providers:** `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GEMINI_API_KEY`, `USE_GOOGLE_GEMINI_NATIVE_API` (true by default; false uses Gemini's OpenAI-compatible chat API), `XAI_API_KEY`, `GROQ_API_KEY`, `OPENROUTER_API_KEY`, `ZAI_API_KEY`, `ZAI_BASE_URL` (optional Z.ai endpoint override), `MINIMAX_API_KEY`, `MINIMAX_BASE_URL` (optional MiniMax endpoint override), `XIAOMI_API_KEY`, `XIAOMI_BASE_URL` (optional Xiaomi MiMo endpoint override), `OPENCODE_GO_API_KEY`, `OPENCODE_GO_BASE_URL` (optional OpenCode Go/Zen endpoint override; default `https://opencode.ai/zen/go/v1`), `OPENCODE_GO_MESSAGES_MODELS` (optional comma-separated model IDs routed to the Anthropic-native `/messages` endpoint instead of `/chat/completions`; default `qwen3.7-max`), `BAILIAN_API_KEY`, `BAILIAN_BASE_URL` (optional Bailian base URL for region switching; default `https://dashscope.aliyuncs.com/compatible-mode/v1`), `AZURE_API_KEY`, `AZURE_BASE_URL` (Azure OpenAI deployment base URL), `AZURE_API_VERSION` (optional Azure API version), `ORACLE_API_KEY` (Oracle API key), `ORACLE_BASE_URL` (Oracle OpenAI-compatible base URL), `<PROVIDER>[_SUFFIX]_MODELS` (comma-separated configured model list for any provider type), `OLLAMA_BASE_URL`, `VLLM_BASE_URL`, `VLLM_API_KEY` (optional upstream vLLM bearer token)
 - **Provider model metadata:** `providers.<name>.models` accepts either model IDs (strings) or `{id, metadata}` objects. When `metadata` is supplied (`display_name`, `context_window`, `max_output_tokens`, `modes`, `capabilities`, `pricing`, …) it is merged onto the remote ai-model-list entry during enrichment, with operator values winning per-field. Primary use case: advertising context windows, capabilities, and pricing for local models (Ollama) and other custom endpoints whose IDs are not in the upstream registry.
diff --git a/config/config.example.yaml b/config/config.example.yaml
@@ -176,6 +176,47 @@ guardrails:
     #     skip_content_prefix: "### safe"
     #     # prompt: "Custom rewrite instructions here."
 
+# Intelligent model routing (optional, disabled by default).
+# When enabled, the gateway classifies the request with a cheap analyzer model
+# and selects the best catalog model for execution. Only triggers for
+# intelligent selectors (auto/smart/auto-cost/auto-quality) or intelligent
+# virtual models, unless mode is observe. See docs/dev/intelligent-model.md.
+intelligent_routing:
+  enabled: false
+  mode: "off" # off | observe | enforce; observe classifies but keeps the requested model
+  analyzers:
+    # Ordered pool of cheap models used to classify the request. Tried in
+    # order; on failure or timeout the next analyzer is used.
+    - model: "gpt-5.4-mini"
+      provider: "openai"
+      max_tokens: 256
+    - model: "glm-5-turbo"
+      provider: "zai"
+      max_tokens: 256
+    - model: "claude-haiku-4-5"
+      provider: "anthropic"
+      max_tokens: 256
+  defaults:
+    strategy: "balanced" # cost | balanced | quality | latency
+    max_analysis_tokens: 256
+    timeout: "8000ms"
+    min_savings_ratio: 0.15 # minimum estimated savings to switch to a cheaper model in enforce
+    min_confidence: 0.7 # below this, a stronger model is chosen
+  selectors:
+    - name: "auto"
+      strategy: "balanced"
+    - name: "smart"
+      strategy: "balanced"
+    - name: "auto-cost"
+      strategy: "cost"
+    - name: "auto-quality"
+      strategy: "quality"
+  # candidates: # optional allow/deny over the catalog
+  #   allow: ["openai/gpt-4o-mini", "anthropic/claude-sonnet-*"]
+  #   deny: []
+  fallback_model: "openai/gpt-4o-mini" # used when all analyzers fail; empty falls back to model_not_found
+  analysis_user_path: "/intelligent-router" # scopes analyzer usage/audit cost separately
+
 fallback:
   default_mode: "manual" # "off", "manual", or "auto"; default is "manual"
   manual_rules_path: "config/fallback.example.json" # optional JSON map: {"model": ["fallback-1", "provider/model"]}; when omitted, manual mode has no fallback candidates

diff --git a/config/config.go b/config/config.go
@@ -13,20 +13,21 @@ import (
 
 // Config holds the application configuration.
 type Config struct {
-	Server     ServerConfig     `yaml:"server"`
-	Models     ModelsConfig     `yaml:"models"`
-	Cache      CacheConfig      `yaml:"cache"`
-	Storage    StorageConfig    `yaml:"storage"`
-	Logging    LogConfig        `yaml:"logging"`
-	Usage      UsageConfig      `yaml:"usage"`
-	Budgets    BudgetsConfig    `yaml:"budgets"`
-	Metrics    MetricsConfig    `yaml:"metrics"`
-	HTTP       HTTPConfig       `yaml:"http"`
-	Admin      AdminConfig      `yaml:"admin"`
-	Guardrails GuardrailsConfig `yaml:"guardrails"`
-	Fallback   FallbackConfig   `yaml:"fallback"`
-	Workflows  WorkflowsConfig  `yaml:"workflows"`
-	Resilience ResilienceConfig `yaml:"resilience"`
+	Server             ServerConfig             `yaml:"server"`
+	Models             ModelsConfig             `yaml:"models"`
+	Cache              CacheConfig              `yaml:"cache"`
+	Storage            StorageConfig            `yaml:"storage"`
+	Logging            LogConfig                `yaml:"logging"`
+	Usage              UsageConfig              `yaml:"usage"`
+	Budgets            BudgetsConfig            `yaml:"budgets"`
+	Metrics            MetricsConfig            `yaml:"metrics"`
+	HTTP               HTTPConfig               `yaml:"http"`
+	Admin              AdminConfig              `yaml:"admin"`
+	Guardrails         GuardrailsConfig         `yaml:"guardrails"`
+	Fallback           FallbackConfig           `yaml:"fallback"`
+	Workflows          WorkflowsConfig          `yaml:"workflows"`
+	Resilience         ResilienceConfig         `yaml:"resilience"`
+	IntelligentRouting IntelligentRoutingConfig `yaml:"intelligent_routing"`
 }
 
 // LoadResult is returned by Load and bundles the application config with the raw
@@ -127,7 +128,8 @@ func buildDefaultConfig() *Config {
 			LiveLogsReplayLimit:      1000,
 			LiveLogsHeartbeatSeconds: 15,
 		},
-		Guardrails: GuardrailsConfig{},
+		Guardrails:         GuardrailsConfig{},
+		IntelligentRouting: DefaultIntelligentRoutingConfig(),
 	}
 }
 
@@ -193,6 +195,10 @@ func Load() (*LoadResult, error) {
 		return nil, err
 	}
 
+	if err := ValidateIntelligentRoutingConfig(&cfg.IntelligentRouting); err != nil {
+		return nil, err
+	}
+
 	return &LoadResult{
 		Config:       cfg,
 		RawProviders: rawProviders,