From 066e6c7299cb2e3bcb392a42f073e4e70be30410 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 16:37:44 -0800
Subject: [PATCH 01/37] update docs/architecture/memory.md

---
 docs/architecture/memory.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index cbff5d938..f99637ff3 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -259,13 +259,12 @@ Executed via `_postprocess_after_turn` (background task).
 *   **Output:** JSON list of strings. Failures fall back to `[]`.
 
 ### 4.3 Reconciliation (Memory Management)
-Resolves contradictions using a "Search-Decide-Update" loop.
+Resolves contradictions using a "Search-Decide-Update" loop with complete enumeration.
 1.  **Local Search:** For each new fact, retrieve a small neighborhood of existing `role="memory"` entries for the conversation.
-2.  **LLM Decision:** Uses `UPDATE_MEMORY_PROMPT` (examples + strict JSON schema) to compare `new_facts` vs `existing_memories`.
+2.  **LLM Decision:** Uses `UPDATE_MEMORY_PROMPT` to compare `new_facts` vs `existing_memories`. The model must return **all memories** (existing + new) with explicit events for each.
     *   **Decisions:** `ADD`, `UPDATE`, `DELETE`, `NONE`.
     *   If no existing memories are found, all new facts are added directly.
     *   On LLM/network failure, defaults to adding all new facts.
-    *   Safeguard: if the model returns only deletes/empties, the new facts are still added to avoid data loss.
 3.  **Execution:**
     *   **Adds:** Creates new fact files and upserts to Chroma.
     *   **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`.
@@ -295,13 +294,14 @@ To replicate the system behavior, the following prompt strategies are required.
 *   **Example:** "My wife is Anne" -> `["The user's wife is named Anne"]`.
 
 ### 5.2 Reconciliation (`UPDATE_MEMORY_PROMPT`)
-*   **Goal:** Compare `new_facts` against `existing_memories` (id + text) and output structured decisions.
+*   **Goal:** Compare `new_facts` against `existing_memories` and return **all memories** (existing + new) with explicit events.
+*   **Approach:** The model must enumerate every memory in its response, forcing deliberate decisions rather than implicit omissions.
 *   **Operations:**
-    *   **ADD:** New information (generates a new ID).
-    *   **UPDATE:** Refines existing information (uses the provided short ID).
-    *   **DELETE:** Contradicts existing information (e.g., "I hate pizza" vs "I love pizza"). **If deleting because of a replacement, the new fact must also be returned (ADD or UPDATE).**
-    *   **NONE:** Fact already exists or is irrelevant.
-*   **Output constraints:** JSON list only; no prose/code fences; IDs for UPDATE/DELETE/NONE must come from the provided list.
+    *   **ADD:** New information not present in existing memories (generates a new sequential ID).
+    *   **UPDATE:** Refines existing information about the **same topic** (keeps the existing ID).
+    *   **DELETE:** Explicitly contradicts existing information (e.g., "I hate pizza" vs "I love pizza").
+    *   **NONE:** Existing memory is unrelated to new facts, or new fact is an exact duplicate.
+*   **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences.
 
 ### 5.3 Summarization (`SUMMARY_PROMPT`)
 *   **Goal:** Maintain a concise running summary.

From d495e6ec9cbdf8b16b725aa7d38fe252c6d5a1a8 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 16:38:04 -0800
Subject: [PATCH 02/37] Turn off ChromaDB telemetry

---
 agent_cli/core/chroma.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/agent_cli/core/chroma.py b/agent_cli/core/chroma.py
index 22455fa65..0cc639cb7 100644
--- a/agent_cli/core/chroma.py
+++ b/agent_cli/core/chroma.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Any
-
 from agent_cli.constants import DEFAULT_OPENAI_EMBEDDING_MODEL
 
 if TYPE_CHECKING:

From d7b2a3d7c6a942ab5727ef8513617551a85d2cd4 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 16:44:37 -0800
Subject: [PATCH 03/37] feat(memory): add output validation with ModelRetry for
 reconciliation

- Add @agent.output_validator to validate LLM decisions
- Catch invalid UPDATE/DELETE/NONE with non-existent IDs
- Send helpful error messages via ModelRetry for retry
- Graceful fallback to add all facts when retries exhausted
- Add AI journal POC example for testing MemoryClient
- Improve reconciliation prompt with clearer examples
---
 examples/aijournal_poc.py | 151 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100755 examples/aijournal_poc.py

diff --git a/examples/aijournal_poc.py b/examples/aijournal_poc.py
new file mode 100755
index 000000000..df5934e9b
--- /dev/null
+++ b/examples/aijournal_poc.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Minimal AI Journal proof-of-concept using MemoryClient.
+
+This validates the core hypothesis: MemoryClient can serve as the
+foundation for a personal knowledge system (AI journal).
+
+Usage:
+    # Add a journal entry
+    python examples/aijournal_poc.py add "Today I learned about quantum computing at work"
+
+    # Search memories
+    python examples/aijournal_poc.py search "what did I learn?"
+
+    # Interactive chat with memory
+    python examples/aijournal_poc.py chat "What have I been working on lately?"
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+from pathlib import Path
+
+from agent_cli.memory.client import MemoryClient
+
+# Enable debug logging for memory module
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    datefmt="%H:%M:%S",
+)
+# Enable DEBUG for memory ingest to see full prompts
+logging.getLogger("agent_cli.memory._ingest").setLevel(logging.DEBUG)
+
+
+# Defaults for local AI setup
+DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1"
+DEFAULT_MODEL = "gpt-oss-high:20b"
+DEFAULT_EMBEDDING_MODEL = "embeddinggemma:300m"
+
+
+def get_client(model: str | None = None) -> tuple[MemoryClient, str]:
+    """Initialize the memory client with sensible defaults.
+
+    Returns:
+        Tuple of (client, model_name)
+
+    """
+    base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    model_name = model or os.environ.get("OPENAI_MODEL", DEFAULT_MODEL)
+    embedding_model = os.environ.get("EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    print(f"Using: {base_url}")
+    print(f"  Chat model: {model_name}")
+    print(f"  Embedding model: {embedding_model}")
+
+    return MemoryClient(
+        memory_path=Path("~/.aijournal").expanduser(),
+        openai_base_url=base_url,
+        chat_api_key=api_key,
+        embedding_api_key=api_key,
+        embedding_model=embedding_model,
+        enable_summarization=True,
+        enable_git_versioning=False,  # Keep it simple for POC
+        score_threshold=0.1,  # Lower threshold for local models
+    ), model_name
+
+
+async def cmd_add(text: str) -> None:
+    """Add a journal entry."""
+    client, model = get_client()
+    print(f"Adding entry: {text[:50]}...")
+    await client.add(text, conversation_id="journal", model=model)
+    print("✓ Entry processed and facts extracted")
+
+
+async def cmd_search(query: str, top_k: int = 5) -> None:
+    """Search memories."""
+    client, model = get_client()
+    print(f"Searching for: {query}\n")
+
+    result = await client.search(query, conversation_id="journal", top_k=top_k, model=model)
+
+    if not result.entries:
+        print("No relevant memories found.")
+        return
+
+    for i, entry in enumerate(result.entries, 1):
+        print(f"{i}. [{entry.role}] {entry.content}")
+        print(f"   Score: {entry.score:.3f} | Created: {entry.created_at[:10]}")
+        print()
+
+
+async def cmd_chat(question: str) -> None:
+    """Chat with memory-augmented LLM."""
+    client, model = get_client()
+    print(f"Question: {question}\n")
+
+    response = await client.chat(
+        messages=[{"role": "user", "content": question}],
+        conversation_id="journal",
+        model=model,
+    )
+
+    # Extract assistant reply
+    choices = response.get("choices", [])
+    if choices:
+        reply = choices[0].get("message", {}).get("content", "")
+        print(f"Answer: {reply}")
+
+    # Show which memories were used
+    hits = response.get("memory_hits", [])
+    if hits:
+        print(f"\n--- Used {len(hits)} memories ---")
+        for hit in hits[:3]:
+            print(f"  • {hit['content'][:80]}...")
+
+
+def main() -> None:
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(description="AI Journal POC")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # Add command
+    add_parser = subparsers.add_parser("add", help="Add a journal entry")
+    add_parser.add_argument("text", help="The journal entry text")
+
+    # Search command
+    search_parser = subparsers.add_parser("search", help="Search memories")
+    search_parser.add_argument("query", help="Search query")
+    search_parser.add_argument("-k", "--top-k", type=int, default=5, help="Number of results")
+
+    # Chat command
+    chat_parser = subparsers.add_parser("chat", help="Chat with memory")
+    chat_parser.add_argument("question", help="Question to ask")
+
+    args = parser.parse_args()
+
+    if args.command == "add":
+        asyncio.run(cmd_add(args.text))
+    elif args.command == "search":
+        asyncio.run(cmd_search(args.query, args.top_k))
+    elif args.command == "chat":
+        asyncio.run(cmd_chat(args.question))
+
+
+if __name__ == "__main__":
+    main()

From 24e04843fb55e45d0aa51de61b547ac8d54777cb Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 17:04:20 -0800
Subject: [PATCH 04/37] feat(memory): add self-model features to AI journal POC

- Add list_all() method to MemoryClient to retrieve all stored memories
- Add 'show' command to display all stored facts about the user
- Add 'profile' command to generate a structured profile summary using LLM
- Enhance 'chat' command to use profile context for personalized responses

The POC now demonstrates a "self-model" system that:
1. Extracts facts from user input
2. Stores and retrieves them semantically
3. Generates profile summaries on demand
4. Uses the profile to personalize conversations

This validates the core hypothesis: MemoryClient can serve as the
foundation for a personal knowledge system that knows who you are.
---
 examples/aijournal_poc.py | 146 ++++++++++++++++++++++++++++++++++----
 1 file changed, 132 insertions(+), 14 deletions(-)

diff --git a/examples/aijournal_poc.py b/examples/aijournal_poc.py
index df5934e9b..156c0b97a 100755
--- a/examples/aijournal_poc.py
+++ b/examples/aijournal_poc.py
@@ -23,6 +23,8 @@
 import os
 from pathlib import Path
 
+import httpx
+
 from agent_cli.memory.client import MemoryClient
 
 # Enable debug logging for memory module
@@ -94,29 +96,135 @@ async def cmd_search(query: str, top_k: int = 5) -> None:
         print()
 
 
-async def cmd_chat(question: str) -> None:
+def cmd_show() -> None:
+    """Show all stored memories (what the system knows about you)."""
+    client, _ = get_client()
+    print("=== What I know about you ===\n")
+
+    entries = client.list_all(conversation_id="journal")
+
+    if not entries:
+        print("No memories stored yet. Add some journal entries first!")
+        return
+
+    # Sort by created_at
+    entries.sort(key=lambda x: x["created_at"], reverse=True)
+
+    for i, entry in enumerate(entries, 1):
+        date = entry["created_at"][:10] if entry["created_at"] else "unknown"
+        print(f"{i}. [{date}] {entry['content']}")
+
+    print(f"\n--- Total: {len(entries)} memories ---")
+
+
+PROFILE_PROMPT = """Based on the following facts about a person, create a brief profile summary.
+Organize the information into categories like:
+- **Identity**: Name, relationships, occupation
+- **Interests & Activities**: Hobbies, regular activities
+- **Goals & Values**: What they care about, what they're working towards
+- **Recent Events**: Notable recent happenings
+
+Only include categories that have relevant information. Be concise.
+
+Facts:
+{facts}
+
+Profile Summary:"""
+
+
+async def cmd_profile() -> None:
+    """Generate a profile summary from stored memories."""
+    client, model = get_client()
+
+    entries = client.list_all(conversation_id="journal")
+
+    if not entries:
+        print("No memories stored yet. Add some journal entries first!")
+        return
+
+    # Format facts for the prompt
+    facts = "\n".join(f"- {e['content']}" for e in entries)
+    prompt = PROFILE_PROMPT.format(facts=facts)
+
+    print("=== Your Profile ===\n")
+    print("(Generating profile from stored memories...)\n")
+
+    # Direct LLM call (bypasses memory storage)
+    base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    async with httpx.AsyncClient(timeout=120.0) as http:
+        response = await http.post(
+            f"{base_url}/chat/completions",
+            headers={"Authorization": f"Bearer {api_key}"},
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.7,
+            },
+        )
+        data = response.json()
+
+    choices = data.get("choices", [])
+    if choices:
+        profile = choices[0].get("message", {}).get("content", "")
+        print(profile)
+
+    print(f"\n--- Based on {len(entries)} memories ---")
+
+
+CHAT_SYSTEM_PROMPT = """You are a helpful AI assistant with memory of the user.
+
+Here's what you know about the user:
+{profile}
+
+Use this knowledge naturally in your responses. Be helpful and personable."""
+
+
+async def cmd_chat(question: str, with_profile: bool = True) -> None:
     """Chat with memory-augmented LLM."""
     client, model = get_client()
+
+    # Build profile context
+    profile_text = ""
+    if with_profile:
+        entries = client.list_all(conversation_id="journal")
+        if entries:
+            profile_text = "\n".join(f"- {e['content']}" for e in entries)
+
     print(f"Question: {question}\n")
 
-    response = await client.chat(
-        messages=[{"role": "user", "content": question}],
-        conversation_id="journal",
-        model=model,
-    )
+    # Build messages with profile context
+    messages: list[dict[str, str]] = []
+    if profile_text:
+        system_prompt = CHAT_SYSTEM_PROMPT.format(profile=profile_text)
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": question})
 
-    # Extract assistant reply
-    choices = response.get("choices", [])
+    # Direct LLM call with profile context
+    base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    async with httpx.AsyncClient(timeout=120.0) as http:
+        response = await http.post(
+            f"{base_url}/chat/completions",
+            headers={"Authorization": f"Bearer {api_key}"},
+            json={
+                "model": model,
+                "messages": messages,
+                "temperature": 0.7,
+            },
+        )
+        data = response.json()
+
+    choices = data.get("choices", [])
     if choices:
         reply = choices[0].get("message", {}).get("content", "")
         print(f"Answer: {reply}")
 
-    # Show which memories were used
-    hits = response.get("memory_hits", [])
-    if hits:
-        print(f"\n--- Used {len(hits)} memories ---")
-        for hit in hits[:3]:
-            print(f"  • {hit['content'][:80]}...")
+    if profile_text:
+        entry_count = len(client.list_all(conversation_id="journal"))
+        print(f"\n--- Using profile with {entry_count} memories ---")
 
 
 def main() -> None:
@@ -137,6 +245,12 @@ def main() -> None:
     chat_parser = subparsers.add_parser("chat", help="Chat with memory")
     chat_parser.add_argument("question", help="Question to ask")
 
+    # Show command - display what the system knows about you
+    subparsers.add_parser("show", help="Show all stored memories")
+
+    # Profile command - generate a profile summary
+    subparsers.add_parser("profile", help="Generate profile from memories")
+
     args = parser.parse_args()
 
     if args.command == "add":
@@ -145,6 +259,10 @@ def main() -> None:
         asyncio.run(cmd_search(args.query, args.top_k))
     elif args.command == "chat":
         asyncio.run(cmd_chat(args.question))
+    elif args.command == "show":
+        cmd_show()
+    elif args.command == "profile":
+        asyncio.run(cmd_profile())
 
 
 if __name__ == "__main__":

From f083b9ee94a9f240540820d88d46f4a5db44cc2a Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 17:51:37 -0800
Subject: [PATCH 05/37] docs: add detailed comparison between AI journal POC
 and aijournal

Analyzes architecture, features, and test results comparing our
MemoryClient-based POC (~200 LOC) with the full aijournal project
(~15,000+ LOC).

Key findings:
- POC successfully extracts facts and generates accurate profiles
- Main gap is learning over time (strength tracking, decay, feedback)
- Recommends adding simple strength field to close 80% of functionality
  gap with 20% of aijournal's complexity

Includes concrete test results from ingesting 12+ blog posts.
---
 docs/aijournal-poc-comparison.md | 245 +++++++++++++++++++++++++++++++
 1 file changed, 245 insertions(+)
 create mode 100644 docs/aijournal-poc-comparison.md

diff --git a/docs/aijournal-poc-comparison.md b/docs/aijournal-poc-comparison.md
new file mode 100644
index 000000000..a6f928f0b
--- /dev/null
+++ b/docs/aijournal-poc-comparison.md
@@ -0,0 +1,245 @@
+# AI Journal POC vs aijournal: Detailed Comparison
+
+This document analyzes the differences between our MemoryClient-based AI Journal POC and the full-featured aijournal project, identifying strengths, gaps, and potential paths forward.
+
+## Executive Summary
+
+| Aspect | Our POC | aijournal |
+|--------|---------|-----------|
+| **Complexity** | ~200 LOC | ~15,000+ LOC |
+| **Setup Time** | Instant | `aijournal init` + config |
+| **Profile Storage** | Generated on-demand | Persisted YAML with versioning |
+| **Claim System** | Raw fact strings | Typed atoms with strength/decay |
+| **Context Layers** | Single flat layer | 4 hierarchical layers (L1-L4) |
+| **Learning** | Static extraction | Feedback loops + interview probing |
+
+## 1. Architecture Comparison
+
+### 1.1 Data Model
+
+**Our POC:**
+```
+~/.aijournal/
+  entries/
+    journal/
+      facts/           # Extracted facts as markdown
+      turns/           # Chat turns
+  chroma/              # Vector embeddings
+```
+
+**aijournal:**
+```
+workspace/
+  data/
+    journal/YYYY/MM/DD/*.md    # Raw entries
+    normalized/YYYY-MM-DD/     # Structured YAML
+  profile/
+    self_profile.yaml          # Facets (values, goals, traits)
+    claims.yaml                # Typed claim atoms
+  derived/
+    summaries/                 # Daily summaries
+    microfacts/                # Extracted facts
+    persona/persona_core.yaml  # L1 context (~1200 tokens)
+    index/                     # Vector store + metadata
+    chat_sessions/             # Conversation history
+    pending/profile_updates/   # Queued changes
+```
+
+**Analysis:** aijournal separates authoritative data (human-editable) from derived data (reproducible). Our POC conflates these, making it harder to inspect or manually correct the knowledge base.
+
+### 1.2 Claim Representation
+
+**Our POC - Raw facts:**
+```
+"Bas is a software engineer"
+"The user loves hiking"
+"The user's wife is named Anne"
+```
+
+**aijournal - Typed claim atoms:**
+```yaml
+- type: trait
+  subject: self
+  predicate: occupation
+  statement: "Works as a software engineer focused on AI systems"
+  scope: {domain: work, context: [professional]}
+  strength: 0.85
+  status: accepted
+  provenance:
+    sources: [entry:2025-01-15-morning]
+    first_seen: 2025-01-15
+    last_updated: 2025-01-20
+```
+
+**Analysis:** aijournal's typed claims enable:
+- Filtering by type (traits vs preferences vs goals)
+- Confidence tracking via `strength`
+- Time-decay for relevance
+- Conflict detection between claims
+- Source attribution for verification
+
+### 1.3 Context Layers
+
+**Our POC:** Single layer - all facts dumped into system prompt
+
+**aijournal - Hierarchical layers:**
+
+| Layer | Content | Tokens | Use Case |
+|-------|---------|--------|----------|
+| L1 | Persona core + top claims | ~1,200 | Quick chat, advice |
+| L2 | L1 + recent summaries/facts | ~2,000 | Daily check-ins |
+| L3 | L2 + full claims + facets | ~2,600 | Deep conversations |
+| L4 | L3 + prompts + config + history | ~3,200 | External AI export |
+
+**Analysis:** Layered context prevents token overflow and allows appropriate depth for different interactions.
+
+## 2. Feature Comparison
+
+### 2.1 Fact Extraction
+
+| Feature | Our POC | aijournal |
+|---------|---------|-----------|
+| Extraction method | PydanticAI agent | Ollama + custom prompts |
+| Output format | Raw strings | Typed MicroFact objects |
+| Reconciliation | ADD/UPDATE/DELETE/NONE | Consolidation with strength weighting |
+| Deduplication | Semantic similarity | Hash + semantic + scope matching |
+
+**Our POC advantage:** The reconciliation logic (PromptedOutput with JSON mode) prevents duplicate facts effectively.
+
+**aijournal advantage:** Consolidation weights existing evidence: `strength_new = clamp01((w_prev * strength_prev + w_obs * signal) / (w_prev + w_obs))`
+
+### 2.2 Profile Generation
+
+| Feature | Our POC | aijournal |
+|---------|---------|-----------|
+| Generation | On-demand via LLM | Pre-built `persona_core.yaml` |
+| Caching | None | Persisted with staleness tracking |
+| Categories | LLM-determined | Defined schema (values, goals, traits, etc.) |
+| Token budget | Unlimited (risk of overflow) | Configurable (~1,200 default) |
+
+**Our POC advantage:** Flexible - LLM determines categories dynamically based on content.
+
+**aijournal advantage:** Deterministic, auditable, and respects token limits.
+
+### 2.3 Chat Integration
+
+| Feature | Our POC | aijournal |
+|---------|---------|-----------|
+| Context injection | All facts in system prompt | Layer-appropriate context |
+| Citations | None | `[entry:id#p<idx>]` markers |
+| Feedback | None | Up/down adjustments to claim strength |
+| Memory storage | Bypassed (direct LLM call) | Persisted with telemetry |
+
+**Our POC advantage:** Simple, no side effects.
+
+**aijournal advantage:** Learning loop - feedback strengthens/weakens claims over time.
+
+### 2.4 Missing in Our POC
+
+1. **Interview/Probing Mode**
+   - aijournal generates questions to fill knowledge gaps
+   - Ranks facets by `staleness × impact_weight` to prioritize probing
+
+2. **Time Decay**
+   - aijournal: `effective_strength = strength × exp(-λ × staleness)`
+   - Our POC: All facts treated equally regardless of age
+
+3. **Conflict Resolution**
+   - aijournal: Detects contradictions, downgrades to `tentative`, queues questions
+   - Our POC: UPDATE replaces old fact entirely
+
+4. **Advisor Mode**
+   - aijournal: Separate `advise` command with coaching preferences
+   - Our POC: Generic chat only
+
+5. **Export/Packs**
+   - aijournal: Generate context bundles for external AIs
+   - Our POC: No export capability
+
+## 3. Test Results Analysis
+
+### 3.1 Blog Post Ingestion
+
+We fed 12+ blog posts into our POC:
+
+| Metric | Result |
+|--------|--------|
+| Posts processed | ~12 |
+| Facts extracted | 52 |
+| Extraction accuracy | High - captured key themes |
+| Profile quality | Excellent - identified all major interests |
+
+**Sample extracted facts:**
+- "Bas is a software engineer"
+- "Bas works on AI systems"
+- "The user loves hiking"
+- "You went for a 5km run this morning"
+- "You discovered that local vision models like Qwen3-VL-32B can identify niche books"
+
+### 3.2 Profile Generation Quality
+
+The generated profile correctly identified:
+- ✅ Professional identity (software engineer, AI focus)
+- ✅ Personal relationships (wife Anne)
+- ✅ Hobbies (hiking, running, learning Dutch)
+- ✅ Technical interests (local AI, terminal productivity, homelab)
+- ✅ Values (minimalism, security, reproducibility)
+
+### 3.3 Chat Intelligence
+
+The chat demonstrated:
+- **Specific recall:** "You use the Glove80 keyboard with programmable layers"
+- **Temporal understanding:** Tracked evolution of views on AI coding
+- **Theme synthesis:** Connected local AI + security + productivity interests
+- **Nuanced responses:** Acknowledged both benefits and limitations
+
+## 4. Recommendations
+
+### 4.1 Quick Wins (Keep POC Simple)
+
+1. **Persist profile summary** - Cache the LLM-generated profile to avoid regeneration
+2. **Add timestamps to facts** - Already have `created_at`, use it for recency weighting
+3. **Token budgeting** - Limit facts sent to chat based on relevance + recency
+
+### 4.2 Medium-Term Enhancements
+
+1. **Claim typing** - Categorize facts into types (trait, preference, goal, relationship)
+2. **Strength tracking** - Increment when same fact extracted multiple times
+3. **Simple decay** - Weight recent facts higher in context
+
+### 4.3 aijournal Features Worth Adopting
+
+1. **Interview mode** - Generate questions to learn more
+2. **Feedback loop** - Up/down on responses affects claim strength
+3. **Layered context** - L1 for quick chats, L4 for deep dives
+4. **Citations** - Link responses to source facts
+
+### 4.4 What NOT to Adopt
+
+1. **7-stage pipeline** - Overkill for our use case
+2. **Strict schema governance** - Adds friction without clear benefit for POC
+3. **Markdown file storage** - ChromaDB is sufficient for our needs
+
+## 5. Conclusion
+
+Our POC validates the core hypothesis: **MemoryClient can serve as the foundation for a personal knowledge system**. With ~200 lines of code, we achieved:
+
+- Accurate fact extraction from unstructured text
+- Coherent profile generation from diverse content
+- Personalized conversations using stored knowledge
+
+The main gap is **learning over time** - our system doesn't strengthen beliefs based on repetition or feedback. Adding simple strength tracking and decay would close 80% of the functionality gap with 20% of aijournal's complexity.
+
+### Recommended Next Step
+
+Add a `strength` field to stored facts and implement:
+```python
+# On duplicate fact detection
+existing.strength = min(1.0, existing.strength + 0.1)
+existing.last_seen = now()
+
+# On retrieval
+effective_strength = fact.strength * exp(-0.1 * days_since_last_seen)
+```
+
+This single change would transform our static knowledge base into a learning system.

From b9ad5ceedf94c0f5ebe961e08e1a20d21b86a664 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 19:30:24 -0800
Subject: [PATCH 06/37] feat(memory): add adaptive summarization with
 hierarchical storage

Implement research-grounded summarization inspired by Letta and Mem0:
- AdaptiveSummarizer with 5 levels (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL)
- Hierarchical summary storage (L1 chunks, L2 groups, L3 final) in ChromaDB
- File-based persistence with YAML front matter in markdown files
- Token counting via tiktoken with fallback to cl100k_base
- Level-specific compression ratios (20%, 12%, 7%, capped 2000 tokens)

Structure:
- agent_cli/summarizer/ - standalone reusable summarization module
- summaries/L1/chunk_*.md, L2/group_*.md, L3/final.md file hierarchy
- Soft-delete old summaries to deleted/ folder before replacing
---
 agent_cli/memory/_files.py           |  37 +-
 agent_cli/memory/_ingest.py          |  92 ++++-
 agent_cli/memory/_persistence.py     | 105 +++++-
 agent_cli/memory/_store.py           | 151 ++++++++
 agent_cli/memory/models.py           |  20 ++
 agent_cli/summarizer/__init__.py     |  31 ++
 agent_cli/summarizer/adaptive.py     | 502 +++++++++++++++++++++++++++
 agent_cli/summarizer/models.py       | 220 ++++++++++++
 agent_cli/summarizer/prompts.py      | 135 +++++++
 agent_cli/summarizer/utils.py        | 258 ++++++++++++++
 pyproject.toml                       |   8 +-
 tests/memory/test_store.py           | 226 ++++++++++++
 tests/summarizer/__init__.py         |   1 +
 tests/summarizer/test_adaptive.py    | 434 +++++++++++++++++++++++
 tests/summarizer/test_integration.py | 466 +++++++++++++++++++++++++
 tests/summarizer/test_models.py      | 332 ++++++++++++++++++
 tests/summarizer/test_prompts.py     | 180 ++++++++++
 tests/summarizer/test_utils.py       | 193 ++++++++++
 18 files changed, 3386 insertions(+), 5 deletions(-)
 create mode 100644 agent_cli/summarizer/__init__.py
 create mode 100644 agent_cli/summarizer/adaptive.py
 create mode 100644 agent_cli/summarizer/models.py
 create mode 100644 agent_cli/summarizer/prompts.py
 create mode 100644 agent_cli/summarizer/utils.py
 create mode 100644 tests/summarizer/__init__.py
 create mode 100644 tests/summarizer/test_adaptive.py
 create mode 100644 tests/summarizer/test_integration.py
 create mode 100644 tests/summarizer/test_models.py
 create mode 100644 tests/summarizer/test_prompts.py
 create mode 100644 tests/summarizer/test_utils.py

diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py
index 536e49e80..65fbbc1b2 100644
--- a/agent_cli/memory/_files.py
+++ b/agent_cli/memory/_files.py
@@ -23,6 +23,11 @@
 _SNAPSHOT_FILENAME = "memory_index.json"
 _DELETED_DIRNAME = "deleted"
 
+# Summary level constants for hierarchical file structure
+_SUMMARY_LEVEL_L1 = 1
+_SUMMARY_LEVEL_L2 = 2
+_SUMMARY_LEVEL_L3 = 3
+
 
 @dataclass
 class MemoryFileRecord:
@@ -89,6 +94,16 @@ def write_memory_file(
     summary_kind: str | None = None,
     doc_id: str | None = None,
     source_id: str | None = None,
+    # Hierarchical summary fields
+    level: int | None = None,
+    is_final: bool | None = None,
+    chunk_index: int | None = None,
+    parent_group: int | None = None,
+    group_index: int | None = None,
+    input_tokens: int | None = None,
+    output_tokens: int | None = None,
+    compression_ratio: float | None = None,
+    summary_level_name: str | None = None,
 ) -> MemoryFileRecord:
     """Render and persist a memory document to disk."""
     entries_dir, _ = ensure_store_dirs(root)
@@ -97,7 +112,18 @@ def write_memory_file(
     safe_ts = _safe_timestamp(created_at)
 
     # Route by role/category for readability
-    if summary_kind:
+    if summary_kind and level is not None:
+        # Hierarchical summary file structure
+        if level == _SUMMARY_LEVEL_L1:
+            subdir = Path("summaries") / "L1"
+            filename = f"chunk_{chunk_index or 0}.md"
+        elif level == _SUMMARY_LEVEL_L2:
+            subdir = Path("summaries") / "L2"
+            filename = f"group_{group_index or 0}.md"
+        else:  # level == _SUMMARY_LEVEL_L3
+            subdir = Path("summaries") / "L3"
+            filename = "final.md"
+    elif summary_kind:
         subdir = Path("summaries")
         filename = "summary.md"
     elif role == "user":
@@ -119,6 +145,15 @@ def write_memory_file(
         created_at=created_at,
         summary_kind=summary_kind,
         source_id=source_id,
+        level=level,
+        is_final=is_final,
+        chunk_index=chunk_index,
+        parent_group=parent_group,
+        group_index=group_index,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=compression_ratio,
+        summary_level_name=summary_level_name,
     )
 
     front_matter = _render_front_matter(doc_id, metadata)
diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 266b9f80f..6673000c1 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -10,7 +10,12 @@
 from uuid import uuid4
 
 from agent_cli.memory._git import commit_changes
-from agent_cli.memory._persistence import delete_memory_files, persist_entries, persist_summary
+from agent_cli.memory._persistence import (
+    delete_memory_files,
+    persist_entries,
+    persist_hierarchical_summary,
+    persist_summary,
+)
 from agent_cli.memory._prompt import (
     FACT_INSTRUCTIONS,
     FACT_SYSTEM_PROMPT,
@@ -34,6 +39,8 @@
 
     from chromadb import Collection
 
+    from agent_cli.summarizer import SummaryResult
+
 LOGGER = logging.getLogger(__name__)
 
 _SUMMARY_ROLE = "summary"
@@ -285,7 +292,12 @@ async def update_summary(
     model: str,
     max_tokens: int = 256,
 ) -> str | None:
-    """Update the conversation summary based on new facts."""
+    """Update the conversation summary based on new facts.
+
+    This is the simple Mem0-style rolling summary that incrementally
+    updates based on new facts. For full content adaptive summarization,
+    use `summarize_content` instead.
+    """
     if not new_facts:
         return prior_summary
 
@@ -311,6 +323,82 @@ async def update_summary(
     return result.output.summary or prior_summary
 
 
+async def summarize_content(
+    *,
+    content: str,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+    openai_base_url: str,
+    api_key: str | None,
+    model: str,
+) -> SummaryResult:
+    """Adaptively summarize content based on its length.
+
+    Uses the AdaptiveSummarizer to automatically select the appropriate
+    summarization strategy (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL)
+    based on input token count.
+
+    Args:
+        content: The content to summarize.
+        prior_summary: Optional prior summary for context continuity.
+        content_type: Type of content ("general", "conversation", "journal", "document").
+        openai_base_url: Base URL for OpenAI-compatible API.
+        api_key: API key for the LLM.
+        model: Model name to use for summarization.
+
+    Returns:
+        SummaryResult with the summary and metadata.
+
+    """
+    # Import here to avoid circular imports and allow optional dependency
+    from agent_cli.summarizer import AdaptiveSummarizer  # noqa: PLC0415
+
+    summarizer = AdaptiveSummarizer(
+        openai_base_url=openai_base_url,
+        model=model,
+        api_key=api_key,
+    )
+    return await summarizer.summarize(
+        content=content,
+        prior_summary=prior_summary,
+        content_type=content_type,
+    )
+
+
+async def store_adaptive_summary(
+    collection: Collection,
+    memory_root: Path,
+    conversation_id: str,
+    summary_result: SummaryResult,
+) -> list[str]:
+    """Store an adaptive summary result to files and ChromaDB.
+
+    This stores all levels of a hierarchical summary (L1, L2, L3) or
+    just the final summary for simpler levels. Old summaries are deleted first.
+
+    Files are stored as Markdown with YAML front matter in a hierarchical structure:
+    - summaries/L1/chunk_{n}.md - L1 chunk summaries
+    - summaries/L2/group_{n}.md - L2 group summaries
+    - summaries/L3/final.md - L3 final summary
+
+    Args:
+        collection: ChromaDB collection.
+        memory_root: Root path for memory files.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: The result from AdaptiveSummarizer.summarize().
+
+    Returns:
+        List of IDs that were stored.
+
+    """
+    return persist_hierarchical_summary(
+        collection,
+        memory_root=memory_root,
+        conversation_id=conversation_id,
+        summary_result=summary_result,
+    )
+
+
 async def extract_and_store_facts_and_summaries(
     *,
     collection: Collection,
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index bd8f4dfd4..9c38f7315 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -3,10 +3,13 @@
 from __future__ import annotations
 
 import logging
+import shutil
+from datetime import UTC, datetime
 from typing import TYPE_CHECKING
 
 from agent_cli.memory._files import (
     _DELETED_DIRNAME,
+    _slugify,
     ensure_store_dirs,
     load_snapshot,
     read_memory_file,
@@ -14,7 +17,13 @@
     write_memory_file,
     write_snapshot,
 )
-from agent_cli.memory._store import delete_entries, list_conversation_entries, upsert_memories
+from agent_cli.memory._store import (
+    delete_entries,
+    delete_summaries,
+    list_conversation_entries,
+    upsert_hierarchical_summary,
+    upsert_memories,
+)
 from agent_cli.memory.entities import Fact, Summary, Turn
 
 if TYPE_CHECKING:
@@ -23,6 +32,7 @@
     from chromadb import Collection
 
     from agent_cli.memory.models import MemoryMetadata
+    from agent_cli.summarizer import SummaryResult
 
 LOGGER = logging.getLogger(__name__)
 
@@ -180,3 +190,96 @@ def evict_if_needed(
     ids_to_remove = [e.id for e in overflow]
     delete_entries(collection, ids_to_remove)
     delete_memory_files(memory_root, conversation_id, ids_to_remove)
+
+
+def persist_hierarchical_summary(
+    collection: Collection,
+    *,
+    memory_root: Path,
+    conversation_id: str,
+    summary_result: SummaryResult,
+) -> list[str]:
+    """Persist a hierarchical summary to disk and ChromaDB.
+
+    This function:
+    1. Deletes existing summaries (files and ChromaDB entries)
+    2. Writes new summary files to disk in hierarchical structure
+    3. Stores entries in ChromaDB
+
+    Args:
+        collection: ChromaDB collection.
+        memory_root: Root path for memory files.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: The result from AdaptiveSummarizer.summarize().
+
+    Returns:
+        List of IDs that were stored.
+
+    """
+    from agent_cli.summarizer import SummaryLevel  # noqa: PLC0415
+
+    # Skip if no summary needed
+    if summary_result.level == SummaryLevel.NONE:
+        return []
+
+    # Delete existing summary files
+    _delete_summary_files(memory_root, conversation_id)
+
+    # Delete existing ChromaDB entries
+    delete_summaries(collection, conversation_id)
+
+    # Get storage metadata from SummaryResult
+    entries = summary_result.to_storage_metadata(conversation_id)
+    if not entries:
+        return []
+
+    stored_ids: list[str] = []
+    created_at = datetime.now(UTC).isoformat()
+
+    for entry in entries:
+        meta = entry["metadata"]
+        record = write_memory_file(
+            memory_root,
+            conversation_id=meta["conversation_id"],
+            role=meta["role"],
+            created_at=meta.get("created_at", created_at),
+            content=entry["content"],
+            summary_kind="summary",
+            doc_id=entry["id"],
+            level=meta.get("level"),
+            is_final=meta.get("is_final"),
+            chunk_index=meta.get("chunk_index"),
+            parent_group=meta.get("parent_group"),
+            group_index=meta.get("group_index"),
+            input_tokens=meta.get("input_tokens"),
+            output_tokens=meta.get("output_tokens"),
+            compression_ratio=meta.get("compression_ratio"),
+            summary_level_name=meta.get("summary_level"),
+        )
+        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta.get("level"))
+        stored_ids.append(record.id)
+
+    # Store in ChromaDB
+    upsert_hierarchical_summary(collection, conversation_id, summary_result)
+
+    return stored_ids
+
+
+def _delete_summary_files(memory_root: Path, conversation_id: str) -> None:
+    """Delete all summary files for a conversation."""
+    entries_dir, _ = ensure_store_dirs(memory_root)
+    safe_conversation = _slugify(conversation_id)
+    summaries_dir = entries_dir / safe_conversation / "summaries"
+
+    if summaries_dir.exists():
+        # Move to deleted folder instead of hard delete
+        deleted_dir = entries_dir / _DELETED_DIRNAME / safe_conversation / "summaries"
+        deleted_dir.parent.mkdir(parents=True, exist_ok=True)
+
+        # If deleted summaries already exist, remove them first
+        if deleted_dir.exists():
+            shutil.rmtree(deleted_dir)
+
+        # Move current summaries to deleted
+        shutil.move(str(summaries_dir), str(deleted_dir))
+        LOGGER.info("Moved old summaries to deleted: %s", deleted_dir)
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 96e7c66af..4f3755b12 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -167,3 +167,154 @@ def list_conversation_entries(
 def delete_entries(collection: Collection, ids: list[str]) -> None:
     """Delete entries by ID."""
     delete_docs(collection, ids)
+
+
+def upsert_hierarchical_summary(
+    collection: Collection,
+    conversation_id: str,
+    summary_result: Any,
+) -> list[str]:
+    """Store all levels of a hierarchical summary.
+
+    Uses SummaryResult.to_storage_metadata() to generate ChromaDB entries
+    for L1 (chunk), L2 (group), and L3 (final) summaries.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: A SummaryResult from the adaptive summarizer.
+
+    Returns:
+        List of IDs that were upserted.
+
+    """
+    entries = summary_result.to_storage_metadata(conversation_id)
+    if not entries:
+        return []
+
+    ids: list[str] = []
+    contents: list[str] = []
+    metadatas: list[MemoryMetadata] = []
+
+    for entry in entries:
+        ids.append(entry["id"])
+        contents.append(entry["content"])
+        # Convert the raw metadata dict to MemoryMetadata
+        meta_dict = entry["metadata"]
+        metadatas.append(
+            MemoryMetadata(
+                conversation_id=meta_dict["conversation_id"],
+                role=meta_dict["role"],
+                created_at=meta_dict["created_at"],
+                level=meta_dict.get("level"),
+                is_final=meta_dict.get("is_final"),
+                chunk_index=meta_dict.get("chunk_index"),
+                parent_group=meta_dict.get("parent_group"),
+                group_index=meta_dict.get("group_index"),
+                input_tokens=meta_dict.get("input_tokens"),
+                output_tokens=meta_dict.get("output_tokens"),
+                compression_ratio=meta_dict.get("compression_ratio"),
+                summary_level_name=meta_dict.get("summary_level"),
+            ),
+        )
+
+    upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas)
+    return ids
+
+
+def get_summary_at_level(
+    collection: Collection,
+    conversation_id: str,
+    level: int,
+) -> list[StoredMemory]:
+    """Retrieve summaries at a specific level for a conversation.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation to retrieve summaries for.
+        level: Summary level (1=chunk, 2=group, 3=final).
+
+    Returns:
+        List of StoredMemory entries at the requested level.
+
+    """
+    filters: list[dict[str, Any]] = [
+        {"conversation_id": conversation_id},
+        {"role": "summary"},
+        {"level": level},
+    ]
+    result = collection.get(where={"$and": filters})
+    docs = result.get("documents") or []
+    metas = result.get("metadatas") or []
+    ids = result.get("ids") or []
+
+    records: list[StoredMemory] = []
+    for doc, meta, entry_id in zip(docs, metas, ids, strict=False):
+        records.append(
+            StoredMemory(
+                id=entry_id,
+                content=doc,
+                metadata=MemoryMetadata(**dict(meta)),
+                distance=None,
+            ),
+        )
+    return records
+
+
+def get_final_summary(
+    collection: Collection,
+    conversation_id: str,
+) -> StoredMemory | None:
+    """Get the L3 (final) summary for a conversation.
+
+    This is a convenience wrapper around get_summary_at_level for the
+    most common use case of retrieving the top-level summary.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation to retrieve the summary for.
+
+    Returns:
+        The final summary entry, or None if not found.
+
+    """
+    summaries = get_summary_at_level(collection, conversation_id, level=3)
+    # Return the one marked as final, or the first if none marked
+    for summary in summaries:
+        if summary.metadata.is_final:
+            return summary
+    return summaries[0] if summaries else None
+
+
+def delete_summaries(
+    collection: Collection,
+    conversation_id: str,
+    *,
+    levels: list[int] | None = None,
+) -> int:
+    """Delete summary entries for a conversation.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation to delete summaries from.
+        levels: Optional list of levels to delete. If None, deletes all levels.
+
+    Returns:
+        Number of entries deleted.
+
+    """
+    filters: list[dict[str, Any]] = [
+        {"conversation_id": conversation_id},
+        {"role": "summary"},
+    ]
+    if levels:
+        filters.append({"level": {"$in": levels}})
+
+    # First get the IDs to count them
+    result = collection.get(where={"$and": filters})
+    ids = result.get("ids") or []
+
+    if ids:
+        delete_docs(collection, list(ids))
+
+    return len(ids)
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 9ef076d57..6dc689d8f 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -49,6 +49,26 @@ class MemoryMetadata(BaseModel):
     replaced_by: str | None = None
     source_id: str | None = None
 
+    # Hierarchical summary fields (only used when role="summary")
+    level: int | None = None
+    """Summary level: 1=chunk, 2=group, 3=final."""
+    is_final: bool | None = None
+    """Whether this is the final L3 summary."""
+    chunk_index: int | None = None
+    """For L1 summaries: index of the source chunk."""
+    parent_group: int | None = None
+    """For L1 summaries: which L2 group this chunk belongs to."""
+    group_index: int | None = None
+    """For L2 summaries: index of this group."""
+    input_tokens: int | None = None
+    """Number of tokens in the original input (L3 only)."""
+    output_tokens: int | None = None
+    """Number of tokens in the summary output (L3 only)."""
+    compression_ratio: float | None = None
+    """Ratio of output to input tokens (L3 only)."""
+    summary_level_name: str | None = None
+    """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL')."""
+
 
 class SummaryOutput(BaseModel):
     """Structured summary returned by the LLM."""
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
new file mode 100644
index 000000000..c6f1d85a1
--- /dev/null
+++ b/agent_cli/summarizer/__init__.py
@@ -0,0 +1,31 @@
+"""Adaptive summarization module for variable-length content.
+
+This module provides research-grounded summarization that scales with input complexity,
+inspired by Letta (partial eviction, middle truncation) and Mem0 (rolling summaries,
+compression ratios) architectures.
+
+Example:
+    from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel
+
+    summarizer = AdaptiveSummarizer(
+        openai_base_url="http://localhost:8000/v1",
+        model="gpt-4",
+    )
+    result = await summarizer.summarize(long_document)
+    print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}")
+
+"""
+
+from agent_cli.summarizer.adaptive import AdaptiveSummarizer
+from agent_cli.summarizer.models import (
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
+
+__all__ = [
+    "AdaptiveSummarizer",
+    "HierarchicalSummary",
+    "SummaryLevel",
+    "SummaryResult",
+]
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
new file mode 100644
index 000000000..ed0074d87
--- /dev/null
+++ b/agent_cli/summarizer/adaptive.py
@@ -0,0 +1,502 @@
+"""Adaptive summarization that scales with input complexity.
+
+This module implements research-grounded summarization inspired by:
+- Letta: Partial eviction (30%), middle truncation, fire-and-forget background processing
+- Mem0: Rolling summaries, 90%+ compression, two-phase architecture
+
+Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+
+import httpx
+from pydantic import BaseModel
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+from pydantic_ai.settings import ModelSettings
+
+from agent_cli.summarizer.models import (
+    ChunkSummary,
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
+from agent_cli.summarizer.prompts import (
+    BRIEF_SUMMARY_PROMPT,
+    CHUNK_SUMMARY_PROMPT,
+    META_SUMMARY_PROMPT,
+    ROLLING_SUMMARY_PROMPT,
+    format_prior_context,
+    format_summaries_for_meta,
+    get_prompt_for_content_type,
+)
+from agent_cli.summarizer.utils import (
+    chunk_text,
+    count_tokens,
+    estimate_summary_tokens,
+    tokens_to_words,
+)
+
+logger = logging.getLogger(__name__)
+
+# Thresholds for summary levels (in tokens)
+LEVEL_THRESHOLDS = {
+    SummaryLevel.NONE: 100,
+    SummaryLevel.BRIEF: 500,
+    SummaryLevel.STANDARD: 3000,
+    SummaryLevel.DETAILED: 15000,
+    # HIERARCHICAL is everything above DETAILED
+}
+
+# Number of L1 chunks to group together for L2 summaries
+L2_GROUP_SIZE = 5
+# Minimum number of L1 chunks before L2 grouping is applied
+L2_MIN_CHUNKS = 5
+
+
+class SummaryOutput(BaseModel):
+    """Structured output for summary generation."""
+
+    summary: str
+
+
+class AdaptiveSummarizer:
+    """Adaptive summarization that scales with input complexity.
+
+    Automatically selects the appropriate summarization strategy based on
+    input length:
+    - NONE (< 100 tokens): No summary needed
+    - BRIEF (100-500 tokens): Single sentence
+    - STANDARD (500-3000 tokens): Paragraph summary
+    - DETAILED (3000-15000 tokens): Chunked + meta-summary
+    - HIERARCHICAL (> 15000 tokens): Multi-level tree of summaries
+
+    Example:
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        result = await summarizer.summarize(long_document)
+        print(f"Level: {result.level.name}")
+        print(f"Summary: {result.summary}")
+        print(f"Compression: {result.compression_ratio:.1%}")
+
+    """
+
+    def __init__(
+        self,
+        openai_base_url: str,
+        model: str,
+        api_key: str | None = None,
+        chunk_size: int = 3000,
+        chunk_overlap: int = 200,
+        max_concurrent_chunks: int = 5,
+        timeout: float = 60.0,
+    ) -> None:
+        """Initialize the adaptive summarizer.
+
+        Args:
+            openai_base_url: Base URL for OpenAI-compatible API.
+            model: Model name to use for summarization.
+            api_key: API key (optional for local models).
+            chunk_size: Target token count per chunk for hierarchical summarization.
+            chunk_overlap: Token overlap between chunks.
+            max_concurrent_chunks: Maximum parallel chunk summarizations.
+            timeout: Request timeout in seconds.
+
+        """
+        self.openai_base_url = openai_base_url.rstrip("/")
+        self.model = model
+        self.api_key = api_key or "not-needed"
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.max_concurrent_chunks = max_concurrent_chunks
+        self.timeout = timeout
+
+        self._provider = OpenAIProvider(api_key=self.api_key, base_url=self.openai_base_url)
+
+    def determine_level(self, token_count: int) -> SummaryLevel:
+        """Determine the appropriate summary level based on token count.
+
+        Args:
+            token_count: Number of tokens in the input.
+
+        Returns:
+            The recommended SummaryLevel.
+
+        """
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
+            return SummaryLevel.NONE
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
+            return SummaryLevel.BRIEF
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
+            return SummaryLevel.STANDARD
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
+            return SummaryLevel.DETAILED
+        return SummaryLevel.HIERARCHICAL
+
+    async def summarize(
+        self,
+        content: str,
+        prior_summary: str | None = None,
+        content_type: str = "general",
+    ) -> SummaryResult:
+        """Summarize content with adaptive strategy based on length.
+
+        Args:
+            content: The content to summarize.
+            prior_summary: Optional prior summary for context continuity.
+            content_type: Type of content ("general", "conversation", "journal", "document").
+
+        Returns:
+            SummaryResult with summary and metadata.
+
+        """
+        if not content or not content.strip():
+            return SummaryResult(
+                level=SummaryLevel.NONE,
+                summary=None,
+                hierarchical=None,
+                input_tokens=0,
+                output_tokens=0,
+                compression_ratio=0.0,
+            )
+
+        input_tokens = count_tokens(content, self.model)
+        level = self.determine_level(input_tokens)
+
+        logger.info(
+            "Summarizing %d tokens at level %s (type=%s)",
+            input_tokens,
+            level.name,
+            content_type,
+        )
+
+        if level == SummaryLevel.NONE:
+            return SummaryResult(
+                level=level,
+                summary=None,
+                hierarchical=None,
+                input_tokens=input_tokens,
+                output_tokens=0,
+                compression_ratio=0.0,
+            )
+
+        if level == SummaryLevel.BRIEF:
+            summary = await self._brief_summary(content)
+        elif level == SummaryLevel.STANDARD:
+            summary = await self._standard_summary(content, prior_summary, content_type)
+        elif level == SummaryLevel.DETAILED:
+            return await self._detailed_summary(content, input_tokens)
+        else:  # HIERARCHICAL
+            return await self._hierarchical_summary(content, input_tokens)
+
+        output_tokens = count_tokens(summary, self.model) if summary else 0
+        compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
+
+        return SummaryResult(
+            level=level,
+            summary=summary,
+            hierarchical=None,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=compression_ratio,
+        )
+
+    async def update_rolling_summary(
+        self,
+        prior_summary: str | None,
+        new_facts: list[str],
+    ) -> str:
+        """Update a rolling summary with new facts (Mem0-style).
+
+        This is optimized for incremental updates where you have discrete
+        new facts to integrate into an existing summary.
+
+        Args:
+            prior_summary: The existing summary to update.
+            new_facts: List of new facts to integrate.
+
+        Returns:
+            Updated summary string.
+
+        """
+        if not new_facts:
+            return prior_summary or ""
+
+        new_content = "\n".join(f"- {fact}" for fact in new_facts)
+        combined_tokens = count_tokens(
+            (prior_summary or "") + new_content,
+            self.model,
+        )
+
+        target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+        max_words = tokens_to_words(target_tokens)
+
+        prompt = ROLLING_SUMMARY_PROMPT.format(
+            prior_summary=prior_summary or "(No prior summary)",
+            new_content=new_content,
+            max_words=max_words,
+        )
+
+        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+
+    async def _brief_summary(self, content: str) -> str:
+        """Generate a single-sentence summary for brief content."""
+        prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
+        return await self._generate_summary(prompt, max_tokens=50)
+
+    async def _standard_summary(
+        self,
+        content: str,
+        prior_summary: str | None,
+        content_type: str,
+    ) -> str:
+        """Generate a paragraph summary for standard-length content."""
+        input_tokens = count_tokens(content, self.model)
+        target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
+        max_words = tokens_to_words(target_tokens)
+
+        prompt_template = get_prompt_for_content_type(content_type)
+        prior_context = format_prior_context(prior_summary)
+
+        prompt = prompt_template.format(
+            content=content,
+            prior_context=prior_context,
+            max_words=max_words,
+        )
+
+        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+
+    async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryResult:
+        """Generate chunked summaries with meta-summary for detailed content."""
+        chunks = chunk_text(
+            content,
+            chunk_size=self.chunk_size,
+            overlap=self.chunk_overlap,
+            model=self.model,
+        )
+
+        logger.info("Detailed summary: processing %d chunks", len(chunks))
+
+        # Summarize chunks (with concurrency limit)
+        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+
+        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+            async with semaphore:
+                chunk_tokens = count_tokens(chunk, self.model)
+                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+                max_words = tokens_to_words(target_tokens)
+
+                prompt = CHUNK_SUMMARY_PROMPT.format(
+                    chunk_index=idx + 1,
+                    total_chunks=len(chunks),
+                    content=chunk,
+                    max_words=max_words,
+                )
+
+                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+                summary_tokens = count_tokens(summary, self.model)
+
+                return ChunkSummary(
+                    chunk_index=idx,
+                    content=summary,
+                    token_count=summary_tokens,
+                    source_tokens=chunk_tokens,
+                    parent_group=None,
+                )
+
+        chunk_summaries = await asyncio.gather(
+            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+        )
+
+        # Generate meta-summary
+        all_summaries = [cs.content for cs in chunk_summaries]
+        meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
+        max_words = tokens_to_words(meta_target)
+
+        meta_prompt = META_SUMMARY_PROMPT.format(
+            summaries=format_summaries_for_meta(all_summaries),
+            max_words=max_words,
+        )
+
+        final_summary = await self._generate_summary(meta_prompt, max_tokens=meta_target + 100)
+        output_tokens = count_tokens(final_summary, self.model)
+
+        hierarchical = HierarchicalSummary(
+            l1_summaries=list(chunk_summaries),
+            l2_summaries=[],  # Not used for DETAILED level
+            l3_summary=final_summary,
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+        )
+
+        return SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary=final_summary,
+            hierarchical=hierarchical,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        )
+
+    async def _hierarchical_summary(self, content: str, input_tokens: int) -> SummaryResult:
+        """Build a tree of summaries for very long content.
+
+        Structure:
+        - L1: Individual chunk summaries
+        - L2: Group summaries (groups of ~5 L1 summaries)
+        - L3: Final synthesis
+        """
+        chunks = chunk_text(
+            content,
+            chunk_size=self.chunk_size,
+            overlap=self.chunk_overlap,
+            model=self.model,
+        )
+
+        logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
+
+        # L1: Summarize each chunk
+        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+
+        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+            async with semaphore:
+                chunk_tokens = count_tokens(chunk, self.model)
+                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+                max_words = tokens_to_words(target_tokens)
+
+                prompt = CHUNK_SUMMARY_PROMPT.format(
+                    chunk_index=idx + 1,
+                    total_chunks=len(chunks),
+                    content=chunk,
+                    max_words=max_words,
+                )
+
+                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+                summary_tokens = count_tokens(summary, self.model)
+
+                # Assign to group (5 chunks per group)
+                group_idx = idx // 5
+
+                return ChunkSummary(
+                    chunk_index=idx,
+                    content=summary,
+                    token_count=summary_tokens,
+                    source_tokens=chunk_tokens,
+                    parent_group=group_idx,
+                )
+
+        l1_summaries = await asyncio.gather(
+            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+        )
+
+        # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
+        l2_summaries: list[str] = []
+        if len(l1_summaries) > L2_MIN_CHUNKS:
+            groups: list[list[str]] = []
+            for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
+                group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
+                groups.append(group)
+
+            async def summarize_group(group: list[str]) -> str:
+                combined_tokens = sum(count_tokens(s, self.model) for s in group)
+                target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+                max_words = tokens_to_words(target_tokens)
+
+                prompt = META_SUMMARY_PROMPT.format(
+                    summaries=format_summaries_for_meta(group),
+                    max_words=max_words,
+                )
+                return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+
+            l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
+
+        # L3: Final synthesis
+        summaries_to_synthesize = (
+            l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
+        )
+        final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
+        max_words = tokens_to_words(final_target)
+
+        final_prompt = META_SUMMARY_PROMPT.format(
+            summaries=format_summaries_for_meta(summaries_to_synthesize),
+            max_words=max_words,
+        )
+
+        final_summary = await self._generate_summary(final_prompt, max_tokens=final_target + 100)
+        output_tokens = count_tokens(final_summary, self.model)
+
+        hierarchical = HierarchicalSummary(
+            l1_summaries=list(l1_summaries),
+            l2_summaries=list(l2_summaries),
+            l3_summary=final_summary,
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+        )
+
+        return SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary=final_summary,
+            hierarchical=hierarchical,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        )
+
+    async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str:
+        """Generate a summary using the LLM.
+
+        Uses PydanticAI for structured output with fallback to raw generation.
+        """
+        model = OpenAIChatModel(
+            model_name=self.model,
+            provider=self._provider,
+            settings=ModelSettings(
+                temperature=0.3,
+                max_tokens=max_tokens,
+            ),
+        )
+
+        agent = Agent(
+            model=model,
+            system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+            output_type=SummaryOutput,
+            retries=2,
+        )
+
+        try:
+            result = await agent.run(prompt)
+            return result.output.summary.strip()
+        except Exception as e:
+            logger.warning("Structured summary failed, trying raw generation: %s", e)
+            # Fallback to raw HTTP call
+            return await self._raw_generate(prompt, max_tokens)
+
+    async def _raw_generate(self, prompt: str, max_tokens: int) -> str:
+        """Fallback raw HTTP generation without structured output."""
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            response = await client.post(
+                f"{self.openai_base_url}/chat/completions",
+                headers={"Authorization": f"Bearer {self.api_key}"},
+                json={
+                    "model": self.model,
+                    "messages": [
+                        {"role": "system", "content": "You are a concise summarizer."},
+                        {"role": "user", "content": prompt},
+                    ],
+                    "temperature": 0.3,
+                    "max_tokens": max_tokens,
+                },
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        choices = data.get("choices", [])
+        if choices:
+            return choices[0].get("message", {}).get("content", "").strip()
+        return ""
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
new file mode 100644
index 000000000..f231a41e5
--- /dev/null
+++ b/agent_cli/summarizer/models.py
@@ -0,0 +1,220 @@
+"""Data models for adaptive summarization."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from enum import IntEnum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+# Hierarchical level constants for storage
+HIERARCHICAL_LEVEL_L1 = 1
+HIERARCHICAL_LEVEL_L2 = 2
+HIERARCHICAL_LEVEL_L3 = 3
+
+
+class SummaryLevel(IntEnum):
+    """Summary granularity levels based on input complexity.
+
+    Thresholds are based on Mem0 research showing optimal compression ratios
+    at different content lengths. Token counts are approximate guidelines.
+    """
+
+    NONE = 0
+    """< 100 tokens: No summary needed, facts only."""
+
+    BRIEF = 1
+    """100-500 tokens: Single-sentence summary (~20% compression)."""
+
+    STANDARD = 2
+    """500-3000 tokens: Paragraph summary (~12% compression)."""
+
+    DETAILED = 3
+    """3000-15000 tokens: Chunked summaries + meta-summary (~7% compression)."""
+
+    HIERARCHICAL = 4
+    """> 15000 tokens: Tree of summaries with multiple levels."""
+
+
+class ChunkSummary(BaseModel):
+    """Summary of a single chunk within a hierarchical summary."""
+
+    chunk_index: int = Field(..., description="Index of this chunk in the original content")
+    content: str = Field(..., description="The summarized content of this chunk")
+    token_count: int = Field(..., ge=0, description="Token count of this summary")
+    source_tokens: int = Field(..., ge=0, description="Token count of the source chunk")
+    parent_group: int | None = Field(
+        default=None,
+        description="Index of the L2 group this chunk belongs to",
+    )
+
+
+class HierarchicalSummary(BaseModel):
+    """A hierarchical summary with multiple levels.
+
+    Structure inspired by Letta's partial eviction pattern:
+    - L1: Individual chunk summaries (parallel processing)
+    - L2: Group summaries (groups of ~5 L1 summaries)
+    - L3: Final synthesis (single top-level summary)
+    """
+
+    l1_summaries: list[ChunkSummary] = Field(
+        default_factory=list,
+        description="Level 1: Individual chunk summaries",
+    )
+    l2_summaries: list[str] = Field(
+        default_factory=list,
+        description="Level 2: Group summaries (if > 5 chunks)",
+    )
+    l3_summary: str = Field(
+        ...,
+        description="Level 3: Final synthesized summary",
+    )
+    chunk_size: int = Field(
+        default=3000,
+        description="Token size used for chunking",
+    )
+    chunk_overlap: int = Field(
+        default=200,
+        description="Token overlap between chunks",
+    )
+
+    def get_summary_at_level(self, level: int) -> str | list[str]:
+        """Get summary content at a specific level.
+
+        Args:
+            level: 1 for chunk summaries, 2 for group summaries, 3 for final.
+
+        Returns:
+            Summary content at the requested level.
+
+        """
+        if level == HIERARCHICAL_LEVEL_L1:
+            return [cs.content for cs in self.l1_summaries]
+        if level == HIERARCHICAL_LEVEL_L2:
+            return self.l2_summaries if self.l2_summaries else [self.l3_summary]
+        return self.l3_summary
+
+
+class SummaryResult(BaseModel):
+    """Result of adaptive summarization.
+
+    Contains the summary at the appropriate level for the input complexity,
+    along with metadata about the compression achieved.
+    """
+
+    level: SummaryLevel = Field(..., description="The summarization level used")
+    summary: str | None = Field(
+        default=None,
+        description="The final summary text (None for NONE level)",
+    )
+    hierarchical: HierarchicalSummary | None = Field(
+        default=None,
+        description="Full hierarchical structure (for DETAILED/HIERARCHICAL levels)",
+    )
+    input_tokens: int = Field(..., ge=0, description="Token count of the input content")
+    output_tokens: int = Field(..., ge=0, description="Token count of the summary")
+    compression_ratio: float = Field(
+        ...,
+        ge=0.0,
+        le=1.0,
+        description="Ratio of output to input tokens (lower = more compression)",
+    )
+    created_at: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Timestamp when summary was created",
+    )
+
+    @property
+    def chunk_summaries(self) -> list[str] | None:
+        """Get L1 chunk summaries if available."""
+        if self.hierarchical:
+            return [cs.content for cs in self.hierarchical.l1_summaries]
+        return None
+
+    def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
+        """Convert to metadata entries for ChromaDB storage.
+
+        Returns a list of metadata dicts, one for each summary level stored.
+        """
+        entries: list[dict[str, Any]] = []
+        timestamp = self.created_at.isoformat()
+
+        if self.level == SummaryLevel.NONE:
+            return entries
+
+        # For hierarchical summaries, store each level
+        if self.hierarchical:
+            # L1: Individual chunk summaries
+            entries.extend(
+                {
+                    "id": f"{conversation_id}:summary:L1:{cs.chunk_index}",
+                    "content": cs.content,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L1,
+                        "chunk_index": cs.chunk_index,
+                        "parent_group": cs.parent_group,
+                        "token_count": cs.token_count,
+                        "created_at": timestamp,
+                    },
+                }
+                for cs in self.hierarchical.l1_summaries
+            )
+
+            # L2: Group summaries
+            entries.extend(
+                {
+                    "id": f"{conversation_id}:summary:L2:{idx}",
+                    "content": l2_summary,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L2,
+                        "group_index": idx,
+                        "created_at": timestamp,
+                    },
+                }
+                for idx, l2_summary in enumerate(self.hierarchical.l2_summaries)
+            )
+
+            # L3: Final summary
+            entries.append(
+                {
+                    "id": f"{conversation_id}:summary:L3:final",
+                    "content": self.hierarchical.l3_summary,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L3,
+                        "is_final": True,
+                        "input_tokens": self.input_tokens,
+                        "output_tokens": self.output_tokens,
+                        "compression_ratio": self.compression_ratio,
+                        "created_at": timestamp,
+                    },
+                },
+            )
+        elif self.summary:
+            # Non-hierarchical: just store the single summary
+            entries.append(
+                {
+                    "id": f"{conversation_id}:summary:L3:final",
+                    "content": self.summary,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L3,
+                        "is_final": True,
+                        "summary_level": self.level.name,
+                        "input_tokens": self.input_tokens,
+                        "output_tokens": self.output_tokens,
+                        "compression_ratio": self.compression_ratio,
+                        "created_at": timestamp,
+                    },
+                },
+            )
+
+        return entries
diff --git a/agent_cli/summarizer/prompts.py b/agent_cli/summarizer/prompts.py
new file mode 100644
index 000000000..101422b77
--- /dev/null
+++ b/agent_cli/summarizer/prompts.py
@@ -0,0 +1,135 @@
+"""Prompt templates for adaptive summarization.
+
+These prompts are designed to work with various LLM sizes (8B-20B parameters)
+and are optimized for structured, factual output.
+"""
+
+# Level 1: BRIEF - Single sentence summary
+BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
+Focus on the single most important point or takeaway.
+
+Content:
+{content}
+
+One-sentence summary:""".strip()
+
+# Level 2: STANDARD - Paragraph summary
+STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
+
+Focus on:
+- Key facts, decisions, and outcomes
+- Important context that should be remembered
+- Skip transient details, greetings, and chitchat
+
+{prior_context}
+
+Content to summarize:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+# Level 3: DETAILED - Used for individual chunks in hierarchical summarization
+CHUNK_SUMMARY_PROMPT = """Summarize this section of a longer document.
+Capture the main points while preserving important details.
+
+Section {chunk_index} of {total_chunks}:
+{content}
+
+Summary of this section (maximum {max_words} words):""".strip()
+
+# Level 4: META - Combine multiple summaries into one
+META_SUMMARY_PROMPT = """Synthesize these summaries into a single coherent overview.
+Identify common themes and key points across all sections.
+Eliminate redundancy while preserving unique insights.
+
+Summaries to combine:
+{summaries}
+
+Combined summary (maximum {max_words} words):""".strip()
+
+# Rolling summary update (Mem0-style)
+ROLLING_SUMMARY_PROMPT = """Update the running summary with new information.
+Integrate new facts seamlessly while keeping the summary concise.
+Drop redundant or superseded information.
+Preserve durable facts about identity, preferences, and important events.
+
+Current summary:
+{prior_summary}
+
+New information to integrate:
+{new_content}
+
+Updated summary (maximum {max_words} words):""".strip()
+
+# For conversation-specific summarization
+CONVERSATION_SUMMARY_PROMPT = """Summarize this conversation from the AI assistant's perspective.
+Focus on:
+- What the user wanted or asked about
+- Key information the user shared about themselves
+- Decisions made or conclusions reached
+- Any commitments or follow-ups mentioned
+
+Conversation:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+# For journal/personal content
+JOURNAL_SUMMARY_PROMPT = """Summarize this personal entry or reflection.
+Preserve:
+- Key events and experiences mentioned
+- Emotions and insights expressed
+- Goals, plans, or intentions stated
+- People, places, or things that are important
+
+Entry:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+# For technical/document content
+DOCUMENT_SUMMARY_PROMPT = """Summarize this technical content or documentation.
+Focus on:
+- Main concepts and their relationships
+- Key procedures or processes described
+- Important specifications or requirements
+- Conclusions or recommendations
+
+Document:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+
+def get_prompt_for_content_type(content_type: str) -> str:
+    """Get the appropriate prompt template for a content type.
+
+    Args:
+        content_type: One of "general", "conversation", "journal", "document".
+
+    Returns:
+        The prompt template string.
+
+    """
+    prompts = {
+        "general": STANDARD_SUMMARY_PROMPT,
+        "conversation": CONVERSATION_SUMMARY_PROMPT,
+        "journal": JOURNAL_SUMMARY_PROMPT,
+        "document": DOCUMENT_SUMMARY_PROMPT,
+    }
+    return prompts.get(content_type, STANDARD_SUMMARY_PROMPT)
+
+
+def format_prior_context(prior_summary: str | None) -> str:
+    """Format prior summary context for inclusion in prompts."""
+    if prior_summary:
+        return f"Prior context (for continuity):\n{prior_summary}\n"
+    return ""
+
+
+def format_summaries_for_meta(summaries: list[str]) -> str:
+    """Format a list of summaries for the meta-summary prompt."""
+    formatted = []
+    for i, summary in enumerate(summaries, 1):
+        formatted.append(f"[Section {i}]\n{summary}")
+    return "\n\n".join(formatted)
diff --git a/agent_cli/summarizer/utils.py b/agent_cli/summarizer/utils.py
new file mode 100644
index 000000000..bc319f5b5
--- /dev/null
+++ b/agent_cli/summarizer/utils.py
@@ -0,0 +1,258 @@
+"""Utility functions for adaptive summarization."""
+
+from __future__ import annotations
+
+import re
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+from agent_cli.summarizer.models import SummaryLevel
+
+if TYPE_CHECKING:
+    import tiktoken
+
+
+@lru_cache(maxsize=4)
+def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding:
+    """Get tiktoken encoding for a model, with caching.
+
+    Falls back to cl100k_base for unknown models (covers most modern LLMs).
+    """
+    import tiktoken  # noqa: PLC0415
+
+    try:
+        return tiktoken.encoding_for_model(model)
+    except KeyError:
+        return tiktoken.get_encoding("cl100k_base")
+
+
+def count_tokens(text: str, model: str = "gpt-4") -> int:
+    """Count tokens in text using tiktoken.
+
+    Args:
+        text: The text to count tokens for.
+        model: Model name for tokenizer selection.
+
+    Returns:
+        Number of tokens in the text.
+
+    """
+    if not text:
+        return 0
+    enc = _get_encoding(model)
+    return len(enc.encode(text))
+
+
+def chunk_text(
+    text: str,
+    chunk_size: int = 3000,
+    overlap: int = 200,
+    model: str = "gpt-4",
+) -> list[str]:
+    """Split text into overlapping chunks by token count.
+
+    Uses semantic boundaries (paragraphs, sentences) when possible to avoid
+    splitting mid-thought. Falls back to token-based splitting if no good
+    boundaries are found.
+
+    Args:
+        text: The text to chunk.
+        chunk_size: Target token count per chunk.
+        overlap: Token overlap between chunks for context continuity.
+        model: Model name for tokenizer.
+
+    Returns:
+        List of text chunks.
+
+    """
+    if not text:
+        return []
+
+    total_tokens = count_tokens(text, model)
+    if total_tokens <= chunk_size:
+        return [text]
+
+    # Split into paragraphs first
+    paragraphs = re.split(r"\n\s*\n", text)
+    paragraphs = [p.strip() for p in paragraphs if p.strip()]
+
+    if not paragraphs:
+        return [text]
+
+    chunks: list[str] = []
+    current_chunk: list[str] = []
+    current_tokens = 0
+
+    for para in paragraphs:
+        para_tokens = count_tokens(para, model)
+
+        # If single paragraph exceeds chunk size, split it further
+        if para_tokens > chunk_size:
+            # Flush current chunk if any
+            if current_chunk:
+                chunks.append("\n\n".join(current_chunk))
+                current_chunk = []
+                current_tokens = 0
+
+            # Split large paragraph by sentences
+            sentences = _split_sentences(para)
+            for sentence in sentences:
+                sent_tokens = count_tokens(sentence, model)
+                if current_tokens + sent_tokens > chunk_size and current_chunk:
+                    chunks.append(" ".join(current_chunk))
+                    # Keep overlap from end of previous chunk
+                    overlap_text = _get_overlap_text(current_chunk, overlap, model)
+                    current_chunk = [overlap_text] if overlap_text else []
+                    current_tokens = count_tokens(overlap_text, model) if overlap_text else 0
+                current_chunk.append(sentence)
+                current_tokens += sent_tokens
+        elif current_tokens + para_tokens > chunk_size:
+            # Flush current chunk and start new one
+            chunks.append("\n\n".join(current_chunk))
+            # Keep overlap from end of previous chunk
+            overlap_text = _get_overlap_text(current_chunk, overlap, model)
+            current_chunk = [overlap_text, para] if overlap_text else [para]
+            current_tokens = (
+                count_tokens(overlap_text, model) + para_tokens if overlap_text else para_tokens
+            )
+        else:
+            current_chunk.append(para)
+            current_tokens += para_tokens
+
+    # Don't forget the last chunk
+    if current_chunk:
+        chunks.append("\n\n".join(current_chunk))
+
+    return chunks
+
+
+def _split_sentences(text: str) -> list[str]:
+    """Split text into sentences, preserving common abbreviations."""
+    # Simple sentence splitting that handles common cases
+    # Matches period/question/exclamation followed by space and capital letter
+    sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
+    return [s.strip() for s in sentences if s.strip()]
+
+
+def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str:
+    """Extract overlap text from end of chunk list.
+
+    Takes text from the end of the chunk list until reaching target_tokens.
+    """
+    if not chunks or target_tokens <= 0:
+        return ""
+
+    # Work backwards through chunks
+    overlap_parts: list[str] = []
+    tokens_collected = 0
+
+    for chunk in reversed(chunks):
+        chunk_tokens = count_tokens(chunk, model)
+        if tokens_collected + chunk_tokens <= target_tokens:
+            overlap_parts.insert(0, chunk)
+            tokens_collected += chunk_tokens
+        else:
+            # Take partial chunk if needed
+            words = chunk.split()
+            partial: list[str] = []
+            for word in reversed(words):
+                word_tokens = count_tokens(word, model)
+                if tokens_collected + word_tokens <= target_tokens:
+                    partial.insert(0, word)
+                    tokens_collected += word_tokens
+                else:
+                    break
+            if partial:
+                overlap_parts.insert(0, " ".join(partial))
+            break
+
+    return " ".join(overlap_parts)
+
+
+def middle_truncate(
+    text: str,
+    budget_chars: int,
+    head_frac: float = 0.3,
+    tail_frac: float = 0.3,
+) -> tuple[str, int]:
+    """Middle-truncate text to fit within a character budget.
+
+    Keeps the first head_frac and last tail_frac portions, dropping the middle.
+    This preserves context from both the beginning (often contains setup) and
+    end (often contains conclusions/recent events).
+
+    Inspired by Letta's `middle_truncate_text` function.
+
+    Args:
+        text: Text to truncate.
+        budget_chars: Maximum character count for output.
+        head_frac: Fraction of budget for the head portion.
+        tail_frac: Fraction of budget for the tail portion.
+
+    Returns:
+        Tuple of (truncated_text, dropped_char_count).
+
+    """
+    if budget_chars <= 0 or len(text) <= budget_chars:
+        return text, 0
+
+    head_len = max(0, int(budget_chars * head_frac))
+    tail_len = max(0, int(budget_chars * tail_frac))
+
+    # Ensure head + tail doesn't exceed budget
+    if head_len + tail_len > budget_chars:
+        tail_len = max(0, budget_chars - head_len)
+
+    head = text[:head_len]
+    tail = text[-tail_len:] if tail_len > 0 else ""
+    dropped = max(0, len(text) - (len(head) + len(tail)))
+
+    marker = f"\n[...{dropped} characters truncated...]\n"
+
+    # If marker would overflow budget, shrink tail
+    available_for_marker = budget_chars - (len(head) + len(tail))
+    if available_for_marker < len(marker):
+        over = len(marker) - available_for_marker
+        tail = tail[:-over] if over < len(tail) else ""
+
+    return head + marker + tail, dropped
+
+
+def estimate_summary_tokens(input_tokens: int, level: int) -> int:
+    """Estimate target summary tokens based on input size and level.
+
+    Compression ratios based on Mem0 research:
+    - BRIEF: ~20% compression (80% reduction)
+    - STANDARD: ~12% compression (88% reduction)
+    - DETAILED: ~7% compression (93% reduction)
+    - HIERARCHICAL: Capped with diminishing returns
+
+    Args:
+        input_tokens: Number of tokens in the input.
+        level: Summary level (1-4).
+
+    Returns:
+        Target number of tokens for the summary.
+
+    """
+    if level == SummaryLevel.NONE:
+        return 0
+    if level == SummaryLevel.BRIEF:
+        return min(50, max(20, input_tokens // 5))
+    if level == SummaryLevel.STANDARD:
+        return min(200, max(50, input_tokens // 8))
+    if level == SummaryLevel.DETAILED:
+        return min(500, max(100, input_tokens // 15))
+    # HIERARCHICAL
+    # Base of 1000 tokens plus diminishing returns for additional content
+    base = 1000
+    additional = max(0, (input_tokens - 15000) // 100)
+    return min(2000, base + additional)
+
+
+def tokens_to_words(tokens: int) -> int:
+    """Convert token count to approximate word count.
+
+    Rough approximation: 1 token ≈ 0.75 words for English text.
+    """
+    return int(tokens * 0.75)
diff --git a/pyproject.toml b/pyproject.toml
index dcc98fed8..44fcc0403 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,11 @@ vectordb = [
     "watchfiles>=0.21.0",
 ]
 rag = ["agent-cli[vectordb]", "markitdown[docx,pdf,pptx]>=0.1.3"]
-memory = ["agent-cli[vectordb]", "pyyaml>=6.0.0"]
+memory = [
+    "agent-cli[vectordb]",
+    "pyyaml>=6.0.0",
+    "tiktoken>=0.5.0",  # For token counting in adaptive summarization
+]
 
 # Feature extras
 vad = ["onnxruntime>=1.16.0"]
@@ -82,6 +86,7 @@ test = [
     "pytest-cov>=4.0.0",
     "pytest-timeout",
     "pytest-mock",
+    "tiktoken>=0.5.0",  # For summarizer tests
 ]
 dev = [
     "agent-cli[test]",
@@ -113,6 +118,7 @@ dev = [
     "pre-commit-uv>=4.1.4",
     "zensical",
     "markdown-gfm-admonition",
+    "tiktoken>=0.5.0",  # For summarizer tests
 ]
 
 [project.scripts]
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 98334e459..3edd0eeb9 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -148,3 +148,229 @@ def test_upsert_and_delete_entries_delegate() -> None:
 
     _store.delete_entries(fake, ["x"])
     assert fake.deleted == [["x"]]
+
+
+# --- Hierarchical Summary Tests ---
+
+
+class _MockSummaryResult:
+    """Mock SummaryResult for testing without importing the full summarizer module."""
+
+    def __init__(self, entries: list[dict[str, Any]]) -> None:
+        self._entries = entries
+
+    def to_storage_metadata(self, _conversation_id: str) -> list[dict[str, Any]]:
+        # Just return the pre-configured entries (ignores conversation_id)
+        return self._entries
+
+
+def test_upsert_hierarchical_summary_simple() -> None:
+    """Test upserting a simple (non-hierarchical) summary."""
+    fake = _FakeCollection()
+    entries = [
+        {
+            "id": "conv-123:summary:L3:final",
+            "content": "A standard paragraph summary.",
+            "metadata": {
+                "conversation_id": "conv-123",
+                "role": "summary",
+                "level": 3,
+                "is_final": True,
+                "summary_level": "STANDARD",
+                "input_tokens": 1000,
+                "output_tokens": 50,
+                "compression_ratio": 0.05,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+    ]
+    mock_result = _MockSummaryResult(entries)
+
+    ids = _store.upsert_hierarchical_summary(fake, "conv-123", mock_result)
+
+    assert ids == ["conv-123:summary:L3:final"]
+    assert len(fake.upserts) == 1
+    upserted_ids, upserted_docs, upserted_metas = fake.upserts[0]
+    assert upserted_ids == ["conv-123:summary:L3:final"]
+    assert upserted_docs == ["A standard paragraph summary."]
+    assert upserted_metas[0]["level"] == 3
+    assert upserted_metas[0]["is_final"] is True
+
+
+def test_upsert_hierarchical_summary_with_chunks() -> None:
+    """Test upserting a hierarchical summary with L1 and L3 entries."""
+    fake = _FakeCollection()
+    entries = [
+        {
+            "id": "conv-456:summary:L1:0",
+            "content": "Chunk 0 summary",
+            "metadata": {
+                "conversation_id": "conv-456",
+                "role": "summary",
+                "level": 1,
+                "chunk_index": 0,
+                "parent_group": 0,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+        {
+            "id": "conv-456:summary:L1:1",
+            "content": "Chunk 1 summary",
+            "metadata": {
+                "conversation_id": "conv-456",
+                "role": "summary",
+                "level": 1,
+                "chunk_index": 1,
+                "parent_group": 0,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+        {
+            "id": "conv-456:summary:L3:final",
+            "content": "Final synthesis",
+            "metadata": {
+                "conversation_id": "conv-456",
+                "role": "summary",
+                "level": 3,
+                "is_final": True,
+                "input_tokens": 5000,
+                "output_tokens": 100,
+                "compression_ratio": 0.02,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+    ]
+    mock_result = _MockSummaryResult(entries)
+
+    ids = _store.upsert_hierarchical_summary(fake, "conv-456", mock_result)
+
+    assert len(ids) == 3
+    assert "conv-456:summary:L1:0" in ids
+    assert "conv-456:summary:L1:1" in ids
+    assert "conv-456:summary:L3:final" in ids
+
+
+def test_upsert_hierarchical_summary_empty() -> None:
+    """Test upserting when there are no entries (e.g., NONE level)."""
+    fake = _FakeCollection()
+    mock_result = _MockSummaryResult([])
+
+    ids = _store.upsert_hierarchical_summary(fake, "conv-789", mock_result)
+
+    assert ids == []
+    assert len(fake.upserts) == 0
+
+
+def test_get_summary_at_level() -> None:
+    """Test retrieving summaries at a specific level."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["Chunk 0", "Chunk 1"],
+            "metadatas": [
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "level": 1,
+                    "chunk_index": 0,
+                    "created_at": "now",
+                },
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "level": 1,
+                    "chunk_index": 1,
+                    "created_at": "now",
+                },
+            ],
+            "ids": ["c1:summary:L1:0", "c1:summary:L1:1"],
+        },
+    )
+
+    records = _store.get_summary_at_level(fake, "c1", level=1)
+
+    assert len(records) == 2
+    assert records[0].metadata.level == 1
+    assert records[0].metadata.chunk_index == 0
+    assert records[1].metadata.chunk_index == 1
+
+
+def test_get_final_summary_returns_final() -> None:
+    """Test getting the L3 final summary."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["The final summary"],
+            "metadatas": [
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "level": 3,
+                    "is_final": True,
+                    "created_at": "now",
+                },
+            ],
+            "ids": ["c1:summary:L3:final"],
+        },
+    )
+
+    result = _store.get_final_summary(fake, "c1")
+
+    assert result is not None
+    assert result.content == "The final summary"
+    assert result.metadata.is_final is True
+
+
+def test_get_final_summary_returns_none_when_missing() -> None:
+    """Test that get_final_summary returns None when no summary exists."""
+    fake = _FakeCollection(get_result={"documents": [], "metadatas": [], "ids": []})
+
+    result = _store.get_final_summary(fake, "c1")
+
+    assert result is None
+
+
+def test_delete_summaries_all_levels() -> None:
+    """Test deleting all summary levels for a conversation."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["L1", "L3"],
+            "metadatas": [
+                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
+                {"conversation_id": "c1", "role": "summary", "level": 3, "created_at": "now"},
+            ],
+            "ids": ["c1:summary:L1:0", "c1:summary:L3:final"],
+        },
+    )
+
+    deleted_count = _store.delete_summaries(fake, "c1")
+
+    assert deleted_count == 2
+    assert len(fake.deleted) == 1
+    assert set(fake.deleted[0]) == {"c1:summary:L1:0", "c1:summary:L3:final"}
+
+
+def test_delete_summaries_specific_levels() -> None:
+    """Test deleting only specific summary levels."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["L1 chunk"],
+            "metadatas": [
+                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
+            ],
+            "ids": ["c1:summary:L1:0"],
+        },
+    )
+
+    deleted_count = _store.delete_summaries(fake, "c1", levels=[1])
+
+    assert deleted_count == 1
+    assert fake.deleted[0] == ["c1:summary:L1:0"]
+
+
+def test_delete_summaries_no_entries() -> None:
+    """Test deleting when no summaries exist."""
+    fake = _FakeCollection(get_result={"documents": [], "metadatas": [], "ids": []})
+
+    deleted_count = _store.delete_summaries(fake, "c1")
+
+    assert deleted_count == 0
+    assert len(fake.deleted) == 0
diff --git a/tests/summarizer/__init__.py b/tests/summarizer/__init__.py
new file mode 100644
index 000000000..d6801b313
--- /dev/null
+++ b/tests/summarizer/__init__.py
@@ -0,0 +1 @@
+"""Tests for the adaptive summarizer module."""
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
new file mode 100644
index 000000000..1f010999e
--- /dev/null
+++ b/tests/summarizer/test_adaptive.py
@@ -0,0 +1,434 @@
+"""Unit tests for AdaptiveSummarizer."""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from agent_cli.summarizer.adaptive import (
+    LEVEL_THRESHOLDS,
+    AdaptiveSummarizer,
+    SummaryOutput,
+)
+from agent_cli.summarizer.models import SummaryLevel, SummaryResult
+
+
+class TestAdaptiveSummarizerInit:
+    """Tests for AdaptiveSummarizer initialization."""
+
+    def test_basic_init(self) -> None:
+        """Test basic initialization with required parameters."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        assert summarizer.openai_base_url == "http://localhost:8000/v1"
+        assert summarizer.model == "llama3.1:8b"
+        assert summarizer.api_key == "not-needed"
+
+    def test_init_with_api_key(self) -> None:
+        """Test initialization with custom API key."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+            api_key="sk-test-key",
+        )
+        assert summarizer.api_key == "sk-test-key"
+
+    def test_init_with_custom_settings(self) -> None:
+        """Test initialization with custom chunk settings."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+            chunk_size=5000,
+            chunk_overlap=300,
+            max_concurrent_chunks=10,
+            timeout=120.0,
+        )
+        assert summarizer.chunk_size == 5000
+        assert summarizer.chunk_overlap == 300
+        assert summarizer.max_concurrent_chunks == 10
+        assert summarizer.timeout == 120.0
+
+    def test_trailing_slash_stripped(self) -> None:
+        """Test that trailing slash is stripped from base URL."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1/",
+            model="gpt-4",
+        )
+        assert summarizer.openai_base_url == "http://localhost:8000/v1"
+
+
+class TestDetermineLevel:
+    """Tests for level determination based on token count."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    def test_none_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test NONE level for very short content."""
+        assert summarizer.determine_level(50) == SummaryLevel.NONE
+        assert summarizer.determine_level(99) == SummaryLevel.NONE
+
+    def test_brief_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test BRIEF level for short content."""
+        assert summarizer.determine_level(100) == SummaryLevel.BRIEF
+        assert summarizer.determine_level(300) == SummaryLevel.BRIEF
+        assert summarizer.determine_level(499) == SummaryLevel.BRIEF
+
+    def test_standard_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test STANDARD level for medium content."""
+        assert summarizer.determine_level(500) == SummaryLevel.STANDARD
+        assert summarizer.determine_level(1500) == SummaryLevel.STANDARD
+        assert summarizer.determine_level(2999) == SummaryLevel.STANDARD
+
+    def test_detailed_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test DETAILED level for longer content."""
+        assert summarizer.determine_level(3000) == SummaryLevel.DETAILED
+        assert summarizer.determine_level(8000) == SummaryLevel.DETAILED
+        assert summarizer.determine_level(14999) == SummaryLevel.DETAILED
+
+    def test_hierarchical_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test HIERARCHICAL level for very long content."""
+        assert summarizer.determine_level(15000) == SummaryLevel.HIERARCHICAL
+        assert summarizer.determine_level(50000) == SummaryLevel.HIERARCHICAL
+        assert summarizer.determine_level(100000) == SummaryLevel.HIERARCHICAL
+
+    def test_thresholds_match_constants(self) -> None:
+        """Verify thresholds match the module constants."""
+        assert LEVEL_THRESHOLDS[SummaryLevel.NONE] == 100
+        assert LEVEL_THRESHOLDS[SummaryLevel.BRIEF] == 500
+        assert LEVEL_THRESHOLDS[SummaryLevel.STANDARD] == 3000
+        assert LEVEL_THRESHOLDS[SummaryLevel.DETAILED] == 15000
+
+
+class TestSummarize:
+    """Tests for main summarize method."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_empty_content_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that empty content returns NONE level result."""
+        result = await summarizer.summarize("")
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+        assert result.input_tokens == 0
+        assert result.output_tokens == 0
+
+    @pytest.mark.asyncio
+    async def test_whitespace_only_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that whitespace-only content returns NONE level result."""
+        result = await summarizer.summarize("   \n\n   ")
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+
+    @pytest.mark.asyncio
+    async def test_very_short_content_no_summary(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that very short content gets NONE level (no summary)."""
+        # Less than 100 tokens
+        result = await summarizer.summarize("Hello world")
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_brief_summary")
+    async def test_brief_level_calls_brief_summary(
+        self,
+        mock_brief: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that BRIEF level content calls _brief_summary."""
+        mock_brief.return_value = "Brief summary."
+
+        # Create content that's ~100-500 tokens
+        content = "This is a test sentence. " * 30  # ~150 tokens
+
+        result = await summarizer.summarize(content)
+
+        mock_brief.assert_called_once_with(content)
+        assert result.level == SummaryLevel.BRIEF
+        assert result.summary == "Brief summary."
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    async def test_standard_level_calls_standard_summary(
+        self,
+        mock_standard: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that STANDARD level content calls _standard_summary."""
+        mock_standard.return_value = "Standard summary paragraph."
+
+        # Create content that's ~500-3000 tokens
+        content = "This is a test sentence with more words. " * 100  # ~800 tokens
+
+        result = await summarizer.summarize(content, content_type="general")
+
+        mock_standard.assert_called_once_with(content, None, "general")
+        assert result.level == SummaryLevel.STANDARD
+        assert result.summary == "Standard summary paragraph."
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    async def test_prior_summary_passed_to_standard(
+        self,
+        mock_standard: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that prior_summary is passed to _standard_summary."""
+        mock_standard.return_value = "Updated summary."
+
+        content = "This is a test sentence with more words. " * 100
+        prior = "Previous context summary."
+
+        await summarizer.summarize(content, prior_summary=prior)
+
+        mock_standard.assert_called_once_with(content, prior, "general")
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_detailed_summary")
+    async def test_detailed_level_calls_detailed_summary(
+        self,
+        mock_detailed: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that DETAILED level content calls _detailed_summary."""
+        mock_result = SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary="Detailed summary.",
+            hierarchical=None,
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+        )
+        mock_detailed.return_value = mock_result
+
+        # Create content that's ~3000-15000 tokens
+        content = "Word " * 5000  # ~5000 tokens
+
+        result = await summarizer.summarize(content)
+
+        assert mock_detailed.called
+        assert result.level == SummaryLevel.DETAILED
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_hierarchical_summary")
+    async def test_hierarchical_level_calls_hierarchical_summary(
+        self,
+        mock_hierarchical: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that HIERARCHICAL level content calls _hierarchical_summary."""
+        mock_result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Hierarchical summary.",
+            hierarchical=None,
+            input_tokens=20000,
+            output_tokens=500,
+            compression_ratio=0.025,
+        )
+        mock_hierarchical.return_value = mock_result
+
+        # Create content that's > 15000 tokens
+        content = "Word " * 20000
+
+        result = await summarizer.summarize(content)
+
+        assert mock_hierarchical.called
+        assert result.level == SummaryLevel.HIERARCHICAL
+
+
+class TestUpdateRollingSummary:
+    """Tests for rolling summary updates."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_empty_facts_returns_prior(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that empty facts list returns prior summary."""
+        result = await summarizer.update_rolling_summary(
+            prior_summary="Existing summary",
+            new_facts=[],
+        )
+        assert result == "Existing summary"
+
+    @pytest.mark.asyncio
+    async def test_empty_facts_no_prior_returns_empty(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that empty facts with no prior returns empty string."""
+        result = await summarizer.update_rolling_summary(
+            prior_summary=None,
+            new_facts=[],
+        )
+        assert result == ""
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    async def test_new_facts_calls_generate(
+        self,
+        mock_generate: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that new facts trigger summary generation."""
+        mock_generate.return_value = "Updated summary with new facts."
+
+        result = await summarizer.update_rolling_summary(
+            prior_summary="Old summary",
+            new_facts=["User likes coffee", "User lives in Amsterdam"],
+        )
+
+        mock_generate.assert_called_once()
+        assert result == "Updated summary with new facts."
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    async def test_facts_formatted_as_list(
+        self,
+        mock_generate: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that facts are formatted as bullet list in prompt."""
+        mock_generate.return_value = "Summary"
+
+        await summarizer.update_rolling_summary(
+            prior_summary="Prior",
+            new_facts=["Fact one", "Fact two"],
+        )
+
+        # Check the prompt contains formatted facts
+        call_args = mock_generate.call_args
+        prompt = call_args[0][0]
+        assert "- Fact one" in prompt
+        assert "- Fact two" in prompt
+
+
+class TestGenerateSummary:
+    """Tests for _generate_summary method."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_generate_summary_with_pydantic_ai(
+        self,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test summary generation using PydanticAI agent."""
+        # Mock the entire agent creation and run
+        mock_result = MagicMock()
+        mock_result.output = SummaryOutput(summary="Generated summary.")
+
+        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+            mock_agent = MagicMock()
+            mock_agent.run = AsyncMock(return_value=mock_result)
+            mock_agent_class.return_value = mock_agent
+
+            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+
+            assert result == "Generated summary."
+            mock_agent.run.assert_called_once_with("Test prompt")
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_raw_generate")
+    async def test_fallback_to_raw_generate_on_error(
+        self,
+        mock_raw: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test fallback to raw HTTP on PydanticAI error."""
+        mock_raw.return_value = "Fallback summary"
+
+        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+            mock_agent = MagicMock()
+            mock_agent.run = AsyncMock(side_effect=Exception("API error"))
+            mock_agent_class.return_value = mock_agent
+
+            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+
+            mock_raw.assert_called_once_with("Test prompt", 100)
+            assert result == "Fallback summary"
+
+
+class TestRawGenerate:
+    """Tests for _raw_generate fallback method."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test successful raw HTTP generation."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "choices": [{"message": {"content": "Raw generated summary"}}],
+        }
+
+        with patch("httpx.AsyncClient") as mock_client_class:
+            mock_client = MagicMock()
+            mock_client.post = AsyncMock(return_value=mock_response)
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=None)
+            mock_client_class.return_value = mock_client
+
+            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+
+            assert result == "Raw generated summary"
+
+    @pytest.mark.asyncio
+    async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test raw generate with empty choices returns empty string."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"choices": []}
+
+        with patch("httpx.AsyncClient") as mock_client_class:
+            mock_client = MagicMock()
+            mock_client.post = AsyncMock(return_value=mock_response)
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=None)
+            mock_client_class.return_value = mock_client
+
+            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+
+            assert result == ""
+
+
+class TestSummaryOutput:
+    """Tests for SummaryOutput pydantic model."""
+
+    def test_basic_creation(self) -> None:
+        """Test creating a SummaryOutput."""
+        output = SummaryOutput(summary="Test summary text")
+        assert output.summary == "Test summary text"
+
+    def test_whitespace_preserved(self) -> None:
+        """Test that whitespace in summary is preserved."""
+        output = SummaryOutput(summary="  Summary with spaces  ")
+        assert output.summary == "  Summary with spaces  "
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
new file mode 100644
index 000000000..381f9f5b6
--- /dev/null
+++ b/tests/summarizer/test_integration.py
@@ -0,0 +1,466 @@
+"""Integration tests for the summarizer with memory system."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+from unittest.mock import patch
+
+import pytest
+
+from agent_cli.memory._ingest import summarize_content
+from agent_cli.memory._persistence import persist_hierarchical_summary
+from agent_cli.memory._store import (
+    get_final_summary,
+    get_summary_at_level,
+    upsert_hierarchical_summary,
+)
+from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel, SummaryResult
+from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class _FakeCollection:
+    """Minimal Chroma-like collection for testing."""
+
+    def __init__(self) -> None:
+        self._store: dict[str, tuple[str, dict[str, Any]]] = {}
+
+    def upsert(
+        self,
+        *,
+        ids: list[str],
+        documents: list[str],
+        metadatas: list[dict[str, Any]],
+    ) -> None:
+        for doc_id, doc, meta in zip(ids, documents, metadatas, strict=False):
+            self._store[doc_id] = (doc, meta)
+
+    def get(
+        self,
+        *,
+        where: dict[str, Any] | None = None,
+        include: list[str] | None = None,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        if where is None:
+            return {"documents": [], "metadatas": [], "ids": []}
+
+        results: list[tuple[str, tuple[str, dict[str, Any]]]] = []
+        for doc_id, (doc, meta) in self._store.items():
+            # Check all conditions in $and clause
+            conditions = where.get("$and", [where])
+            match = True
+            for clause in conditions:
+                for k, v in clause.items():
+                    if k == "$and":
+                        continue
+                    if isinstance(v, dict):
+                        if "$in" in v and meta.get(k) not in v["$in"]:
+                            match = False
+                        if "$ne" in v and meta.get(k) == v["$ne"]:
+                            match = False
+                    elif meta.get(k) != v:
+                        match = False
+            if match:
+                results.append((doc_id, (doc, meta)))
+
+        docs = [doc for _, (doc, _) in results]
+        metas = [meta for _, (_, meta) in results]
+        ids = [doc_id for doc_id, _ in results]
+        return {"documents": docs, "metadatas": metas, "ids": ids}
+
+    def delete(
+        self,
+        ids: list[str] | None = None,
+        where: dict[str, Any] | None = None,  # noqa: ARG002
+    ) -> None:
+        if ids:
+            for doc_id in ids:
+                self._store.pop(doc_id, None)
+
+
+@pytest.fixture
+def fake_collection() -> _FakeCollection:
+    """Create a fake ChromaDB collection."""
+    return _FakeCollection()
+
+
+@pytest.fixture
+def memory_root(tmp_path: Path) -> Path:
+    """Create a temporary memory root directory."""
+    return tmp_path / "memory"
+
+
+class TestSummaryResultStorageMetadata:
+    """Test SummaryResult.to_storage_metadata for various levels."""
+
+    def test_standard_summary_produces_single_entry(self) -> None:
+        """Test that STANDARD level produces a single L3 entry."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A paragraph summary of the content.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        entries = result.to_storage_metadata("conv-123")
+
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "conv-123:summary:L3:final"
+        assert entry["content"] == "A paragraph summary of the content."
+        assert entry["metadata"]["level"] == 3
+        assert entry["metadata"]["is_final"] is True
+        assert entry["metadata"]["summary_level"] == "STANDARD"
+
+    def test_hierarchical_summary_produces_multiple_entries(self) -> None:
+        """Test that HIERARCHICAL level produces L1, L2, L3 entries."""
+        l1_summaries = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=2,
+                content="Chunk 2",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1_summaries,
+            l2_summaries=["Group 0 summary"],
+            l3_summary="Final hierarchical synthesis.",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Final hierarchical synthesis.",
+            hierarchical=hierarchical,
+            input_tokens=20000,
+            output_tokens=200,
+            compression_ratio=0.01,
+        )
+
+        entries = result.to_storage_metadata("conv-456")
+
+        # Should have 3 L1 + 1 L2 + 1 L3 = 5 entries
+        assert len(entries) == 5
+
+        # Check L1 entries
+        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
+        assert len(l1_entries) == 3
+
+        # Check L2 entries
+        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
+        assert len(l2_entries) == 1
+
+        # Check L3 entry
+        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
+        assert len(l3_entries) == 1
+
+
+class TestHierarchicalSummaryStorage:
+    """Test storing hierarchical summaries to ChromaDB."""
+
+    def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None:
+        """Test storing a simple (non-hierarchical) summary."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A standard summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        ids = upsert_hierarchical_summary(fake_collection, "conv-123", result)
+
+        assert len(ids) == 1
+        assert "conv-123:summary:L3:final" in ids
+
+        # Verify retrieval
+        stored = get_final_summary(fake_collection, "conv-123")
+        assert stored is not None
+        assert stored.content == "A standard summary."
+
+    def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> None:
+        """Test storing a hierarchical summary with all levels."""
+        l1_summaries = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0 summary",
+                token_count=10,
+                source_tokens=100,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1 summary",
+                token_count=10,
+                source_tokens=100,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1_summaries,
+            l2_summaries=[],
+            l3_summary="Final summary",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary="Final summary",
+            hierarchical=hierarchical,
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+        )
+
+        ids = upsert_hierarchical_summary(fake_collection, "conv-789", result)
+
+        assert len(ids) == 3  # 2 L1 + 1 L3
+
+        # Verify L1 retrieval
+        l1_stored = get_summary_at_level(fake_collection, "conv-789", level=1)
+        assert len(l1_stored) == 2
+
+        # Verify L3 retrieval
+        final = get_final_summary(fake_collection, "conv-789")
+        assert final is not None
+        assert final.content == "Final summary"
+
+
+class TestFilePersistence:
+    """Test hierarchical summary file persistence."""
+
+    def test_persist_hierarchical_creates_files(
+        self,
+        fake_collection: _FakeCollection,
+        memory_root: Path,
+    ) -> None:
+        """Test that persist_hierarchical_summary creates correct file structure."""
+        l1_summaries = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0 content",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1 content",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1_summaries,
+            l2_summaries=["Group 0 summary"],
+            l3_summary="Final synthesis",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Final synthesis",
+            hierarchical=hierarchical,
+            input_tokens=20000,
+            output_tokens=200,
+            compression_ratio=0.01,
+        )
+
+        ids = persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="test-conv",
+            summary_result=result,
+        )
+
+        assert len(ids) == 4  # 2 L1 + 1 L2 + 1 L3
+
+        # Check file structure (note: _slugify converts - to - not _)
+        entries_dir = memory_root / "entries" / "test-conv"
+        l1_dir = entries_dir / "summaries" / "L1"
+        l2_dir = entries_dir / "summaries" / "L2"
+        l3_dir = entries_dir / "summaries" / "L3"
+
+        assert l1_dir.exists()
+        assert l2_dir.exists()
+        assert l3_dir.exists()
+
+        # Check L1 files
+        l1_files = list(l1_dir.glob("*.md"))
+        assert len(l1_files) == 2
+
+        # Check L2 files
+        l2_files = list(l2_dir.glob("*.md"))
+        assert len(l2_files) == 1
+
+        # Check L3 files
+        l3_files = list(l3_dir.glob("*.md"))
+        assert len(l3_files) == 1
+        assert (l3_dir / "final.md").exists()
+
+    def test_persist_simple_summary_creates_l3_file(
+        self,
+        fake_collection: _FakeCollection,
+        memory_root: Path,
+    ) -> None:
+        """Test that a simple summary creates just L3/final.md."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A standard paragraph summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        ids = persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="simple-conv",
+            summary_result=result,
+        )
+
+        assert len(ids) == 1
+
+        # Check file exists (note: _slugify converts - to - not _)
+        entries_dir = memory_root / "entries" / "simple-conv"
+        l3_file = entries_dir / "summaries" / "L3" / "final.md"
+        assert l3_file.exists()
+
+        # Check content has YAML front matter
+        content = l3_file.read_text(encoding="utf-8")
+        assert "---" in content
+        assert "level: 3" in content
+        assert "A standard paragraph summary." in content
+
+    def test_persist_deletes_old_summaries(
+        self,
+        fake_collection: _FakeCollection,
+        memory_root: Path,
+    ) -> None:
+        """Test that persisting new summary deletes old summary files."""
+        # Create first summary
+        result1 = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="First summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="conv",
+            summary_result=result1,
+        )
+
+        entries_dir = memory_root / "entries" / "conv"
+        first_file = entries_dir / "summaries" / "L3" / "final.md"
+        assert first_file.exists()
+        assert "First summary." in first_file.read_text()
+
+        # Create second summary (should replace first)
+        result2 = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="Second summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="conv",
+            summary_result=result2,
+        )
+
+        # First summary should be moved to deleted
+        assert first_file.exists()
+        assert "Second summary." in first_file.read_text()
+
+        # Old summary should be in deleted folder
+        deleted_dir = memory_root / "entries" / "deleted" / "conv" / "summaries"
+        assert deleted_dir.exists()
+
+
+class TestAdaptiveSummarizerLevelDetermination:
+    """Test that AdaptiveSummarizer correctly determines summary levels."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create an AdaptiveSummarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="test-model",
+        )
+
+    def test_very_short_content_is_none(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that content under 100 tokens gets NONE level."""
+        level = summarizer.determine_level(50)
+        assert level == SummaryLevel.NONE
+
+    def test_short_content_is_brief(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that 100-500 token content gets BRIEF level."""
+        level = summarizer.determine_level(300)
+        assert level == SummaryLevel.BRIEF
+
+    def test_medium_content_is_standard(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that 500-3000 token content gets STANDARD level."""
+        level = summarizer.determine_level(1500)
+        assert level == SummaryLevel.STANDARD
+
+    def test_long_content_is_detailed(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that 3000-15000 token content gets DETAILED level."""
+        level = summarizer.determine_level(8000)
+        assert level == SummaryLevel.DETAILED
+
+    def test_very_long_content_is_hierarchical(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that content over 15000 tokens gets HIERARCHICAL level."""
+        level = summarizer.determine_level(25000)
+        assert level == SummaryLevel.HIERARCHICAL
+
+
+class TestSummarizeContentFunction:
+    """Test the summarize_content function from _ingest."""
+
+    @pytest.mark.asyncio
+    async def test_summarize_content_creates_result(self) -> None:
+        """Test that summarize_content returns a valid SummaryResult."""
+        with patch.object(AdaptiveSummarizer, "summarize") as mock_summarize:
+            mock_result = SummaryResult(
+                level=SummaryLevel.STANDARD,
+                summary="Mocked summary.",
+                hierarchical=None,
+                input_tokens=1000,
+                output_tokens=50,
+                compression_ratio=0.05,
+            )
+            mock_summarize.return_value = mock_result
+
+            result = await summarize_content(
+                content="Some content to summarize " * 100,
+                openai_base_url="http://localhost:8000/v1",
+                api_key=None,
+                model="test-model",
+            )
+
+            assert result.level == SummaryLevel.STANDARD
+            assert result.summary == "Mocked summary."
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
new file mode 100644
index 000000000..5a6583cd2
--- /dev/null
+++ b/tests/summarizer/test_models.py
@@ -0,0 +1,332 @@
+"""Unit tests for summarizer models."""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+
+import pytest
+
+from agent_cli.summarizer.models import (
+    ChunkSummary,
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
+
+
+class TestSummaryLevel:
+    """Tests for SummaryLevel enum."""
+
+    def test_level_values(self) -> None:
+        """Test that levels have correct integer values."""
+        assert SummaryLevel.NONE == 0
+        assert SummaryLevel.BRIEF == 1
+        assert SummaryLevel.STANDARD == 2
+        assert SummaryLevel.DETAILED == 3
+        assert SummaryLevel.HIERARCHICAL == 4
+
+    def test_level_ordering(self) -> None:
+        """Test that levels can be compared."""
+        assert SummaryLevel.NONE < SummaryLevel.BRIEF
+        assert SummaryLevel.BRIEF < SummaryLevel.STANDARD
+        assert SummaryLevel.STANDARD < SummaryLevel.DETAILED
+        assert SummaryLevel.DETAILED < SummaryLevel.HIERARCHICAL
+
+
+class TestChunkSummary:
+    """Tests for ChunkSummary model."""
+
+    def test_basic_creation(self) -> None:
+        """Test creating a chunk summary."""
+        chunk = ChunkSummary(
+            chunk_index=0,
+            content="This is a summary of chunk 1.",
+            token_count=10,
+            source_tokens=100,
+            parent_group=None,
+        )
+        assert chunk.chunk_index == 0
+        assert chunk.content == "This is a summary of chunk 1."
+        assert chunk.token_count == 10
+        assert chunk.source_tokens == 100
+        assert chunk.parent_group is None
+
+    def test_with_parent_group(self) -> None:
+        """Test creating a chunk summary with parent group."""
+        chunk = ChunkSummary(
+            chunk_index=5,
+            content="Summary text",
+            token_count=8,
+            source_tokens=200,
+            parent_group=1,
+        )
+        assert chunk.parent_group == 1
+
+    def test_validation_negative_tokens(self) -> None:
+        """Test that negative token counts fail validation."""
+        with pytest.raises(ValueError, match="greater than or equal to 0"):
+            ChunkSummary(
+                chunk_index=0,
+                content="Test",
+                token_count=-1,
+                source_tokens=100,
+            )
+
+
+class TestHierarchicalSummary:
+    """Tests for HierarchicalSummary model."""
+
+    def test_basic_creation(self) -> None:
+        """Test creating a hierarchical summary."""
+        l1 = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 1 summary",
+                token_count=10,
+                source_tokens=100,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 2 summary",
+                token_count=12,
+                source_tokens=120,
+            ),
+        ]
+        hs = HierarchicalSummary(
+            l1_summaries=l1,
+            l2_summaries=["Group summary"],
+            l3_summary="Final summary of all content.",
+        )
+        assert len(hs.l1_summaries) == 2
+        assert len(hs.l2_summaries) == 1
+        assert hs.l3_summary == "Final summary of all content."
+
+    def test_default_chunk_settings(self) -> None:
+        """Test default chunk size and overlap."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=[],
+            l3_summary="Final",
+        )
+        assert hs.chunk_size == 3000
+        assert hs.chunk_overlap == 200
+
+    def test_get_summary_at_level_1(self) -> None:
+        """Test getting L1 summaries."""
+        l1 = [
+            ChunkSummary(chunk_index=0, content="C1", token_count=5, source_tokens=50),
+            ChunkSummary(chunk_index=1, content="C2", token_count=5, source_tokens=50),
+        ]
+        hs = HierarchicalSummary(l1_summaries=l1, l2_summaries=[], l3_summary="Final")
+        result = hs.get_summary_at_level(1)
+        assert result == ["C1", "C2"]
+
+    def test_get_summary_at_level_2_with_l2(self) -> None:
+        """Test getting L2 summaries when available."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=["Group A", "Group B"],
+            l3_summary="Final",
+        )
+        result = hs.get_summary_at_level(2)
+        assert result == ["Group A", "Group B"]
+
+    def test_get_summary_at_level_2_fallback(self) -> None:
+        """Test getting L2 falls back to L3 when no L2 summaries."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=[],
+            l3_summary="Final summary",
+        )
+        result = hs.get_summary_at_level(2)
+        assert result == ["Final summary"]
+
+    def test_get_summary_at_level_3(self) -> None:
+        """Test getting L3 summary."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=["Group"],
+            l3_summary="The final summary",
+        )
+        result = hs.get_summary_at_level(3)
+        assert result == "The final summary"
+
+
+class TestSummaryResult:
+    """Tests for SummaryResult model."""
+
+    def test_none_level_result(self) -> None:
+        """Test result for content that needs no summary."""
+        result = SummaryResult(
+            level=SummaryLevel.NONE,
+            summary=None,
+            hierarchical=None,
+            input_tokens=50,
+            output_tokens=0,
+            compression_ratio=0.0,
+        )
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+        assert result.chunk_summaries is None
+
+    def test_brief_level_result(self) -> None:
+        """Test result for brief summary."""
+        result = SummaryResult(
+            level=SummaryLevel.BRIEF,
+            summary="A brief one-sentence summary.",
+            hierarchical=None,
+            input_tokens=200,
+            output_tokens=10,
+            compression_ratio=0.05,
+        )
+        assert result.level == SummaryLevel.BRIEF
+        assert result.summary == "A brief one-sentence summary."
+        assert result.chunk_summaries is None
+
+    def test_hierarchical_result_with_chunk_summaries(self) -> None:
+        """Test hierarchical result exposes chunk summaries."""
+        l1 = [
+            ChunkSummary(chunk_index=0, content="Chunk 1", token_count=10, source_tokens=100),
+            ChunkSummary(chunk_index=1, content="Chunk 2", token_count=10, source_tokens=100),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1,
+            l2_summaries=[],
+            l3_summary="Final",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary="Final",
+            hierarchical=hierarchical,
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+        )
+        assert result.chunk_summaries == ["Chunk 1", "Chunk 2"]
+
+    def test_to_storage_metadata_none_level(self) -> None:
+        """Test that NONE level produces no storage entries."""
+        result = SummaryResult(
+            level=SummaryLevel.NONE,
+            summary=None,
+            hierarchical=None,
+            input_tokens=50,
+            output_tokens=0,
+            compression_ratio=0.0,
+        )
+        entries = result.to_storage_metadata("conv-123")
+        assert entries == []
+
+    def test_to_storage_metadata_simple_summary(self) -> None:
+        """Test storage metadata for simple (non-hierarchical) summary."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A standard paragraph summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+        entries = result.to_storage_metadata("conv-456")
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "conv-456:summary:L3:final"
+        assert entry["content"] == "A standard paragraph summary."
+        assert entry["metadata"]["conversation_id"] == "conv-456"
+        assert entry["metadata"]["role"] == "summary"
+        assert entry["metadata"]["level"] == 3
+        assert entry["metadata"]["is_final"] is True
+        assert entry["metadata"]["summary_level"] == "STANDARD"
+
+    def test_to_storage_metadata_hierarchical(self) -> None:
+        """Test storage metadata for hierarchical summary."""
+        l1 = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0 text",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1 text",
+                token_count=12,
+                source_tokens=120,
+                parent_group=0,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1,
+            l2_summaries=["Group 0 summary"],
+            l3_summary="Final synthesis",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Final synthesis",
+            hierarchical=hierarchical,
+            input_tokens=20000,
+            output_tokens=200,
+            compression_ratio=0.01,
+        )
+        entries = result.to_storage_metadata("conv-789")
+
+        # Should have 2 L1 + 1 L2 + 1 L3 = 4 entries
+        assert len(entries) == 4
+
+        # Check L1 entries
+        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
+        assert len(l1_entries) == 2
+        assert l1_entries[0]["id"] == "conv-789:summary:L1:0"
+        assert l1_entries[0]["metadata"]["chunk_index"] == 0
+
+        # Check L2 entry
+        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
+        assert len(l2_entries) == 1
+        assert l2_entries[0]["id"] == "conv-789:summary:L2:0"
+        assert l2_entries[0]["content"] == "Group 0 summary"
+
+        # Check L3 entry
+        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
+        assert len(l3_entries) == 1
+        assert l3_entries[0]["id"] == "conv-789:summary:L3:final"
+        assert l3_entries[0]["metadata"]["is_final"] is True
+
+    def test_compression_ratio_bounds(self) -> None:
+        """Test compression ratio validation."""
+        # Valid ratio
+        result = SummaryResult(
+            level=SummaryLevel.BRIEF,
+            summary="Test",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=10,
+            compression_ratio=0.1,
+        )
+        assert result.compression_ratio == 0.1
+
+        # Ratio must be between 0 and 1
+        with pytest.raises(ValueError, match="less than or equal to 1"):
+            SummaryResult(
+                level=SummaryLevel.BRIEF,
+                summary="Test",
+                hierarchical=None,
+                input_tokens=100,
+                output_tokens=10,
+                compression_ratio=1.5,
+            )
+
+    def test_created_at_default(self) -> None:
+        """Test that created_at is automatically set."""
+        before = datetime.now(UTC)
+        result = SummaryResult(
+            level=SummaryLevel.BRIEF,
+            summary="Test",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=10,
+            compression_ratio=0.1,
+        )
+        after = datetime.now(UTC)
+        # Compare without timezone since result.created_at may be naive
+        assert before.replace(tzinfo=None) <= result.created_at <= after.replace(tzinfo=None)
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
new file mode 100644
index 000000000..e126def22
--- /dev/null
+++ b/tests/summarizer/test_prompts.py
@@ -0,0 +1,180 @@
+"""Unit tests for summarizer prompt templates."""
+
+from __future__ import annotations
+
+from agent_cli.summarizer.prompts import (
+    BRIEF_SUMMARY_PROMPT,
+    CHUNK_SUMMARY_PROMPT,
+    CONVERSATION_SUMMARY_PROMPT,
+    DOCUMENT_SUMMARY_PROMPT,
+    JOURNAL_SUMMARY_PROMPT,
+    META_SUMMARY_PROMPT,
+    ROLLING_SUMMARY_PROMPT,
+    STANDARD_SUMMARY_PROMPT,
+    format_prior_context,
+    format_summaries_for_meta,
+    get_prompt_for_content_type,
+)
+
+
+class TestPromptTemplates:
+    """Tests for prompt template structure."""
+
+    def test_brief_prompt_has_content_placeholder(self) -> None:
+        """Test BRIEF prompt contains content placeholder."""
+        assert "{content}" in BRIEF_SUMMARY_PROMPT
+        # Test it can be formatted
+        result = BRIEF_SUMMARY_PROMPT.format(content="Test content")
+        assert "Test content" in result
+
+    def test_standard_prompt_has_placeholders(self) -> None:
+        """Test STANDARD prompt contains required placeholders."""
+        assert "{content}" in STANDARD_SUMMARY_PROMPT
+        assert "{prior_context}" in STANDARD_SUMMARY_PROMPT
+        assert "{max_words}" in STANDARD_SUMMARY_PROMPT
+
+        result = STANDARD_SUMMARY_PROMPT.format(
+            content="Main content",
+            prior_context="Previous context",
+            max_words=100,
+        )
+        assert "Main content" in result
+        assert "Previous context" in result
+        assert "100" in result
+
+    def test_chunk_prompt_has_placeholders(self) -> None:
+        """Test CHUNK prompt contains required placeholders."""
+        assert "{content}" in CHUNK_SUMMARY_PROMPT
+        assert "{chunk_index}" in CHUNK_SUMMARY_PROMPT
+        assert "{total_chunks}" in CHUNK_SUMMARY_PROMPT
+        assert "{max_words}" in CHUNK_SUMMARY_PROMPT
+
+        result = CHUNK_SUMMARY_PROMPT.format(
+            content="Chunk content",
+            chunk_index=1,
+            total_chunks=5,
+            max_words=50,
+        )
+        assert "Chunk content" in result
+        assert "1" in result
+        assert "5" in result
+
+    def test_meta_prompt_has_placeholders(self) -> None:
+        """Test META prompt contains required placeholders."""
+        assert "{summaries}" in META_SUMMARY_PROMPT
+        assert "{max_words}" in META_SUMMARY_PROMPT
+
+        result = META_SUMMARY_PROMPT.format(
+            summaries="Summary 1\n\nSummary 2",
+            max_words=200,
+        )
+        assert "Summary 1" in result
+        assert "200" in result
+
+    def test_rolling_prompt_has_placeholders(self) -> None:
+        """Test ROLLING prompt contains required placeholders."""
+        assert "{prior_summary}" in ROLLING_SUMMARY_PROMPT
+        assert "{new_content}" in ROLLING_SUMMARY_PROMPT
+        assert "{max_words}" in ROLLING_SUMMARY_PROMPT
+
+    def test_conversation_prompt_has_content(self) -> None:
+        """Test CONVERSATION prompt contains content placeholder."""
+        assert "{content}" in CONVERSATION_SUMMARY_PROMPT
+        assert "{max_words}" in CONVERSATION_SUMMARY_PROMPT
+
+    def test_journal_prompt_has_content(self) -> None:
+        """Test JOURNAL prompt contains content placeholder."""
+        assert "{content}" in JOURNAL_SUMMARY_PROMPT
+        assert "{max_words}" in JOURNAL_SUMMARY_PROMPT
+
+    def test_document_prompt_has_content(self) -> None:
+        """Test DOCUMENT prompt contains content placeholder."""
+        assert "{content}" in DOCUMENT_SUMMARY_PROMPT
+        assert "{max_words}" in DOCUMENT_SUMMARY_PROMPT
+
+
+class TestGetPromptForContentType:
+    """Tests for get_prompt_for_content_type function."""
+
+    def test_general_returns_standard(self) -> None:
+        """Test general content type returns standard prompt."""
+        prompt = get_prompt_for_content_type("general")
+        assert prompt == STANDARD_SUMMARY_PROMPT
+
+    def test_conversation_returns_conversation(self) -> None:
+        """Test conversation content type returns conversation prompt."""
+        prompt = get_prompt_for_content_type("conversation")
+        assert prompt == CONVERSATION_SUMMARY_PROMPT
+
+    def test_journal_returns_journal(self) -> None:
+        """Test journal content type returns journal prompt."""
+        prompt = get_prompt_for_content_type("journal")
+        assert prompt == JOURNAL_SUMMARY_PROMPT
+
+    def test_document_returns_document(self) -> None:
+        """Test document content type returns document prompt."""
+        prompt = get_prompt_for_content_type("document")
+        assert prompt == DOCUMENT_SUMMARY_PROMPT
+
+    def test_unknown_returns_standard(self) -> None:
+        """Test unknown content type falls back to standard."""
+        prompt = get_prompt_for_content_type("unknown_type")
+        assert prompt == STANDARD_SUMMARY_PROMPT
+
+    def test_empty_returns_standard(self) -> None:
+        """Test empty string falls back to standard."""
+        prompt = get_prompt_for_content_type("")
+        assert prompt == STANDARD_SUMMARY_PROMPT
+
+
+class TestFormatPriorContext:
+    """Tests for format_prior_context function."""
+
+    def test_with_prior_summary(self) -> None:
+        """Test formatting with a prior summary."""
+        result = format_prior_context("Previous summary text")
+        assert "Prior context" in result
+        assert "Previous summary text" in result
+
+    def test_without_prior_summary(self) -> None:
+        """Test formatting without prior summary returns empty string."""
+        result = format_prior_context(None)
+        assert result == ""
+
+    def test_empty_string_prior_summary(self) -> None:
+        """Test formatting with empty string prior summary."""
+        result = format_prior_context("")
+        assert result == ""
+
+
+class TestFormatSummariesForMeta:
+    """Tests for format_summaries_for_meta function."""
+
+    def test_single_summary(self) -> None:
+        """Test formatting a single summary."""
+        result = format_summaries_for_meta(["Summary one"])
+        assert "[Section 1]" in result
+        assert "Summary one" in result
+
+    def test_multiple_summaries(self) -> None:
+        """Test formatting multiple summaries."""
+        summaries = ["First summary", "Second summary", "Third summary"]
+        result = format_summaries_for_meta(summaries)
+
+        assert "[Section 1]" in result
+        assert "[Section 2]" in result
+        assert "[Section 3]" in result
+        assert "First summary" in result
+        assert "Second summary" in result
+        assert "Third summary" in result
+
+    def test_empty_list(self) -> None:
+        """Test formatting empty list."""
+        result = format_summaries_for_meta([])
+        assert result == ""
+
+    def test_summaries_separated(self) -> None:
+        """Test summaries are separated by double newlines."""
+        summaries = ["Sum 1", "Sum 2"]
+        result = format_summaries_for_meta(summaries)
+        assert "\n\n" in result
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
new file mode 100644
index 000000000..458e9b37d
--- /dev/null
+++ b/tests/summarizer/test_utils.py
@@ -0,0 +1,193 @@
+"""Unit tests for summarizer utility functions."""
+
+from __future__ import annotations
+
+from agent_cli.summarizer.utils import (
+    chunk_text,
+    count_tokens,
+    estimate_summary_tokens,
+    middle_truncate,
+    tokens_to_words,
+)
+
+
+class TestCountTokens:
+    """Tests for count_tokens function."""
+
+    def test_empty_string(self) -> None:
+        """Test counting tokens in empty string."""
+        assert count_tokens("") == 0
+
+    def test_simple_sentence(self) -> None:
+        """Test counting tokens in a simple sentence."""
+        # "Hello world" is typically 2 tokens
+        count = count_tokens("Hello world")
+        assert count > 0
+        assert count < 10
+
+    def test_longer_text(self) -> None:
+        """Test that longer text has more tokens."""
+        short = count_tokens("Hello")
+        long = count_tokens("Hello world, this is a longer sentence with more words.")
+        assert long > short
+
+    def test_different_model_fallback(self) -> None:
+        """Test that unknown models fall back to cl100k_base."""
+        # Should not raise, should fall back gracefully
+        count = count_tokens("Hello world", model="unknown-model-xyz")
+        assert count > 0
+
+
+class TestChunkText:
+    """Tests for chunk_text function."""
+
+    def test_empty_text(self) -> None:
+        """Test chunking empty text returns empty list."""
+        assert chunk_text("") == []
+
+    def test_short_text_single_chunk(self) -> None:
+        """Test that short text stays as single chunk."""
+        text = "This is a short paragraph."
+        chunks = chunk_text(text, chunk_size=1000)
+        assert len(chunks) == 1
+        assert chunks[0] == text
+
+    def test_multiple_paragraphs_chunking(self) -> None:
+        """Test chunking multiple paragraphs."""
+        paragraphs = ["Paragraph one. " * 50, "Paragraph two. " * 50, "Paragraph three. " * 50]
+        text = "\n\n".join(paragraphs)
+
+        # Use small chunk size to force splitting
+        chunks = chunk_text(text, chunk_size=200, overlap=20)
+        assert len(chunks) > 1
+
+    def test_overlap_preserved(self) -> None:
+        """Test that chunks have overlap for context continuity."""
+        # Create text that will definitely need chunking
+        text = "Sentence one about topic A. " * 20 + "\n\n" + "Sentence two about topic B. " * 20
+
+        chunks = chunk_text(text, chunk_size=100, overlap=30)
+
+        # With overlap, later chunks should contain some content from earlier
+        if len(chunks) > 1:
+            # Overlap means adjacent chunks share some content
+            # This is a rough check - exact overlap depends on tokenization
+            assert len(chunks) >= 2
+
+    def test_large_paragraph_sentence_split(self) -> None:
+        """Test that large paragraphs are split by sentences."""
+        # One giant paragraph with multiple sentences
+        sentences = [
+            f"This is sentence number {i}. It contains important information." for i in range(50)
+        ]
+        text = " ".join(sentences)
+
+        chunks = chunk_text(text, chunk_size=100, overlap=20)
+        assert len(chunks) > 1
+
+
+class TestMiddleTruncate:
+    """Tests for middle_truncate function."""
+
+    def test_no_truncation_needed(self) -> None:
+        """Test that short text is not truncated."""
+        text = "Short text"
+        result, dropped = middle_truncate(text, budget_chars=100)
+        assert result == text
+        assert dropped == 0
+
+    def test_basic_truncation(self) -> None:
+        """Test basic middle truncation."""
+        text = "A" * 100  # 100 character string
+        result, dropped = middle_truncate(text, budget_chars=50)
+
+        # Should have head + marker + tail
+        assert len(result) <= 50 + 50  # Allow for marker
+        assert dropped > 0
+        assert "[..." in result
+        assert "truncated...]" in result
+
+    def test_head_tail_fractions(self) -> None:
+        """Test custom head/tail fractions."""
+        text = "AAAAA" + "BBBBB" * 20 + "CCCCC"
+        result, dropped = middle_truncate(text, budget_chars=30, head_frac=0.5, tail_frac=0.5)
+
+        # Should preserve beginning (A's) and end (C's)
+        assert result.startswith("A")
+        assert dropped > 0
+
+    def test_zero_budget(self) -> None:
+        """Test with zero budget returns original."""
+        text = "Some text"
+        result, dropped = middle_truncate(text, budget_chars=0)
+        assert result == text
+        assert dropped == 0
+
+    def test_negative_budget(self) -> None:
+        """Test with negative budget returns original."""
+        text = "Some text"
+        result, dropped = middle_truncate(text, budget_chars=-10)
+        assert result == text
+        assert dropped == 0
+
+
+class TestEstimateSummaryTokens:
+    """Tests for estimate_summary_tokens function."""
+
+    def test_none_level(self) -> None:
+        """Test level 0 (NONE) returns 0."""
+        assert estimate_summary_tokens(1000, level=0) == 0
+
+    def test_brief_level(self) -> None:
+        """Test level 1 (BRIEF) compression."""
+        # BRIEF: ~20% compression, capped at 50
+        result = estimate_summary_tokens(100, level=1)
+        assert result >= 20  # minimum of 20
+        assert result <= 50  # capped at 50
+
+    def test_standard_level(self) -> None:
+        """Test level 2 (STANDARD) compression."""
+        # STANDARD: ~12% compression, capped at 200
+        result = estimate_summary_tokens(1000, level=2)
+        assert result >= 50  # minimum of 50
+        assert result <= 200  # capped at 200
+
+    def test_detailed_level(self) -> None:
+        """Test level 3 (DETAILED) compression."""
+        # DETAILED: ~7% compression, capped at 500
+        result = estimate_summary_tokens(10000, level=3)
+        assert result >= 100  # minimum of 100
+        assert result <= 500  # capped at 500
+
+    def test_hierarchical_level(self) -> None:
+        """Test level 4 (HIERARCHICAL) compression."""
+        # HIERARCHICAL: base of 1000 + diminishing returns
+        result = estimate_summary_tokens(50000, level=4)
+        assert result >= 1000  # base minimum
+        assert result <= 2000  # capped at 2000
+
+    def test_hierarchical_small_input(self) -> None:
+        """Test HIERARCHICAL with smaller input."""
+        # Even with small input, should return base
+        result = estimate_summary_tokens(5000, level=4)
+        assert result == 1000  # just the base, no additional
+
+
+class TestTokensToWords:
+    """Tests for tokens_to_words function."""
+
+    def test_basic_conversion(self) -> None:
+        """Test basic token to word conversion."""
+        # 1 token ≈ 0.75 words
+        assert tokens_to_words(100) == 75
+        assert tokens_to_words(1000) == 750
+
+    def test_zero_tokens(self) -> None:
+        """Test zero tokens returns zero words."""
+        assert tokens_to_words(0) == 0
+
+    def test_small_values(self) -> None:
+        """Test small token values."""
+        assert tokens_to_words(1) == 0  # int(0.75) = 0
+        assert tokens_to_words(2) == 1  # int(1.5) = 1
+        assert tokens_to_words(4) == 3  # int(3.0) = 3

From 47c32a688185390603caaa31b31f0b70b2b378a1 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:00:28 -0800
Subject: [PATCH 07/37] refactor(summarizer): improve code quality and add
 Letta-style features

- Fix datetime.utcnow() deprecation, use datetime.now(UTC)
- Extract duplicate chunk summarization to _summarize_single_chunk()
- Add SummarizationError exception for better error handling
- Add retry with exponential backoff (1s, 2s, 4s) for generation failures
- Add middle-truncation fallback for oversized content (Letta-style)
- Export SummarizationError from module __init__
---
 agent_cli/summarizer/__init__.py |   3 +-
 agent_cli/summarizer/adaptive.py | 176 ++++++++++++++++++++++---------
 agent_cli/summarizer/models.py   |   4 +-
 tests/summarizer/test_models.py  |   4 +-
 4 files changed, 135 insertions(+), 52 deletions(-)

diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index c6f1d85a1..d017dfd4b 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -16,7 +16,7 @@
 
 """
 
-from agent_cli.summarizer.adaptive import AdaptiveSummarizer
+from agent_cli.summarizer.adaptive import AdaptiveSummarizer, SummarizationError
 from agent_cli.summarizer.models import (
     HierarchicalSummary,
     SummaryLevel,
@@ -26,6 +26,7 @@
 __all__ = [
     "AdaptiveSummarizer",
     "HierarchicalSummary",
+    "SummarizationError",
     "SummaryLevel",
     "SummaryResult",
 ]
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index ed0074d87..e8ff2f9a0 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -38,6 +38,7 @@
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
+    middle_truncate,
     tokens_to_words,
 )
 
@@ -57,6 +58,14 @@
 # Minimum number of L1 chunks before L2 grouping is applied
 L2_MIN_CHUNKS = 5
 
+# Retry settings for summarization failures
+MAX_SUMMARIZE_RETRIES = 3
+
+# Maximum characters per chunk before applying middle truncation
+# This prevents context overflow errors for very large chunks
+# (roughly 12K tokens with cl100k_base encoding)
+MAX_CHUNK_CHARS = 48000
+
 
 class SummaryOutput(BaseModel):
     """Structured output for summary generation."""
@@ -64,6 +73,10 @@ class SummaryOutput(BaseModel):
     summary: str
 
 
+class SummarizationError(Exception):
+    """Raised when summarization fails after all retries."""
+
+
 class AdaptiveSummarizer:
     """Adaptive summarization that scales with input complexity.
 
@@ -245,6 +258,68 @@ async def update_rolling_summary(
 
         return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
 
+    async def _summarize_single_chunk(
+        self,
+        chunk: str,
+        chunk_index: int,
+        total_chunks: int,
+        *,
+        parent_group: int | None = None,
+    ) -> ChunkSummary:
+        """Summarize a single chunk of content.
+
+        Extracted to avoid duplication between _detailed_summary and
+        _hierarchical_summary methods. Uses middle truncation as a fallback
+        for oversized content (Letta-style).
+
+        Args:
+            chunk: The text chunk to summarize.
+            chunk_index: Index of this chunk (0-based).
+            total_chunks: Total number of chunks being processed.
+            parent_group: Optional L2 group index for hierarchical summaries.
+
+        Returns:
+            ChunkSummary with the summarized content.
+
+        """
+        # Apply middle truncation if chunk is too large (Letta-style fallback)
+        source_tokens = count_tokens(chunk, self.model)
+        content_to_summarize = chunk
+        if len(chunk) > MAX_CHUNK_CHARS:
+            content_to_summarize, dropped = middle_truncate(
+                chunk,
+                MAX_CHUNK_CHARS,
+                head_frac=0.3,
+                tail_frac=0.3,
+            )
+            logger.warning(
+                "Chunk %d truncated: dropped %d chars to fit context window",
+                chunk_index,
+                dropped,
+            )
+
+        chunk_tokens = count_tokens(content_to_summarize, self.model)
+        target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+        max_words = tokens_to_words(target_tokens)
+
+        prompt = CHUNK_SUMMARY_PROMPT.format(
+            chunk_index=chunk_index + 1,
+            total_chunks=total_chunks,
+            content=content_to_summarize,
+            max_words=max_words,
+        )
+
+        summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+        summary_tokens = count_tokens(summary, self.model)
+
+        return ChunkSummary(
+            chunk_index=chunk_index,
+            content=summary,
+            token_count=summary_tokens,
+            source_tokens=source_tokens,  # Report original token count
+            parent_group=parent_group,
+        )
+
     async def _brief_summary(self, content: str) -> str:
         """Generate a single-sentence summary for brief content."""
         prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
@@ -286,32 +361,17 @@ async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryRes
         # Summarize chunks (with concurrency limit)
         semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
 
-        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
             async with semaphore:
-                chunk_tokens = count_tokens(chunk, self.model)
-                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
-                max_words = tokens_to_words(target_tokens)
-
-                prompt = CHUNK_SUMMARY_PROMPT.format(
-                    chunk_index=idx + 1,
-                    total_chunks=len(chunks),
-                    content=chunk,
-                    max_words=max_words,
-                )
-
-                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-                summary_tokens = count_tokens(summary, self.model)
-
-                return ChunkSummary(
-                    chunk_index=idx,
-                    content=summary,
-                    token_count=summary_tokens,
-                    source_tokens=chunk_tokens,
+                return await self._summarize_single_chunk(
+                    chunk,
+                    idx,
+                    len(chunks),
                     parent_group=None,
                 )
 
         chunk_summaries = await asyncio.gather(
-            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
         )
 
         # Generate meta-summary
@@ -364,35 +424,19 @@ async def _hierarchical_summary(self, content: str, input_tokens: int) -> Summar
         # L1: Summarize each chunk
         semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
 
-        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
             async with semaphore:
-                chunk_tokens = count_tokens(chunk, self.model)
-                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
-                max_words = tokens_to_words(target_tokens)
-
-                prompt = CHUNK_SUMMARY_PROMPT.format(
-                    chunk_index=idx + 1,
-                    total_chunks=len(chunks),
-                    content=chunk,
-                    max_words=max_words,
-                )
-
-                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-                summary_tokens = count_tokens(summary, self.model)
-
-                # Assign to group (5 chunks per group)
-                group_idx = idx // 5
-
-                return ChunkSummary(
-                    chunk_index=idx,
-                    content=summary,
-                    token_count=summary_tokens,
-                    source_tokens=chunk_tokens,
+                # Assign to L2 group (L2_GROUP_SIZE chunks per group)
+                group_idx = idx // L2_GROUP_SIZE
+                return await self._summarize_single_chunk(
+                    chunk,
+                    idx,
+                    len(chunks),
                     parent_group=group_idx,
                 )
 
         l1_summaries = await asyncio.gather(
-            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
         )
 
         # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
@@ -448,10 +492,29 @@ async def summarize_group(group: list[str]) -> str:
             compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
         )
 
-    async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str:
+    async def _generate_summary(
+        self,
+        prompt: str,
+        max_tokens: int = 256,
+        *,
+        attempt: int = 0,
+    ) -> str:
         """Generate a summary using the LLM.
 
         Uses PydanticAI for structured output with fallback to raw generation.
+        Implements exponential backoff retry on failures.
+
+        Args:
+            prompt: The prompt to send to the LLM.
+            max_tokens: Maximum tokens for the response.
+            attempt: Current retry attempt (for internal recursion).
+
+        Returns:
+            The generated summary text.
+
+        Raises:
+            SummarizationError: If all retries are exhausted.
+
         """
         model = OpenAIChatModel(
             model_name=self.model,
@@ -475,7 +538,26 @@ async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str:
         except Exception as e:
             logger.warning("Structured summary failed, trying raw generation: %s", e)
             # Fallback to raw HTTP call
-            return await self._raw_generate(prompt, max_tokens)
+            try:
+                return await self._raw_generate(prompt, max_tokens)
+            except Exception as raw_err:
+                if attempt < MAX_SUMMARIZE_RETRIES:
+                    wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
+                    logger.warning(
+                        "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
+                        attempt + 1,
+                        MAX_SUMMARIZE_RETRIES,
+                        wait_time,
+                        raw_err,
+                    )
+                    await asyncio.sleep(wait_time)
+                    return await self._generate_summary(
+                        prompt,
+                        max_tokens,
+                        attempt=attempt + 1,
+                    )
+                msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
+                raise SummarizationError(msg) from raw_err
 
     async def _raw_generate(self, prompt: str, max_tokens: int) -> str:
         """Fallback raw HTTP generation without structured output."""
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index f231a41e5..de9bc609a 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from datetime import datetime
+from datetime import UTC, datetime
 from enum import IntEnum
 from typing import Any
 
@@ -122,7 +122,7 @@ class SummaryResult(BaseModel):
         description="Ratio of output to input tokens (lower = more compression)",
     )
     created_at: datetime = Field(
-        default_factory=datetime.utcnow,
+        default_factory=lambda: datetime.now(UTC),
         description="Timestamp when summary was created",
     )
 
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index 5a6583cd2..e27fa18e0 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -328,5 +328,5 @@ def test_created_at_default(self) -> None:
             compression_ratio=0.1,
         )
         after = datetime.now(UTC)
-        # Compare without timezone since result.created_at may be naive
-        assert before.replace(tzinfo=None) <= result.created_at <= after.replace(tzinfo=None)
+        # All datetimes should be UTC-aware
+        assert before <= result.created_at <= after

From f145f37dc653aae8d9842922a72a9842be8c9ea5 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:11:26 -0800
Subject: [PATCH 08/37] refactor(summarizer): replace class with functional API

- Remove AdaptiveSummarizer class in favor of standalone functions
- Add SummarizerConfig dataclass for configuration
- Export determine_level() as pure function (no state needed)
- Update summarize(), update_rolling_summary() to take config parameter
- Update _ingest.py to use new functional API
- Update all tests for new API

This matches the functional style used throughout the codebase,
reducing state and improving testability.
---
 agent_cli/memory/_ingest.py          |  12 +-
 agent_cli/memory/_persistence.py     |   2 +-
 agent_cli/summarizer/__init__.py     |  19 +-
 agent_cli/summarizer/adaptive.py     | 925 ++++++++++++++-------------
 tests/summarizer/test_adaptive.py    | 219 ++++---
 tests/summarizer/test_integration.py |  37 +-
 6 files changed, 617 insertions(+), 597 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 6673000c1..53e3f3c3f 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -334,9 +334,8 @@ async def summarize_content(
 ) -> SummaryResult:
     """Adaptively summarize content based on its length.
 
-    Uses the AdaptiveSummarizer to automatically select the appropriate
-    summarization strategy (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL)
-    based on input token count.
+    Automatically selects the appropriate summarization strategy
+    (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) based on input token count.
 
     Args:
         content: The content to summarize.
@@ -351,15 +350,16 @@ async def summarize_content(
 
     """
     # Import here to avoid circular imports and allow optional dependency
-    from agent_cli.summarizer import AdaptiveSummarizer  # noqa: PLC0415
+    from agent_cli.summarizer import SummarizerConfig, summarize  # noqa: PLC0415
 
-    summarizer = AdaptiveSummarizer(
+    config = SummarizerConfig(
         openai_base_url=openai_base_url,
         model=model,
         api_key=api_key,
     )
-    return await summarizer.summarize(
+    return await summarize(
         content=content,
+        config=config,
         prior_summary=prior_summary,
         content_type=content_type,
     )
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 9c38f7315..e27eb83fe 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -210,7 +210,7 @@ def persist_hierarchical_summary(
         collection: ChromaDB collection.
         memory_root: Root path for memory files.
         conversation_id: The conversation this summary belongs to.
-        summary_result: The result from AdaptiveSummarizer.summarize().
+        summary_result: The result from summarize().
 
     Returns:
         List of IDs that were stored.
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index d017dfd4b..09210146c 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -5,18 +5,24 @@
 compression ratios) architectures.
 
 Example:
-    from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel
+    from agent_cli.summarizer import summarize, SummarizerConfig, determine_level
 
-    summarizer = AdaptiveSummarizer(
+    config = SummarizerConfig(
         openai_base_url="http://localhost:8000/v1",
         model="gpt-4",
     )
-    result = await summarizer.summarize(long_document)
+    result = await summarize(long_document, config)
     print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}")
 
 """
 
-from agent_cli.summarizer.adaptive import AdaptiveSummarizer, SummarizationError
+from agent_cli.summarizer.adaptive import (
+    SummarizationError,
+    SummarizerConfig,
+    determine_level,
+    summarize,
+    update_rolling_summary,
+)
 from agent_cli.summarizer.models import (
     HierarchicalSummary,
     SummaryLevel,
@@ -24,9 +30,12 @@
 )
 
 __all__ = [
-    "AdaptiveSummarizer",
     "HierarchicalSummary",
     "SummarizationError",
+    "SummarizerConfig",
     "SummaryLevel",
     "SummaryResult",
+    "determine_level",
+    "summarize",
+    "update_rolling_summary",
 ]
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index e8ff2f9a0..38fa865d0 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -11,6 +11,7 @@
 
 import asyncio
 import logging
+from dataclasses import dataclass
 
 import httpx
 from pydantic import BaseModel
@@ -77,508 +78,512 @@ class SummarizationError(Exception):
     """Raised when summarization fails after all retries."""
 
 
-class AdaptiveSummarizer:
-    """Adaptive summarization that scales with input complexity.
-
-    Automatically selects the appropriate summarization strategy based on
-    input length:
-    - NONE (< 100 tokens): No summary needed
-    - BRIEF (100-500 tokens): Single sentence
-    - STANDARD (500-3000 tokens): Paragraph summary
-    - DETAILED (3000-15000 tokens): Chunked + meta-summary
-    - HIERARCHICAL (> 15000 tokens): Multi-level tree of summaries
+@dataclass
+class SummarizerConfig:
+    """Configuration for summarization operations.
 
     Example:
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="llama3.1:8b",
         )
-        result = await summarizer.summarize(long_document)
+        result = await summarize(long_document, config)
         print(f"Level: {result.level.name}")
-        print(f"Summary: {result.summary}")
         print(f"Compression: {result.compression_ratio:.1%}")
 
     """
 
-    def __init__(
-        self,
-        openai_base_url: str,
-        model: str,
-        api_key: str | None = None,
-        chunk_size: int = 3000,
-        chunk_overlap: int = 200,
-        max_concurrent_chunks: int = 5,
-        timeout: float = 60.0,
-    ) -> None:
-        """Initialize the adaptive summarizer.
-
-        Args:
-            openai_base_url: Base URL for OpenAI-compatible API.
-            model: Model name to use for summarization.
-            api_key: API key (optional for local models).
-            chunk_size: Target token count per chunk for hierarchical summarization.
-            chunk_overlap: Token overlap between chunks.
-            max_concurrent_chunks: Maximum parallel chunk summarizations.
-            timeout: Request timeout in seconds.
-
-        """
-        self.openai_base_url = openai_base_url.rstrip("/")
-        self.model = model
-        self.api_key = api_key or "not-needed"
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-        self.max_concurrent_chunks = max_concurrent_chunks
-        self.timeout = timeout
-
-        self._provider = OpenAIProvider(api_key=self.api_key, base_url=self.openai_base_url)
-
-    def determine_level(self, token_count: int) -> SummaryLevel:
-        """Determine the appropriate summary level based on token count.
-
-        Args:
-            token_count: Number of tokens in the input.
-
-        Returns:
-            The recommended SummaryLevel.
-
-        """
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
-            return SummaryLevel.NONE
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
-            return SummaryLevel.BRIEF
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
-            return SummaryLevel.STANDARD
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
-            return SummaryLevel.DETAILED
-        return SummaryLevel.HIERARCHICAL
-
-    async def summarize(
-        self,
-        content: str,
-        prior_summary: str | None = None,
-        content_type: str = "general",
-    ) -> SummaryResult:
-        """Summarize content with adaptive strategy based on length.
-
-        Args:
-            content: The content to summarize.
-            prior_summary: Optional prior summary for context continuity.
-            content_type: Type of content ("general", "conversation", "journal", "document").
-
-        Returns:
-            SummaryResult with summary and metadata.
-
-        """
-        if not content or not content.strip():
-            return SummaryResult(
-                level=SummaryLevel.NONE,
-                summary=None,
-                hierarchical=None,
-                input_tokens=0,
-                output_tokens=0,
-                compression_ratio=0.0,
-            )
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 3000
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
 
-        input_tokens = count_tokens(content, self.model)
-        level = self.determine_level(input_tokens)
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
 
-        logger.info(
-            "Summarizing %d tokens at level %s (type=%s)",
-            input_tokens,
-            level.name,
-            content_type,
-        )
 
-        if level == SummaryLevel.NONE:
-            return SummaryResult(
-                level=level,
-                summary=None,
-                hierarchical=None,
-                input_tokens=input_tokens,
-                output_tokens=0,
-                compression_ratio=0.0,
-            )
+def determine_level(token_count: int) -> SummaryLevel:
+    """Determine the appropriate summary level based on token count.
 
-        if level == SummaryLevel.BRIEF:
-            summary = await self._brief_summary(content)
-        elif level == SummaryLevel.STANDARD:
-            summary = await self._standard_summary(content, prior_summary, content_type)
-        elif level == SummaryLevel.DETAILED:
-            return await self._detailed_summary(content, input_tokens)
-        else:  # HIERARCHICAL
-            return await self._hierarchical_summary(content, input_tokens)
+    Args:
+        token_count: Number of tokens in the input.
 
-        output_tokens = count_tokens(summary, self.model) if summary else 0
-        compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
+    Returns:
+        The recommended SummaryLevel.
 
+    """
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
+        return SummaryLevel.NONE
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
+        return SummaryLevel.BRIEF
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
+        return SummaryLevel.STANDARD
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
+        return SummaryLevel.DETAILED
+    return SummaryLevel.HIERARCHICAL
+
+
+async def summarize(
+    content: str,
+    config: SummarizerConfig,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+) -> SummaryResult:
+    """Summarize content with adaptive strategy based on length.
+
+    Args:
+        content: The content to summarize.
+        config: Summarizer configuration.
+        prior_summary: Optional prior summary for context continuity.
+        content_type: Type of content ("general", "conversation", "journal", "document").
+
+    Returns:
+        SummaryResult with summary and metadata.
+
+    """
+    if not content or not content.strip():
         return SummaryResult(
-            level=level,
-            summary=summary,
+            level=SummaryLevel.NONE,
+            summary=None,
             hierarchical=None,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=compression_ratio,
-        )
-
-    async def update_rolling_summary(
-        self,
-        prior_summary: str | None,
-        new_facts: list[str],
-    ) -> str:
-        """Update a rolling summary with new facts (Mem0-style).
-
-        This is optimized for incremental updates where you have discrete
-        new facts to integrate into an existing summary.
-
-        Args:
-            prior_summary: The existing summary to update.
-            new_facts: List of new facts to integrate.
-
-        Returns:
-            Updated summary string.
-
-        """
-        if not new_facts:
-            return prior_summary or ""
-
-        new_content = "\n".join(f"- {fact}" for fact in new_facts)
-        combined_tokens = count_tokens(
-            (prior_summary or "") + new_content,
-            self.model,
-        )
-
-        target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-        max_words = tokens_to_words(target_tokens)
-
-        prompt = ROLLING_SUMMARY_PROMPT.format(
-            prior_summary=prior_summary or "(No prior summary)",
-            new_content=new_content,
-            max_words=max_words,
+            input_tokens=0,
+            output_tokens=0,
+            compression_ratio=0.0,
         )
 
-        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-
-    async def _summarize_single_chunk(
-        self,
-        chunk: str,
-        chunk_index: int,
-        total_chunks: int,
-        *,
-        parent_group: int | None = None,
-    ) -> ChunkSummary:
-        """Summarize a single chunk of content.
-
-        Extracted to avoid duplication between _detailed_summary and
-        _hierarchical_summary methods. Uses middle truncation as a fallback
-        for oversized content (Letta-style).
-
-        Args:
-            chunk: The text chunk to summarize.
-            chunk_index: Index of this chunk (0-based).
-            total_chunks: Total number of chunks being processed.
-            parent_group: Optional L2 group index for hierarchical summaries.
-
-        Returns:
-            ChunkSummary with the summarized content.
-
-        """
-        # Apply middle truncation if chunk is too large (Letta-style fallback)
-        source_tokens = count_tokens(chunk, self.model)
-        content_to_summarize = chunk
-        if len(chunk) > MAX_CHUNK_CHARS:
-            content_to_summarize, dropped = middle_truncate(
-                chunk,
-                MAX_CHUNK_CHARS,
-                head_frac=0.3,
-                tail_frac=0.3,
-            )
-            logger.warning(
-                "Chunk %d truncated: dropped %d chars to fit context window",
-                chunk_index,
-                dropped,
-            )
+    input_tokens = count_tokens(content, config.model)
+    level = determine_level(input_tokens)
 
-        chunk_tokens = count_tokens(content_to_summarize, self.model)
-        target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
-        max_words = tokens_to_words(target_tokens)
+    logger.info(
+        "Summarizing %d tokens at level %s (type=%s)",
+        input_tokens,
+        level.name,
+        content_type,
+    )
 
-        prompt = CHUNK_SUMMARY_PROMPT.format(
-            chunk_index=chunk_index + 1,
-            total_chunks=total_chunks,
-            content=content_to_summarize,
-            max_words=max_words,
+    if level == SummaryLevel.NONE:
+        return SummaryResult(
+            level=level,
+            summary=None,
+            hierarchical=None,
+            input_tokens=input_tokens,
+            output_tokens=0,
+            compression_ratio=0.0,
         )
 
-        summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-        summary_tokens = count_tokens(summary, self.model)
+    if level == SummaryLevel.BRIEF:
+        summary = await _brief_summary(content, config)
+    elif level == SummaryLevel.STANDARD:
+        summary = await _standard_summary(content, config, prior_summary, content_type)
+    elif level == SummaryLevel.DETAILED:
+        return await _detailed_summary(content, input_tokens, config)
+    else:  # HIERARCHICAL
+        return await _hierarchical_summary(content, input_tokens, config)
+
+    output_tokens = count_tokens(summary, config.model) if summary else 0
+    compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
+
+    return SummaryResult(
+        level=level,
+        summary=summary,
+        hierarchical=None,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=compression_ratio,
+    )
+
+
+async def update_rolling_summary(
+    prior_summary: str | None,
+    new_facts: list[str],
+    config: SummarizerConfig,
+) -> str:
+    """Update a rolling summary with new facts (Mem0-style).
+
+    This is optimized for incremental updates where you have discrete
+    new facts to integrate into an existing summary.
+
+    Args:
+        prior_summary: The existing summary to update.
+        new_facts: List of new facts to integrate.
+        config: Summarizer configuration.
+
+    Returns:
+        Updated summary string.
 
-        return ChunkSummary(
-            chunk_index=chunk_index,
-            content=summary,
-            token_count=summary_tokens,
-            source_tokens=source_tokens,  # Report original token count
-            parent_group=parent_group,
-        )
-
-    async def _brief_summary(self, content: str) -> str:
-        """Generate a single-sentence summary for brief content."""
-        prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
-        return await self._generate_summary(prompt, max_tokens=50)
-
-    async def _standard_summary(
-        self,
-        content: str,
-        prior_summary: str | None,
-        content_type: str,
-    ) -> str:
-        """Generate a paragraph summary for standard-length content."""
-        input_tokens = count_tokens(content, self.model)
-        target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
-        max_words = tokens_to_words(target_tokens)
-
-        prompt_template = get_prompt_for_content_type(content_type)
-        prior_context = format_prior_context(prior_summary)
-
-        prompt = prompt_template.format(
-            content=content,
-            prior_context=prior_context,
-            max_words=max_words,
-        )
+    """
+    if not new_facts:
+        return prior_summary or ""
 
-        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+    new_content = "\n".join(f"- {fact}" for fact in new_facts)
+    combined_tokens = count_tokens(
+        (prior_summary or "") + new_content,
+        config.model,
+    )
 
-    async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryResult:
-        """Generate chunked summaries with meta-summary for detailed content."""
-        chunks = chunk_text(
-            content,
-            chunk_size=self.chunk_size,
-            overlap=self.chunk_overlap,
-            model=self.model,
-        )
+    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+    max_words = tokens_to_words(target_tokens)
 
-        logger.info("Detailed summary: processing %d chunks", len(chunks))
+    prompt = ROLLING_SUMMARY_PROMPT.format(
+        prior_summary=prior_summary or "(No prior summary)",
+        new_content=new_content,
+        max_words=max_words,
+    )
 
-        # Summarize chunks (with concurrency limit)
-        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
 
-        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-            async with semaphore:
-                return await self._summarize_single_chunk(
-                    chunk,
-                    idx,
-                    len(chunks),
-                    parent_group=None,
-                )
 
-        chunk_summaries = await asyncio.gather(
-            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-        )
+async def _summarize_single_chunk(
+    chunk: str,
+    chunk_index: int,
+    total_chunks: int,
+    config: SummarizerConfig,
+    *,
+    parent_group: int | None = None,
+) -> ChunkSummary:
+    """Summarize a single chunk of content.
 
-        # Generate meta-summary
-        all_summaries = [cs.content for cs in chunk_summaries]
-        meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
-        max_words = tokens_to_words(meta_target)
+    Uses middle truncation as a fallback for oversized content (Letta-style).
 
-        meta_prompt = META_SUMMARY_PROMPT.format(
-            summaries=format_summaries_for_meta(all_summaries),
-            max_words=max_words,
-        )
+    Args:
+        chunk: The text chunk to summarize.
+        chunk_index: Index of this chunk (0-based).
+        total_chunks: Total number of chunks being processed.
+        config: Summarizer configuration.
+        parent_group: Optional L2 group index for hierarchical summaries.
 
-        final_summary = await self._generate_summary(meta_prompt, max_tokens=meta_target + 100)
-        output_tokens = count_tokens(final_summary, self.model)
+    Returns:
+        ChunkSummary with the summarized content.
 
-        hierarchical = HierarchicalSummary(
-            l1_summaries=list(chunk_summaries),
-            l2_summaries=[],  # Not used for DETAILED level
-            l3_summary=final_summary,
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
+    """
+    # Apply middle truncation if chunk is too large (Letta-style fallback)
+    source_tokens = count_tokens(chunk, config.model)
+    content_to_summarize = chunk
+    if len(chunk) > MAX_CHUNK_CHARS:
+        content_to_summarize, dropped = middle_truncate(
+            chunk,
+            MAX_CHUNK_CHARS,
+            head_frac=0.3,
+            tail_frac=0.3,
         )
-
-        return SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary=final_summary,
-            hierarchical=hierarchical,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        logger.warning(
+            "Chunk %d truncated: dropped %d chars to fit context window",
+            chunk_index,
+            dropped,
         )
 
-    async def _hierarchical_summary(self, content: str, input_tokens: int) -> SummaryResult:
-        """Build a tree of summaries for very long content.
-
-        Structure:
-        - L1: Individual chunk summaries
-        - L2: Group summaries (groups of ~5 L1 summaries)
-        - L3: Final synthesis
-        """
-        chunks = chunk_text(
-            content,
-            chunk_size=self.chunk_size,
-            overlap=self.chunk_overlap,
-            model=self.model,
-        )
+    chunk_tokens = count_tokens(content_to_summarize, config.model)
+    target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = CHUNK_SUMMARY_PROMPT.format(
+        chunk_index=chunk_index + 1,
+        total_chunks=total_chunks,
+        content=content_to_summarize,
+        max_words=max_words,
+    )
+
+    summary = await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+    summary_tokens = count_tokens(summary, config.model)
+
+    return ChunkSummary(
+        chunk_index=chunk_index,
+        content=summary,
+        token_count=summary_tokens,
+        source_tokens=source_tokens,  # Report original token count
+        parent_group=parent_group,
+    )
+
+
+async def _brief_summary(content: str, config: SummarizerConfig) -> str:
+    """Generate a single-sentence summary for brief content."""
+    prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
+    return await _generate_summary(prompt, config, max_tokens=50)
+
+
+async def _standard_summary(
+    content: str,
+    config: SummarizerConfig,
+    prior_summary: str | None,
+    content_type: str,
+) -> str:
+    """Generate a paragraph summary for standard-length content."""
+    input_tokens = count_tokens(content, config.model)
+    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt_template = get_prompt_for_content_type(content_type)
+    prior_context = format_prior_context(prior_summary)
+
+    prompt = prompt_template.format(
+        content=content,
+        prior_context=prior_context,
+        max_words=max_words,
+    )
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+
+async def _detailed_summary(
+    content: str,
+    input_tokens: int,
+    config: SummarizerConfig,
+) -> SummaryResult:
+    """Generate chunked summaries with meta-summary for detailed content."""
+    chunks = chunk_text(
+        content,
+        chunk_size=config.chunk_size,
+        overlap=config.chunk_overlap,
+        model=config.model,
+    )
+
+    logger.info("Detailed summary: processing %d chunks", len(chunks))
+
+    # Summarize chunks (with concurrency limit)
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
+
+    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
+        async with semaphore:
+            return await _summarize_single_chunk(
+                chunk,
+                idx,
+                len(chunks),
+                config,
+                parent_group=None,
+            )
 
-        logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
+    chunk_summaries = await asyncio.gather(
+        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
+    )
+
+    # Generate meta-summary
+    all_summaries = [cs.content for cs in chunk_summaries]
+    meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
+    max_words = tokens_to_words(meta_target)
+
+    meta_prompt = META_SUMMARY_PROMPT.format(
+        summaries=format_summaries_for_meta(all_summaries),
+        max_words=max_words,
+    )
+
+    final_summary = await _generate_summary(
+        meta_prompt,
+        config,
+        max_tokens=meta_target + 100,
+    )
+    output_tokens = count_tokens(final_summary, config.model)
+
+    hierarchical = HierarchicalSummary(
+        l1_summaries=list(chunk_summaries),
+        l2_summaries=[],  # Not used for DETAILED level
+        l3_summary=final_summary,
+        chunk_size=config.chunk_size,
+        chunk_overlap=config.chunk_overlap,
+    )
+
+    return SummaryResult(
+        level=SummaryLevel.DETAILED,
+        summary=final_summary,
+        hierarchical=hierarchical,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+    )
+
+
+async def _hierarchical_summary(
+    content: str,
+    input_tokens: int,
+    config: SummarizerConfig,
+) -> SummaryResult:
+    """Build a tree of summaries for very long content.
+
+    Structure:
+    - L1: Individual chunk summaries
+    - L2: Group summaries (groups of ~5 L1 summaries)
+    - L3: Final synthesis
+    """
+    chunks = chunk_text(
+        content,
+        chunk_size=config.chunk_size,
+        overlap=config.chunk_overlap,
+        model=config.model,
+    )
+
+    logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
+
+    # L1: Summarize each chunk
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
+
+    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
+        async with semaphore:
+            # Assign to L2 group (L2_GROUP_SIZE chunks per group)
+            group_idx = idx // L2_GROUP_SIZE
+            return await _summarize_single_chunk(
+                chunk,
+                idx,
+                len(chunks),
+                config,
+                parent_group=group_idx,
+            )
 
-        # L1: Summarize each chunk
-        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+    l1_summaries = await asyncio.gather(
+        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
+    )
+
+    # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
+    l2_summaries: list[str] = []
+    if len(l1_summaries) > L2_MIN_CHUNKS:
+        groups: list[list[str]] = []
+        for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
+            group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
+            groups.append(group)
+
+        async def summarize_group(group: list[str]) -> str:
+            combined_tokens = sum(count_tokens(s, config.model) for s in group)
+            target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+            max_words = tokens_to_words(target_tokens)
+
+            prompt = META_SUMMARY_PROMPT.format(
+                summaries=format_summaries_for_meta(group),
+                max_words=max_words,
+            )
+            return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+        l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
+
+    # L3: Final synthesis
+    summaries_to_synthesize = l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
+    final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
+    max_words = tokens_to_words(final_target)
+
+    final_prompt = META_SUMMARY_PROMPT.format(
+        summaries=format_summaries_for_meta(summaries_to_synthesize),
+        max_words=max_words,
+    )
+
+    final_summary = await _generate_summary(
+        final_prompt,
+        config,
+        max_tokens=final_target + 100,
+    )
+    output_tokens = count_tokens(final_summary, config.model)
+
+    hierarchical = HierarchicalSummary(
+        l1_summaries=list(l1_summaries),
+        l2_summaries=list(l2_summaries),
+        l3_summary=final_summary,
+        chunk_size=config.chunk_size,
+        chunk_overlap=config.chunk_overlap,
+    )
+
+    return SummaryResult(
+        level=SummaryLevel.HIERARCHICAL,
+        summary=final_summary,
+        hierarchical=hierarchical,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+    )
+
+
+async def _generate_summary(
+    prompt: str,
+    config: SummarizerConfig,
+    max_tokens: int = 256,
+    *,
+    attempt: int = 0,
+) -> str:
+    """Generate a summary using the LLM.
+
+    Uses PydanticAI for structured output with fallback to raw generation.
+    Implements exponential backoff retry on failures.
+
+    Args:
+        prompt: The prompt to send to the LLM.
+        config: Summarizer configuration.
+        max_tokens: Maximum tokens for the response.
+        attempt: Current retry attempt (for internal recursion).
+
+    Returns:
+        The generated summary text.
+
+    Raises:
+        SummarizationError: If all retries are exhausted.
 
-        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-            async with semaphore:
-                # Assign to L2 group (L2_GROUP_SIZE chunks per group)
-                group_idx = idx // L2_GROUP_SIZE
-                return await self._summarize_single_chunk(
-                    chunk,
-                    idx,
-                    len(chunks),
-                    parent_group=group_idx,
+    """
+    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
+    model = OpenAIChatModel(
+        model_name=config.model,
+        provider=provider,
+        settings=ModelSettings(
+            temperature=0.3,
+            max_tokens=max_tokens,
+        ),
+    )
+
+    agent = Agent(
+        model=model,
+        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+        output_type=SummaryOutput,
+        retries=2,
+    )
+
+    try:
+        result = await agent.run(prompt)
+        return result.output.summary.strip()
+    except Exception as e:
+        logger.warning("Structured summary failed, trying raw generation: %s", e)
+        # Fallback to raw HTTP call
+        try:
+            return await _raw_generate(prompt, config, max_tokens)
+        except Exception as raw_err:
+            if attempt < MAX_SUMMARIZE_RETRIES:
+                wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
+                logger.warning(
+                    "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
+                    attempt + 1,
+                    MAX_SUMMARIZE_RETRIES,
+                    wait_time,
+                    raw_err,
                 )
-
-        l1_summaries = await asyncio.gather(
-            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-        )
-
-        # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
-        l2_summaries: list[str] = []
-        if len(l1_summaries) > L2_MIN_CHUNKS:
-            groups: list[list[str]] = []
-            for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
-                group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
-                groups.append(group)
-
-            async def summarize_group(group: list[str]) -> str:
-                combined_tokens = sum(count_tokens(s, self.model) for s in group)
-                target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-                max_words = tokens_to_words(target_tokens)
-
-                prompt = META_SUMMARY_PROMPT.format(
-                    summaries=format_summaries_for_meta(group),
-                    max_words=max_words,
+                await asyncio.sleep(wait_time)
+                return await _generate_summary(
+                    prompt,
+                    config,
+                    max_tokens,
+                    attempt=attempt + 1,
                 )
-                return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-
-            l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
-
-        # L3: Final synthesis
-        summaries_to_synthesize = (
-            l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
-        )
-        final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
-        max_words = tokens_to_words(final_target)
-
-        final_prompt = META_SUMMARY_PROMPT.format(
-            summaries=format_summaries_for_meta(summaries_to_synthesize),
-            max_words=max_words,
+            msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
+            raise SummarizationError(msg) from raw_err
+
+
+async def _raw_generate(prompt: str, config: SummarizerConfig, max_tokens: int) -> str:
+    """Fallback raw HTTP generation without structured output."""
+    async with httpx.AsyncClient(timeout=config.timeout) as client:
+        response = await client.post(
+            f"{config.openai_base_url}/chat/completions",
+            headers={"Authorization": f"Bearer {config.api_key}"},
+            json={
+                "model": config.model,
+                "messages": [
+                    {"role": "system", "content": "You are a concise summarizer."},
+                    {"role": "user", "content": prompt},
+                ],
+                "temperature": 0.3,
+                "max_tokens": max_tokens,
+            },
         )
+        response.raise_for_status()
+        data = response.json()
 
-        final_summary = await self._generate_summary(final_prompt, max_tokens=final_target + 100)
-        output_tokens = count_tokens(final_summary, self.model)
-
-        hierarchical = HierarchicalSummary(
-            l1_summaries=list(l1_summaries),
-            l2_summaries=list(l2_summaries),
-            l3_summary=final_summary,
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
-        )
-
-        return SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary=final_summary,
-            hierarchical=hierarchical,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
-        )
-
-    async def _generate_summary(
-        self,
-        prompt: str,
-        max_tokens: int = 256,
-        *,
-        attempt: int = 0,
-    ) -> str:
-        """Generate a summary using the LLM.
-
-        Uses PydanticAI for structured output with fallback to raw generation.
-        Implements exponential backoff retry on failures.
-
-        Args:
-            prompt: The prompt to send to the LLM.
-            max_tokens: Maximum tokens for the response.
-            attempt: Current retry attempt (for internal recursion).
-
-        Returns:
-            The generated summary text.
-
-        Raises:
-            SummarizationError: If all retries are exhausted.
-
-        """
-        model = OpenAIChatModel(
-            model_name=self.model,
-            provider=self._provider,
-            settings=ModelSettings(
-                temperature=0.3,
-                max_tokens=max_tokens,
-            ),
-        )
-
-        agent = Agent(
-            model=model,
-            system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
-            output_type=SummaryOutput,
-            retries=2,
-        )
-
-        try:
-            result = await agent.run(prompt)
-            return result.output.summary.strip()
-        except Exception as e:
-            logger.warning("Structured summary failed, trying raw generation: %s", e)
-            # Fallback to raw HTTP call
-            try:
-                return await self._raw_generate(prompt, max_tokens)
-            except Exception as raw_err:
-                if attempt < MAX_SUMMARIZE_RETRIES:
-                    wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
-                    logger.warning(
-                        "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
-                        attempt + 1,
-                        MAX_SUMMARIZE_RETRIES,
-                        wait_time,
-                        raw_err,
-                    )
-                    await asyncio.sleep(wait_time)
-                    return await self._generate_summary(
-                        prompt,
-                        max_tokens,
-                        attempt=attempt + 1,
-                    )
-                msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
-                raise SummarizationError(msg) from raw_err
-
-    async def _raw_generate(self, prompt: str, max_tokens: int) -> str:
-        """Fallback raw HTTP generation without structured output."""
-        async with httpx.AsyncClient(timeout=self.timeout) as client:
-            response = await client.post(
-                f"{self.openai_base_url}/chat/completions",
-                headers={"Authorization": f"Bearer {self.api_key}"},
-                json={
-                    "model": self.model,
-                    "messages": [
-                        {"role": "system", "content": "You are a concise summarizer."},
-                        {"role": "user", "content": prompt},
-                    ],
-                    "temperature": 0.3,
-                    "max_tokens": max_tokens,
-                },
-            )
-            response.raise_for_status()
-            data = response.json()
-
-        choices = data.get("choices", [])
-        if choices:
-            return choices[0].get("message", {}).get("content", "").strip()
-        return ""
+    choices = data.get("choices", [])
+    if choices:
+        return choices[0].get("message", {}).get("content", "").strip()
+    return ""
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index 1f010999e..f5db1486c 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -1,4 +1,4 @@
-"""Unit tests for AdaptiveSummarizer."""
+"""Unit tests for adaptive summarization functions."""
 
 from __future__ import annotations
 
@@ -8,37 +8,42 @@
 
 from agent_cli.summarizer.adaptive import (
     LEVEL_THRESHOLDS,
-    AdaptiveSummarizer,
+    SummarizerConfig,
     SummaryOutput,
+    _generate_summary,
+    _raw_generate,
+    determine_level,
+    summarize,
+    update_rolling_summary,
 )
 from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
 
-class TestAdaptiveSummarizerInit:
-    """Tests for AdaptiveSummarizer initialization."""
+class TestSummarizerConfig:
+    """Tests for SummarizerConfig initialization."""
 
     def test_basic_init(self) -> None:
         """Test basic initialization with required parameters."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="llama3.1:8b",
         )
-        assert summarizer.openai_base_url == "http://localhost:8000/v1"
-        assert summarizer.model == "llama3.1:8b"
-        assert summarizer.api_key == "not-needed"
+        assert config.openai_base_url == "http://localhost:8000/v1"
+        assert config.model == "llama3.1:8b"
+        assert config.api_key == "not-needed"
 
     def test_init_with_api_key(self) -> None:
         """Test initialization with custom API key."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
             api_key="sk-test-key",
         )
-        assert summarizer.api_key == "sk-test-key"
+        assert config.api_key == "sk-test-key"
 
     def test_init_with_custom_settings(self) -> None:
         """Test initialization with custom chunk settings."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
             chunk_size=5000,
@@ -46,59 +51,51 @@ def test_init_with_custom_settings(self) -> None:
             max_concurrent_chunks=10,
             timeout=120.0,
         )
-        assert summarizer.chunk_size == 5000
-        assert summarizer.chunk_overlap == 300
-        assert summarizer.max_concurrent_chunks == 10
-        assert summarizer.timeout == 120.0
+        assert config.chunk_size == 5000
+        assert config.chunk_overlap == 300
+        assert config.max_concurrent_chunks == 10
+        assert config.timeout == 120.0
 
     def test_trailing_slash_stripped(self) -> None:
         """Test that trailing slash is stripped from base URL."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1/",
             model="gpt-4",
         )
-        assert summarizer.openai_base_url == "http://localhost:8000/v1"
+        assert config.openai_base_url == "http://localhost:8000/v1"
 
 
 class TestDetermineLevel:
     """Tests for level determination based on token count."""
 
-    @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
-            openai_base_url="http://localhost:8000/v1",
-            model="gpt-4",
-        )
-
-    def test_none_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_none_level_threshold(self) -> None:
         """Test NONE level for very short content."""
-        assert summarizer.determine_level(50) == SummaryLevel.NONE
-        assert summarizer.determine_level(99) == SummaryLevel.NONE
+        assert determine_level(50) == SummaryLevel.NONE
+        assert determine_level(99) == SummaryLevel.NONE
 
-    def test_brief_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_brief_level_threshold(self) -> None:
         """Test BRIEF level for short content."""
-        assert summarizer.determine_level(100) == SummaryLevel.BRIEF
-        assert summarizer.determine_level(300) == SummaryLevel.BRIEF
-        assert summarizer.determine_level(499) == SummaryLevel.BRIEF
+        assert determine_level(100) == SummaryLevel.BRIEF
+        assert determine_level(300) == SummaryLevel.BRIEF
+        assert determine_level(499) == SummaryLevel.BRIEF
 
-    def test_standard_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_standard_level_threshold(self) -> None:
         """Test STANDARD level for medium content."""
-        assert summarizer.determine_level(500) == SummaryLevel.STANDARD
-        assert summarizer.determine_level(1500) == SummaryLevel.STANDARD
-        assert summarizer.determine_level(2999) == SummaryLevel.STANDARD
+        assert determine_level(500) == SummaryLevel.STANDARD
+        assert determine_level(1500) == SummaryLevel.STANDARD
+        assert determine_level(2999) == SummaryLevel.STANDARD
 
-    def test_detailed_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_detailed_level_threshold(self) -> None:
         """Test DETAILED level for longer content."""
-        assert summarizer.determine_level(3000) == SummaryLevel.DETAILED
-        assert summarizer.determine_level(8000) == SummaryLevel.DETAILED
-        assert summarizer.determine_level(14999) == SummaryLevel.DETAILED
+        assert determine_level(3000) == SummaryLevel.DETAILED
+        assert determine_level(8000) == SummaryLevel.DETAILED
+        assert determine_level(14999) == SummaryLevel.DETAILED
 
-    def test_hierarchical_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_hierarchical_level_threshold(self) -> None:
         """Test HIERARCHICAL level for very long content."""
-        assert summarizer.determine_level(15000) == SummaryLevel.HIERARCHICAL
-        assert summarizer.determine_level(50000) == SummaryLevel.HIERARCHICAL
-        assert summarizer.determine_level(100000) == SummaryLevel.HIERARCHICAL
+        assert determine_level(15000) == SummaryLevel.HIERARCHICAL
+        assert determine_level(50000) == SummaryLevel.HIERARCHICAL
+        assert determine_level(100000) == SummaryLevel.HIERARCHICAL
 
     def test_thresholds_match_constants(self) -> None:
         """Verify thresholds match the module constants."""
@@ -109,46 +106,55 @@ def test_thresholds_match_constants(self) -> None:
 
 
 class TestSummarize:
-    """Tests for main summarize method."""
+    """Tests for main summarize function."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
 
     @pytest.mark.asyncio
-    async def test_empty_content_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_empty_content_returns_none_level(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that empty content returns NONE level result."""
-        result = await summarizer.summarize("")
+        result = await summarize("", config)
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
         assert result.input_tokens == 0
         assert result.output_tokens == 0
 
     @pytest.mark.asyncio
-    async def test_whitespace_only_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_whitespace_only_returns_none_level(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that whitespace-only content returns NONE level result."""
-        result = await summarizer.summarize("   \n\n   ")
+        result = await summarize("   \n\n   ", config)
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
 
     @pytest.mark.asyncio
-    async def test_very_short_content_no_summary(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_very_short_content_no_summary(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that very short content gets NONE level (no summary)."""
         # Less than 100 tokens
-        result = await summarizer.summarize("Hello world")
+        result = await summarize("Hello world", config)
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_brief_summary")
+    @patch("agent_cli.summarizer.adaptive._brief_summary")
     async def test_brief_level_calls_brief_summary(
         self,
         mock_brief: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that BRIEF level content calls _brief_summary."""
         mock_brief.return_value = "Brief summary."
@@ -156,18 +162,18 @@ async def test_brief_level_calls_brief_summary(
         # Create content that's ~100-500 tokens
         content = "This is a test sentence. " * 30  # ~150 tokens
 
-        result = await summarizer.summarize(content)
+        result = await summarize(content, config)
 
-        mock_brief.assert_called_once_with(content)
+        mock_brief.assert_called_once_with(content, config)
         assert result.level == SummaryLevel.BRIEF
         assert result.summary == "Brief summary."
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    @patch("agent_cli.summarizer.adaptive._standard_summary")
     async def test_standard_level_calls_standard_summary(
         self,
         mock_standard: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that STANDARD level content calls _standard_summary."""
         mock_standard.return_value = "Standard summary paragraph."
@@ -175,18 +181,18 @@ async def test_standard_level_calls_standard_summary(
         # Create content that's ~500-3000 tokens
         content = "This is a test sentence with more words. " * 100  # ~800 tokens
 
-        result = await summarizer.summarize(content, content_type="general")
+        result = await summarize(content, config, content_type="general")
 
-        mock_standard.assert_called_once_with(content, None, "general")
+        mock_standard.assert_called_once_with(content, config, None, "general")
         assert result.level == SummaryLevel.STANDARD
         assert result.summary == "Standard summary paragraph."
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    @patch("agent_cli.summarizer.adaptive._standard_summary")
     async def test_prior_summary_passed_to_standard(
         self,
         mock_standard: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that prior_summary is passed to _standard_summary."""
         mock_standard.return_value = "Updated summary."
@@ -194,16 +200,16 @@ async def test_prior_summary_passed_to_standard(
         content = "This is a test sentence with more words. " * 100
         prior = "Previous context summary."
 
-        await summarizer.summarize(content, prior_summary=prior)
+        await summarize(content, config, prior_summary=prior)
 
-        mock_standard.assert_called_once_with(content, prior, "general")
+        mock_standard.assert_called_once_with(content, config, prior, "general")
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_detailed_summary")
+    @patch("agent_cli.summarizer.adaptive._detailed_summary")
     async def test_detailed_level_calls_detailed_summary(
         self,
         mock_detailed: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that DETAILED level content calls _detailed_summary."""
         mock_result = SummaryResult(
@@ -219,17 +225,17 @@ async def test_detailed_level_calls_detailed_summary(
         # Create content that's ~3000-15000 tokens
         content = "Word " * 5000  # ~5000 tokens
 
-        result = await summarizer.summarize(content)
+        result = await summarize(content, config)
 
         assert mock_detailed.called
         assert result.level == SummaryLevel.DETAILED
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_hierarchical_summary")
+    @patch("agent_cli.summarizer.adaptive._hierarchical_summary")
     async def test_hierarchical_level_calls_hierarchical_summary(
         self,
         mock_hierarchical: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that HIERARCHICAL level content calls _hierarchical_summary."""
         mock_result = SummaryResult(
@@ -245,7 +251,7 @@ async def test_hierarchical_level_calls_hierarchical_summary(
         # Create content that's > 15000 tokens
         content = "Word " * 20000
 
-        result = await summarizer.summarize(content)
+        result = await summarize(content, config)
 
         assert mock_hierarchical.called
         assert result.level == SummaryLevel.HIERARCHICAL
@@ -255,62 +261,69 @@ class TestUpdateRollingSummary:
     """Tests for rolling summary updates."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
 
     @pytest.mark.asyncio
-    async def test_empty_facts_returns_prior(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_empty_facts_returns_prior(self, config: SummarizerConfig) -> None:
         """Test that empty facts list returns prior summary."""
-        result = await summarizer.update_rolling_summary(
+        result = await update_rolling_summary(
             prior_summary="Existing summary",
             new_facts=[],
+            config=config,
         )
         assert result == "Existing summary"
 
     @pytest.mark.asyncio
-    async def test_empty_facts_no_prior_returns_empty(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_empty_facts_no_prior_returns_empty(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that empty facts with no prior returns empty string."""
-        result = await summarizer.update_rolling_summary(
+        result = await update_rolling_summary(
             prior_summary=None,
             new_facts=[],
+            config=config,
         )
         assert result == ""
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    @patch("agent_cli.summarizer.adaptive._generate_summary")
     async def test_new_facts_calls_generate(
         self,
         mock_generate: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that new facts trigger summary generation."""
         mock_generate.return_value = "Updated summary with new facts."
 
-        result = await summarizer.update_rolling_summary(
+        result = await update_rolling_summary(
             prior_summary="Old summary",
             new_facts=["User likes coffee", "User lives in Amsterdam"],
+            config=config,
         )
 
         mock_generate.assert_called_once()
         assert result == "Updated summary with new facts."
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    @patch("agent_cli.summarizer.adaptive._generate_summary")
     async def test_facts_formatted_as_list(
         self,
         mock_generate: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that facts are formatted as bullet list in prompt."""
         mock_generate.return_value = "Summary"
 
-        await summarizer.update_rolling_summary(
+        await update_rolling_summary(
             prior_summary="Prior",
             new_facts=["Fact one", "Fact two"],
+            config=config,
         )
 
         # Check the prompt contains formatted facts
@@ -321,12 +334,12 @@ async def test_facts_formatted_as_list(
 
 
 class TestGenerateSummary:
-    """Tests for _generate_summary method."""
+    """Tests for _generate_summary function."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
@@ -334,7 +347,7 @@ def summarizer(self) -> AdaptiveSummarizer:
     @pytest.mark.asyncio
     async def test_generate_summary_with_pydantic_ai(
         self,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test summary generation using PydanticAI agent."""
         # Mock the entire agent creation and run
@@ -346,17 +359,17 @@ async def test_generate_summary_with_pydantic_ai(
             mock_agent.run = AsyncMock(return_value=mock_result)
             mock_agent_class.return_value = mock_agent
 
-            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+            result = await _generate_summary("Test prompt", config, max_tokens=100)
 
             assert result == "Generated summary."
             mock_agent.run.assert_called_once_with("Test prompt")
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_raw_generate")
+    @patch("agent_cli.summarizer.adaptive._raw_generate")
     async def test_fallback_to_raw_generate_on_error(
         self,
         mock_raw: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test fallback to raw HTTP on PydanticAI error."""
         mock_raw.return_value = "Fallback summary"
@@ -366,25 +379,25 @@ async def test_fallback_to_raw_generate_on_error(
             mock_agent.run = AsyncMock(side_effect=Exception("API error"))
             mock_agent_class.return_value = mock_agent
 
-            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+            result = await _generate_summary("Test prompt", config, max_tokens=100)
 
-            mock_raw.assert_called_once_with("Test prompt", 100)
+            mock_raw.assert_called_once_with("Test prompt", config, 100)
             assert result == "Fallback summary"
 
 
 class TestRawGenerate:
-    """Tests for _raw_generate fallback method."""
+    """Tests for _raw_generate fallback function."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
 
     @pytest.mark.asyncio
-    async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_raw_generate_success(self, config: SummarizerConfig) -> None:
         """Test successful raw HTTP generation."""
         mock_response = MagicMock()
         mock_response.json.return_value = {
@@ -398,12 +411,12 @@ async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> Non
             mock_client.__aexit__ = AsyncMock(return_value=None)
             mock_client_class.return_value = mock_client
 
-            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+            result = await _raw_generate("Test prompt", config, max_tokens=100)
 
             assert result == "Raw generated summary"
 
     @pytest.mark.asyncio
-    async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_raw_generate_empty_choices(self, config: SummarizerConfig) -> None:
         """Test raw generate with empty choices returns empty string."""
         mock_response = MagicMock()
         mock_response.json.return_value = {"choices": []}
@@ -415,7 +428,7 @@ async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer)
             mock_client.__aexit__ = AsyncMock(return_value=None)
             mock_client_class.return_value = mock_client
 
-            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+            result = await _raw_generate("Test prompt", config, max_tokens=100)
 
             assert result == ""
 
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index 381f9f5b6..e58a20f6c 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -14,7 +14,7 @@
     get_summary_at_level,
     upsert_hierarchical_summary,
 )
-from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel, SummaryResult
+from agent_cli.summarizer import SummaryLevel, SummaryResult, determine_level
 from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
 
 if TYPE_CHECKING:
@@ -401,40 +401,32 @@ def test_persist_deletes_old_summaries(
         assert deleted_dir.exists()
 
 
-class TestAdaptiveSummarizerLevelDetermination:
-    """Test that AdaptiveSummarizer correctly determines summary levels."""
+class TestDetermineLevelFunction:
+    """Test that determine_level correctly determines summary levels."""
 
-    @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create an AdaptiveSummarizer instance."""
-        return AdaptiveSummarizer(
-            openai_base_url="http://localhost:8000/v1",
-            model="test-model",
-        )
-
-    def test_very_short_content_is_none(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_very_short_content_is_none(self) -> None:
         """Test that content under 100 tokens gets NONE level."""
-        level = summarizer.determine_level(50)
+        level = determine_level(50)
         assert level == SummaryLevel.NONE
 
-    def test_short_content_is_brief(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_short_content_is_brief(self) -> None:
         """Test that 100-500 token content gets BRIEF level."""
-        level = summarizer.determine_level(300)
+        level = determine_level(300)
         assert level == SummaryLevel.BRIEF
 
-    def test_medium_content_is_standard(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_medium_content_is_standard(self) -> None:
         """Test that 500-3000 token content gets STANDARD level."""
-        level = summarizer.determine_level(1500)
+        level = determine_level(1500)
         assert level == SummaryLevel.STANDARD
 
-    def test_long_content_is_detailed(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_long_content_is_detailed(self) -> None:
         """Test that 3000-15000 token content gets DETAILED level."""
-        level = summarizer.determine_level(8000)
+        level = determine_level(8000)
         assert level == SummaryLevel.DETAILED
 
-    def test_very_long_content_is_hierarchical(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_very_long_content_is_hierarchical(self) -> None:
         """Test that content over 15000 tokens gets HIERARCHICAL level."""
-        level = summarizer.determine_level(25000)
+        level = determine_level(25000)
         assert level == SummaryLevel.HIERARCHICAL
 
 
@@ -444,7 +436,8 @@ class TestSummarizeContentFunction:
     @pytest.mark.asyncio
     async def test_summarize_content_creates_result(self) -> None:
         """Test that summarize_content returns a valid SummaryResult."""
-        with patch.object(AdaptiveSummarizer, "summarize") as mock_summarize:
+        # Patch at source since _ingest imports inside the function
+        with patch("agent_cli.summarizer.summarize") as mock_summarize:
             mock_result = SummaryResult(
                 level=SummaryLevel.STANDARD,
                 summary="Mocked summary.",

From 44cfdda19e7948e3afc0516b6e498bc4ca31c771 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:21:59 -0800
Subject: [PATCH 09/37] refactor(summarizer): make internal modules private and
 simplify public API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename prompts.py → _prompts.py and utils.py → _utils.py
- Reduce public API to 6 essential exports: SummarizerConfig, summarize,
  SummaryResult, SummaryLevel, HierarchicalSummary, SummarizationError
- Remove determine_level, update_rolling_summary, count_tokens from public API
- Update imports in adaptive.py and test files
---
 agent_cli/summarizer/__init__.py              | 20 ++++---------------
 .../summarizer/{prompts.py => _prompts.py}    |  0
 agent_cli/summarizer/{utils.py => _utils.py}  |  0
 agent_cli/summarizer/adaptive.py              | 16 +++++++--------
 tests/summarizer/test_integration.py          |  3 ++-
 tests/summarizer/test_prompts.py              |  2 +-
 tests/summarizer/test_utils.py                |  2 +-
 7 files changed, 16 insertions(+), 27 deletions(-)
 rename agent_cli/summarizer/{prompts.py => _prompts.py} (100%)
 rename agent_cli/summarizer/{utils.py => _utils.py} (100%)

diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index 09210146c..fc0994c4c 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -5,29 +5,19 @@
 compression ratios) architectures.
 
 Example:
-    from agent_cli.summarizer import summarize, SummarizerConfig, determine_level
+    from agent_cli.summarizer import summarize, SummarizerConfig
 
     config = SummarizerConfig(
         openai_base_url="http://localhost:8000/v1",
         model="gpt-4",
     )
     result = await summarize(long_document, config)
-    print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}")
+    print(f"Level: {result.level.name}, Compression: {result.compression_ratio:.1%}")
 
 """
 
-from agent_cli.summarizer.adaptive import (
-    SummarizationError,
-    SummarizerConfig,
-    determine_level,
-    summarize,
-    update_rolling_summary,
-)
-from agent_cli.summarizer.models import (
-    HierarchicalSummary,
-    SummaryLevel,
-    SummaryResult,
-)
+from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize
+from agent_cli.summarizer.models import HierarchicalSummary, SummaryLevel, SummaryResult
 
 __all__ = [
     "HierarchicalSummary",
@@ -35,7 +25,5 @@
     "SummarizerConfig",
     "SummaryLevel",
     "SummaryResult",
-    "determine_level",
     "summarize",
-    "update_rolling_summary",
 ]
diff --git a/agent_cli/summarizer/prompts.py b/agent_cli/summarizer/_prompts.py
similarity index 100%
rename from agent_cli/summarizer/prompts.py
rename to agent_cli/summarizer/_prompts.py
diff --git a/agent_cli/summarizer/utils.py b/agent_cli/summarizer/_utils.py
similarity index 100%
rename from agent_cli/summarizer/utils.py
rename to agent_cli/summarizer/_utils.py
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 38fa865d0..590dabc55 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -20,13 +20,7 @@
 from pydantic_ai.providers.openai import OpenAIProvider
 from pydantic_ai.settings import ModelSettings
 
-from agent_cli.summarizer.models import (
-    ChunkSummary,
-    HierarchicalSummary,
-    SummaryLevel,
-    SummaryResult,
-)
-from agent_cli.summarizer.prompts import (
+from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
@@ -35,13 +29,19 @@
     format_summaries_for_meta,
     get_prompt_for_content_type,
 )
-from agent_cli.summarizer.utils import (
+from agent_cli.summarizer._utils import (
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
     middle_truncate,
     tokens_to_words,
 )
+from agent_cli.summarizer.models import (
+    ChunkSummary,
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index e58a20f6c..6eeb133ed 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -14,7 +14,8 @@
     get_summary_at_level,
     upsert_hierarchical_summary,
 )
-from agent_cli.summarizer import SummaryLevel, SummaryResult, determine_level
+from agent_cli.summarizer import SummaryLevel, SummaryResult
+from agent_cli.summarizer.adaptive import determine_level
 from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
 
 if TYPE_CHECKING:
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index e126def22..05937f71a 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from agent_cli.summarizer.prompts import (
+from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     CONVERSATION_SUMMARY_PROMPT,
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 458e9b37d..22eb4039e 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from agent_cli.summarizer.utils import (
+from agent_cli.summarizer._utils import (
     chunk_text,
     count_tokens,
     estimate_summary_tokens,

From 1de48ddc6f58a73ab336e9123080d9390023a346 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:39:01 -0800
Subject: [PATCH 10/37] refactor(memory): wire AdaptiveSummarizer into memory
 pipeline

Replace the old rolling summary system with the new hierarchical
adaptive summarizer. This simplifies the codebase by removing
redundant code paths and using a single, research-backed approach.

Changes:
- Update extract_and_store_facts_and_summaries() to use summarize_content()
  and store_adaptive_summary() instead of update_summary()/persist_summary()
- Remove old summary functions: update_summary, persist_summary, get_summary_entry
- Remove Summary entity and SummaryOutput model (unused)
- Add summary_level to L3 metadata for consistency
- Update tests to mock new summarizer interface

The new system automatically selects summarization level (NONE, BRIEF,
STANDARD, DETAILED, HIERARCHICAL) based on content complexity, storing
summaries in a L1/L2/L3 hierarchical structure.
---
 agent_cli/memory/_ingest.py          | 80 ++++++----------------------
 agent_cli/memory/_persistence.py     | 27 +---------
 agent_cli/memory/_retrieval.py       |  5 +-
 agent_cli/memory/_store.py           | 25 ---------
 agent_cli/memory/entities.py         |  9 ----
 agent_cli/memory/models.py           | 16 +-----
 agent_cli/summarizer/models.py       |  1 +
 tests/memory/test_engine.py          | 46 +++++++++++-----
 tests/memory/test_git_integration.py | 14 +++--
 tests/memory/test_store.py           | 17 ------
 10 files changed, 64 insertions(+), 176 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 53e3f3c3f..b0b472b71 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -14,24 +14,21 @@
     delete_memory_files,
     persist_entries,
     persist_hierarchical_summary,
-    persist_summary,
 )
 from agent_cli.memory._prompt import (
     FACT_INSTRUCTIONS,
     FACT_SYSTEM_PROMPT,
-    SUMMARY_PROMPT,
     UPDATE_MEMORY_PROMPT,
 )
 from agent_cli.memory._retrieval import gather_relevant_existing_memories
-from agent_cli.memory._store import delete_entries, get_summary_entry
-from agent_cli.memory.entities import Fact, Summary
+from agent_cli.memory._store import delete_entries, get_final_summary
+from agent_cli.memory.entities import Fact
 from agent_cli.memory.models import (
     MemoryAdd,
     MemoryDecision,
     MemoryDelete,
     MemoryIgnore,
     MemoryUpdate,
-    SummaryOutput,
 )
 
 if TYPE_CHECKING:
@@ -43,8 +40,6 @@
 
 LOGGER = logging.getLogger(__name__)
 
-_SUMMARY_ROLE = "summary"
-
 
 def _elapsed_ms(start: float) -> float:
     """Return elapsed milliseconds since start."""
@@ -283,46 +278,6 @@ def validate_decisions(decisions: list[MemoryDecision]) -> list[MemoryDecision]:
     return to_add, to_delete, replacement_map
 
 
-async def update_summary(
-    *,
-    prior_summary: str | None,
-    new_facts: list[str],
-    openai_base_url: str,
-    api_key: str | None,
-    model: str,
-    max_tokens: int = 256,
-) -> str | None:
-    """Update the conversation summary based on new facts.
-
-    This is the simple Mem0-style rolling summary that incrementally
-    updates based on new facts. For full content adaptive summarization,
-    use `summarize_content` instead.
-    """
-    if not new_facts:
-        return prior_summary
-
-    from pydantic_ai import Agent  # noqa: PLC0415
-    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
-    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
-    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
-
-    system_prompt = SUMMARY_PROMPT
-    user_parts: list[str] = []
-    if prior_summary:
-        user_parts.append(f"Previous summary:\n{prior_summary}")
-    user_parts.append("New facts:\n" + "\n".join(f"- {fact}" for fact in new_facts))
-    prompt_text = "\n\n".join(user_parts)
-    provider = OpenAIProvider(api_key=api_key or "dummy", base_url=openai_base_url)
-    model_cfg = OpenAIChatModel(
-        model_name=model,
-        provider=provider,
-        settings=ModelSettings(temperature=0.2, max_tokens=max_tokens),
-    )
-    agent = Agent(model=model_cfg, system_prompt=system_prompt, output_type=SummaryOutput)
-    result = await agent.run(prompt_text)
-    return result.output.summary or prior_summary
-
-
 async def summarize_content(
     *,
     content: str,
@@ -459,37 +414,34 @@ async def extract_and_store_facts_and_summaries(
             entries=list(to_add),
         )
 
-    if enable_summarization:
-        prior_summary_entry = get_summary_entry(
-            collection,
-            conversation_id,
-            role=_SUMMARY_ROLE,
-        )
+    if enable_summarization and facts:
+        # Get prior summary for context continuity
+        prior_summary_entry = get_final_summary(collection, conversation_id)
         prior_summary = prior_summary_entry.content if prior_summary_entry else None
 
+        # Summarize the new facts
+        content_to_summarize = "\n".join(facts)
         summary_start = perf_counter()
-        new_summary = await update_summary(
+        summary_result = await summarize_content(
+            content=content_to_summarize,
             prior_summary=prior_summary,
-            new_facts=facts,
+            content_type="conversation",
             openai_base_url=openai_base_url,
             api_key=api_key,
             model=model,
         )
         LOGGER.info(
-            "Summary update completed in %.1f ms (conversation=%s)",
+            "Summary update completed in %.1f ms (conversation=%s, level=%s)",
             _elapsed_ms(summary_start),
             conversation_id,
+            summary_result.level.name,
         )
-        if new_summary:
-            summary_obj = Summary(
-                conversation_id=conversation_id,
-                content=new_summary,
-                created_at=datetime.now(UTC),
-            )
-            persist_summary(
+        if summary_result.summary:
+            await store_adaptive_summary(
                 collection,
                 memory_root=memory_root,
-                summary=summary_obj,
+                conversation_id=conversation_id,
+                summary_result=summary_result,
             )
 
     if enable_git_versioning:
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index e27eb83fe..91585ade8 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -24,7 +24,7 @@
     upsert_hierarchical_summary,
     upsert_memories,
 )
-from agent_cli.memory.entities import Fact, Summary, Turn
+from agent_cli.memory.entities import Fact, Turn
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -89,31 +89,6 @@ def persist_entries(
         upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas)
 
 
-def persist_summary(
-    collection: Collection,
-    *,
-    memory_root: Path,
-    summary: Summary,
-) -> None:
-    """Persist a summary to disk and Chroma."""
-    doc_id = _safe_identifier(f"{summary.conversation_id}{_SUMMARY_DOC_ID_SUFFIX}-summary")
-    record = write_memory_file(
-        memory_root,
-        conversation_id=summary.conversation_id,
-        role="summary",
-        created_at=summary.created_at.isoformat(),
-        content=summary.content,
-        summary_kind="summary",
-        doc_id=doc_id,
-    )
-    upsert_memories(
-        collection,
-        ids=[record.id],
-        contents=[record.content],
-        metadatas=[record.metadata],
-    )
-
-
 def delete_memory_files(
     memory_root: Path,
     conversation_id: str,
diff --git a/agent_cli/memory/_retrieval.py b/agent_cli/memory/_retrieval.py
index 283b0afb6..3be059171 100644
--- a/agent_cli/memory/_retrieval.py
+++ b/agent_cli/memory/_retrieval.py
@@ -8,7 +8,7 @@
 from typing import TYPE_CHECKING, Any
 
 from agent_cli.core.reranker import OnnxCrossEncoder, predict_relevance
-from agent_cli.memory._store import get_summary_entry, query_memories
+from agent_cli.memory._store import get_final_summary, query_memories
 from agent_cli.memory.models import (
     ChatRequest,
     MemoryEntry,
@@ -24,7 +24,6 @@
 LOGGER = logging.getLogger(__name__)
 
 _DEFAULT_MMR_LAMBDA = 0.7
-_SUMMARY_ROLE = "summary"
 _MIN_MAX_EPSILON = 1e-8  # Avoid division by zero in min-max normalization
 
 
@@ -212,7 +211,7 @@ def recency_score(meta: MemoryMetadata) -> float:
 
     summaries: list[str] = []
     if include_summary:
-        summary_entry = get_summary_entry(collection, conversation_id, role=_SUMMARY_ROLE)
+        summary_entry = get_final_summary(collection, conversation_id)
         if summary_entry:
             summaries.append(f"Conversation summary:\n{summary_entry.content}")
 
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 4f3755b12..722dcda9e 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -111,31 +111,6 @@ def query_memories(
     return records
 
 
-def get_summary_entry(
-    collection: Collection,
-    conversation_id: str,
-    *,
-    role: str = "summary",
-) -> StoredMemory | None:
-    """Return the latest summary entry for a conversation, if present."""
-    result = collection.get(
-        where={"$and": [{"conversation_id": conversation_id}, {"role": role}]},
-    )
-    docs = result.get("documents") or []
-    metas = result.get("metadatas") or []
-    ids = result.get("ids") or []
-
-    if not docs or not metas or not ids:
-        return None
-
-    return StoredMemory(
-        id=ids[0],
-        content=docs[0],
-        metadata=MemoryMetadata(**dict(metas[0])),
-        distance=None,
-    )
-
-
 def list_conversation_entries(
     collection: Collection,
     conversation_id: str,
diff --git a/agent_cli/memory/entities.py b/agent_cli/memory/entities.py
index 70b16a78c..a352b0bbf 100644
--- a/agent_cli/memory/entities.py
+++ b/agent_cli/memory/entities.py
@@ -32,12 +32,3 @@ class Fact(BaseModel):
     source_id: str = Field(..., description="UUID of the Turn this fact was extracted from")
     created_at: datetime
     # Facts are always role="memory" implicitly in the storage layer
-
-
-class Summary(BaseModel):
-    """The rolling summary of a conversation."""
-
-    conversation_id: str
-    content: str
-    created_at: datetime
-    # Summaries are role="summary" implicitly
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 6dc689d8f..4eb289c7d 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -4,7 +4,7 @@
 
 from typing import Literal
 
-from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import BaseModel, ConfigDict
 
 
 class Message(BaseModel):
@@ -70,20 +70,6 @@ class MemoryMetadata(BaseModel):
     """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL')."""
 
 
-class SummaryOutput(BaseModel):
-    """Structured summary returned by the LLM."""
-
-    summary: str
-
-    @field_validator("summary")
-    @classmethod
-    def _not_empty(cls, v: str) -> str:
-        if not v or not str(v).strip():
-            msg = "field must be non-empty"
-            raise ValueError(msg)
-        return str(v).strip()
-
-
 class StoredMemory(BaseModel):
     """Memory document as stored in the vector DB."""
 
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index de9bc609a..843d1dfe5 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -190,6 +190,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L3,
                         "is_final": True,
+                        "summary_level": self.level.name,
                         "input_tokens": self.input_tokens,
                         "output_tokens": self.output_tokens,
                         "compression_ratio": self.compression_ratio,
diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py
index 440127acb..12e419de9 100644
--- a/tests/memory/test_engine.py
+++ b/tests/memory/test_engine.py
@@ -22,8 +22,8 @@
     MemoryMetadata,
     Message,
     StoredMemory,
-    SummaryOutput,
 )
+from agent_cli.summarizer import SummaryLevel, SummaryResult
 
 
 class _DummyReranker:
@@ -250,13 +250,13 @@ def fake_query_memories(
     )
     monkeypatch.setattr(
         _retrieval,
-        "get_summary_entry",
-        lambda _collection, _cid, role: StoredMemory(  # type: ignore[return-value]
-            id=f"{role}-id",
-            content=f"{role} content",
+        "get_final_summary",
+        lambda _collection, _cid: StoredMemory(
+            id="summary-id",
+            content="summary content",
             metadata=MemoryMetadata(
                 conversation_id="conv1",
-                role=role,
+                role="summary",
                 created_at=now.isoformat(),
             ),
         ),
@@ -349,11 +349,19 @@ def __init__(self, output: Any) -> None:
                 self.output = output
 
         prompt_str = str(prompt_text)
-        if "New facts:" in prompt_str:
-            return _Result(SummaryOutput(summary="summary up to 256"))
         if "Hello, I enjoy biking" in prompt_str:
             return _Result(["User likes cats.", "User loves biking."])
-        return _Result(SummaryOutput(summary="noop"))
+        return _Result([])
+
+    async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
+        return SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="summary up to 256",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=20,
+            compression_ratio=0.2,
+        )
 
     async def fake_reconcile(
         _collection: Any,
@@ -377,6 +385,7 @@ async def fake_reconcile(
     import pydantic_ai  # noqa: PLC0415
 
     monkeypatch.setattr(pydantic_ai.Agent, "run", fake_agent_run)
+    monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
     # High relevance so they aren't filtered
     monkeypatch.setattr(_retrieval, "predict_relevance", lambda _model, pairs: [5.0 for _ in pairs])
 
@@ -568,11 +577,19 @@ def __init__(self, output: Any) -> None:
                 self.output = output
 
         prompt_str = str(prompt_text)
-        if "New facts:" in prompt_str:
-            return _Result(SummaryOutput(summary="summary text"))
         if "My cat is Luna" in prompt_str:
             return _Result(["User has a cat named Luna."])
-        return _Result(SummaryOutput(summary="noop"))
+        return _Result([])
+
+    async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
+        return SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="summary text",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=20,
+            compression_ratio=0.2,
+        )
 
     monkeypatch.setattr(engine._streaming, "stream_chat_sse", fake_stream_chat_sse)
 
@@ -598,6 +615,7 @@ async def fake_reconcile(
     import pydantic_ai  # noqa: PLC0415
 
     monkeypatch.setattr(pydantic_ai.Agent, "run", fake_agent_run)
+    monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
 
     response = await engine.process_chat_request(
         request,
@@ -613,5 +631,5 @@ async def fake_reconcile(
 
     files = list(tmp_path.glob("entries/**/*.md"))
     assert len(files) == 4  # user + assistant + fact + 1 summary
-    assert any("facts" in f.parts for f in files)
-    assert any(f.parent.name == "summaries" and f.name == "summary.md" for f in files)
+    assert any("facts" in str(f) for f in files)
+    assert any("summaries/L3/final.md" in str(f) for f in files)
diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py
index 7d59f7c0e..db197b023 100644
--- a/tests/memory/test_git_integration.py
+++ b/tests/memory/test_git_integration.py
@@ -14,6 +14,7 @@
 from agent_cli.memory import _ingest
 from agent_cli.memory.client import MemoryClient
 from agent_cli.memory.entities import Fact
+from agent_cli.summarizer import SummaryLevel, SummaryResult
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -63,12 +64,19 @@ async def fake_reconcile(
         ]
         return entries, [], {}
 
-    async def fake_update_summary(*_args: Any, **_kwargs: Any) -> str:
-        return "User likes testing."
+    async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
+        return SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="User likes testing.",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=20,
+            compression_ratio=0.2,
+        )
 
     monkeypatch.setattr(_ingest, "extract_salient_facts", fake_extract)
     monkeypatch.setattr(_ingest, "reconcile_facts", fake_reconcile)
-    monkeypatch.setattr(_ingest, "update_summary", fake_update_summary)
+    monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
 
     # Patch Reranker to avoid loading ONNX model
     monkeypatch.setattr("agent_cli.memory.client.get_reranker_model", MagicMock())
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 3edd0eeb9..453a21a9a 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -101,23 +101,6 @@ def query(self, **kwargs: Any) -> dict[str, Any]:
     assert {"role": {"$ne": "summary"}} in clauses
 
 
-def test_get_summary_entry_returns_entry() -> None:
-    # ChromaDB's .get() returns flat lists (not nested like .query())
-    fake = _FakeCollection(
-        get_result={
-            "documents": ["summary text"],
-            "metadatas": [
-                {"conversation_id": "c1", "role": "summary", "created_at": "now"},
-            ],
-            "ids": ["sum1"],
-        },
-    )
-    entry = _store.get_summary_entry(fake, "c1", role="summary")
-    assert entry is not None
-    assert entry.id == "sum1"
-    assert entry.metadata.role == "summary"
-
-
 def test_list_conversation_entries_filters_summaries() -> None:
     # ChromaDB's .get() returns flat lists (not nested like .query())
     fake = _FakeCollection(

From cd43bb3dd4c4b29235bbbdbfff29cfc23300c495 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:42:41 -0800
Subject: [PATCH 11/37] docs: add summarizer spec and update memory docs for
 hierarchical summaries

- Create docs/architecture/summarizer.md with comprehensive technical
  specification for the adaptive summarization system
- Update memory.md to reflect new L1/L2/L3 hierarchical summary structure
- Document level thresholds, compression ratios, and research basis
- Add content-type aware prompts documentation
- Document integration with memory system and storage format
---
 docs/architecture/memory.md     |  37 ++-
 docs/architecture/summarizer.md | 553 ++++++++++++++++++++++++++++++++
 2 files changed, 581 insertions(+), 9 deletions(-)
 create mode 100644 docs/architecture/summarizer.md

diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index f99637ff3..b42e739a9 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -164,7 +164,13 @@ entries/
       assistant/
         <timestamp>__<uuid>.md     # Raw assistant responses
     summaries/
-      summary.md                   # The single rolling summary of the conversation
+      L1/
+        chunk_0.md                 # Level 1: Individual chunk summaries
+        chunk_1.md
+      L2/
+        group_0.md                 # Level 2: Group summaries (groups of ~5 L1s)
+      L3/
+        final.md                   # Level 3: Final synthesized summary
 ```
 
 **Deleted Directory Structure (Soft Deletes):**
@@ -176,7 +182,7 @@ entries/
       facts/
         <timestamp>__<uuid>.md
       summaries/
-        summary.md                 # Tombstoned summary
+        L1/, L2/, L3/              # Tombstoned summary levels
 ```
 
 ### 2.2 File Format
@@ -270,10 +276,18 @@ Resolves contradictions using a "Search-Decide-Update" loop with complete enumer
     *   **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`.
     *   **Deletes:** Soft-deletes files (moved under `deleted/`) and removes from Chroma.
 
-### 4.4 Summarization
-*   **Input:** Previous summary (if any) + newly extracted facts.
-*   **Prompt:** `SUMMARY_PROMPT` (updates the running summary).
-*   **Persistence:** Writes a single `summaries/summary.md` per conversation (deterministic doc ID).
+### 4.4 Summarization (Adaptive Hierarchical)
+Uses the `agent_cli.summarizer` module for research-backed adaptive summarization.
+
+*   **Level Selection:** Automatically determines summarization depth based on token count:
+    *   `NONE` (< 100 tokens): No summary needed, facts only.
+    *   `BRIEF` (100-500 tokens): Single-sentence summary (~20% compression).
+    *   `STANDARD` (500-3000 tokens): Paragraph summary (~12% compression).
+    *   `DETAILED` (3000-15000 tokens): Chunked summaries + meta-summary (~7% compression).
+    *   `HIERARCHICAL` (> 15000 tokens): Full L1/L2/L3 tree structure.
+*   **Input:** Previous L3 summary (if any) + newly extracted facts.
+*   **Persistence:** Stores summaries in `summaries/L1/`, `L2/`, `L3/` subdirectories with YAML front matter containing compression metrics.
+*   **See:** `docs/architecture/summarizer.md` for detailed algorithm specification.
 
 ### 4.5 Eviction
 *   **Trigger:** If total entries in conversation > `max_entries` (default 500).
@@ -303,9 +317,14 @@ To replicate the system behavior, the following prompt strategies are required.
     *   **NONE:** Existing memory is unrelated to new facts, or new fact is an exact duplicate.
 *   **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences.
 
-### 5.3 Summarization (`SUMMARY_PROMPT`)
-*   **Goal:** Maintain a concise running summary.
-*   **Constraints:** Aggregate related facts. Drop transient chit-chat. Focus on durable info.
+### 5.3 Summarization (Adaptive Prompts)
+The summarizer uses level-specific prompts from `agent_cli.summarizer._prompts`:
+*   **`BRIEF_PROMPT`:** Single-sentence distillation for short content.
+*   **`STANDARD_PROMPT`:** Paragraph summary with prior context integration.
+*   **`CHUNK_PROMPT`:** Individual chunk summarization for hierarchical processing.
+*   **`META_PROMPT`:** Synthesizes multiple chunk summaries into cohesive narrative.
+*   **`ROLLING_PROMPT`:** Integrates new facts with existing summary.
+*   **Content-type variants:** `CONVERSATION_PROMPT`, `JOURNAL_PROMPT`, `DOCUMENT_PROMPT` for domain-specific summarization.
 
 ---
 
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
new file mode 100644
index 000000000..940ddddb8
--- /dev/null
+++ b/docs/architecture/summarizer.md
@@ -0,0 +1,553 @@
+# Agent CLI: Adaptive Summarizer Technical Specification
+
+This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem. The design is grounded in research from Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, compression ratios).
+
+## 1. System Overview
+
+The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count.
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                    Adaptive Summarization Pipeline                   │
+├─────────────────────────────────────────────────────────────────────┤
+│                                                                     │
+│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy    │
+│                                                                     │
+│  ┌─────────────────────────────────────────────────────────────┐   │
+│  │ Level Thresholds:                                           │   │
+│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │   │
+│  │   100-500       ──▶ BRIEF       (single sentence)           │   │
+│  │   500-3000      ──▶ STANDARD    (paragraph)                 │   │
+│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │   │
+│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │   │
+│  └─────────────────────────────────────────────────────────────┘   │
+│                                                                     │
+│  Output: SummaryResult with compression metrics                     │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+**Design Goals:**
+
+- **Adaptive compression:** Match summarization depth to content complexity.
+- **Research-grounded:** Based on proven approaches from Letta and Mem0.
+- **Hierarchical structure:** Preserve detail at multiple granularities.
+- **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
+
+---
+
+## 2. Architectural Decisions
+
+### 2.1 Token-Based Level Selection
+
+**Decision:** Select summarization strategy based on input token count with fixed thresholds.
+
+**Rationale:**
+
+- **Predictable behavior:** Users can anticipate output length based on input size.
+- **Optimal compression:** Each level targets a specific compression ratio validated by research.
+- **Efficiency:** Avoid over-processing short content or under-processing long content.
+
+**Implementation:**
+
+```python
+THRESHOLD_NONE = 100       # Below this: no summary needed
+THRESHOLD_BRIEF = 500      # 100-500: single sentence (~20% compression)
+THRESHOLD_STANDARD = 3000  # 500-3000: paragraph (~12% compression)
+THRESHOLD_DETAILED = 15000 # 3000-15000: chunked (~7% compression)
+# Above 15000: hierarchical tree structure
+```
+
+**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior.
+
+### 2.2 Hierarchical Summary Structure (L1/L2/L3)
+
+**Decision:** For long content, build a tree of summaries at three levels of granularity.
+
+**Rationale:**
+
+- **Partial eviction:** Inspired by Letta's memory architecture—keep detailed summaries for recent content, compressed summaries for older content.
+- **Flexible retrieval:** Different use cases need different detail levels.
+- **Progressive compression:** Each level provides ~5x compression over the previous.
+
+**Implementation:**
+
+- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks with 200 token overlap.
+- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries.
+- **L3 (Final Summary):** Single synthesized summary of all L2 summaries.
+
+**Storage:**
+```text
+summaries/
+  L1/
+    chunk_0.md    # Summary of tokens 0-3000
+    chunk_1.md    # Summary of tokens 2800-5800 (overlap)
+  L2/
+    group_0.md    # Synthesis of chunk_0 through chunk_4
+  L3/
+    final.md      # Final narrative summary
+```
+
+### 2.3 Content-Type Aware Prompts
+
+**Decision:** Use different prompt templates for different content domains.
+
+**Rationale:**
+
+- **Conversations:** Focus on user preferences, decisions, action items.
+- **Journals:** Emphasize personal insights, emotional context, growth patterns.
+- **Documents:** Prioritize key findings, methodology, conclusions.
+
+**Implementation:**
+
+```python
+def get_prompt_for_content_type(content_type: str) -> str:
+    match content_type:
+        case "conversation": return CONVERSATION_PROMPT
+        case "journal": return JOURNAL_PROMPT
+        case "document": return DOCUMENT_PROMPT
+        case _: return STANDARD_PROMPT
+```
+
+### 2.4 Prior Summary Integration
+
+**Decision:** Always provide the previous summary as context when updating.
+
+**Rationale:**
+
+- **Continuity:** New summaries should build on existing context, not replace it.
+- **Incremental updates:** Avoid re-summarizing all content on every update.
+- **Context preservation:** Important information from earlier content persists.
+
+**Implementation:**
+
+- The `prior_summary` parameter is passed through the entire pipeline.
+- `ROLLING_PROMPT` specifically handles integrating new facts with existing summaries.
+- For hierarchical summaries, only the L3 summary is used as prior context.
+
+### 2.5 Compression Ratio Tracking
+
+**Decision:** Track and report compression metrics for every summary.
+
+**Rationale:**
+
+- **Transparency:** Users can understand how much information was compressed.
+- **Quality monitoring:** Unusual ratios may indicate summarization issues.
+- **Optimization:** Metrics inform future threshold tuning.
+
+**Implementation:**
+
+```python
+@dataclass
+class SummaryResult:
+    level: SummaryLevel
+    summary: str | None
+    hierarchical: HierarchicalSummary | None
+    input_tokens: int
+    output_tokens: int
+    compression_ratio: float  # output/input (lower = more compression)
+```
+
+---
+
+## 3. Data Model
+
+### 3.1 Summary Levels
+
+| Level | Token Range | Target Compression | Strategy |
+| :--- | :--- | :--- | :--- |
+| `NONE` | < 100 | N/A | No summarization |
+| `BRIEF` | 100-500 | ~20% | Single sentence |
+| `STANDARD` | 500-3000 | ~12% | Paragraph |
+| `DETAILED` | 3000-15000 | ~7% | Chunked + meta |
+| `HIERARCHICAL` | > 15000 | ~3-5% | L1/L2/L3 tree |
+
+### 3.2 Hierarchical Summary Structure
+
+```python
+class ChunkSummary(BaseModel):
+    chunk_index: int          # Position in original content
+    content: str              # The summarized text
+    token_count: int          # Tokens in this summary
+    source_tokens: int        # Tokens in source chunk
+    parent_group: int | None  # L2 group this belongs to
+
+class HierarchicalSummary(BaseModel):
+    l1_summaries: list[ChunkSummary]  # Individual chunk summaries
+    l2_summaries: list[str]           # Group summaries
+    l3_summary: str                   # Final synthesis
+    chunk_size: int = 3000            # Tokens per chunk
+    chunk_overlap: int = 200          # Overlap between chunks
+```
+
+### 3.3 Storage Metadata (ChromaDB)
+
+Summaries are stored with rich metadata for retrieval and management:
+
+| Field | L1 | L2 | L3 | Description |
+| :--- | :---: | :---: | :---: | :--- |
+| `id` | ✓ | ✓ | ✓ | `{conversation_id}:summary:L{n}:{index}` |
+| `conversation_id` | ✓ | ✓ | ✓ | Scope key |
+| `role` | ✓ | ✓ | ✓ | Always `"summary"` |
+| `level` | ✓ | ✓ | ✓ | 1, 2, or 3 |
+| `chunk_index` | ✓ | | | Position in L1 sequence |
+| `group_index` | | ✓ | | Position in L2 sequence |
+| `parent_group` | ✓ | | | Which L2 group owns this L1 |
+| `is_final` | | | ✓ | Marks the top-level summary |
+| `summary_level` | | | ✓ | Name of SummaryLevel enum |
+| `input_tokens` | | | ✓ | Original content token count |
+| `output_tokens` | | | ✓ | Total summary token count |
+| `compression_ratio` | | | ✓ | Output/input ratio |
+| `created_at` | ✓ | ✓ | ✓ | ISO 8601 timestamp |
+
+### 3.4 File Format
+
+Summary files use Markdown with YAML front matter:
+
+```markdown
+---
+id: "journal:summary:L3:final"
+conversation_id: "journal"
+role: "summary"
+level: 3
+is_final: true
+summary_level: "STANDARD"
+input_tokens: 1500
+output_tokens: 180
+compression_ratio: 0.12
+created_at: "2025-01-15T10:30:00Z"
+---
+
+The user has been exploring adaptive summarization techniques...
+```
+
+---
+
+## 4. Processing Pipeline
+
+### 4.1 Main Entry Point
+
+```python
+async def summarize(
+    content: str,
+    config: SummarizerConfig,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+) -> SummaryResult
+```
+
+### 4.2 Level Selection Flow
+
+```
+Input Content
+     │
+     ▼
+┌─────────────┐
+│ Count Tokens│ (tiktoken, cl100k_base)
+└──────┬──────┘
+       │
+       ▼
+┌─────────────────────────────────────────┐
+│ determine_level(token_count) -> Level   │
+│                                         │
+│   < 100  ──▶ NONE                       │
+│   < 500  ──▶ BRIEF                      │
+│   < 3000 ──▶ STANDARD                   │
+│   < 15000 ──▶ DETAILED                  │
+│   else   ──▶ HIERARCHICAL               │
+└──────┬──────────────────────────────────┘
+       │
+       ▼
+   Execute level-specific strategy
+```
+
+### 4.3 Strategy Execution by Level
+
+#### NONE Level
+- **Action:** Return immediately with no summary.
+- **Output:** `SummaryResult(level=NONE, summary=None, compression_ratio=1.0)`
+
+#### BRIEF Level
+- **Prompt:** `BRIEF_PROMPT` - distill to single sentence.
+- **LLM Call:** Single generation with low max_tokens.
+- **Output:** One-sentence summary.
+
+#### STANDARD Level
+- **Prompt:** `STANDARD_PROMPT` with optional prior summary context.
+- **LLM Call:** Single generation.
+- **Output:** Paragraph-length summary.
+
+#### DETAILED Level
+1. **Chunk:** Split content into ~3000 token chunks with 200 token overlap.
+2. **Parallel L1:** Generate summary for each chunk using `CHUNK_PROMPT`.
+3. **Meta-synthesis:** Combine L1 summaries using `META_PROMPT`.
+4. **Output:** `HierarchicalSummary` with L1s and L3 (no L2 needed for this size).
+
+#### HIERARCHICAL Level
+1. **Chunk:** Split into ~3000 token chunks with overlap.
+2. **Parallel L1:** Generate chunk summaries.
+3. **Group:** Organize L1s into groups of ~5.
+4. **Parallel L2:** Summarize each group.
+5. **L3 Synthesis:** Final meta-summary of all L2s.
+6. **Output:** Full `HierarchicalSummary` tree.
+
+### 4.4 Chunking Algorithm
+
+```python
+def chunk_text(
+    text: str,
+    chunk_size: int = 3000,
+    overlap: int = 200,
+) -> list[str]:
+    """Split text into overlapping chunks on paragraph boundaries."""
+```
+
+**Strategy:**
+
+1. **Paragraph-first:** Try to split on double newlines.
+2. **Sentence fallback:** If paragraph exceeds chunk_size, split on sentence boundaries.
+3. **Character fallback:** For very long sentences (e.g., code), use character splitting.
+4. **Overlap handling:** Each chunk starts with the last `overlap` tokens of the previous.
+
+### 4.5 Middle Truncation (Utility)
+
+For contexts where the summary exceeds available space:
+
+```python
+def middle_truncate(
+    text: str,
+    token_budget: int,
+    head_fraction: float = 0.3,
+    tail_fraction: float = 0.7,
+) -> str:
+    """Keep head and tail, remove middle (least likely to contain key info)."""
+```
+
+**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items).
+
+---
+
+## 5. Prompt Specifications
+
+### 5.1 Brief Summary (`BRIEF_PROMPT`)
+
+```
+Distill the following content into a single, comprehensive sentence
+that captures the essential meaning:
+
+{content}
+
+Summary (one sentence):
+```
+
+### 5.2 Standard Summary (`STANDARD_PROMPT`)
+
+```
+Summarize the following content in a concise paragraph.
+{prior_context}
+Focus on key information, decisions, and actionable insights.
+
+Content:
+{content}
+
+Summary:
+```
+
+### 5.3 Chunk Summary (`CHUNK_PROMPT`)
+
+```
+Summarize this section of a larger document.
+Preserve specific details, names, and numbers that may be important.
+
+Section {chunk_index} of {total_chunks}:
+{content}
+
+Section summary:
+```
+
+### 5.4 Meta Summary (`META_PROMPT`)
+
+```
+Synthesize these section summaries into a coherent narrative.
+Maintain logical flow and preserve the most important information.
+
+Section Summaries:
+{summaries}
+
+Synthesized Summary:
+```
+
+### 5.5 Rolling Summary (`ROLLING_PROMPT`)
+
+```
+Update the existing summary to incorporate new information.
+Preserve important historical context while integrating new facts.
+
+Existing Summary:
+{prior_summary}
+
+New Information:
+{new_facts}
+
+Updated Summary:
+```
+
+### 5.6 Content-Type Prompts
+
+**Conversation:**
+```
+Summarize this conversation focusing on:
+- User preferences and decisions
+- Action items and commitments
+- Key topics discussed
+```
+
+**Journal:**
+```
+Summarize this journal entry focusing on:
+- Personal insights and reflections
+- Emotional context and growth
+- Goals and intentions
+```
+
+**Document:**
+```
+Summarize this document focusing on:
+- Key findings and conclusions
+- Methodology and approach
+- Recommendations and implications
+```
+
+---
+
+## 6. Integration with Memory System
+
+### 6.1 Entry Point
+
+The memory system calls the summarizer via `_ingest.summarize_content()`:
+
+```python
+async def summarize_content(
+    content: str,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+    openai_base_url: str,
+    api_key: str | None,
+    model: str,
+) -> SummaryResult
+```
+
+### 6.2 Storage Flow
+
+```
+summarize_content()
+       │
+       ▼
+SummaryResult
+       │
+       ▼
+store_adaptive_summary()
+       │
+       ├──▶ persist_hierarchical_summary()
+       │         │
+       │         ├──▶ Delete old summaries (L1, L2, L3)
+       │         ├──▶ Write new summary files
+       │         └──▶ Upsert to ChromaDB
+       │
+       └──▶ Return stored IDs
+```
+
+### 6.3 Retrieval Integration
+
+The memory retrieval system uses `get_final_summary()` to fetch the L3 summary:
+
+```python
+def get_final_summary(
+    collection: Collection,
+    conversation_id: str,
+) -> StoredMemory | None:
+    """Retrieve the L3 final summary for injection into prompts."""
+```
+
+---
+
+## 7. Configuration Reference
+
+| Parameter | Default | Description |
+| :--- | :--- | :--- |
+| `openai_base_url` | *required* | Base URL for LLM API |
+| `model` | *required* | Model ID for summarization |
+| `api_key` | `None` | API key (optional for local models) |
+| `chunk_size` | `3000` | Tokens per chunk for hierarchical |
+| `chunk_overlap` | `200` | Token overlap between chunks |
+
+### 7.1 Level Thresholds (Constants)
+
+| Constant | Value | Description |
+| :--- | :--- | :--- |
+| `THRESHOLD_NONE` | 100 | Below: no summary |
+| `THRESHOLD_BRIEF` | 500 | Below: single sentence |
+| `THRESHOLD_STANDARD` | 3000 | Below: paragraph |
+| `THRESHOLD_DETAILED` | 15000 | Below: chunked |
+
+---
+
+## 8. Error Handling
+
+### 8.1 Graceful Degradation
+
+| Error | Fallback |
+| :--- | :--- |
+| LLM timeout | Return input unchanged with NONE level |
+| LLM error | Retry up to 3 times, then return NONE |
+| Token counting failure | Estimate based on character count (÷4) |
+| Chunking failure | Fall back to character-based splitting |
+
+### 8.2 Validation
+
+- **Empty content:** Returns NONE level immediately.
+- **Whitespace-only:** Returns NONE level.
+- **Invalid compression ratio:** Clamped to [0.0, 1.0].
+
+---
+
+## 9. Performance Considerations
+
+### 9.1 Token Counting
+
+- Uses `tiktoken` with `cl100k_base` encoding (GPT-4 tokenizer).
+- Caches tokenizer instance for efficiency.
+- Falls back to character-based estimation if tiktoken unavailable.
+
+### 9.2 Parallel Processing
+
+For DETAILED and HIERARCHICAL levels:
+- L1 chunk summaries can be generated in parallel.
+- L2 group summaries can be generated in parallel.
+- Only L3 synthesis requires sequential processing.
+
+### 9.3 Caching
+
+- Token counts are computed once per content string.
+- Prompt templates are loaded once at module import.
+- ChromaDB connection is reused across operations.
+
+---
+
+## 10. Comparison with Alternative Approaches
+
+| Aspect | Adaptive Summarizer | Rolling Summary | Fixed Chunking |
+| :--- | :--- | :--- | :--- |
+| **Compression** | 3-20% (varies by level) | ~15% fixed | ~10% fixed |
+| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Single level |
+| **Context awareness** | Content-type prompts | Generic | Generic |
+| **Efficiency** | Skip short content | Always summarize | Always chunk |
+| **Research basis** | Letta + Mem0 | Mem0 only | None |
+
+---
+
+## 11. Future Enhancements
+
+- **Semantic chunking:** Split on topic boundaries rather than token counts.
+- **Incremental L1 updates:** Only re-summarize changed chunks.
+- **Quality scoring:** Evaluate summary quality and trigger re-summarization.
+- **User feedback loop:** Learn preferred compression ratios per user.

From 0e9382270c007c4811036c421d4fe733a387b41b Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 21:31:37 -0800
Subject: [PATCH 12/37] Add example script

---
 agent_cli/summarizer/_utils.py |   4 +-
 examples/summarizer_demo.py    | 483 +++++++++++++++++++++++++++++++++
 2 files changed, 486 insertions(+), 1 deletion(-)
 create mode 100644 examples/summarizer_demo.py

diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index bc319f5b5..030b5729e 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -40,7 +40,9 @@ def count_tokens(text: str, model: str = "gpt-4") -> int:
     if not text:
         return 0
     enc = _get_encoding(model)
-    return len(enc.encode(text))
+    # Disable special token checking - LLM outputs may contain special tokens
+    # like <|constrain|>, <|endoftext|>, etc. that we want to count normally
+    return len(enc.encode(text, disallowed_special=()))
 
 
 def chunk_text(
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
new file mode 100644
index 000000000..6a542dbdc
--- /dev/null
+++ b/examples/summarizer_demo.py
@@ -0,0 +1,483 @@
+"""Demonstrate the summarizer on texts of varying lengths from the internet.
+
+This script fetches content of different sizes and shows how the adaptive
+summarizer automatically selects the appropriate strategy (BRIEF, STANDARD,
+DETAILED, or HIERARCHICAL) based on content length.
+
+Usage:
+    python examples/summarizer_demo.py
+
+    # Test specific levels only
+    python examples/summarizer_demo.py --level brief
+    python examples/summarizer_demo.py --level standard
+    python examples/summarizer_demo.py --level detailed
+    python examples/summarizer_demo.py --level hierarchical
+
+    # Use a different model
+    python examples/summarizer_demo.py --model "gpt-4o-mini"
+"""  # noqa: INP001
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import os
+import re
+import textwrap
+import traceback
+from dataclasses import dataclass
+
+import httpx
+
+from agent_cli.summarizer import (
+    SummarizerConfig,
+    SummaryLevel,
+    SummaryResult,
+    summarize,
+)
+
+# Defaults for local AI setup (same as aijournal_poc.py)
+DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1"
+DEFAULT_MODEL = "gpt-oss-high:20b"
+
+
+@dataclass
+class TextSample:
+    """A sample text for testing the summarizer."""
+
+    name: str
+    description: str
+    url: str
+    expected_level: SummaryLevel
+    content_type: str = "general"
+    # If URL fetch fails, use this fallback
+    fallback_content: str | None = None
+
+
+# Thresholds from adaptive.py:
+# NONE: < 100 tokens
+# BRIEF: 100-500 tokens
+# STANDARD: 500-3000 tokens
+# DETAILED: 3000-15000 tokens
+# HIERARCHICAL: > 15000 tokens
+
+# Sample texts of varying lengths to demonstrate different summarization levels
+SAMPLES: list[TextSample] = [
+    TextSample(
+        name="Brief - Short News Article",
+        description="~150-400 tokens - triggers BRIEF level (100-500 token range)",
+        url="https://httpbin.org/json",  # Returns small JSON we'll convert to text
+        expected_level=SummaryLevel.BRIEF,
+        fallback_content="""
+        Breaking News: Scientists at the Marine Biology Institute have made a
+        groundbreaking discovery in the Mariana Trench. A new species of deep-sea
+        fish, dubbed "Pseudoliparis swirei," has been found surviving at depths
+        exceeding 8,000 meters, making it one of the deepest-living fish ever
+        documented.
+
+        The research team, led by Dr. Sarah Chen from the University of Washington,
+        used advanced unmanned submersibles equipped with high-resolution cameras
+        and collection apparatus. The expedition lasted three months and covered
+        multiple dive sites across the western Pacific.
+
+        "This discovery fundamentally changes our understanding of life in extreme
+        environments," Dr. Chen stated in a press conference. "The adaptations
+        these fish have developed to survive crushing pressures and near-freezing
+        temperatures are remarkable."
+
+        The fish displays several unique characteristics including translucent skin,
+        specialized proteins that prevent cellular damage under pressure, and an
+        unusual metabolism that allows survival with minimal oxygen. Scientists
+        believe studying these adaptations could lead to breakthroughs in medicine
+        and materials science.
+
+        The finding has been published in the journal Nature and has already
+        generated significant interest from the scientific community worldwide.
+        Further expeditions are planned to study the species in its natural habitat.
+        """,
+    ),
+    TextSample(
+        name="Standard - Technology Article",
+        description="~800-2000 tokens - triggers STANDARD level (500-3000 token range)",
+        url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence",
+        expected_level=SummaryLevel.STANDARD,
+        content_type="document",
+        fallback_content="""
+        Artificial intelligence (AI) is the intelligence of machines or software,
+        as opposed to the intelligence of humans or other animals. It is a field
+        of computer science that develops and studies intelligent machines. The
+        field encompasses a wide range of approaches and technologies.
+
+        AI research has been defined as the field of study of intelligent agents,
+        which refers to any system that perceives its environment and takes actions
+        that maximize its chances of achieving its goals. This definition emphasizes
+        the practical aspects of building systems that can operate effectively.
+
+        The term "artificial intelligence" has been used to describe machines that
+        mimic cognitive functions that humans associate with the human mind, such
+        as learning and problem solving. As machines become increasingly capable,
+        tasks considered to require "intelligence" are often removed from the
+        definition of AI, a phenomenon known as the AI effect.
+
+        History of Artificial Intelligence
+
+        The field of AI research was founded at a workshop held on the campus of
+        Dartmouth College during the summer of 1956. The attendees became the
+        founders and leaders of AI research. They and their students produced
+        programs that the press described as astonishing.
+
+        Early AI research in the 1950s explored topics like problem solving and
+        symbolic methods. In the 1960s, the US Department of Defense took interest
+        and began training computers to mimic basic human reasoning. DARPA completed
+        street mapping projects in the 1970s and produced intelligent personal
+        assistants in 2003, long before Siri, Alexa or Cortana.
+
+        Modern AI Approaches
+
+        Modern AI techniques have become pervasive and include machine learning,
+        deep learning, natural language processing, computer vision, robotics,
+        and autonomous systems. These technologies power everything from search
+        engines to self-driving cars.
+
+        Machine learning is a subset of AI that enables systems to learn and improve
+        from experience without being explicitly programmed. Deep learning uses
+        neural networks with many layers to analyze various factors of data.
+
+        Neural networks are computing systems inspired by biological neural networks.
+        They consist of interconnected nodes that process information using
+        connectionist approaches to computation. Modern neural networks can have
+        millions or billions of parameters.
+
+        Applications of AI
+
+        AI applications are transforming industries including healthcare, finance,
+        transportation, and entertainment. In healthcare, AI helps diagnose diseases
+        and develop new treatments. In finance, AI powers fraud detection and
+        algorithmic trading.
+
+        Autonomous vehicles use AI to perceive their environment and make driving
+        decisions. Virtual assistants use natural language processing to understand
+        and respond to user queries. Recommendation systems use AI to suggest
+        content based on user preferences.
+
+        Ethical Considerations
+
+        The field was founded on the assumption that human intelligence can be
+        so precisely described that a machine can be made to simulate it. This
+        raised philosophical arguments about the mind and the ethical consequences
+        of creating artificial beings endowed with human-like intelligence.
+
+        Major concerns include job displacement, algorithmic bias, privacy violations,
+        and the potential for misuse. Researchers and policymakers are working to
+        develop frameworks for responsible AI development and deployment.
+
+        The future of AI holds both tremendous promise and significant challenges.
+        As these systems become more capable, society must grapple with questions
+        about control, accountability, and the nature of intelligence itself.
+        """,
+    ),
+    TextSample(
+        name="Detailed - Full Article",
+        description="~4000-10000 tokens - triggers DETAILED level (3000-15000 token range)",
+        url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning",
+        expected_level=SummaryLevel.DETAILED,
+        content_type="document",
+        fallback_content=None,  # We'll generate synthetic content
+    ),
+    TextSample(
+        name="Hierarchical - Long Document",
+        description="~16000+ tokens - triggers HIERARCHICAL level (>15000 tokens)",
+        url="https://www.gutenberg.org/cache/epub/84/pg84.txt",  # Frankenstein (truncated)
+        expected_level=SummaryLevel.HIERARCHICAL,
+        content_type="document",
+        fallback_content=None,  # We'll generate synthetic content (~16K tokens)
+    ),
+]
+
+
+def generate_synthetic_content(target_tokens: int, topic: str = "technology") -> str:
+    """Generate synthetic content for testing when URLs fail."""
+    # Each paragraph is roughly 50-100 tokens
+    paragraphs = [
+        f"Section on {topic} - Part {{i}}: This section explores various aspects "
+        f"of {topic} and its implications for modern society. The development of "
+        f"new technologies continues to reshape how we live and work. Researchers "
+        f"have made significant progress in understanding the fundamentals.",
+        f"The history of {topic} spans many decades of innovation. Early pioneers "
+        f"laid the groundwork for current advancements. Their contributions remain "
+        f"relevant today as we build upon established foundations.",
+        f"Current applications of {topic} include healthcare, transportation, and "
+        f"communication. These sectors have seen dramatic improvements in efficiency "
+        f"and capability. Future developments promise even greater transformations.",
+        f"Challenges in {topic} include ethical considerations, resource constraints, "
+        f"and technical limitations. Addressing these requires collaboration across "
+        f"disciplines. Solutions often emerge from unexpected directions.",
+        f"The future of {topic} looks promising with continued investment and research. "
+        f"Emerging trends suggest new possibilities. Stakeholders must prepare for "
+        f"rapid change while maintaining focus on beneficial outcomes.",
+    ]
+
+    result = []
+    tokens_per_para = 75  # approximate
+    needed_paragraphs = target_tokens // tokens_per_para + 1
+
+    for i in range(needed_paragraphs):
+        para = paragraphs[i % len(paragraphs)].format(i=i + 1)
+        result.append(para)
+
+    return "\n\n".join(result)
+
+
+async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:  # noqa: PLR0912
+    """Fetch content from URL or use fallback."""
+    try:
+        # Add User-Agent header to avoid 403 errors from some sites
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; SummarizerDemo/1.0)",
+        }
+        response = await client.get(
+            sample.url,
+            timeout=30.0,
+            follow_redirects=True,
+            headers=headers,
+        )
+        response.raise_for_status()
+
+        content = response.text
+
+        # Handle Wikipedia API JSON responses
+        if "wikipedia.org/api" in sample.url:
+            try:
+                data = json.loads(content)
+                if "extract" in data:
+                    content = data["extract"]
+                elif "text" in data:
+                    content = data["text"]
+            except json.JSONDecodeError:
+                pass
+
+        # For httpbin JSON, create a readable summary
+        if "httpbin.org/json" in sample.url:
+            content = sample.fallback_content or ""
+
+        # Strip HTML tags if present
+        if "<" in content and ">" in content:
+            content = re.sub(r"<[^>]+>", " ", content)
+            content = re.sub(r"\s+", " ", content).strip()
+
+        # Check if content is too short for expected level
+        min_words_for_level = {
+            SummaryLevel.BRIEF: 80,  # Need ~100 tokens
+            SummaryLevel.STANDARD: 400,  # Need ~500 tokens
+            SummaryLevel.DETAILED: 2500,  # Need ~3000 tokens
+            SummaryLevel.HIERARCHICAL: 12000,  # Need ~15000 tokens
+        }
+        min_words = min_words_for_level.get(sample.expected_level, 50)
+
+        if len(content.split()) < min_words:
+            print(f"  📎 Fetched content too short ({len(content.split())} words), using fallback")
+            if sample.fallback_content:
+                content = sample.fallback_content
+            else:
+                target_tokens = {
+                    SummaryLevel.BRIEF: 300,
+                    SummaryLevel.STANDARD: 1500,
+                    SummaryLevel.DETAILED: 8000,
+                    SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+                }
+                content = generate_synthetic_content(
+                    target_tokens.get(sample.expected_level, 1000),
+                )
+
+        # For HIERARCHICAL, truncate very long content to keep demo fast
+        # but ensure we stay above 15000 tokens (~13000 words)
+        if sample.expected_level == SummaryLevel.HIERARCHICAL:
+            words = content.split()
+            # ~16000 tokens ≈ 13500 words (need >15000 tokens for HIERARCHICAL)
+            if len(words) > 13500:  # noqa: PLR2004
+                content = " ".join(words[:13500])
+                print("  📎 Truncated to ~13500 words for faster demo")
+
+        return content.strip()
+
+    except Exception as e:
+        print(f"  ⚠️  Failed to fetch URL: {e}")
+
+        if sample.fallback_content:
+            return sample.fallback_content.strip()
+
+        # Generate synthetic content for the expected level
+        target_tokens = {
+            SummaryLevel.BRIEF: 300,
+            SummaryLevel.STANDARD: 1500,
+            SummaryLevel.DETAILED: 8000,
+            SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+        }
+        return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000))
+
+
+def print_result(sample: TextSample, result: SummaryResult, content: str) -> None:
+    """Print a formatted summary result."""
+    print("\n" + "=" * 70)
+    print(f"📄 {sample.name}")
+    print(f"   {sample.description}")
+    print("=" * 70)
+
+    # Input stats
+    word_count = len(content.split())
+    print("\n📊 Input Statistics:")
+    print(f"   Words: {word_count:,}")
+    print(f"   Tokens: {result.input_tokens:,}")
+    print(f"   Content type: {sample.content_type}")
+
+    # Summarization result
+    level_emoji = {
+        SummaryLevel.NONE: "⏭️",
+        SummaryLevel.BRIEF: "📝",
+        SummaryLevel.STANDARD: "📄",
+        SummaryLevel.DETAILED: "📚",
+        SummaryLevel.HIERARCHICAL: "🏗️",
+    }
+    print("\n🎯 Summarization Result:")
+    print(f"   Level: {level_emoji.get(result.level, '❓')} {result.level.name}")
+    print(f"   Expected: {sample.expected_level.name}")
+    print(f"   Match: {'✅' if result.level == sample.expected_level else '⚠️'}")
+    print(f"   Output tokens: {result.output_tokens:,}")
+    print(f"   Compression: {result.compression_ratio:.1%}")
+
+    # Summary content
+    if result.summary:
+        print("\n📝 Summary:")
+        wrapped = textwrap.fill(
+            result.summary,
+            width=68,
+            initial_indent="   ",
+            subsequent_indent="   ",
+        )
+        print(wrapped)
+
+    # Hierarchical details if present
+    if result.hierarchical:
+        h = result.hierarchical
+        print("\n🏗️  Hierarchical Structure:")
+        print(f"   L1 chunks: {len(h.l1_summaries)}")
+        print(f"   L2 groups: {len(h.l2_summaries)}")
+        if h.l2_summaries:
+            print(f"   L2 preview: {h.l2_summaries[0][:100]}...")
+        print("\n   L3 Final Summary:")
+        wrapped = textwrap.fill(
+            h.l3_summary,
+            width=68,
+            initial_indent="   ",
+            subsequent_indent="   ",
+        )
+        print(wrapped)
+
+
+async def run_demo(
+    level_filter: str | None = None,
+    model: str | None = None,
+    base_url: str | None = None,
+) -> None:
+    """Run the summarizer demo."""
+    # Configuration
+    actual_base_url = base_url or os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    actual_model = model or os.environ.get("OPENAI_MODEL", DEFAULT_MODEL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    print("🔧 Configuration:")
+    print(f"   Base URL: {actual_base_url}")
+    print(f"   Model: {actual_model}")
+
+    config = SummarizerConfig(
+        openai_base_url=actual_base_url,
+        model=actual_model,
+        api_key=api_key,
+        chunk_size=3000,
+        max_concurrent_chunks=3,
+        timeout=120.0,  # Longer timeout for local models
+    )
+
+    # Filter samples if requested
+    samples = SAMPLES
+    if level_filter:
+        level_map = {
+            "brief": SummaryLevel.BRIEF,
+            "standard": SummaryLevel.STANDARD,
+            "detailed": SummaryLevel.DETAILED,
+            "hierarchical": SummaryLevel.HIERARCHICAL,
+        }
+        target_level = level_map.get(level_filter.lower())
+        if target_level:
+            samples = [s for s in SAMPLES if s.expected_level == target_level]
+            print(f"\n🔍 Filtering to {level_filter.upper()} level only")
+
+    async with httpx.AsyncClient() as client:
+        for sample in samples:
+            print(f"\n⏳ Processing: {sample.name}...")
+
+            # Fetch content
+            content = await fetch_content(sample, client)
+
+            try:
+                # Summarize
+                result = await summarize(
+                    content=content,
+                    config=config,
+                    content_type=sample.content_type,
+                )
+
+                # Display results
+                print_result(sample, result, content)
+
+            except Exception as e:
+                print(f"\n❌ Error summarizing {sample.name}: {e}")
+
+                traceback.print_exc()
+
+    print("\n" + "=" * 70)
+    print("✅ Demo complete!")
+    print("=" * 70)
+
+
+def main() -> None:
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Demonstrate adaptive summarization on texts of varying lengths",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=textwrap.dedent("""
+        Examples:
+          python examples/summarizer_demo.py
+          python examples/summarizer_demo.py --level standard
+          python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1"
+        """),
+    )
+
+    parser.add_argument(
+        "--level",
+        choices=["brief", "standard", "detailed", "hierarchical"],
+        help="Only test a specific summarization level",
+    )
+    parser.add_argument(
+        "--model",
+        help=f"Model to use (default: {DEFAULT_MODEL})",
+    )
+    parser.add_argument(
+        "--base-url",
+        help=f"OpenAI-compatible API base URL (default: {DEFAULT_BASE_URL})",
+    )
+
+    args = parser.parse_args()
+
+    asyncio.run(
+        run_demo(
+            level_filter=args.level,
+            model=args.model,
+            base_url=args.base_url,
+        ),
+    )
+
+
+if __name__ == "__main__":
+    main()

From 8c3768c3808650e2df452f07d304372f2e21747a Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 21:54:45 -0800
Subject: [PATCH 13/37] refactor(summarizer): YAGNI cleanup and fix
 prior_context bug

Removed unused code:
- update_rolling_summary() - never called anywhere
- _raw_generate() fallback - errors should fail loudly
- retry/backoff logic - same reason
- parent_group from ChunkSummary - stored but never read
- ROLLING_SUMMARY_PROMPT - only used by removed function

Kept middle_truncate() - useful for handling very large inputs
(e.g., conversations with pasted codebases).

Bugfix:
- Add {prior_context} to CONVERSATION, JOURNAL, DOCUMENT prompts
- Previously prior_summary was silently ignored for non-"general" types
- Python's .format() ignores extra kwargs, hiding the bug

Updates documentation to reflect fail-fast error handling.
---
 agent_cli/summarizer/_prompts.py  |  20 ++---
 agent_cli/summarizer/adaptive.py  | 141 ++----------------------------
 agent_cli/summarizer/models.py    |   5 --
 docs/architecture/summarizer.md   |  62 ++++++-------
 tests/summarizer/test_adaptive.py | 141 ++----------------------------
 tests/summarizer/test_models.py   |  13 ---
 tests/summarizer/test_prompts.py  |  22 ++---
 7 files changed, 51 insertions(+), 353 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index 101422b77..f46b39ebf 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -47,20 +47,6 @@
 
 Combined summary (maximum {max_words} words):""".strip()
 
-# Rolling summary update (Mem0-style)
-ROLLING_SUMMARY_PROMPT = """Update the running summary with new information.
-Integrate new facts seamlessly while keeping the summary concise.
-Drop redundant or superseded information.
-Preserve durable facts about identity, preferences, and important events.
-
-Current summary:
-{prior_summary}
-
-New information to integrate:
-{new_content}
-
-Updated summary (maximum {max_words} words):""".strip()
-
 # For conversation-specific summarization
 CONVERSATION_SUMMARY_PROMPT = """Summarize this conversation from the AI assistant's perspective.
 Focus on:
@@ -69,6 +55,8 @@
 - Decisions made or conclusions reached
 - Any commitments or follow-ups mentioned
 
+{prior_context}
+
 Conversation:
 {content}
 
@@ -82,6 +70,8 @@
 - Goals, plans, or intentions stated
 - People, places, or things that are important
 
+{prior_context}
+
 Entry:
 {content}
 
@@ -95,6 +85,8 @@
 - Important specifications or requirements
 - Conclusions or recommendations
 
+{prior_context}
+
 Document:
 {content}
 
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 590dabc55..989bd86ba 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -13,7 +13,6 @@
 import logging
 from dataclasses import dataclass
 
-import httpx
 from pydantic import BaseModel
 from pydantic_ai import Agent
 from pydantic_ai.models.openai import OpenAIChatModel
@@ -24,7 +23,6 @@
     BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
-    ROLLING_SUMMARY_PROMPT,
     format_prior_context,
     format_summaries_for_meta,
     get_prompt_for_content_type,
@@ -33,7 +31,6 @@
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
-    middle_truncate,
     tokens_to_words,
 )
 from agent_cli.summarizer.models import (
@@ -59,14 +56,6 @@
 # Minimum number of L1 chunks before L2 grouping is applied
 L2_MIN_CHUNKS = 5
 
-# Retry settings for summarization failures
-MAX_SUMMARIZE_RETRIES = 3
-
-# Maximum characters per chunk before applying middle truncation
-# This prevents context overflow errors for very large chunks
-# (roughly 12K tokens with cl100k_base encoding)
-MAX_CHUNK_CHARS = 48000
-
 
 class SummaryOutput(BaseModel):
     """Structured output for summary generation."""
@@ -199,93 +188,32 @@ async def summarize(
     )
 
 
-async def update_rolling_summary(
-    prior_summary: str | None,
-    new_facts: list[str],
-    config: SummarizerConfig,
-) -> str:
-    """Update a rolling summary with new facts (Mem0-style).
-
-    This is optimized for incremental updates where you have discrete
-    new facts to integrate into an existing summary.
-
-    Args:
-        prior_summary: The existing summary to update.
-        new_facts: List of new facts to integrate.
-        config: Summarizer configuration.
-
-    Returns:
-        Updated summary string.
-
-    """
-    if not new_facts:
-        return prior_summary or ""
-
-    new_content = "\n".join(f"- {fact}" for fact in new_facts)
-    combined_tokens = count_tokens(
-        (prior_summary or "") + new_content,
-        config.model,
-    )
-
-    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt = ROLLING_SUMMARY_PROMPT.format(
-        prior_summary=prior_summary or "(No prior summary)",
-        new_content=new_content,
-        max_words=max_words,
-    )
-
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
 async def _summarize_single_chunk(
     chunk: str,
     chunk_index: int,
     total_chunks: int,
     config: SummarizerConfig,
-    *,
-    parent_group: int | None = None,
 ) -> ChunkSummary:
     """Summarize a single chunk of content.
 
-    Uses middle truncation as a fallback for oversized content (Letta-style).
-
     Args:
         chunk: The text chunk to summarize.
         chunk_index: Index of this chunk (0-based).
         total_chunks: Total number of chunks being processed.
         config: Summarizer configuration.
-        parent_group: Optional L2 group index for hierarchical summaries.
 
     Returns:
         ChunkSummary with the summarized content.
 
     """
-    # Apply middle truncation if chunk is too large (Letta-style fallback)
     source_tokens = count_tokens(chunk, config.model)
-    content_to_summarize = chunk
-    if len(chunk) > MAX_CHUNK_CHARS:
-        content_to_summarize, dropped = middle_truncate(
-            chunk,
-            MAX_CHUNK_CHARS,
-            head_frac=0.3,
-            tail_frac=0.3,
-        )
-        logger.warning(
-            "Chunk %d truncated: dropped %d chars to fit context window",
-            chunk_index,
-            dropped,
-        )
-
-    chunk_tokens = count_tokens(content_to_summarize, config.model)
-    target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD)
     max_words = tokens_to_words(target_tokens)
 
     prompt = CHUNK_SUMMARY_PROMPT.format(
         chunk_index=chunk_index + 1,
         total_chunks=total_chunks,
-        content=content_to_summarize,
+        content=chunk,
         max_words=max_words,
     )
 
@@ -296,8 +224,7 @@ async def _summarize_single_chunk(
         chunk_index=chunk_index,
         content=summary,
         token_count=summary_tokens,
-        source_tokens=source_tokens,  # Report original token count
-        parent_group=parent_group,
+        source_tokens=source_tokens,
     )
 
 
@@ -355,7 +282,6 @@ async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
                 idx,
                 len(chunks),
                 config,
-                parent_group=None,
             )
 
     chunk_summaries = await asyncio.gather(
@@ -423,14 +349,11 @@ async def _hierarchical_summary(
 
     async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
         async with semaphore:
-            # Assign to L2 group (L2_GROUP_SIZE chunks per group)
-            group_idx = idx // L2_GROUP_SIZE
             return await _summarize_single_chunk(
                 chunk,
                 idx,
                 len(chunks),
                 config,
-                parent_group=group_idx,
             )
 
     l1_summaries = await asyncio.gather(
@@ -497,25 +420,19 @@ async def _generate_summary(
     prompt: str,
     config: SummarizerConfig,
     max_tokens: int = 256,
-    *,
-    attempt: int = 0,
 ) -> str:
     """Generate a summary using the LLM.
 
-    Uses PydanticAI for structured output with fallback to raw generation.
-    Implements exponential backoff retry on failures.
-
     Args:
         prompt: The prompt to send to the LLM.
         config: Summarizer configuration.
         max_tokens: Maximum tokens for the response.
-        attempt: Current retry attempt (for internal recursion).
 
     Returns:
         The generated summary text.
 
     Raises:
-        SummarizationError: If all retries are exhausted.
+        SummarizationError: If summarization fails.
 
     """
     provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
@@ -539,51 +456,5 @@ async def _generate_summary(
         result = await agent.run(prompt)
         return result.output.summary.strip()
     except Exception as e:
-        logger.warning("Structured summary failed, trying raw generation: %s", e)
-        # Fallback to raw HTTP call
-        try:
-            return await _raw_generate(prompt, config, max_tokens)
-        except Exception as raw_err:
-            if attempt < MAX_SUMMARIZE_RETRIES:
-                wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
-                logger.warning(
-                    "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
-                    attempt + 1,
-                    MAX_SUMMARIZE_RETRIES,
-                    wait_time,
-                    raw_err,
-                )
-                await asyncio.sleep(wait_time)
-                return await _generate_summary(
-                    prompt,
-                    config,
-                    max_tokens,
-                    attempt=attempt + 1,
-                )
-            msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
-            raise SummarizationError(msg) from raw_err
-
-
-async def _raw_generate(prompt: str, config: SummarizerConfig, max_tokens: int) -> str:
-    """Fallback raw HTTP generation without structured output."""
-    async with httpx.AsyncClient(timeout=config.timeout) as client:
-        response = await client.post(
-            f"{config.openai_base_url}/chat/completions",
-            headers={"Authorization": f"Bearer {config.api_key}"},
-            json={
-                "model": config.model,
-                "messages": [
-                    {"role": "system", "content": "You are a concise summarizer."},
-                    {"role": "user", "content": prompt},
-                ],
-                "temperature": 0.3,
-                "max_tokens": max_tokens,
-            },
-        )
-        response.raise_for_status()
-        data = response.json()
-
-    choices = data.get("choices", [])
-    if choices:
-        return choices[0].get("message", {}).get("content", "").strip()
-    return ""
+        msg = f"Summarization failed: {e}"
+        raise SummarizationError(msg) from e
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 843d1dfe5..4f5c51191 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -44,10 +44,6 @@ class ChunkSummary(BaseModel):
     content: str = Field(..., description="The summarized content of this chunk")
     token_count: int = Field(..., ge=0, description="Token count of this summary")
     source_tokens: int = Field(..., ge=0, description="Token count of the source chunk")
-    parent_group: int | None = Field(
-        default=None,
-        description="Index of the L2 group this chunk belongs to",
-    )
 
 
 class HierarchicalSummary(BaseModel):
@@ -156,7 +152,6 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L1,
                         "chunk_index": cs.chunk_index,
-                        "parent_group": cs.parent_group,
                         "token_count": cs.token_count,
                         "created_at": timestamp,
                     },
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 940ddddb8..59f1dbb5e 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -8,19 +8,19 @@ The adaptive summarizer provides **content-aware compression** that scales summa
 
 ```
 ┌─────────────────────────────────────────────────────────────────────┐
-│                    Adaptive Summarization Pipeline                   │
+│                    Adaptive Summarization Pipeline                  │
 ├─────────────────────────────────────────────────────────────────────┤
 │                                                                     │
-│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy    │
+│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy     │
 │                                                                     │
-│  ┌─────────────────────────────────────────────────────────────┐   │
-│  │ Level Thresholds:                                           │   │
-│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │   │
-│  │   100-500       ──▶ BRIEF       (single sentence)           │   │
-│  │   500-3000      ──▶ STANDARD    (paragraph)                 │   │
-│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │   │
-│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │   │
-│  └─────────────────────────────────────────────────────────────┘   │
+│  ┌─────────────────────────────────────────────────────────────┐    │
+│  │ Level Thresholds:                                           │    │
+│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │    │
+│  │   100-500       ──▶ BRIEF       (single sentence)           │    │
+│  │   500-3000      ──▶ STANDARD    (paragraph)                 │    │
+│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │    │
+│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │    │
+│  └─────────────────────────────────────────────────────────────┘    │
 │                                                                     │
 │  Output: SummaryResult with compression metrics                     │
 └─────────────────────────────────────────────────────────────────────┘
@@ -310,19 +310,19 @@ def chunk_text(
 
 ### 4.5 Middle Truncation (Utility)
 
-For contexts where the summary exceeds available space:
+For handling very large inputs that could exceed context windows:
 
 ```python
 def middle_truncate(
     text: str,
-    token_budget: int,
-    head_fraction: float = 0.3,
-    tail_fraction: float = 0.7,
-) -> str:
+    budget_chars: int,
+    head_frac: float = 0.3,
+    tail_frac: float = 0.3,
+) -> tuple[str, int]:
     """Keep head and tail, remove middle (least likely to contain key info)."""
 ```
 
-**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items).
+**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). Useful when summarizing very long conversations that may contain pasted codebases.
 
 ---
 
@@ -376,22 +376,9 @@ Section Summaries:
 Synthesized Summary:
 ```
 
-### 5.5 Rolling Summary (`ROLLING_PROMPT`)
+### 5.5 Content-Type Prompts
 
-```
-Update the existing summary to incorporate new information.
-Preserve important historical context while integrating new facts.
-
-Existing Summary:
-{prior_summary}
-
-New Information:
-{new_facts}
-
-Updated Summary:
-```
-
-### 5.6 Content-Type Prompts
+All content-type prompts include `{prior_context}` for rolling summary continuity.
 
 **Conversation:**
 ```
@@ -493,14 +480,15 @@ def get_final_summary(
 
 ## 8. Error Handling
 
-### 8.1 Graceful Degradation
+### 8.1 Fail-Fast Philosophy
+
+Errors are propagated rather than hidden behind fallbacks:
 
-| Error | Fallback |
+| Error | Behavior |
 | :--- | :--- |
-| LLM timeout | Return input unchanged with NONE level |
-| LLM error | Retry up to 3 times, then return NONE |
-| Token counting failure | Estimate based on character count (÷4) |
-| Chunking failure | Fall back to character-based splitting |
+| LLM timeout | Raises `SummarizationError` |
+| LLM error | Raises `SummarizationError` |
+| Token counting failure | Falls back to `cl100k_base` encoding |
 
 ### 8.2 Validation
 
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index f5db1486c..ac04bc126 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -8,13 +8,12 @@
 
 from agent_cli.summarizer.adaptive import (
     LEVEL_THRESHOLDS,
+    SummarizationError,
     SummarizerConfig,
     SummaryOutput,
     _generate_summary,
-    _raw_generate,
     determine_level,
     summarize,
-    update_rolling_summary,
 )
 from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
@@ -257,82 +256,6 @@ async def test_hierarchical_level_calls_hierarchical_summary(
         assert result.level == SummaryLevel.HIERARCHICAL
 
 
-class TestUpdateRollingSummary:
-    """Tests for rolling summary updates."""
-
-    @pytest.fixture
-    def config(self) -> SummarizerConfig:
-        """Create a config instance."""
-        return SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="gpt-4",
-        )
-
-    @pytest.mark.asyncio
-    async def test_empty_facts_returns_prior(self, config: SummarizerConfig) -> None:
-        """Test that empty facts list returns prior summary."""
-        result = await update_rolling_summary(
-            prior_summary="Existing summary",
-            new_facts=[],
-            config=config,
-        )
-        assert result == "Existing summary"
-
-    @pytest.mark.asyncio
-    async def test_empty_facts_no_prior_returns_empty(
-        self,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that empty facts with no prior returns empty string."""
-        result = await update_rolling_summary(
-            prior_summary=None,
-            new_facts=[],
-            config=config,
-        )
-        assert result == ""
-
-    @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._generate_summary")
-    async def test_new_facts_calls_generate(
-        self,
-        mock_generate: AsyncMock,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that new facts trigger summary generation."""
-        mock_generate.return_value = "Updated summary with new facts."
-
-        result = await update_rolling_summary(
-            prior_summary="Old summary",
-            new_facts=["User likes coffee", "User lives in Amsterdam"],
-            config=config,
-        )
-
-        mock_generate.assert_called_once()
-        assert result == "Updated summary with new facts."
-
-    @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._generate_summary")
-    async def test_facts_formatted_as_list(
-        self,
-        mock_generate: AsyncMock,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that facts are formatted as bullet list in prompt."""
-        mock_generate.return_value = "Summary"
-
-        await update_rolling_summary(
-            prior_summary="Prior",
-            new_facts=["Fact one", "Fact two"],
-            config=config,
-        )
-
-        # Check the prompt contains formatted facts
-        call_args = mock_generate.call_args
-        prompt = call_args[0][0]
-        assert "- Fact one" in prompt
-        assert "- Fact two" in prompt
-
-
 class TestGenerateSummary:
     """Tests for _generate_summary function."""
 
@@ -365,72 +288,18 @@ async def test_generate_summary_with_pydantic_ai(
             mock_agent.run.assert_called_once_with("Test prompt")
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._raw_generate")
-    async def test_fallback_to_raw_generate_on_error(
+    async def test_raises_summarization_error_on_failure(
         self,
-        mock_raw: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test fallback to raw HTTP on PydanticAI error."""
-        mock_raw.return_value = "Fallback summary"
-
+        """Test that SummarizationError is raised on failure."""
         with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
             mock_agent = MagicMock()
             mock_agent.run = AsyncMock(side_effect=Exception("API error"))
             mock_agent_class.return_value = mock_agent
 
-            result = await _generate_summary("Test prompt", config, max_tokens=100)
-
-            mock_raw.assert_called_once_with("Test prompt", config, 100)
-            assert result == "Fallback summary"
-
-
-class TestRawGenerate:
-    """Tests for _raw_generate fallback function."""
-
-    @pytest.fixture
-    def config(self) -> SummarizerConfig:
-        """Create a config instance."""
-        return SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="gpt-4",
-        )
-
-    @pytest.mark.asyncio
-    async def test_raw_generate_success(self, config: SummarizerConfig) -> None:
-        """Test successful raw HTTP generation."""
-        mock_response = MagicMock()
-        mock_response.json.return_value = {
-            "choices": [{"message": {"content": "Raw generated summary"}}],
-        }
-
-        with patch("httpx.AsyncClient") as mock_client_class:
-            mock_client = MagicMock()
-            mock_client.post = AsyncMock(return_value=mock_response)
-            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-            mock_client.__aexit__ = AsyncMock(return_value=None)
-            mock_client_class.return_value = mock_client
-
-            result = await _raw_generate("Test prompt", config, max_tokens=100)
-
-            assert result == "Raw generated summary"
-
-    @pytest.mark.asyncio
-    async def test_raw_generate_empty_choices(self, config: SummarizerConfig) -> None:
-        """Test raw generate with empty choices returns empty string."""
-        mock_response = MagicMock()
-        mock_response.json.return_value = {"choices": []}
-
-        with patch("httpx.AsyncClient") as mock_client_class:
-            mock_client = MagicMock()
-            mock_client.post = AsyncMock(return_value=mock_response)
-            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-            mock_client.__aexit__ = AsyncMock(return_value=None)
-            mock_client_class.return_value = mock_client
-
-            result = await _raw_generate("Test prompt", config, max_tokens=100)
-
-            assert result == ""
+            with pytest.raises(SummarizationError, match="Summarization failed"):
+                await _generate_summary("Test prompt", config, max_tokens=100)
 
 
 class TestSummaryOutput:
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index e27fa18e0..23509d2e3 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -43,24 +43,11 @@ def test_basic_creation(self) -> None:
             content="This is a summary of chunk 1.",
             token_count=10,
             source_tokens=100,
-            parent_group=None,
         )
         assert chunk.chunk_index == 0
         assert chunk.content == "This is a summary of chunk 1."
         assert chunk.token_count == 10
         assert chunk.source_tokens == 100
-        assert chunk.parent_group is None
-
-    def test_with_parent_group(self) -> None:
-        """Test creating a chunk summary with parent group."""
-        chunk = ChunkSummary(
-            chunk_index=5,
-            content="Summary text",
-            token_count=8,
-            source_tokens=200,
-            parent_group=1,
-        )
-        assert chunk.parent_group == 1
 
     def test_validation_negative_tokens(self) -> None:
         """Test that negative token counts fail validation."""
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index 05937f71a..660229709 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -9,7 +9,6 @@
     DOCUMENT_SUMMARY_PROMPT,
     JOURNAL_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
-    ROLLING_SUMMARY_PROMPT,
     STANDARD_SUMMARY_PROMPT,
     format_prior_context,
     format_summaries_for_meta,
@@ -71,26 +70,23 @@ def test_meta_prompt_has_placeholders(self) -> None:
         assert "Summary 1" in result
         assert "200" in result
 
-    def test_rolling_prompt_has_placeholders(self) -> None:
-        """Test ROLLING prompt contains required placeholders."""
-        assert "{prior_summary}" in ROLLING_SUMMARY_PROMPT
-        assert "{new_content}" in ROLLING_SUMMARY_PROMPT
-        assert "{max_words}" in ROLLING_SUMMARY_PROMPT
-
-    def test_conversation_prompt_has_content(self) -> None:
-        """Test CONVERSATION prompt contains content placeholder."""
+    def test_conversation_prompt_has_placeholders(self) -> None:
+        """Test CONVERSATION prompt contains required placeholders."""
         assert "{content}" in CONVERSATION_SUMMARY_PROMPT
         assert "{max_words}" in CONVERSATION_SUMMARY_PROMPT
+        assert "{prior_context}" in CONVERSATION_SUMMARY_PROMPT
 
-    def test_journal_prompt_has_content(self) -> None:
-        """Test JOURNAL prompt contains content placeholder."""
+    def test_journal_prompt_has_placeholders(self) -> None:
+        """Test JOURNAL prompt contains required placeholders."""
         assert "{content}" in JOURNAL_SUMMARY_PROMPT
         assert "{max_words}" in JOURNAL_SUMMARY_PROMPT
+        assert "{prior_context}" in JOURNAL_SUMMARY_PROMPT
 
-    def test_document_prompt_has_content(self) -> None:
-        """Test DOCUMENT prompt contains content placeholder."""
+    def test_document_prompt_has_placeholders(self) -> None:
+        """Test DOCUMENT prompt contains required placeholders."""
         assert "{content}" in DOCUMENT_SUMMARY_PROMPT
         assert "{max_words}" in DOCUMENT_SUMMARY_PROMPT
+        assert "{prior_context}" in DOCUMENT_SUMMARY_PROMPT
 
 
 class TestGetPromptForContentType:

From a171aafcd46d9c45cb24a8d704dd437c74000629 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 22:25:41 -0800
Subject: [PATCH 14/37] feat(cli): add summarize command for adaptive
 hierarchical summarization

Expose the full power of the summarizer through a CLI command that:
- Follows existing CLI patterns using shared opts module
- Supports all LLM providers (ollama, openai, gemini)
- Offers content-type prompts (general, conversation, journal, document)
- Provides output formats (text, json, full hierarchical)
- Includes chunking options and rolling summary support
- Reads from file or stdin
---
 agent_cli/agents/__init__.py  |   2 +
 agent_cli/agents/summarize.py | 435 ++++++++++++++++++++++++++++++++++
 agent_cli/cli.py              |   1 +
 3 files changed, 438 insertions(+)
 create mode 100644 agent_cli/agents/summarize.py

diff --git a/agent_cli/agents/__init__.py b/agent_cli/agents/__init__.py
index 1ec88de0a..6a0c7838d 100644
--- a/agent_cli/agents/__init__.py
+++ b/agent_cli/agents/__init__.py
@@ -7,6 +7,7 @@
     memory,
     rag_proxy,
     speak,
+    summarize,
     transcribe,
     voice_edit,
 )
@@ -18,6 +19,7 @@
     "memory",
     "rag_proxy",
     "speak",
+    "summarize",
     "transcribe",
     "voice_edit",
 ]
diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py
new file mode 100644
index 000000000..abc8dfc72
--- /dev/null
+++ b/agent_cli/agents/summarize.py
@@ -0,0 +1,435 @@
+"""Summarize text files or stdin using adaptive hierarchical summarization."""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import json
+import sys
+import time
+from enum import Enum
+from pathlib import Path  # noqa: TC003
+from typing import TYPE_CHECKING
+
+import typer
+
+from agent_cli import config, opts
+from agent_cli.cli import app
+from agent_cli.core.utils import (
+    console,
+    create_status,
+    print_command_line_args,
+    print_error_message,
+    print_input_panel,
+    print_output_panel,
+    print_with_style,
+    setup_logging,
+)
+from agent_cli.summarizer import SummarizationError, SummarizerConfig, summarize
+from agent_cli.summarizer._utils import count_tokens
+
+if TYPE_CHECKING:
+    from agent_cli.summarizer import SummaryResult
+
+
+class ContentType(str, Enum):
+    """Content type for specialized summarization prompts."""
+
+    general = "general"
+    conversation = "conversation"
+    journal = "journal"
+    document = "document"
+
+
+class OutputFormat(str, Enum):
+    """Output format for the summarization result."""
+
+    text = "text"
+    json = "json"
+    full = "full"
+
+
+def _read_input(file_path: Path | None) -> str | None:
+    """Read input from file or stdin."""
+    if file_path:
+        if not file_path.exists():
+            print_error_message(
+                f"File not found: {file_path}",
+                "Please check the file path and try again.",
+            )
+            return None
+        return file_path.read_text(encoding="utf-8")
+
+    # Read from stdin
+    if sys.stdin.isatty():
+        print_error_message(
+            "No input provided",
+            "Provide a file path or pipe content via stdin.",
+        )
+        return None
+
+    return sys.stdin.read()
+
+
+def _display_input_preview(
+    content: str,
+    token_count: int,
+    *,
+    quiet: bool,
+    max_preview_chars: int = 500,
+) -> None:
+    """Display a preview of the input content."""
+    if quiet:
+        return
+
+    preview = content[:max_preview_chars]
+    if len(content) > max_preview_chars:
+        preview += f"\n... [{len(content) - max_preview_chars} more characters]"
+
+    print_input_panel(
+        preview,
+        title=f"Input ({token_count:,} tokens)",
+    )
+
+
+def _display_result(
+    result: SummaryResult,
+    elapsed: float,
+    output_format: OutputFormat,
+    *,
+    quiet: bool,
+) -> None:
+    """Display the summarization result."""
+    if output_format == OutputFormat.json:
+        print(json.dumps(result.model_dump(mode="json"), indent=2))
+        return
+
+    if output_format == OutputFormat.full:
+        _display_full_result(result, elapsed, quiet=quiet)
+        return
+
+    # Text output - just the summary
+    if quiet:
+        if result.summary:
+            print(result.summary)
+    elif result.summary:
+        print_output_panel(
+            result.summary,
+            title=f"Summary (Level: {result.level.name})",
+            subtitle=f"[dim]{result.output_tokens:,} tokens | {result.compression_ratio:.1%} of original | {elapsed:.2f}s[/dim]",
+        )
+    else:
+        print_with_style(
+            f"No summary generated (input too short: {result.input_tokens} tokens)",
+            style="yellow",
+        )
+
+
+def _display_full_result(
+    result: SummaryResult,
+    elapsed: float,
+    *,
+    quiet: bool,
+) -> None:
+    """Display full hierarchical result with all levels."""
+    if quiet:
+        if result.summary:
+            print(result.summary)
+        return
+
+    console.print()
+    console.print("[bold cyan]Summarization Result[/bold cyan]")
+    console.print(f"  Level: [bold]{result.level.name}[/bold]")
+    console.print(f"  Input tokens: [bold]{result.input_tokens:,}[/bold]")
+    console.print(f"  Output tokens: [bold]{result.output_tokens:,}[/bold]")
+    console.print(f"  Compression: [bold]{result.compression_ratio:.1%}[/bold]")
+    console.print(f"  Time: [bold]{elapsed:.2f}s[/bold]")
+    console.print()
+
+    if result.hierarchical:
+        if result.hierarchical.l1_summaries:
+            console.print(
+                f"[bold yellow]L1 Chunk Summaries "
+                f"({len(result.hierarchical.l1_summaries)} chunks)[/bold yellow]",
+            )
+            for cs in result.hierarchical.l1_summaries:
+                console.print(
+                    f"\n[dim]--- Chunk {cs.chunk_index + 1} "
+                    f"({cs.source_tokens:,} → {cs.token_count:,} tokens) ---[/dim]",
+                )
+                console.print(cs.content)
+
+        if result.hierarchical.l2_summaries:
+            console.print(
+                f"\n[bold yellow]L2 Group Summaries "
+                f"({len(result.hierarchical.l2_summaries)} groups)[/bold yellow]",
+            )
+            for idx, l2_summary in enumerate(result.hierarchical.l2_summaries):
+                console.print(f"\n[dim]--- Group {idx + 1} ---[/dim]")
+                console.print(l2_summary)
+
+        console.print("\n[bold green]L3 Final Summary[/bold green]")
+        print_output_panel(result.hierarchical.l3_summary, title="Final Summary")
+    elif result.summary:
+        print_output_panel(
+            result.summary,
+            title=f"Summary ({result.level.name})",
+        )
+
+
+def _get_llm_config(
+    provider_cfg: config.ProviderSelection,
+    ollama_cfg: config.Ollama,
+    openai_llm_cfg: config.OpenAILLM,
+    gemini_llm_cfg: config.GeminiLLM,
+) -> tuple[str, str, str | None]:
+    """Get openai_base_url, model, and api_key from provider config."""
+    if provider_cfg.llm_provider == "ollama":
+        # Ollama uses OpenAI-compatible API at /v1
+        base_url = ollama_cfg.llm_ollama_host.rstrip("/")
+        if not base_url.endswith("/v1"):
+            base_url = f"{base_url}/v1"
+        return base_url, ollama_cfg.llm_ollama_model, None
+    if provider_cfg.llm_provider == "openai":
+        base_url = openai_llm_cfg.openai_base_url or "https://api.openai.com/v1"
+        return base_url, openai_llm_cfg.llm_openai_model, openai_llm_cfg.openai_api_key
+    # gemini
+    return (
+        "https://generativelanguage.googleapis.com/v1beta/openai",
+        gemini_llm_cfg.llm_gemini_model,
+        gemini_llm_cfg.gemini_api_key,
+    )
+
+
+async def _async_summarize(
+    content: str,
+    *,
+    content_type: ContentType,
+    prior_summary: str | None,
+    provider_cfg: config.ProviderSelection,
+    ollama_cfg: config.Ollama,
+    openai_llm_cfg: config.OpenAILLM,
+    gemini_llm_cfg: config.GeminiLLM,
+    general_cfg: config.General,
+    chunk_size: int,
+    chunk_overlap: int,
+    max_concurrent_chunks: int,
+    output_format: OutputFormat,
+) -> None:
+    """Asynchronous summarization entry point."""
+    setup_logging(general_cfg.log_level, general_cfg.log_file, quiet=general_cfg.quiet)
+
+    openai_base_url, model, api_key = _get_llm_config(
+        provider_cfg,
+        ollama_cfg,
+        openai_llm_cfg,
+        gemini_llm_cfg,
+    )
+
+    token_count = count_tokens(content, model)
+    _display_input_preview(content, token_count, quiet=general_cfg.quiet)
+
+    summarizer_config = SummarizerConfig(
+        openai_base_url=openai_base_url,
+        model=model,
+        api_key=api_key,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        max_concurrent_chunks=max_concurrent_chunks,
+    )
+
+    try:
+        if not general_cfg.quiet:
+            status = create_status(f"Summarizing with {model}...", "bold yellow")
+        else:
+            status = contextlib.nullcontext()
+
+        with status:
+            start_time = time.monotonic()
+            result = await summarize(
+                content,
+                summarizer_config,
+                prior_summary=prior_summary,
+                content_type=content_type.value,
+            )
+            elapsed = time.monotonic() - start_time
+
+        _display_result(result, elapsed, output_format, quiet=general_cfg.quiet)
+
+    except SummarizationError as e:
+        print_error_message(
+            str(e),
+            f"Check that your LLM server is running at {openai_base_url}",
+        )
+        sys.exit(1)
+    except Exception as e:
+        print_error_message(str(e), "An unexpected error occurred during summarization.")
+        sys.exit(1)
+
+
+@app.command("summarize")
+def summarize_command(
+    *,
+    file_path: Path | None = typer.Argument(  # noqa: B008
+        None,
+        help="Path to file to summarize. If not provided, reads from stdin.",
+    ),
+    # --- Content Options ---
+    content_type: ContentType = typer.Option(  # noqa: B008
+        ContentType.general,
+        "--type",
+        "-t",
+        help="Content type for specialized summarization prompts.",
+        rich_help_panel="Content Options",
+    ),
+    prior_summary: str | None = typer.Option(
+        None,
+        "--prior-summary",
+        help="Prior summary to integrate with (for rolling summaries).",
+        rich_help_panel="Content Options",
+    ),
+    prior_summary_file: Path | None = typer.Option(  # noqa: B008
+        None,
+        "--prior-summary-file",
+        help="File containing prior summary to integrate with.",
+        rich_help_panel="Content Options",
+    ),
+    # --- Chunking Options ---
+    chunk_size: int = typer.Option(
+        3000,
+        "--chunk-size",
+        help="Target token count per chunk for hierarchical summarization.",
+        rich_help_panel="Chunking Options",
+    ),
+    chunk_overlap: int = typer.Option(
+        200,
+        "--chunk-overlap",
+        help="Token overlap between chunks for context continuity.",
+        rich_help_panel="Chunking Options",
+    ),
+    max_concurrent_chunks: int = typer.Option(
+        5,
+        "--max-concurrent",
+        help="Maximum number of chunks to process in parallel.",
+        rich_help_panel="Chunking Options",
+    ),
+    # --- Output Options ---
+    output_format: OutputFormat = typer.Option(  # noqa: B008
+        OutputFormat.text,
+        "--output",
+        "-o",
+        help="Output format: 'text' (summary only), 'json' (full result), 'full' (all levels).",
+        rich_help_panel="Output Options",
+    ),
+    # --- Provider Selection ---
+    llm_provider: str = opts.LLM_PROVIDER,
+    # --- LLM Configuration ---
+    # Ollama (local service)
+    llm_ollama_model: str = opts.LLM_OLLAMA_MODEL,
+    llm_ollama_host: str = opts.LLM_OLLAMA_HOST,
+    # OpenAI
+    llm_openai_model: str = opts.LLM_OPENAI_MODEL,
+    openai_api_key: str | None = opts.OPENAI_API_KEY,
+    openai_base_url: str | None = opts.OPENAI_BASE_URL,
+    # Gemini
+    llm_gemini_model: str = opts.LLM_GEMINI_MODEL,
+    gemini_api_key: str | None = opts.GEMINI_API_KEY,
+    # --- General Options ---
+    log_level: str = opts.LOG_LEVEL,
+    log_file: str | None = opts.LOG_FILE,
+    quiet: bool = opts.QUIET,
+    config_file: str | None = opts.CONFIG_FILE,
+    print_args: bool = opts.PRINT_ARGS,
+) -> None:
+    """Summarize text using adaptive hierarchical summarization.
+
+    Reads from a file or stdin and produces a summary scaled to the input complexity:
+
+    - NONE (<100 tokens): No summary needed
+    - BRIEF (100-500): Single sentence
+    - STANDARD (500-3000): Paragraph
+    - DETAILED (3000-15000): Chunked with meta-summary
+    - HIERARCHICAL (>15000): Full L1/L2/L3 tree
+
+    Examples:
+        # Summarize a file
+        agent-cli summarize document.txt
+
+        # Summarize with conversation-specific prompts
+        agent-cli summarize chat.txt --type conversation
+
+        # Pipe content from stdin
+        cat book.txt | agent-cli summarize
+
+        # Get full hierarchical output
+        agent-cli summarize large_document.txt --output full
+
+        # Use OpenAI instead of Ollama
+        agent-cli summarize notes.md --llm-provider openai
+
+    """
+    if print_args:
+        print_command_line_args(locals())
+
+    # Create config objects following the standard pattern
+    provider_cfg = config.ProviderSelection(
+        llm_provider=llm_provider,
+        asr_provider="wyoming",  # Not used, but required by model
+        tts_provider="wyoming",  # Not used, but required by model
+    )
+    ollama_cfg = config.Ollama(
+        llm_ollama_model=llm_ollama_model,
+        llm_ollama_host=llm_ollama_host,
+    )
+    openai_llm_cfg = config.OpenAILLM(
+        llm_openai_model=llm_openai_model,
+        openai_api_key=openai_api_key,
+        openai_base_url=openai_base_url,
+    )
+    gemini_llm_cfg = config.GeminiLLM(
+        llm_gemini_model=llm_gemini_model,
+        gemini_api_key=gemini_api_key,
+    )
+    general_cfg = config.General(
+        log_level=log_level,
+        log_file=log_file,
+        quiet=quiet,
+        clipboard=False,  # summarize doesn't use clipboard
+    )
+
+    # Read content
+    content = _read_input(file_path)
+    if content is None:
+        raise typer.Exit(1)
+
+    if not content.strip():
+        print_error_message("Empty input", "The input file or stdin is empty.")
+        raise typer.Exit(1)
+
+    # Handle prior summary from file
+    actual_prior_summary = prior_summary
+    if prior_summary_file:
+        if not prior_summary_file.exists():
+            print_error_message(
+                f"Prior summary file not found: {prior_summary_file}",
+                "Please check the file path.",
+            )
+            raise typer.Exit(1)
+        actual_prior_summary = prior_summary_file.read_text(encoding="utf-8")
+
+    asyncio.run(
+        _async_summarize(
+            content,
+            content_type=content_type,
+            prior_summary=actual_prior_summary,
+            provider_cfg=provider_cfg,
+            ollama_cfg=ollama_cfg,
+            openai_llm_cfg=openai_llm_cfg,
+            gemini_llm_cfg=gemini_llm_cfg,
+            general_cfg=general_cfg,
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            max_concurrent_chunks=max_concurrent_chunks,
+            output_format=output_format,
+        ),
+    )
diff --git a/agent_cli/cli.py b/agent_cli/cli.py
index 981404159..3542f41fc 100644
--- a/agent_cli/cli.py
+++ b/agent_cli/cli.py
@@ -121,6 +121,7 @@ def set_config_defaults(ctx: typer.Context, config_file: str | None) -> dict[str
     memory,
     rag_proxy,
     speak,
+    summarize,
     transcribe,
     transcribe_live,
     voice_edit,

From 8dff17ff47c60ad9c629a449fac59c1ab9b0fb22 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 22:35:19 -0800
Subject: [PATCH 15/37] refactor(memory): remove dead parent_group field and
 bundle metadata args

- Remove unused parent_group from MemoryMetadata (was never assigned)
- Refactor write_memory_file to accept optional MemoryMetadata object
  instead of 17 individual parameters
- Simplify upsert_hierarchical_summary to use MemoryMetadata(**dict)
- Rename summary_level to summary_level_name for consistency
- Make tiktoken optional in token counting with fallback heuristic
---
 agent_cli/memory/_files.py           | 88 ++++++++++++++--------------
 agent_cli/memory/_persistence.py     | 35 ++++++-----
 agent_cli/memory/_store.py           | 17 +-----
 agent_cli/memory/models.py           |  2 -
 agent_cli/summarizer/_utils.py       | 17 +++++-
 agent_cli/summarizer/models.py       |  4 +-
 docs/architecture/summarizer.md      |  2 -
 tests/memory/test_store.py           |  4 +-
 tests/summarizer/test_integration.py |  7 +--
 tests/summarizer/test_models.py      |  4 +-
 10 files changed, 82 insertions(+), 98 deletions(-)

diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py
index 65fbbc1b2..0bb0a5d94 100644
--- a/agent_cli/memory/_files.py
+++ b/agent_cli/memory/_files.py
@@ -87,76 +87,74 @@ def soft_delete_memory_file(
 def write_memory_file(
     root: Path,
     *,
-    conversation_id: str,
-    role: str,
-    created_at: str,
     content: str,
-    summary_kind: str | None = None,
     doc_id: str | None = None,
+    # Either pass pre-built metadata OR individual fields
+    metadata: MemoryMetadata | None = None,
+    # Individual fields (used when metadata is None)
+    conversation_id: str | None = None,
+    role: str | None = None,
+    created_at: str | None = None,
+    summary_kind: str | None = None,
     source_id: str | None = None,
-    # Hierarchical summary fields
-    level: int | None = None,
-    is_final: bool | None = None,
-    chunk_index: int | None = None,
-    parent_group: int | None = None,
-    group_index: int | None = None,
-    input_tokens: int | None = None,
-    output_tokens: int | None = None,
-    compression_ratio: float | None = None,
-    summary_level_name: str | None = None,
 ) -> MemoryFileRecord:
-    """Render and persist a memory document to disk."""
+    """Render and persist a memory document to disk.
+
+    Can be called in two ways:
+    1. With pre-built metadata: write_memory_file(root, content=..., metadata=..., doc_id=...)
+    2. With individual fields: write_memory_file(root, content=..., conversation_id=..., role=..., ...)
+
+    """
     entries_dir, _ = ensure_store_dirs(root)
-    safe_conversation = _slugify(conversation_id)
     doc_id = doc_id or str(uuid4())
-    safe_ts = _safe_timestamp(created_at)
+
+    # Build or use provided metadata
+    if metadata is not None:
+        meta = metadata
+    else:
+        if conversation_id is None or role is None or created_at is None:
+            msg = "Must provide metadata or (conversation_id, role, created_at)"
+            raise ValueError(msg)
+        meta = MemoryMetadata(
+            conversation_id=conversation_id,
+            role=role,
+            created_at=created_at,
+            summary_kind=summary_kind,
+            source_id=source_id,
+        )
+
+    safe_conversation = _slugify(meta.conversation_id)
+    safe_ts = _safe_timestamp(meta.created_at)
 
     # Route by role/category for readability
-    if summary_kind and level is not None:
+    if meta.summary_kind and meta.level is not None:
         # Hierarchical summary file structure
-        if level == _SUMMARY_LEVEL_L1:
+        if meta.level == _SUMMARY_LEVEL_L1:
             subdir = Path("summaries") / "L1"
-            filename = f"chunk_{chunk_index or 0}.md"
-        elif level == _SUMMARY_LEVEL_L2:
+            filename = f"chunk_{meta.chunk_index or 0}.md"
+        elif meta.level == _SUMMARY_LEVEL_L2:
             subdir = Path("summaries") / "L2"
-            filename = f"group_{group_index or 0}.md"
+            filename = f"group_{meta.group_index or 0}.md"
         else:  # level == _SUMMARY_LEVEL_L3
             subdir = Path("summaries") / "L3"
             filename = "final.md"
-    elif summary_kind:
+    elif meta.summary_kind:
         subdir = Path("summaries")
         filename = "summary.md"
-    elif role == "user":
+    elif meta.role == "user":
         subdir = Path("turns") / "user"
         filename = f"{safe_ts}__{doc_id}.md"
-    elif role == "assistant":
+    elif meta.role == "assistant":
         subdir = Path("turns") / "assistant"
         filename = f"{safe_ts}__{doc_id}.md"
-    elif role == "memory":
+    elif meta.role == "memory":
         subdir = Path("facts")
         filename = f"{safe_ts}__{doc_id}.md"
     else:
         subdir = Path()
         filename = f"{doc_id}.md"
 
-    metadata = MemoryMetadata(
-        conversation_id=conversation_id,
-        role=role,
-        created_at=created_at,
-        summary_kind=summary_kind,
-        source_id=source_id,
-        level=level,
-        is_final=is_final,
-        chunk_index=chunk_index,
-        parent_group=parent_group,
-        group_index=group_index,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=compression_ratio,
-        summary_level_name=summary_level_name,
-    )
-
-    front_matter = _render_front_matter(doc_id, metadata)
+    front_matter = _render_front_matter(doc_id, meta)
     body = front_matter + "\n" + content.strip() + "\n"
 
     file_path = entries_dir / safe_conversation / subdir / filename
@@ -164,7 +162,7 @@ def write_memory_file(
 
     atomic_write_text(file_path, body)
 
-    return MemoryFileRecord(id=doc_id, path=file_path, metadata=metadata, content=content)
+    return MemoryFileRecord(id=doc_id, path=file_path, metadata=meta, content=content)
 
 
 def load_memory_files(root: Path) -> list[MemoryFileRecord]:
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 91585ade8..2af3a2687 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -25,13 +25,13 @@
     upsert_memories,
 )
 from agent_cli.memory.entities import Fact, Turn
+from agent_cli.memory.models import MemoryMetadata
 
 if TYPE_CHECKING:
     from pathlib import Path
 
     from chromadb import Collection
 
-    from agent_cli.memory.models import MemoryMetadata
     from agent_cli.summarizer import SummaryResult
 
 LOGGER = logging.getLogger(__name__)
@@ -212,26 +212,29 @@ def persist_hierarchical_summary(
     created_at = datetime.now(UTC).isoformat()
 
     for entry in entries:
-        meta = entry["metadata"]
+        meta_dict = entry["metadata"]
+        # Build MemoryMetadata from the summary result's metadata dict
+        metadata = MemoryMetadata(
+            conversation_id=meta_dict["conversation_id"],
+            role=meta_dict["role"],
+            created_at=meta_dict.get("created_at", created_at),
+            summary_kind="summary",
+            level=meta_dict.get("level"),
+            is_final=meta_dict.get("is_final"),
+            chunk_index=meta_dict.get("chunk_index"),
+            group_index=meta_dict.get("group_index"),
+            input_tokens=meta_dict.get("input_tokens"),
+            output_tokens=meta_dict.get("output_tokens"),
+            compression_ratio=meta_dict.get("compression_ratio"),
+            summary_level_name=meta_dict.get("summary_level_name"),
+        )
         record = write_memory_file(
             memory_root,
-            conversation_id=meta["conversation_id"],
-            role=meta["role"],
-            created_at=meta.get("created_at", created_at),
             content=entry["content"],
-            summary_kind="summary",
             doc_id=entry["id"],
-            level=meta.get("level"),
-            is_final=meta.get("is_final"),
-            chunk_index=meta.get("chunk_index"),
-            parent_group=meta.get("parent_group"),
-            group_index=meta.get("group_index"),
-            input_tokens=meta.get("input_tokens"),
-            output_tokens=meta.get("output_tokens"),
-            compression_ratio=meta.get("compression_ratio"),
-            summary_level_name=meta.get("summary_level"),
+            metadata=metadata,
         )
-        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta.get("level"))
+        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level"))
         stored_ids.append(record.id)
 
     # Store in ChromaDB
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 722dcda9e..b668a2d3b 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -176,22 +176,7 @@ def upsert_hierarchical_summary(
         contents.append(entry["content"])
         # Convert the raw metadata dict to MemoryMetadata
         meta_dict = entry["metadata"]
-        metadatas.append(
-            MemoryMetadata(
-                conversation_id=meta_dict["conversation_id"],
-                role=meta_dict["role"],
-                created_at=meta_dict["created_at"],
-                level=meta_dict.get("level"),
-                is_final=meta_dict.get("is_final"),
-                chunk_index=meta_dict.get("chunk_index"),
-                parent_group=meta_dict.get("parent_group"),
-                group_index=meta_dict.get("group_index"),
-                input_tokens=meta_dict.get("input_tokens"),
-                output_tokens=meta_dict.get("output_tokens"),
-                compression_ratio=meta_dict.get("compression_ratio"),
-                summary_level_name=meta_dict.get("summary_level"),
-            ),
-        )
+        metadatas.append(MemoryMetadata(**meta_dict))
 
     upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas)
     return ids
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 4eb289c7d..06266c575 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -56,8 +56,6 @@ class MemoryMetadata(BaseModel):
     """Whether this is the final L3 summary."""
     chunk_index: int | None = None
     """For L1 summaries: index of the source chunk."""
-    parent_group: int | None = None
-    """For L1 summaries: which L2 group this chunk belongs to."""
     group_index: int | None = None
     """For L2 summaries: index of this group."""
     input_tokens: int | None = None
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 030b5729e..731c55058 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -13,12 +13,16 @@
 
 
 @lru_cache(maxsize=4)
-def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding:
+def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None:
     """Get tiktoken encoding for a model, with caching.
 
     Falls back to cl100k_base for unknown models (covers most modern LLMs).
+    Returns None when tiktoken is not installed so callers can use a heuristic.
     """
-    import tiktoken  # noqa: PLC0415
+    try:
+        import tiktoken  # noqa: PLC0415
+    except ModuleNotFoundError:
+        return None
 
     try:
         return tiktoken.encoding_for_model(model)
@@ -27,7 +31,7 @@ def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding:
 
 
 def count_tokens(text: str, model: str = "gpt-4") -> int:
-    """Count tokens in text using tiktoken.
+    """Count tokens in text using tiktoken, with a lightweight fallback.
 
     Args:
         text: The text to count tokens for.
@@ -40,11 +44,18 @@ def count_tokens(text: str, model: str = "gpt-4") -> int:
     if not text:
         return 0
     enc = _get_encoding(model)
+    if enc is None:
+        return _estimate_token_count(text)
     # Disable special token checking - LLM outputs may contain special tokens
     # like <|constrain|>, <|endoftext|>, etc. that we want to count normally
     return len(enc.encode(text, disallowed_special=()))
 
 
+def _estimate_token_count(text: str) -> int:
+    """Very rough token estimate based on character length (~4 chars/token)."""
+    return max(1, (len(text) + 3) // 4)
+
+
 def chunk_text(
     text: str,
     chunk_size: int = 3000,
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 4f5c51191..ce6da9082 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -185,7 +185,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L3,
                         "is_final": True,
-                        "summary_level": self.level.name,
+                        "summary_level_name": self.level.name,
                         "input_tokens": self.input_tokens,
                         "output_tokens": self.output_tokens,
                         "compression_ratio": self.compression_ratio,
@@ -204,7 +204,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L3,
                         "is_final": True,
-                        "summary_level": self.level.name,
+                        "summary_level_name": self.level.name,
                         "input_tokens": self.input_tokens,
                         "output_tokens": self.output_tokens,
                         "compression_ratio": self.compression_ratio,
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 59f1dbb5e..ec7b769f2 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -169,7 +169,6 @@ class ChunkSummary(BaseModel):
     content: str              # The summarized text
     token_count: int          # Tokens in this summary
     source_tokens: int        # Tokens in source chunk
-    parent_group: int | None  # L2 group this belongs to
 
 class HierarchicalSummary(BaseModel):
     l1_summaries: list[ChunkSummary]  # Individual chunk summaries
@@ -191,7 +190,6 @@ Summaries are stored with rich metadata for retrieval and management:
 | `level` | ✓ | ✓ | ✓ | 1, 2, or 3 |
 | `chunk_index` | ✓ | | | Position in L1 sequence |
 | `group_index` | | ✓ | | Position in L2 sequence |
-| `parent_group` | ✓ | | | Which L2 group owns this L1 |
 | `is_final` | | | ✓ | Marks the top-level summary |
 | `summary_level` | | | ✓ | Name of SummaryLevel enum |
 | `input_tokens` | | | ✓ | Original content token count |
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 453a21a9a..0851d9637 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -159,7 +159,7 @@ def test_upsert_hierarchical_summary_simple() -> None:
                 "role": "summary",
                 "level": 3,
                 "is_final": True,
-                "summary_level": "STANDARD",
+                "summary_level_name": "STANDARD",
                 "input_tokens": 1000,
                 "output_tokens": 50,
                 "compression_ratio": 0.05,
@@ -192,7 +192,6 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
                 "role": "summary",
                 "level": 1,
                 "chunk_index": 0,
-                "parent_group": 0,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
@@ -204,7 +203,6 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
                 "role": "summary",
                 "level": 1,
                 "chunk_index": 1,
-                "parent_group": 0,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index 6eeb133ed..5cb97115d 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -115,7 +115,7 @@ def test_standard_summary_produces_single_entry(self) -> None:
         assert entry["content"] == "A paragraph summary of the content."
         assert entry["metadata"]["level"] == 3
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "STANDARD"
+        assert entry["metadata"]["summary_level_name"] == "STANDARD"
 
     def test_hierarchical_summary_produces_multiple_entries(self) -> None:
         """Test that HIERARCHICAL level produces L1, L2, L3 entries."""
@@ -125,21 +125,18 @@ def test_hierarchical_summary_produces_multiple_entries(self) -> None:
                 content="Chunk 0",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=1,
                 content="Chunk 1",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=2,
                 content="Chunk 2",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
         ]
         hierarchical = HierarchicalSummary(
@@ -257,14 +254,12 @@ def test_persist_hierarchical_creates_files(
                 content="Chunk 0 content",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=1,
                 content="Chunk 1 content",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
         ]
         hierarchical = HierarchicalSummary(
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index 23509d2e3..d39621119 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -223,7 +223,7 @@ def test_to_storage_metadata_simple_summary(self) -> None:
         assert entry["metadata"]["role"] == "summary"
         assert entry["metadata"]["level"] == 3
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "STANDARD"
+        assert entry["metadata"]["summary_level_name"] == "STANDARD"
 
     def test_to_storage_metadata_hierarchical(self) -> None:
         """Test storage metadata for hierarchical summary."""
@@ -233,14 +233,12 @@ def test_to_storage_metadata_hierarchical(self) -> None:
                 content="Chunk 0 text",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=1,
                 content="Chunk 1 text",
                 token_count=12,
                 source_tokens=120,
-                parent_group=0,
             ),
         ]
         hierarchical = HierarchicalSummary(

From 08e9ac5e23388d6857a836ccadb95775458e758a Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 22:52:39 -0800
Subject: [PATCH 16/37] perf: lazy imports for pydantic_ai, sounddevice, and
 numpy

Improve CLI startup time from ~0.51s to ~0.16s (69% faster) by deferring
heavy imports until they're actually needed:

- pydantic_ai: lazy in memory/_ingest.py, summarizer/adaptive.py, rag/engine.py
- sounddevice: lazy in core/audio.py (moved to TYPE_CHECKING + function imports)
- numpy: lazy in rag/_retriever.py and services/tts.py

Update tests to patch modules directly (e.g., pydantic_ai.Agent) instead of
through module attributes that no longer exist at import time.

Add scripts/profile_imports.py for measuring import performance.
---
 agent_cli/summarizer/adaptive.py  |   9 +-
 scripts/profile_imports.py        | 141 ++++++++++++++++++++++++++++++
 tests/summarizer/test_adaptive.py |   4 +-
 3 files changed, 148 insertions(+), 6 deletions(-)
 create mode 100755 scripts/profile_imports.py

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 989bd86ba..99fa4641a 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -14,10 +14,6 @@
 from dataclasses import dataclass
 
 from pydantic import BaseModel
-from pydantic_ai import Agent
-from pydantic_ai.models.openai import OpenAIChatModel
-from pydantic_ai.providers.openai import OpenAIProvider
-from pydantic_ai.settings import ModelSettings
 
 from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
@@ -435,6 +431,11 @@ async def _generate_summary(
         SummarizationError: If summarization fails.
 
     """
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
+
     provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
     model = OpenAIChatModel(
         model_name=config.model,
diff --git a/scripts/profile_imports.py b/scripts/profile_imports.py
new file mode 100755
index 000000000..d70b5b39e
--- /dev/null
+++ b/scripts/profile_imports.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Profile CLI import times to identify slow imports.
+
+Usage:
+    python scripts/profile_imports.py              # Basic timing
+    python scripts/profile_imports.py -v           # Verbose (show all imports)
+    python scripts/profile_imports.py --top 20     # Show top 20 slowest
+    python scripts/profile_imports.py --cli-only   # Just measure CLI startup time
+
+    # Raw importtime output (for detailed analysis):
+    python -X importtime -c "from agent_cli.cli import app" 2>&1 | sort -t'|' -k2 -n
+"""
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+def measure_import_time(module: str, runs: int = 3) -> float:
+    """Measure average import time for a module."""
+    times = []
+    for _ in range(runs):
+        start = time.perf_counter()
+        result = subprocess.run(
+            [sys.executable, "-c", f"import {module}"],
+            check=False,
+            capture_output=True,
+            cwd=Path(__file__).parent.parent,
+        )
+        elapsed = time.perf_counter() - start
+        if result.returncode != 0:
+            print(f"Error importing {module}: {result.stderr.decode()}")
+            return -1
+        times.append(elapsed)
+    return sum(times) / len(times)
+
+
+def get_import_breakdown(module: str) -> list[tuple[float, str]]:
+    """Get detailed import times using -X importtime."""
+    result = subprocess.run(
+        [sys.executable, "-X", "importtime", "-c", f"import {module}"],
+        check=False,
+        capture_output=True,
+        text=True,
+        cwd=Path(__file__).parent.parent,
+    )
+
+    imports = []
+    for line in result.stderr.splitlines():
+        if "|" not in line:
+            continue
+        parts = line.split("|")
+        if len(parts) >= 2:  # noqa: PLR2004
+            try:
+                # importtime format: "import time: self [us] | cumulative | name"
+                cumulative = int(parts[1].strip())
+                name = parts[2].strip() if len(parts) > 2 else "unknown"  # noqa: PLR2004
+                imports.append((cumulative / 1_000_000, name))  # Convert to seconds
+            except (ValueError, IndexError):
+                continue
+
+    return sorted(imports, reverse=True)
+
+
+def main() -> None:
+    """Run import profiling and display results."""
+    parser = argparse.ArgumentParser(description="Profile CLI import times")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Show all imports")
+    parser.add_argument("--top", type=int, default=15, help="Show top N slowest imports")
+    parser.add_argument("--runs", type=int, default=3, help="Number of runs for averaging")
+    parser.add_argument("--cli-only", action="store_true", help="Only measure CLI import time")
+    args = parser.parse_args()
+
+    if args.cli_only:
+        avg = measure_import_time("agent_cli.cli", runs=args.runs)
+        print(f"CLI import time: {avg:.3f}s (avg of {args.runs} runs)")
+        return
+
+    print("=" * 60)
+    print("CLI Import Time Profiling")
+    print("=" * 60)
+
+    # Measure key entry points
+    modules = [
+        ("agent_cli", "Base package"),
+        ("agent_cli.cli", "CLI app (full)"),
+        ("agent_cli.memory", "Memory module (chromadb)"),
+        ("agent_cli.rag", "RAG module"),
+        ("agent_cli.summarizer", "Summarizer module"),
+        ("agent_cli.agents.assistant", "Assistant agent"),
+        ("agent_cli.agents.summarize", "Summarize agent"),
+        ("pydantic_ai", "pydantic-ai"),
+        ("openai", "OpenAI SDK"),
+    ]
+
+    print(f"\n{'Module':<30} {'Time (s)':<12} Description")
+    print("-" * 60)
+
+    for module, desc in modules:
+        avg_time = measure_import_time(module, runs=args.runs)
+        if avg_time >= 0:
+            bar = "█" * int(avg_time * 20)  # Visual bar (1 block = 50ms)
+            print(f"{module:<30} {avg_time:>8.3f}s   {desc} {bar}")
+
+    # Detailed breakdown
+    print(f"\n{'=' * 60}")
+    print(f"Top {args.top} slowest imports (cumulative time)")
+    print("=" * 60)
+
+    imports = get_import_breakdown("agent_cli.cli")
+
+    shown = 0
+    for cumtime, name in imports:
+        if shown >= args.top and not args.verbose:
+            break
+        # Skip very fast imports unless verbose
+        if cumtime < 0.001 and not args.verbose:  # noqa: PLR2004
+            continue
+        bar = "█" * int(cumtime * 100)  # 1 block = 10ms
+        print(f"{cumtime:>8.3f}s  {name:<40} {bar}")
+        shown += 1
+
+    # Summary
+    if imports:
+        total = imports[0][0] if imports else 0
+        print(f"\n{'=' * 60}")
+        print(f"Total CLI import time: {total:.3f}s")
+        if total > 0.5:  # noqa: PLR2004
+            print("⚠️  Import time > 500ms - consider lazy imports")
+        elif total > 0.3:  # noqa: PLR2004
+            print("⚡ Import time moderate (300-500ms)")
+        else:
+            print("✅ Import time good (< 300ms)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index ac04bc126..6acf43171 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -277,7 +277,7 @@ async def test_generate_summary_with_pydantic_ai(
         mock_result = MagicMock()
         mock_result.output = SummaryOutput(summary="Generated summary.")
 
-        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+        with patch("pydantic_ai.Agent") as mock_agent_class:
             mock_agent = MagicMock()
             mock_agent.run = AsyncMock(return_value=mock_result)
             mock_agent_class.return_value = mock_agent
@@ -293,7 +293,7 @@ async def test_raises_summarization_error_on_failure(
         config: SummarizerConfig,
     ) -> None:
         """Test that SummarizationError is raised on failure."""
-        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+        with patch("pydantic_ai.Agent") as mock_agent_class:
             mock_agent = MagicMock()
             mock_agent.run = AsyncMock(side_effect=Exception("API error"))
             mock_agent_class.return_value = mock_agent

From 22d82c46ba1ea953b4f380521df3d5de7b7ab1ba Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:08:46 -0800
Subject: [PATCH 17/37] refactor: reduce duplication in memory store and
 summarizer

- Extract upsert_summary_entries() to avoid double to_storage_metadata() call
- Extract _summarize_chunks() helper for async chunk processing pipeline
---
 agent_cli/memory/_persistence.py |  6 ++--
 agent_cli/memory/_store.py       | 40 +++++++++++++++++-----
 agent_cli/summarizer/adaptive.py | 59 ++++++++++++++++----------------
 3 files changed, 64 insertions(+), 41 deletions(-)

diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 2af3a2687..1bb2102d4 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -21,8 +21,8 @@
     delete_entries,
     delete_summaries,
     list_conversation_entries,
-    upsert_hierarchical_summary,
     upsert_memories,
+    upsert_summary_entries,
 )
 from agent_cli.memory.entities import Fact, Turn
 from agent_cli.memory.models import MemoryMetadata
@@ -237,8 +237,8 @@ def persist_hierarchical_summary(
         LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level"))
         stored_ids.append(record.id)
 
-    # Store in ChromaDB
-    upsert_hierarchical_summary(collection, conversation_id, summary_result)
+    # Store in ChromaDB (reuse the entries we already built)
+    upsert_summary_entries(collection, entries)
 
     return stored_ids
 
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index b668a2d3b..88edb8c5d 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -144,26 +144,24 @@ def delete_entries(collection: Collection, ids: list[str]) -> None:
     delete_docs(collection, ids)
 
 
-def upsert_hierarchical_summary(
+def upsert_summary_entries(
     collection: Collection,
-    conversation_id: str,
-    summary_result: Any,
+    entries: list[dict[str, Any]],
 ) -> list[str]:
-    """Store all levels of a hierarchical summary.
+    """Store pre-built summary entries to ChromaDB.
 
-    Uses SummaryResult.to_storage_metadata() to generate ChromaDB entries
-    for L1 (chunk), L2 (group), and L3 (final) summaries.
+    This is the low-level helper that accepts entries already built by
+    SummaryResult.to_storage_metadata(). Use this when you already have
+    the entries (e.g., after writing files) to avoid duplicate serialization.
 
     Args:
         collection: ChromaDB collection.
-        conversation_id: The conversation this summary belongs to.
-        summary_result: A SummaryResult from the adaptive summarizer.
+        entries: List of entry dicts with 'id', 'content', and 'metadata' keys.
 
     Returns:
         List of IDs that were upserted.
 
     """
-    entries = summary_result.to_storage_metadata(conversation_id)
     if not entries:
         return []
 
@@ -182,6 +180,30 @@ def upsert_hierarchical_summary(
     return ids
 
 
+def upsert_hierarchical_summary(
+    collection: Collection,
+    conversation_id: str,
+    summary_result: Any,
+) -> list[str]:
+    """Store all levels of a hierarchical summary.
+
+    Convenience wrapper that calls to_storage_metadata() and then
+    upsert_summary_entries(). If you already have the entries built,
+    call upsert_summary_entries() directly to avoid duplicate work.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: A SummaryResult from the adaptive summarizer.
+
+    Returns:
+        List of IDs that were upserted.
+
+    """
+    entries = summary_result.to_storage_metadata(conversation_id)
+    return upsert_summary_entries(collection, entries)
+
+
 def get_summary_at_level(
     collection: Collection,
     conversation_id: str,
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 99fa4641a..7d24ef760 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -184,6 +184,34 @@ async def summarize(
     )
 
 
+async def _summarize_chunks(
+    chunks: list[str],
+    config: SummarizerConfig,
+) -> list[ChunkSummary]:
+    """Summarize multiple chunks with concurrency control.
+
+    This helper centralizes the semaphore/gather pattern used by both
+    _detailed_summary and _hierarchical_summary.
+
+    Args:
+        chunks: List of text chunks to summarize.
+        config: Summarizer configuration (includes max_concurrent_chunks).
+
+    Returns:
+        List of ChunkSummary objects in the same order as input chunks.
+
+    """
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
+    total = len(chunks)
+
+    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
+        async with semaphore:
+            return await _summarize_single_chunk(chunk, idx, total, config)
+
+    gen = (summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks))
+    return list(await asyncio.gather(*gen))
+
+
 async def _summarize_single_chunk(
     chunk: str,
     chunk_index: int,
@@ -268,21 +296,7 @@ async def _detailed_summary(
 
     logger.info("Detailed summary: processing %d chunks", len(chunks))
 
-    # Summarize chunks (with concurrency limit)
-    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
-
-    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-        async with semaphore:
-            return await _summarize_single_chunk(
-                chunk,
-                idx,
-                len(chunks),
-                config,
-            )
-
-    chunk_summaries = await asyncio.gather(
-        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-    )
+    chunk_summaries = await _summarize_chunks(chunks, config)
 
     # Generate meta-summary
     all_summaries = [cs.content for cs in chunk_summaries]
@@ -341,20 +355,7 @@ async def _hierarchical_summary(
     logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
 
     # L1: Summarize each chunk
-    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
-
-    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-        async with semaphore:
-            return await _summarize_single_chunk(
-                chunk,
-                idx,
-                len(chunks),
-                config,
-            )
-
-    l1_summaries = await asyncio.gather(
-        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-    )
+    l1_summaries = await _summarize_chunks(chunks, config)
 
     # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
     l2_summaries: list[str] = []

From 32a9ad4f027da533b96ed37924d2ba632728a70f Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:22:14 -0800
Subject: [PATCH 18/37] refactor: simplify docstrings and remove unused
 upsert_hierarchical_summary

- Replace verbose Args/Returns docstrings with single-line summaries
- Remove upsert_hierarchical_summary (was only used in tests)
- Update tests to use upsert_summary_entries directly

Net: -102 lines
---
 agent_cli/memory/_store.py           | 39 +--------------------
 agent_cli/summarizer/_utils.py       | 11 +-----
 agent_cli/summarizer/adaptive.py     | 52 +++-------------------------
 tests/memory/test_store.py           | 28 ++++-----------
 tests/summarizer/test_integration.py |  8 +++--
 5 files changed, 18 insertions(+), 120 deletions(-)

diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 88edb8c5d..36ace5888 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -148,20 +148,7 @@ def upsert_summary_entries(
     collection: Collection,
     entries: list[dict[str, Any]],
 ) -> list[str]:
-    """Store pre-built summary entries to ChromaDB.
-
-    This is the low-level helper that accepts entries already built by
-    SummaryResult.to_storage_metadata(). Use this when you already have
-    the entries (e.g., after writing files) to avoid duplicate serialization.
-
-    Args:
-        collection: ChromaDB collection.
-        entries: List of entry dicts with 'id', 'content', and 'metadata' keys.
-
-    Returns:
-        List of IDs that were upserted.
-
-    """
+    """Store pre-built summary entries (from to_storage_metadata) to ChromaDB."""
     if not entries:
         return []
 
@@ -180,30 +167,6 @@ def upsert_summary_entries(
     return ids
 
 
-def upsert_hierarchical_summary(
-    collection: Collection,
-    conversation_id: str,
-    summary_result: Any,
-) -> list[str]:
-    """Store all levels of a hierarchical summary.
-
-    Convenience wrapper that calls to_storage_metadata() and then
-    upsert_summary_entries(). If you already have the entries built,
-    call upsert_summary_entries() directly to avoid duplicate work.
-
-    Args:
-        collection: ChromaDB collection.
-        conversation_id: The conversation this summary belongs to.
-        summary_result: A SummaryResult from the adaptive summarizer.
-
-    Returns:
-        List of IDs that were upserted.
-
-    """
-    entries = summary_result.to_storage_metadata(conversation_id)
-    return upsert_summary_entries(collection, entries)
-
-
 def get_summary_at_level(
     collection: Collection,
     conversation_id: str,
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 731c55058..2c37159fc 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -31,16 +31,7 @@ def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None:
 
 
 def count_tokens(text: str, model: str = "gpt-4") -> int:
-    """Count tokens in text using tiktoken, with a lightweight fallback.
-
-    Args:
-        text: The text to count tokens for.
-        model: Model name for tokenizer selection.
-
-    Returns:
-        Number of tokens in the text.
-
-    """
+    """Count tokens using tiktoken, falling back to char-based estimate."""
     if not text:
         return 0
     enc = _get_encoding(model)
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 7d24ef760..62b9b68cd 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -94,15 +94,7 @@ def __post_init__(self) -> None:
 
 
 def determine_level(token_count: int) -> SummaryLevel:
-    """Determine the appropriate summary level based on token count.
-
-    Args:
-        token_count: Number of tokens in the input.
-
-    Returns:
-        The recommended SummaryLevel.
-
-    """
+    """Map token count to appropriate SummaryLevel."""
     if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
         return SummaryLevel.NONE
     if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
@@ -188,19 +180,7 @@ async def _summarize_chunks(
     chunks: list[str],
     config: SummarizerConfig,
 ) -> list[ChunkSummary]:
-    """Summarize multiple chunks with concurrency control.
-
-    This helper centralizes the semaphore/gather pattern used by both
-    _detailed_summary and _hierarchical_summary.
-
-    Args:
-        chunks: List of text chunks to summarize.
-        config: Summarizer configuration (includes max_concurrent_chunks).
-
-    Returns:
-        List of ChunkSummary objects in the same order as input chunks.
-
-    """
+    """Summarize chunks concurrently with semaphore-controlled parallelism."""
     semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
     total = len(chunks)
 
@@ -218,18 +198,7 @@ async def _summarize_single_chunk(
     total_chunks: int,
     config: SummarizerConfig,
 ) -> ChunkSummary:
-    """Summarize a single chunk of content.
-
-    Args:
-        chunk: The text chunk to summarize.
-        chunk_index: Index of this chunk (0-based).
-        total_chunks: Total number of chunks being processed.
-        config: Summarizer configuration.
-
-    Returns:
-        ChunkSummary with the summarized content.
-
-    """
+    """Summarize a single chunk and return its metadata."""
     source_tokens = count_tokens(chunk, config.model)
     target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD)
     max_words = tokens_to_words(target_tokens)
@@ -418,20 +387,7 @@ async def _generate_summary(
     config: SummarizerConfig,
     max_tokens: int = 256,
 ) -> str:
-    """Generate a summary using the LLM.
-
-    Args:
-        prompt: The prompt to send to the LLM.
-        config: Summarizer configuration.
-        max_tokens: Maximum tokens for the response.
-
-    Returns:
-        The generated summary text.
-
-    Raises:
-        SummarizationError: If summarization fails.
-
-    """
+    """Call the LLM to generate a summary. Raises SummarizationError on failure."""
     from pydantic_ai import Agent  # noqa: PLC0415
     from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
     from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 0851d9637..5e8e33142 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -133,21 +133,10 @@ def test_upsert_and_delete_entries_delegate() -> None:
     assert fake.deleted == [["x"]]
 
 
-# --- Hierarchical Summary Tests ---
+# --- Summary Entry Tests ---
 
 
-class _MockSummaryResult:
-    """Mock SummaryResult for testing without importing the full summarizer module."""
-
-    def __init__(self, entries: list[dict[str, Any]]) -> None:
-        self._entries = entries
-
-    def to_storage_metadata(self, _conversation_id: str) -> list[dict[str, Any]]:
-        # Just return the pre-configured entries (ignores conversation_id)
-        return self._entries
-
-
-def test_upsert_hierarchical_summary_simple() -> None:
+def test_upsert_summary_entries_simple() -> None:
     """Test upserting a simple (non-hierarchical) summary."""
     fake = _FakeCollection()
     entries = [
@@ -167,9 +156,8 @@ def test_upsert_hierarchical_summary_simple() -> None:
             },
         },
     ]
-    mock_result = _MockSummaryResult(entries)
 
-    ids = _store.upsert_hierarchical_summary(fake, "conv-123", mock_result)
+    ids = _store.upsert_summary_entries(fake, entries)
 
     assert ids == ["conv-123:summary:L3:final"]
     assert len(fake.upserts) == 1
@@ -180,7 +168,7 @@ def test_upsert_hierarchical_summary_simple() -> None:
     assert upserted_metas[0]["is_final"] is True
 
 
-def test_upsert_hierarchical_summary_with_chunks() -> None:
+def test_upsert_summary_entries_with_chunks() -> None:
     """Test upserting a hierarchical summary with L1 and L3 entries."""
     fake = _FakeCollection()
     entries = [
@@ -221,9 +209,8 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
             },
         },
     ]
-    mock_result = _MockSummaryResult(entries)
 
-    ids = _store.upsert_hierarchical_summary(fake, "conv-456", mock_result)
+    ids = _store.upsert_summary_entries(fake, entries)
 
     assert len(ids) == 3
     assert "conv-456:summary:L1:0" in ids
@@ -231,12 +218,11 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
     assert "conv-456:summary:L3:final" in ids
 
 
-def test_upsert_hierarchical_summary_empty() -> None:
+def test_upsert_summary_entries_empty() -> None:
     """Test upserting when there are no entries (e.g., NONE level)."""
     fake = _FakeCollection()
-    mock_result = _MockSummaryResult([])
 
-    ids = _store.upsert_hierarchical_summary(fake, "conv-789", mock_result)
+    ids = _store.upsert_summary_entries(fake, [])
 
     assert ids == []
     assert len(fake.upserts) == 0
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index 5cb97115d..d70286592 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -12,7 +12,7 @@
 from agent_cli.memory._store import (
     get_final_summary,
     get_summary_at_level,
-    upsert_hierarchical_summary,
+    upsert_summary_entries,
 )
 from agent_cli.summarizer import SummaryLevel, SummaryResult
 from agent_cli.summarizer.adaptive import determine_level
@@ -185,7 +185,8 @@ def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None:
             compression_ratio=0.05,
         )
 
-        ids = upsert_hierarchical_summary(fake_collection, "conv-123", result)
+        entries = result.to_storage_metadata("conv-123")
+        ids = upsert_summary_entries(fake_collection, entries)
 
         assert len(ids) == 1
         assert "conv-123:summary:L3:final" in ids
@@ -225,7 +226,8 @@ def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> N
             compression_ratio=0.02,
         )
 
-        ids = upsert_hierarchical_summary(fake_collection, "conv-789", result)
+        entries = result.to_storage_metadata("conv-789")
+        ids = upsert_summary_entries(fake_collection, entries)
 
         assert len(ids) == 3  # 2 L1 + 1 L3
 

From 6b1b47e53a13015e762abc3b693fbc400da21670 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:32:10 -0800
Subject: [PATCH 19/37] fix(summarizer): strip special tokens from LLM output

Some models leak control tokens like <|constrain|>, <|end|>, etc.
into their output. Add regex cleanup in _generate_summary().

Also rewrites docs/architecture/summarizer.md to focus on research
foundations and design rationale rather than code snippets.
---
 agent_cli/summarizer/adaptive.py |   6 +-
 docs/architecture/summarizer.md  | 562 ++++++++-----------------------
 2 files changed, 141 insertions(+), 427 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 62b9b68cd..9d17c8d7e 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -11,6 +11,7 @@
 
 import asyncio
 import logging
+import re
 from dataclasses import dataclass
 
 from pydantic import BaseModel
@@ -412,7 +413,10 @@ async def _generate_summary(
 
     try:
         result = await agent.run(prompt)
-        return result.output.summary.strip()
+        text = result.output.summary.strip()
+        # Strip special tokens that some models leak (e.g., <|constrain|>, <|end|>)
+        text = re.sub(r"<\|[^|]+\|>", "", text)
+        return text.strip()
     except Exception as e:
         msg = f"Summarization failed: {e}"
         raise SummarizationError(msg) from e
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index ec7b769f2..d69b3b111 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -1,539 +1,249 @@
 # Agent CLI: Adaptive Summarizer Technical Specification
 
-This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem. The design is grounded in research from Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, compression ratios).
+This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem.
 
 ## 1. System Overview
 
 The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count.
 
 ```
-┌─────────────────────────────────────────────────────────────────────┐
-│                    Adaptive Summarization Pipeline                  │
-├─────────────────────────────────────────────────────────────────────┤
-│                                                                     │
-│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy     │
-│                                                                     │
-│  ┌─────────────────────────────────────────────────────────────┐    │
-│  │ Level Thresholds:                                           │    │
-│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │    │
-│  │   100-500       ──▶ BRIEF       (single sentence)           │    │
-│  │   500-3000      ──▶ STANDARD    (paragraph)                 │    │
-│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │    │
-│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │    │
-│  └─────────────────────────────────────────────────────────────┘    │
-│                                                                     │
-│  Output: SummaryResult with compression metrics                     │
-└─────────────────────────────────────────────────────────────────────┘
+Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
+                                        │
+        ┌───────────────────────────────┼───────────────────────────────┐
+        │                               │                               │
+   < 100 tokens                   500-15000 tokens                > 15000 tokens
+        │                               │                               │
+   No summary needed            Chunked processing              Hierarchical tree
+                                  + meta-synthesis                  (L1/L2/L3)
 ```
 
 **Design Goals:**
 
 - **Adaptive compression:** Match summarization depth to content complexity.
 - **Research-grounded:** Based on proven approaches from Letta and Mem0.
-- **Hierarchical structure:** Preserve detail at multiple granularities.
+- **Hierarchical structure:** Preserve detail at multiple granularities for large content.
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
 ---
 
-## 2. Architectural Decisions
+## 2. Research Foundations
 
-### 2.1 Token-Based Level Selection
+The summarization approach draws from two research-backed memory systems:
 
-**Decision:** Select summarization strategy based on input token count with fixed thresholds.
-
-**Rationale:**
+### 2.1 Letta (MemGPT) Contributions
 
-- **Predictable behavior:** Users can anticipate output length based on input size.
-- **Optimal compression:** Each level targets a specific compression ratio validated by research.
-- **Efficiency:** Avoid over-processing short content or under-processing long content.
+**Reference:** arXiv:2310.08560
 
-**Implementation:**
+Letta's approach to memory management introduced several techniques adopted here:
 
-```python
-THRESHOLD_NONE = 100       # Below this: no summary needed
-THRESHOLD_BRIEF = 500      # 100-500: single sentence (~20% compression)
-THRESHOLD_STANDARD = 3000  # 500-3000: paragraph (~12% compression)
-THRESHOLD_DETAILED = 15000 # 3000-15000: chunked (~7% compression)
-# Above 15000: hierarchical tree structure
-```
+- **Partial eviction:** Rather than discarding old content entirely, compress it to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
 
-**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior.
+- **Middle truncation:** When content must be reduced, preserve the head (introductions, context-setting) and tail (conclusions, recent events) while removing the middle. Research shows important information clusters at boundaries.
 
-### 2.2 Hierarchical Summary Structure (L1/L2/L3)
+- **Fire-and-forget background processing:** Summarization runs asynchronously after turn completion, avoiding latency on the critical path.
 
-**Decision:** For long content, build a tree of summaries at three levels of granularity.
+### 2.2 Mem0 Contributions
 
-**Rationale:**
+**Reference:** arXiv:2504.19413
 
-- **Partial eviction:** Inspired by Letta's memory architecture—keep detailed summaries for recent content, compressed summaries for older content.
-- **Flexible retrieval:** Different use cases need different detail levels.
-- **Progressive compression:** Each level provides ~5x compression over the previous.
-
-**Implementation:**
-
-- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks with 200 token overlap.
-- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries.
-- **L3 (Final Summary):** Single synthesized summary of all L2 summaries.
-
-**Storage:**
-```text
-summaries/
-  L1/
-    chunk_0.md    # Summary of tokens 0-3000
-    chunk_1.md    # Summary of tokens 2800-5800 (overlap)
-  L2/
-    group_0.md    # Synthesis of chunk_0 through chunk_4
-  L3/
-    final.md      # Final narrative summary
-```
+Mem0's memory layer research established compression ratio targets:
 
-### 2.3 Content-Type Aware Prompts
+- **90%+ compression:** Long-running conversations can achieve 10:1 or better compression while retaining semantic meaning. Our hierarchical approach targets similar ratios for very long content.
 
-**Decision:** Use different prompt templates for different content domains.
+- **Rolling summaries:** New information integrates with existing summaries rather than replacing them. The `prior_summary` parameter throughout our pipeline implements this pattern.
 
-**Rationale:**
+- **Two-phase architecture:** Separate extraction (what's important) from storage (how to persist it). We apply this by first generating summaries, then persisting to both files and vector DB.
 
-- **Conversations:** Focus on user preferences, decisions, action items.
-- **Journals:** Emphasize personal insights, emotional context, growth patterns.
-- **Documents:** Prioritize key findings, methodology, conclusions.
-
-**Implementation:**
-
-```python
-def get_prompt_for_content_type(content_type: str) -> str:
-    match content_type:
-        case "conversation": return CONVERSATION_PROMPT
-        case "journal": return JOURNAL_PROMPT
-        case "document": return DOCUMENT_PROMPT
-        case _: return STANDARD_PROMPT
-```
-
-### 2.4 Prior Summary Integration
-
-**Decision:** Always provide the previous summary as context when updating.
-
-**Rationale:**
-
-- **Continuity:** New summaries should build on existing context, not replace it.
-- **Incremental updates:** Avoid re-summarizing all content on every update.
-- **Context preservation:** Important information from earlier content persists.
-
-**Implementation:**
+---
 
-- The `prior_summary` parameter is passed through the entire pipeline.
-- `ROLLING_PROMPT` specifically handles integrating new facts with existing summaries.
-- For hierarchical summaries, only the L3 summary is used as prior context.
+## 3. Architectural Decisions
 
-### 2.5 Compression Ratio Tracking
+### 3.1 Token-Based Level Selection
 
-**Decision:** Track and report compression metrics for every summary.
+**Decision:** Select summarization strategy based on input token count with fixed thresholds.
 
 **Rationale:**
 
-- **Transparency:** Users can understand how much information was compressed.
-- **Quality monitoring:** Unusual ratios may indicate summarization issues.
-- **Optimization:** Metrics inform future threshold tuning.
-
-**Implementation:**
-
-```python
-@dataclass
-class SummaryResult:
-    level: SummaryLevel
-    summary: str | None
-    hierarchical: HierarchicalSummary | None
-    input_tokens: int
-    output_tokens: int
-    compression_ratio: float  # output/input (lower = more compression)
-```
-
----
-
-## 3. Data Model
+- **Predictable behavior:** Users can anticipate output length based on input size.
+- **Optimal compression:** Each level targets a specific compression ratio validated by research.
+- **Efficiency:** Avoid over-processing short content or under-processing long content.
 
-### 3.1 Summary Levels
+**Thresholds:**
 
 | Level | Token Range | Target Compression | Strategy |
 | :--- | :--- | :--- | :--- |
-| `NONE` | < 100 | N/A | No summarization |
-| `BRIEF` | 100-500 | ~20% | Single sentence |
-| `STANDARD` | 500-3000 | ~12% | Paragraph |
-| `DETAILED` | 3000-15000 | ~7% | Chunked + meta |
-| `HIERARCHICAL` | > 15000 | ~3-5% | L1/L2/L3 tree |
-
-### 3.2 Hierarchical Summary Structure
-
-```python
-class ChunkSummary(BaseModel):
-    chunk_index: int          # Position in original content
-    content: str              # The summarized text
-    token_count: int          # Tokens in this summary
-    source_tokens: int        # Tokens in source chunk
-
-class HierarchicalSummary(BaseModel):
-    l1_summaries: list[ChunkSummary]  # Individual chunk summaries
-    l2_summaries: list[str]           # Group summaries
-    l3_summary: str                   # Final synthesis
-    chunk_size: int = 3000            # Tokens per chunk
-    chunk_overlap: int = 200          # Overlap between chunks
-```
-
-### 3.3 Storage Metadata (ChromaDB)
-
-Summaries are stored with rich metadata for retrieval and management:
+| NONE | < 100 | N/A | No summarization needed |
+| BRIEF | 100-500 | ~20% | Single sentence |
+| STANDARD | 500-3000 | ~12% | Paragraph |
+| DETAILED | 3000-15000 | ~7% | Chunked + meta-synthesis |
+| HIERARCHICAL | > 15000 | ~3-5% | L1/L2/L3 tree |
 
-| Field | L1 | L2 | L3 | Description |
-| :--- | :---: | :---: | :---: | :--- |
-| `id` | ✓ | ✓ | ✓ | `{conversation_id}:summary:L{n}:{index}` |
-| `conversation_id` | ✓ | ✓ | ✓ | Scope key |
-| `role` | ✓ | ✓ | ✓ | Always `"summary"` |
-| `level` | ✓ | ✓ | ✓ | 1, 2, or 3 |
-| `chunk_index` | ✓ | | | Position in L1 sequence |
-| `group_index` | | ✓ | | Position in L2 sequence |
-| `is_final` | | | ✓ | Marks the top-level summary |
-| `summary_level` | | | ✓ | Name of SummaryLevel enum |
-| `input_tokens` | | | ✓ | Original content token count |
-| `output_tokens` | | | ✓ | Total summary token count |
-| `compression_ratio` | | | ✓ | Output/input ratio |
-| `created_at` | ✓ | ✓ | ✓ | ISO 8601 timestamp |
+**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level.
 
-### 3.4 File Format
+### 3.2 Hierarchical Summary Structure (L1/L2/L3)
 
-Summary files use Markdown with YAML front matter:
+**Decision:** For long content, build a tree of summaries at three levels of granularity.
 
-```markdown
----
-id: "journal:summary:L3:final"
-conversation_id: "journal"
-role: "summary"
-level: 3
-is_final: true
-summary_level: "STANDARD"
-input_tokens: 1500
-output_tokens: 180
-compression_ratio: 0.12
-created_at: "2025-01-15T10:30:00Z"
----
+**Rationale:**
 
-The user has been exploring adaptive summarization techniques...
-```
+- **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection.
+- **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
+- **Progressive compression:** Each level provides ~5x compression over the previous, achieving high overall compression while preserving structure.
 
----
+**Structure:**
 
-## 4. Processing Pipeline
+- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries.
+- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction.
+- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for rolling updates.
 
-### 4.1 Main Entry Point
+**Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3).
 
-```python
-async def summarize(
-    content: str,
-    config: SummarizerConfig,
-    prior_summary: str | None = None,
-    content_type: str = "general",
-) -> SummaryResult
-```
+### 3.3 Semantic Boundary Chunking
 
-### 4.2 Level Selection Flow
+**Decision:** Split content on semantic boundaries (paragraphs, then sentences) rather than fixed character counts.
 
-```
-Input Content
-     │
-     ▼
-┌─────────────┐
-│ Count Tokens│ (tiktoken, cl100k_base)
-└──────┬──────┘
-       │
-       ▼
-┌─────────────────────────────────────────┐
-│ determine_level(token_count) -> Level   │
-│                                         │
-│   < 100  ──▶ NONE                       │
-│   < 500  ──▶ BRIEF                      │
-│   < 3000 ──▶ STANDARD                   │
-│   < 15000 ──▶ DETAILED                  │
-│   else   ──▶ HIERARCHICAL               │
-└──────┬──────────────────────────────────┘
-       │
-       ▼
-   Execute level-specific strategy
-```
+**Rationale:**
 
-### 4.3 Strategy Execution by Level
-
-#### NONE Level
-- **Action:** Return immediately with no summary.
-- **Output:** `SummaryResult(level=NONE, summary=None, compression_ratio=1.0)`
-
-#### BRIEF Level
-- **Prompt:** `BRIEF_PROMPT` - distill to single sentence.
-- **LLM Call:** Single generation with low max_tokens.
-- **Output:** One-sentence summary.
-
-#### STANDARD Level
-- **Prompt:** `STANDARD_PROMPT` with optional prior summary context.
-- **LLM Call:** Single generation.
-- **Output:** Paragraph-length summary.
-
-#### DETAILED Level
-1. **Chunk:** Split content into ~3000 token chunks with 200 token overlap.
-2. **Parallel L1:** Generate summary for each chunk using `CHUNK_PROMPT`.
-3. **Meta-synthesis:** Combine L1 summaries using `META_PROMPT`.
-4. **Output:** `HierarchicalSummary` with L1s and L3 (no L2 needed for this size).
-
-#### HIERARCHICAL Level
-1. **Chunk:** Split into ~3000 token chunks with overlap.
-2. **Parallel L1:** Generate chunk summaries.
-3. **Group:** Organize L1s into groups of ~5.
-4. **Parallel L2:** Summarize each group.
-5. **L3 Synthesis:** Final meta-summary of all L2s.
-6. **Output:** Full `HierarchicalSummary` tree.
-
-### 4.4 Chunking Algorithm
-
-```python
-def chunk_text(
-    text: str,
-    chunk_size: int = 3000,
-    overlap: int = 200,
-) -> list[str]:
-    """Split text into overlapping chunks on paragraph boundaries."""
-```
+- **Coherence preservation:** Splitting mid-sentence or mid-thought loses context and produces poor summaries.
+- **Natural units:** Paragraphs and sentences are natural semantic units that humans use to organize thoughts.
+- **Overlap for continuity:** The 200-token overlap ensures concepts spanning chunk boundaries aren't lost.
 
-**Strategy:**
+**Fallback chain:**
 
-1. **Paragraph-first:** Try to split on double newlines.
-2. **Sentence fallback:** If paragraph exceeds chunk_size, split on sentence boundaries.
-3. **Character fallback:** For very long sentences (e.g., code), use character splitting.
-4. **Overlap handling:** Each chunk starts with the last `overlap` tokens of the previous.
+1. Prefer paragraph boundaries (double newlines)
+2. Fall back to sentence boundaries (`.!?` followed by space + capital)
+3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation)
 
-### 4.5 Middle Truncation (Utility)
+### 3.4 Content-Type Aware Prompts
 
-For handling very large inputs that could exceed context windows:
+**Decision:** Use different prompt templates for different content domains.
 
-```python
-def middle_truncate(
-    text: str,
-    budget_chars: int,
-    head_frac: float = 0.3,
-    tail_frac: float = 0.3,
-) -> tuple[str, int]:
-    """Keep head and tail, remove middle (least likely to contain key info)."""
-```
+**Rationale:**
 
-**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). Useful when summarizing very long conversations that may contain pasted codebases.
+- **Conversations:** Focus on user preferences, decisions, action items—what the user wants and what was agreed.
+- **Journals:** Emphasize personal insights, emotional context, growth patterns—the subjective experience.
+- **Documents:** Prioritize key findings, methodology, conclusions—the objective content.
 
----
+A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
 
-## 5. Prompt Specifications
+### 3.5 Prior Summary Integration (Rolling Updates)
 
-### 5.1 Brief Summary (`BRIEF_PROMPT`)
+**Decision:** Always provide the previous summary as context when generating updates.
 
-```
-Distill the following content into a single, comprehensive sentence
-that captures the essential meaning:
+**Rationale:**
 
-{content}
+- **Continuity:** New summaries should build on existing context, not start fresh each time.
+- **Incremental updates:** Avoid re-summarizing all historical content on every update.
+- **Information preservation:** Important information from earlier content persists through the chain of summaries.
 
-Summary (one sentence):
-```
+This implements Mem0's "rolling summary" pattern. The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
 
-### 5.2 Standard Summary (`STANDARD_PROMPT`)
+### 3.6 Compression Ratio Tracking
 
-```
-Summarize the following content in a concise paragraph.
-{prior_context}
-Focus on key information, decisions, and actionable insights.
+**Decision:** Track and report compression metrics for every summary.
 
-Content:
-{content}
+**Rationale:**
 
-Summary:
-```
+- **Transparency:** Users can understand how much information was compressed.
+- **Quality monitoring:** Unusual ratios (e.g., output longer than input) may indicate summarization issues.
+- **Optimization:** Metrics inform future threshold tuning and quality assessment.
 
-### 5.3 Chunk Summary (`CHUNK_PROMPT`)
+Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression_ratio` for observability.
 
-```
-Summarize this section of a larger document.
-Preserve specific details, names, and numbers that may be important.
+---
 
-Section {chunk_index} of {total_chunks}:
-{content}
+## 4. Processing Pipeline
 
-Section summary:
-```
+### 4.1 Level Selection
 
-### 5.4 Meta Summary (`META_PROMPT`)
+The entry point counts tokens and selects strategy:
 
-```
-Synthesize these section summaries into a coherent narrative.
-Maintain logical flow and preserve the most important information.
+1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
+2. **Threshold comparison:** Maps token count to `SummaryLevel` enum.
+3. **Strategy dispatch:** Calls level-specific handler.
 
-Section Summaries:
-{summaries}
+### 4.2 Brief and Standard Levels
 
-Synthesized Summary:
-```
+For short content (< 3000 tokens):
 
-### 5.5 Content-Type Prompts
+- Single LLM call with level-appropriate prompt
+- Prior summary injected as context if available
+- Content-type selection determines prompt variant
+- Returns simple `SummaryResult` with no hierarchical structure
 
-All content-type prompts include `{prior_context}` for rolling summary continuity.
+### 4.3 Detailed and Hierarchical Levels
 
-**Conversation:**
-```
-Summarize this conversation focusing on:
-- User preferences and decisions
-- Action items and commitments
-- Key topics discussed
-```
+For longer content:
 
-**Journal:**
-```
-Summarize this journal entry focusing on:
-- Personal insights and reflections
-- Emotional context and growth
-- Goals and intentions
-```
+1. **Chunking:** Split content into overlapping chunks on semantic boundaries.
+2. **Parallel L1 generation:** Summarize each chunk independently. Uses semaphore-controlled concurrency to avoid overwhelming the LLM.
+3. **L2 grouping (hierarchical only):** Organize L1s into groups of ~5, summarize each group.
+4. **L3 synthesis:** Meta-summarize all L2s (or all L1s for DETAILED level) into final summary.
 
-**Document:**
-```
-Summarize this document focusing on:
-- Key findings and conclusions
-- Methodology and approach
-- Recommendations and implications
-```
+The parallelism at L1 and L2 levels provides significant speedup for long content while maintaining semantic coherence through the hierarchical structure.
 
 ---
 
-## 6. Integration with Memory System
+## 5. Integration with Memory System
 
-### 6.1 Entry Point
+### 5.1 Write Path
 
-The memory system calls the summarizer via `_ingest.summarize_content()`:
+The memory system triggers summarization during post-processing:
 
-```python
-async def summarize_content(
-    content: str,
-    prior_summary: str | None = None,
-    content_type: str = "general",
-    openai_base_url: str,
-    api_key: str | None,
-    model: str,
-) -> SummaryResult
-```
+1. Collect content to summarize (extracted facts, conversation turns)
+2. Retrieve existing L3 summary as prior context
+3. Call summarizer with content + prior summary + content type
+4. Persist results: delete old summaries, write new files, upsert to ChromaDB
 
-### 6.2 Storage Flow
+### 5.2 Read Path
 
-```
-summarize_content()
-       │
-       ▼
-SummaryResult
-       │
-       ▼
-store_adaptive_summary()
-       │
-       ├──▶ persist_hierarchical_summary()
-       │         │
-       │         ├──▶ Delete old summaries (L1, L2, L3)
-       │         ├──▶ Write new summary files
-       │         └──▶ Upsert to ChromaDB
-       │
-       └──▶ Return stored IDs
-```
+The memory retrieval system uses summaries for context injection:
 
-### 6.3 Retrieval Integration
+- Fetches L3 (final) summary for the conversation
+- Injects as prefix to retrieved memories in the prompt
+- Provides high-level context that individual memory snippets lack
 
-The memory retrieval system uses `get_final_summary()` to fetch the L3 summary:
+### 5.3 Storage
 
-```python
-def get_final_summary(
-    collection: Collection,
-    conversation_id: str,
-) -> StoredMemory | None:
-    """Retrieve the L3 final summary for injection into prompts."""
-```
+Summaries are persisted in two places:
+
+- **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable.
+- **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps.
 
 ---
 
-## 7. Configuration Reference
+## 6. Configuration
 
 | Parameter | Default | Description |
 | :--- | :--- | :--- |
-| `openai_base_url` | *required* | Base URL for LLM API |
-| `model` | *required* | Model ID for summarization |
-| `api_key` | `None` | API key (optional for local models) |
-| `chunk_size` | `3000` | Tokens per chunk for hierarchical |
-| `chunk_overlap` | `200` | Token overlap between chunks |
-
-### 7.1 Level Thresholds (Constants)
+| `chunk_size` | 3000 | Target tokens per chunk |
+| `chunk_overlap` | 200 | Overlap between consecutive chunks |
+| `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization |
 
-| Constant | Value | Description |
-| :--- | :--- | :--- |
-| `THRESHOLD_NONE` | 100 | Below: no summary |
-| `THRESHOLD_BRIEF` | 500 | Below: single sentence |
-| `THRESHOLD_STANDARD` | 3000 | Below: paragraph |
-| `THRESHOLD_DETAILED` | 15000 | Below: chunked |
+Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing and Mem0 research on optimal compression ratios.
 
 ---
 
-## 8. Error Handling
+## 7. Error Handling
 
-### 8.1 Fail-Fast Philosophy
+Summarization follows a fail-fast philosophy:
 
-Errors are propagated rather than hidden behind fallbacks:
+- **LLM errors:** Propagated as `SummarizationError` rather than silently returning empty results.
+- **Empty input:** Returns NONE level immediately (not an error).
+- **Encoding errors:** Falls back to character-based token estimation.
 
-| Error | Behavior |
-| :--- | :--- |
-| LLM timeout | Raises `SummarizationError` |
-| LLM error | Raises `SummarizationError` |
-| Token counting failure | Falls back to `cl100k_base` encoding |
-
-### 8.2 Validation
-
-- **Empty content:** Returns NONE level immediately.
-- **Whitespace-only:** Returns NONE level.
-- **Invalid compression ratio:** Clamped to [0.0, 1.0].
+The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
 
 ---
 
-## 9. Performance Considerations
-
-### 9.1 Token Counting
+## 8. Comparison with Alternatives
 
-- Uses `tiktoken` with `cl100k_base` encoding (GPT-4 tokenizer).
-- Caches tokenizer instance for efficiency.
-- Falls back to character-based estimation if tiktoken unavailable.
-
-### 9.2 Parallel Processing
-
-For DETAILED and HIERARCHICAL levels:
-- L1 chunk summaries can be generated in parallel.
-- L2 group summaries can be generated in parallel.
-- Only L3 synthesis requires sequential processing.
-
-### 9.3 Caching
-
-- Token counts are computed once per content string.
-- Prompt templates are loaded once at module import.
-- ChromaDB connection is reused across operations.
-
----
-
-## 10. Comparison with Alternative Approaches
-
-| Aspect | Adaptive Summarizer | Rolling Summary | Fixed Chunking |
+| Aspect | Adaptive Summarizer | Fixed Rolling Summary | No Summarization |
 | :--- | :--- | :--- | :--- |
-| **Compression** | 3-20% (varies by level) | ~15% fixed | ~10% fixed |
-| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Single level |
-| **Context awareness** | Content-type prompts | Generic | Generic |
-| **Efficiency** | Skip short content | Always summarize | Always chunk |
-| **Research basis** | Letta + Mem0 | Mem0 only | None |
-
----
-
-## 11. Future Enhancements
+| **Compression** | 3-20% (scales with input) | ~15% fixed | 0% |
+| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Full |
+| **Short content** | Skipped (efficient) | Still processed | N/A |
+| **Long content** | Tree structure | Single pass | Context overflow |
+| **Research basis** | Letta + Mem0 | Mem0 | None |
 
-- **Semantic chunking:** Split on topic boundaries rather than token counts.
-- **Incremental L1 updates:** Only re-summarize changed chunks.
-- **Quality scoring:** Evaluate summary quality and trigger re-summarization.
-- **User feedback loop:** Learn preferred compression ratios per user.
+The adaptive approach's key advantage is matching effort to content: short content stays untouched, medium content gets lightweight summarization, and long content gets full hierarchical treatment.

From 062436f3717070b7ab82a2707c999cf233c5a08e Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:39:12 -0800
Subject: [PATCH 20/37] docs: correct Mem0 attribution in summarizer
 documentation

After verifying claims against actual Letta and Mem0 codebases:

Letta (verified):
- Partial eviction (30%) - `partial_evict_summarizer_percentage`
- Middle truncation - `middle_truncate_text()` function
- Fire-and-forget - `fire_and_forget()` method
- arXiv:2310.08560

Mem0 (corrected):
- Two-phase architecture (verified) - fact extraction then memory ops
- Removed "90%+ compression" claim - refers to token savings vs full
  context, not summarization compression ratios
- Removed "rolling summaries" attribution - not a Mem0 term
- arXiv:2504.19413

Also removes incorrect "based on Mem0 research" from code docstrings
where compression ratios were empirically chosen, not research-derived.
---
 agent_cli/summarizer/_utils.py   | 17 +--------
 agent_cli/summarizer/adaptive.py | 11 ++----
 agent_cli/summarizer/models.py   |  6 +---
 docs/architecture/summarizer.md  | 59 +++++++++-----------------------
 4 files changed, 21 insertions(+), 72 deletions(-)

diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 2c37159fc..1c447f321 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -223,22 +223,7 @@ def middle_truncate(
 
 
 def estimate_summary_tokens(input_tokens: int, level: int) -> int:
-    """Estimate target summary tokens based on input size and level.
-
-    Compression ratios based on Mem0 research:
-    - BRIEF: ~20% compression (80% reduction)
-    - STANDARD: ~12% compression (88% reduction)
-    - DETAILED: ~7% compression (93% reduction)
-    - HIERARCHICAL: Capped with diminishing returns
-
-    Args:
-        input_tokens: Number of tokens in the input.
-        level: Summary level (1-4).
-
-    Returns:
-        Target number of tokens for the summary.
-
-    """
+    """Estimate target summary tokens based on input size and level."""
     if level == SummaryLevel.NONE:
         return 0
     if level == SummaryLevel.BRIEF:
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 9d17c8d7e..4a84ecff9 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,8 +1,7 @@
 """Adaptive summarization that scales with input complexity.
 
-This module implements research-grounded summarization inspired by:
-- Letta: Partial eviction (30%), middle truncation, fire-and-forget background processing
-- Mem0: Rolling summaries, 90%+ compression, two-phase architecture
+Implements hierarchical summarization inspired by Letta's partial eviction approach
+and Mem0's two-phase architecture (extraction then storage).
 
 Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta)
 """
@@ -11,7 +10,6 @@
 
 import asyncio
 import logging
-import re
 from dataclasses import dataclass
 
 from pydantic import BaseModel
@@ -413,10 +411,7 @@ async def _generate_summary(
 
     try:
         result = await agent.run(prompt)
-        text = result.output.summary.strip()
-        # Strip special tokens that some models leak (e.g., <|constrain|>, <|end|>)
-        text = re.sub(r"<\|[^|]+\|>", "", text)
-        return text.strip()
+        return result.output.summary.strip()
     except Exception as e:
         msg = f"Summarization failed: {e}"
         raise SummarizationError(msg) from e
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index ce6da9082..36407e459 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -15,11 +15,7 @@
 
 
 class SummaryLevel(IntEnum):
-    """Summary granularity levels based on input complexity.
-
-    Thresholds are based on Mem0 research showing optimal compression ratios
-    at different content lengths. Token counts are approximate guidelines.
-    """
+    """Summary granularity levels based on input complexity."""
 
     NONE = 0
     """< 100 tokens: No summary needed, facts only."""
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index d69b3b111..99318db0f 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -20,7 +20,7 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 **Design Goals:**
 
 - **Adaptive compression:** Match summarization depth to content complexity.
-- **Research-grounded:** Based on proven approaches from Letta and Mem0.
+- **Research-informed:** Draws techniques from Letta's memory management.
 - **Hierarchical structure:** Preserve detail at multiple granularities for large content.
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
@@ -28,31 +28,19 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 
 ## 2. Research Foundations
 
-The summarization approach draws from two research-backed memory systems:
-
 ### 2.1 Letta (MemGPT) Contributions
 
 **Reference:** arXiv:2310.08560
 
-Letta's approach to memory management introduced several techniques adopted here:
-
-- **Partial eviction:** Rather than discarding old content entirely, compress it to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
-
-- **Middle truncation:** When content must be reduced, preserve the head (introductions, context-setting) and tail (conclusions, recent events) while removing the middle. Research shows important information clusters at boundaries.
-
-- **Fire-and-forget background processing:** Summarization runs asynchronously after turn completion, avoiding latency on the critical path.
+Letta's approach to memory management introduced the **partial eviction** technique adopted here: rather than discarding old content entirely, compress a portion to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
 
 ### 2.2 Mem0 Contributions
 
 **Reference:** arXiv:2504.19413
 
-Mem0's memory layer research established compression ratio targets:
-
-- **90%+ compression:** Long-running conversations can achieve 10:1 or better compression while retaining semantic meaning. Our hierarchical approach targets similar ratios for very long content.
-
-- **Rolling summaries:** New information integrates with existing summaries rather than replacing them. The `prior_summary` parameter throughout our pipeline implements this pattern.
+Mem0's memory layer research informed our storage architecture:
 
-- **Two-phase architecture:** Separate extraction (what's important) from storage (how to persist it). We apply this by first generating summaries, then persisting to both files and vector DB.
+- **Two-phase architecture:** Separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
 
 ---
 
@@ -65,18 +53,17 @@ Mem0's memory layer research established compression ratio targets:
 **Rationale:**
 
 - **Predictable behavior:** Users can anticipate output length based on input size.
-- **Optimal compression:** Each level targets a specific compression ratio validated by research.
 - **Efficiency:** Avoid over-processing short content or under-processing long content.
 
 **Thresholds:**
 
-| Level | Token Range | Target Compression | Strategy |
-| :--- | :--- | :--- | :--- |
-| NONE | < 100 | N/A | No summarization needed |
-| BRIEF | 100-500 | ~20% | Single sentence |
-| STANDARD | 500-3000 | ~12% | Paragraph |
-| DETAILED | 3000-15000 | ~7% | Chunked + meta-synthesis |
-| HIERARCHICAL | > 15000 | ~3-5% | L1/L2/L3 tree |
+| Level | Token Range | Strategy |
+| :--- | :--- | :--- |
+| NONE | < 100 | No summarization needed |
+| BRIEF | 100-500 | Single sentence |
+| STANDARD | 500-3000 | Paragraph |
+| DETAILED | 3000-15000 | Chunked + meta-synthesis |
+| HIERARCHICAL | > 15000 | L1/L2/L3 tree |
 
 **Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level.
 
@@ -88,13 +75,13 @@ Mem0's memory layer research established compression ratio targets:
 
 - **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection.
 - **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
-- **Progressive compression:** Each level provides ~5x compression over the previous, achieving high overall compression while preserving structure.
+- **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure.
 
 **Structure:**
 
 - **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries.
 - **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction.
-- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for rolling updates.
+- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for incremental updates.
 
 **Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3).
 
@@ -126,7 +113,7 @@ Mem0's memory layer research established compression ratio targets:
 
 A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
 
-### 3.5 Prior Summary Integration (Rolling Updates)
+### 3.5 Prior Summary Integration
 
 **Decision:** Always provide the previous summary as context when generating updates.
 
@@ -136,7 +123,7 @@ A generic summarization prompt loses domain-specific signal. By tailoring prompt
 - **Incremental updates:** Avoid re-summarizing all historical content on every update.
 - **Information preservation:** Important information from earlier content persists through the chain of summaries.
 
-This implements Mem0's "rolling summary" pattern. The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
+The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
 
 ### 3.6 Compression Ratio Tracking
 
@@ -220,7 +207,7 @@ Summaries are persisted in two places:
 | `chunk_overlap` | 200 | Overlap between consecutive chunks |
 | `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization |
 
-Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing and Mem0 research on optimal compression ratios.
+Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing.
 
 ---
 
@@ -233,17 +220,3 @@ Summarization follows a fail-fast philosophy:
 - **Encoding errors:** Falls back to character-based token estimation.
 
 The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
-
----
-
-## 8. Comparison with Alternatives
-
-| Aspect | Adaptive Summarizer | Fixed Rolling Summary | No Summarization |
-| :--- | :--- | :--- | :--- |
-| **Compression** | 3-20% (scales with input) | ~15% fixed | 0% |
-| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Full |
-| **Short content** | Skipped (efficient) | Still processed | N/A |
-| **Long content** | Tree structure | Single pass | Context overflow |
-| **Research basis** | Letta + Mem0 | Mem0 | None |
-
-The adaptive approach's key advantage is matching effort to content: short content stays untouched, medium content gets lightweight summarization, and long content gets full hierarchical treatment.

From 584631f70df03731f0706edcde679705f2f3e3aa Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:45:11 -0800
Subject: [PATCH 21/37] fix(memory): summarize raw conversation turns, not
 extracted facts

Previously, the summarizer was summarizing the already-compressed
extracted facts, which is redundant. Now it summarizes the actual
user/assistant messages, which is what makes sense for a conversation
summary.
---
 agent_cli/memory/_ingest.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index b0b472b71..f2ce90116 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -414,13 +414,20 @@ async def extract_and_store_facts_and_summaries(
             entries=list(to_add),
         )
 
-    if enable_summarization and facts:
-        # Get prior summary for context continuity
+    # Summarize raw conversation turns (not extracted facts)
+    has_content = user_message or assistant_message
+    if enable_summarization and has_content:
         prior_summary_entry = get_final_summary(collection, conversation_id)
         prior_summary = prior_summary_entry.content if prior_summary_entry else None
 
-        # Summarize the new facts
-        content_to_summarize = "\n".join(facts)
+        # Build conversation transcript
+        parts = []
+        if user_message:
+            parts.append(f"User: {user_message}")
+        if assistant_message:
+            parts.append(f"Assistant: {assistant_message}")
+        content_to_summarize = "\n".join(parts)
+
         summary_start = perf_counter()
         summary_result = await summarize_content(
             content=content_to_summarize,

From bec0384db008a11832ac99940fbe6f34cf24d029 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 00:02:23 -0800
Subject: [PATCH 22/37] docs: clarify research foundations vs original design
 in summarizer

- Document what's actually borrowed from research:
  - Two-phase architecture from Mem0 (arXiv:2504.19413)
  - Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785)

- Clarify what Letta does differently (message count, not tokens)

- Acknowledge original/heuristic design choices:
  - Token thresholds (100/500/3000/15000) are not research-backed
  - L1/L2/L3 hierarchy structure is original
  - Chunk size (3000) is larger than BOOOOKSCORE's 2048

- Add future improvements section based on research findings
---
 agent_cli/summarizer/adaptive.py | 14 +++++--
 docs/architecture/summarizer.md  | 66 ++++++++++++++++++++++++--------
 2 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 4a84ecff9..9536c70eb 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,9 +1,17 @@
 """Adaptive summarization that scales with input complexity.
 
-Implements hierarchical summarization inspired by Letta's partial eviction approach
-and Mem0's two-phase architecture (extraction then storage).
+Implements hierarchical summarization with multiple compression levels (L1/L2/L3).
 
-Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta)
+Research foundations:
+- Two-phase architecture (extraction then storage) from Mem0 (arXiv:2504.19413)
+- Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785)
+
+Original design (not research-backed):
+- Token thresholds (100/500/3000/15000) are heuristic
+- L1/L2/L3 hierarchy structure
+- Chunk size (3000) - BOOOOKSCORE uses 2048
+
+See docs/architecture/summarizer.md for detailed design rationale.
 """
 
 from __future__ import annotations
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 99318db0f..f08ea1a44 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -20,7 +20,6 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 **Design Goals:**
 
 - **Adaptive compression:** Match summarization depth to content complexity.
-- **Research-informed:** Draws techniques from Letta's memory management.
 - **Hierarchical structure:** Preserve detail at multiple granularities for large content.
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
@@ -28,19 +27,45 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 
 ## 2. Research Foundations
 
-### 2.1 Letta (MemGPT) Contributions
+This section documents what techniques are borrowed from research vs. what is original design.
+
+### 2.1 Borrowed: Two-Phase Architecture (Mem0)
+
+**Reference:** arXiv:2504.19413
+
+Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+
+### 2.2 Borrowed: Hierarchical Merging Concept (BOOOOKSCORE)
+
+**Reference:** arXiv:2310.00785 (ICLR 2024)
+
+BOOOOKSCORE's research on book-length summarization demonstrated two approaches:
+- **Hierarchical merging:** Summarize chunks, then merge chunk summaries
+- **Incremental updating:** Maintain a running summary updated with each chunk
+
+Key finding: For smaller context models (like local LLMs), hierarchical merging produces more coherent summaries. This informed our L1/L2/L3 structure.
+
+BOOOOKSCORE's defaults: chunk size of **2048 tokens**, max summary length of **900 tokens**.
+
+### 2.3 Not Directly Borrowed: Letta's Approach
 
 **Reference:** arXiv:2310.08560
 
-Letta's approach to memory management introduced the **partial eviction** technique adopted here: rather than discarding old content entirely, compress a portion to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
+Letta (MemGPT) uses a different paradigm focused on **context window management**:
+- Message count thresholds (e.g., 10 messages), not token thresholds
+- 30% partial eviction when buffer overflows
+- Purpose: fit conversation in LLM context window
 
-### 2.2 Mem0 Contributions
+Our system has a different purpose (memory compression for storage/retrieval), so while we were inspired by Letta's "partial eviction" concept, our implementation differs significantly.
 
-**Reference:** arXiv:2504.19413
+### 2.4 Original Design (Not Research-Backed)
 
-Mem0's memory layer research informed our storage architecture:
+The following aspects are **original design choices without direct research justification**:
 
-- **Two-phase architecture:** Separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+- **Token thresholds (100/500/3000/15000):** These numbers were chosen heuristically, not derived from research. They may benefit from tuning.
+- **L1/L2/L3 hierarchy structure:** The three-level design is original. The naming was loosely inspired by aijournal's L1-L4 "context pack" levels, but those serve a different purpose (what to include in LLM context, not summarization levels).
+- **Chunk size (3000 tokens):** This is larger than BOOOOKSCORE's research-backed 2048 tokens. Consider reducing.
+- **L2 group size (5 chunks):** Chosen heuristically.
 
 ---
 
@@ -65,7 +90,7 @@ Mem0's memory layer research informed our storage architecture:
 | DETAILED | 3000-15000 | Chunked + meta-synthesis |
 | HIERARCHICAL | > 15000 | L1/L2/L3 tree |
 
-**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level.
+**Caveat:** These thresholds are heuristic, not research-backed. They should be validated empirically.
 
 ### 3.2 Hierarchical Summary Structure (L1/L2/L3)
 
@@ -73,7 +98,7 @@ Mem0's memory layer research informed our storage architecture:
 
 **Rationale:**
 
-- **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection.
+- **Hierarchical merging:** Research (BOOOOKSCORE) shows this approach works well for smaller context models.
 - **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
 - **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure.
 
@@ -177,7 +202,7 @@ The parallelism at L1 and L2 levels provides significant speedup for long conten
 
 The memory system triggers summarization during post-processing:
 
-1. Collect content to summarize (extracted facts, conversation turns)
+1. Collect raw conversation turns (user message + assistant message)
 2. Retrieve existing L3 summary as prior context
 3. Call summarizer with content + prior summary + content type
 4. Persist results: delete old summaries, write new files, upsert to ChromaDB
@@ -201,13 +226,13 @@ Summaries are persisted in two places:
 
 ## 6. Configuration
 
-| Parameter | Default | Description |
+| Parameter | Default | Research Comparison |
 | :--- | :--- | :--- |
-| `chunk_size` | 3000 | Target tokens per chunk |
-| `chunk_overlap` | 200 | Overlap between consecutive chunks |
-| `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization |
+| `chunk_size` | 3000 | BOOOOKSCORE uses 2048 |
+| `chunk_overlap` | 200 | No direct comparison |
+| `max_concurrent_chunks` | 5 | Implementation choice |
 
-Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing.
+Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived from published research.
 
 ---
 
@@ -220,3 +245,14 @@ Summarization follows a fail-fast philosophy:
 - **Encoding errors:** Falls back to character-based token estimation.
 
 The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
+
+---
+
+## 8. Future Improvements
+
+Based on research findings, consider:
+
+1. **Reduce chunk size to 2048** to align with BOOOOKSCORE's tested defaults
+2. **Validate token thresholds empirically** with real-world content
+3. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
+4. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation

From 2a88706db1c45a4239450a4e74f06fb9efbe5f1a Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 08:59:28 -0800
Subject: [PATCH 23/37] refactor(summarizer): simplify to NONE/BRIEF/MAP_REDUCE
 levels

Remove old hierarchical summarization (STANDARD, DETAILED, HIERARCHICAL)
in favor of a simpler 3-level system inspired by LangChain's map-reduce:

- NONE: Skip summarization for very short content (<100 tokens)
- BRIEF: Single-pass summary for short content (100-500 tokens)
- MAP_REDUCE: LangChain-style map-reduce for longer content (500+ tokens)

Key changes:
- Add map_reduce.py with dynamic collapse algorithm
- Remove HierarchicalSummary and ChunkSummary classes
- Rename summary_level_name to summary_level in metadata
- Add collapse_depth field to track map-reduce iterations
- Use research-backed defaults (chunk_size=2048, token_max=3000)
- Update all tests for simplified API
- No backward compatibility - clean break from old implementation
---
 agent_cli/agents/summarize.py        |  44 +--
 agent_cli/memory/_files.py           |   2 +-
 agent_cli/memory/_ingest.py          |  18 +-
 agent_cli/memory/_persistence.py     |  20 +-
 agent_cli/memory/models.py           |  22 +-
 agent_cli/summarizer/__init__.py     |  14 +-
 agent_cli/summarizer/_prompts.py     |   8 +-
 agent_cli/summarizer/_utils.py       |  11 +-
 agent_cli/summarizer/adaptive.py     | 320 +++++-------------
 agent_cli/summarizer/map_reduce.py   | 349 +++++++++++++++++++
 agent_cli/summarizer/models.py       | 202 ++---------
 docs/architecture/summarizer.md      | 198 ++++++-----
 examples/summarizer_demo.py          |  91 ++---
 tests/memory/test_engine.py          |   8 +-
 tests/memory/test_git_integration.py |   3 +-
 tests/memory/test_store.py           | 135 ++------
 tests/summarizer/test_adaptive.py    | 147 ++++----
 tests/summarizer/test_integration.py | 481 +++------------------------
 tests/summarizer/test_models.py      | 224 ++-----------
 tests/summarizer/test_utils.py       |  36 +-
 20 files changed, 880 insertions(+), 1453 deletions(-)
 create mode 100644 agent_cli/summarizer/map_reduce.py

diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py
index abc8dfc72..ec516310e 100644
--- a/agent_cli/agents/summarize.py
+++ b/agent_cli/agents/summarize.py
@@ -1,4 +1,4 @@
-"""Summarize text files or stdin using adaptive hierarchical summarization."""
+"""Summarize text files or stdin using adaptive map-reduce summarization."""
 
 from __future__ import annotations
 
@@ -131,7 +131,7 @@ def _display_full_result(
     *,
     quiet: bool,
 ) -> None:
-    """Display full hierarchical result with all levels."""
+    """Display full result with all metadata."""
     if quiet:
         if result.summary:
             print(result.summary)
@@ -143,34 +143,12 @@ def _display_full_result(
     console.print(f"  Input tokens: [bold]{result.input_tokens:,}[/bold]")
     console.print(f"  Output tokens: [bold]{result.output_tokens:,}[/bold]")
     console.print(f"  Compression: [bold]{result.compression_ratio:.1%}[/bold]")
+    if result.collapse_depth > 0:
+        console.print(f"  Collapse depth: [bold]{result.collapse_depth}[/bold]")
     console.print(f"  Time: [bold]{elapsed:.2f}s[/bold]")
     console.print()
 
-    if result.hierarchical:
-        if result.hierarchical.l1_summaries:
-            console.print(
-                f"[bold yellow]L1 Chunk Summaries "
-                f"({len(result.hierarchical.l1_summaries)} chunks)[/bold yellow]",
-            )
-            for cs in result.hierarchical.l1_summaries:
-                console.print(
-                    f"\n[dim]--- Chunk {cs.chunk_index + 1} "
-                    f"({cs.source_tokens:,} → {cs.token_count:,} tokens) ---[/dim]",
-                )
-                console.print(cs.content)
-
-        if result.hierarchical.l2_summaries:
-            console.print(
-                f"\n[bold yellow]L2 Group Summaries "
-                f"({len(result.hierarchical.l2_summaries)} groups)[/bold yellow]",
-            )
-            for idx, l2_summary in enumerate(result.hierarchical.l2_summaries):
-                console.print(f"\n[dim]--- Group {idx + 1} ---[/dim]")
-                console.print(l2_summary)
-
-        console.print("\n[bold green]L3 Final Summary[/bold green]")
-        print_output_panel(result.hierarchical.l3_summary, title="Final Summary")
-    elif result.summary:
+    if result.summary:
         print_output_panel(
             result.summary,
             title=f"Summary ({result.level.name})",
@@ -296,9 +274,9 @@ def summarize_command(
     ),
     # --- Chunking Options ---
     chunk_size: int = typer.Option(
-        3000,
+        2048,
         "--chunk-size",
-        help="Target token count per chunk for hierarchical summarization.",
+        help="Target token count per chunk for map-reduce summarization.",
         rich_help_panel="Chunking Options",
     ),
     chunk_overlap: int = typer.Option(
@@ -341,15 +319,13 @@ def summarize_command(
     config_file: str | None = opts.CONFIG_FILE,
     print_args: bool = opts.PRINT_ARGS,
 ) -> None:
-    """Summarize text using adaptive hierarchical summarization.
+    """Summarize text using adaptive map-reduce summarization.
 
     Reads from a file or stdin and produces a summary scaled to the input complexity:
 
     - NONE (<100 tokens): No summary needed
     - BRIEF (100-500): Single sentence
-    - STANDARD (500-3000): Paragraph
-    - DETAILED (3000-15000): Chunked with meta-summary
-    - HIERARCHICAL (>15000): Full L1/L2/L3 tree
+    - MAP_REDUCE (>500): Dynamic collapse until fits token budget
 
     Examples:
         # Summarize a file
@@ -361,7 +337,7 @@ def summarize_command(
         # Pipe content from stdin
         cat book.txt | agent-cli summarize
 
-        # Get full hierarchical output
+        # Get full output with all metadata
         agent-cli summarize large_document.txt --output full
 
         # Use OpenAI instead of Ollama
diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py
index 0bb0a5d94..50b7400cf 100644
--- a/agent_cli/memory/_files.py
+++ b/agent_cli/memory/_files.py
@@ -23,7 +23,7 @@
 _SNAPSHOT_FILENAME = "memory_index.json"
 _DELETED_DIRNAME = "deleted"
 
-# Summary level constants for hierarchical file structure
+# Summary level constants for file structure (kept for backward compatibility)
 _SUMMARY_LEVEL_L1 = 1
 _SUMMARY_LEVEL_L2 = 2
 _SUMMARY_LEVEL_L3 = 3
diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index f2ce90116..e50e2ac45 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -13,7 +13,7 @@
 from agent_cli.memory._persistence import (
     delete_memory_files,
     persist_entries,
-    persist_hierarchical_summary,
+    persist_summary,
 )
 from agent_cli.memory._prompt import (
     FACT_INSTRUCTIONS,
@@ -290,7 +290,7 @@ async def summarize_content(
     """Adaptively summarize content based on its length.
 
     Automatically selects the appropriate summarization strategy
-    (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) based on input token count.
+    (NONE, BRIEF, MAP_REDUCE) based on input token count.
 
     Args:
         content: The content to summarize.
@@ -326,27 +326,21 @@ async def store_adaptive_summary(
     conversation_id: str,
     summary_result: SummaryResult,
 ) -> list[str]:
-    """Store an adaptive summary result to files and ChromaDB.
+    """Store a summary result to files and ChromaDB.
 
-    This stores all levels of a hierarchical summary (L1, L2, L3) or
-    just the final summary for simpler levels. Old summaries are deleted first.
-
-    Files are stored as Markdown with YAML front matter in a hierarchical structure:
-    - summaries/L1/chunk_{n}.md - L1 chunk summaries
-    - summaries/L2/group_{n}.md - L2 group summaries
-    - summaries/L3/final.md - L3 final summary
+    Old summaries are deleted first, then the new summary is stored.
 
     Args:
         collection: ChromaDB collection.
         memory_root: Root path for memory files.
         conversation_id: The conversation this summary belongs to.
-        summary_result: The result from AdaptiveSummarizer.summarize().
+        summary_result: The result from summarize().
 
     Returns:
         List of IDs that were stored.
 
     """
-    return persist_hierarchical_summary(
+    return persist_summary(
         collection,
         memory_root=memory_root,
         conversation_id=conversation_id,
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 1bb2102d4..a7e3871e2 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -167,19 +167,19 @@ def evict_if_needed(
     delete_memory_files(memory_root, conversation_id, ids_to_remove)
 
 
-def persist_hierarchical_summary(
+def persist_summary(
     collection: Collection,
     *,
     memory_root: Path,
     conversation_id: str,
     summary_result: SummaryResult,
 ) -> list[str]:
-    """Persist a hierarchical summary to disk and ChromaDB.
+    """Persist a summary to disk and ChromaDB.
 
     This function:
     1. Deletes existing summaries (files and ChromaDB entries)
-    2. Writes new summary files to disk in hierarchical structure
-    3. Stores entries in ChromaDB
+    2. Writes new summary file to disk
+    3. Stores entry in ChromaDB
 
     Args:
         collection: ChromaDB collection.
@@ -219,14 +219,12 @@ def persist_hierarchical_summary(
             role=meta_dict["role"],
             created_at=meta_dict.get("created_at", created_at),
             summary_kind="summary",
-            level=meta_dict.get("level"),
             is_final=meta_dict.get("is_final"),
-            chunk_index=meta_dict.get("chunk_index"),
-            group_index=meta_dict.get("group_index"),
             input_tokens=meta_dict.get("input_tokens"),
             output_tokens=meta_dict.get("output_tokens"),
             compression_ratio=meta_dict.get("compression_ratio"),
-            summary_level_name=meta_dict.get("summary_level_name"),
+            summary_level=meta_dict.get("summary_level"),
+            collapse_depth=meta_dict.get("collapse_depth"),
         )
         record = write_memory_file(
             memory_root,
@@ -234,7 +232,11 @@ def persist_hierarchical_summary(
             doc_id=entry["id"],
             metadata=metadata,
         )
-        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level"))
+        LOGGER.info(
+            "Persisted summary file: %s (level=%s)",
+            record.path,
+            meta_dict.get("summary_level"),
+        )
         stored_ids.append(record.id)
 
     # Store in ChromaDB (reuse the entries we already built)
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 06266c575..5b8df3855 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -49,23 +49,25 @@ class MemoryMetadata(BaseModel):
     replaced_by: str | None = None
     source_id: str | None = None
 
-    # Hierarchical summary fields (only used when role="summary")
+    # Summary fields (only used when role="summary")
     level: int | None = None
-    """Summary level: 1=chunk, 2=group, 3=final."""
+    """Summary level (deprecated, kept for file structure compatibility)."""
     is_final: bool | None = None
-    """Whether this is the final L3 summary."""
+    """Whether this is the final summary."""
     chunk_index: int | None = None
-    """For L1 summaries: index of the source chunk."""
+    """Deprecated: index of the source chunk."""
     group_index: int | None = None
-    """For L2 summaries: index of this group."""
+    """Deprecated: index of this group."""
     input_tokens: int | None = None
-    """Number of tokens in the original input (L3 only)."""
+    """Number of tokens in the original input."""
     output_tokens: int | None = None
-    """Number of tokens in the summary output (L3 only)."""
+    """Number of tokens in the summary output."""
     compression_ratio: float | None = None
-    """Ratio of output to input tokens (L3 only)."""
-    summary_level_name: str | None = None
-    """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL')."""
+    """Ratio of output to input tokens."""
+    summary_level: str | None = None
+    """Name of the SummaryLevel enum used (NONE, BRIEF, or MAP_REDUCE)."""
+    collapse_depth: int | None = None
+    """Number of collapse iterations in map-reduce (0 = no collapse needed)."""
 
 
 class StoredMemory(BaseModel):
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index fc0994c4c..af977ada1 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -1,8 +1,13 @@
 """Adaptive summarization module for variable-length content.
 
-This module provides research-grounded summarization that scales with input complexity,
-inspired by Letta (partial eviction, middle truncation) and Mem0 (rolling summaries,
-compression ratios) architectures.
+This module provides map-reduce summarization inspired by LangChain's approach:
+1. Split content into chunks and summarize each in parallel (map phase)
+2. Recursively collapse summaries until they fit token_max (reduce phase)
+
+Research foundations:
+- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
+- BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
+- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
 
 Example:
     from agent_cli.summarizer import summarize, SummarizerConfig
@@ -17,10 +22,9 @@
 """
 
 from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize
-from agent_cli.summarizer.models import HierarchicalSummary, SummaryLevel, SummaryResult
+from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
 __all__ = [
-    "HierarchicalSummary",
     "SummarizationError",
     "SummarizerConfig",
     "SummaryLevel",
diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index f46b39ebf..1de5fa44f 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -4,7 +4,7 @@
 and are optimized for structured, factual output.
 """
 
-# Level 1: BRIEF - Single sentence summary
+# BRIEF level - Single sentence summary for short content (100-500 tokens)
 BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
 Focus on the single most important point or takeaway.
 
@@ -13,7 +13,7 @@
 
 One-sentence summary:""".strip()
 
-# Level 2: STANDARD - Paragraph summary
+# MAP_REDUCE level - Paragraph summary for content-type aware summarization
 STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
 Focus on:
@@ -28,7 +28,7 @@
 
 Summary (maximum {max_words} words):""".strip()
 
-# Level 3: DETAILED - Used for individual chunks in hierarchical summarization
+# CHUNK - Used in map phase of map-reduce summarization
 CHUNK_SUMMARY_PROMPT = """Summarize this section of a longer document.
 Capture the main points while preserving important details.
 
@@ -37,7 +37,7 @@
 
 Summary of this section (maximum {max_words} words):""".strip()
 
-# Level 4: META - Combine multiple summaries into one
+# META - Combine multiple summaries in reduce phase
 META_SUMMARY_PROMPT = """Synthesize these summaries into a single coherent overview.
 Identify common themes and key points across all sections.
 Eliminate redundancy while preserving unique insights.
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 1c447f321..8dbfb1ffd 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -228,15 +228,8 @@ def estimate_summary_tokens(input_tokens: int, level: int) -> int:
         return 0
     if level == SummaryLevel.BRIEF:
         return min(50, max(20, input_tokens // 5))
-    if level == SummaryLevel.STANDARD:
-        return min(200, max(50, input_tokens // 8))
-    if level == SummaryLevel.DETAILED:
-        return min(500, max(100, input_tokens // 15))
-    # HIERARCHICAL
-    # Base of 1000 tokens plus diminishing returns for additional content
-    base = 1000
-    additional = max(0, (input_tokens - 15000) // 100)
-    return min(2000, base + additional)
+    # MAP_REDUCE: ~10% compression with floor/ceiling
+    return min(500, max(50, input_tokens // 10))
 
 
 def tokens_to_words(tokens: int) -> int:
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 9536c70eb..39669e97d 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,22 +1,23 @@
-"""Adaptive summarization that scales with input complexity.
+"""Adaptive summarization using map-reduce with dynamic collapse.
 
-Implements hierarchical summarization with multiple compression levels (L1/L2/L3).
+Implements a simple algorithm inspired by LangChain's map-reduce chains:
+1. If content is short enough, summarize directly
+2. Otherwise, split into chunks and summarize each (map phase)
+3. Recursively collapse summaries until they fit token_max (reduce phase)
 
 Research foundations:
-- Two-phase architecture (extraction then storage) from Mem0 (arXiv:2504.19413)
-- Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785)
+- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
+- BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
+- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
 
-Original design (not research-backed):
-- Token thresholds (100/500/3000/15000) are heuristic
-- L1/L2/L3 hierarchy structure
-- Chunk size (3000) - BOOOOKSCORE uses 2048
+Key insight: No need for predetermined L1/L2/L3 levels.
+Dynamic collapse depth based on actual content length.
 
 See docs/architecture/summarizer.md for detailed design rationale.
 """
 
 from __future__ import annotations
 
-import asyncio
 import logging
 from dataclasses import dataclass
 
@@ -24,21 +25,20 @@
 
 from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
-    CHUNK_SUMMARY_PROMPT,
-    META_SUMMARY_PROMPT,
     format_prior_context,
-    format_summaries_for_meta,
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
-    chunk_text,
     count_tokens,
     estimate_summary_tokens,
     tokens_to_words,
 )
+from agent_cli.summarizer.map_reduce import (
+    MapReduceConfig,
+    MapReduceSummarizationError,
+    map_reduce_summarize,
+)
 from agent_cli.summarizer.models import (
-    ChunkSummary,
-    HierarchicalSummary,
     SummaryLevel,
     SummaryResult,
 )
@@ -46,18 +46,8 @@
 logger = logging.getLogger(__name__)
 
 # Thresholds for summary levels (in tokens)
-LEVEL_THRESHOLDS = {
-    SummaryLevel.NONE: 100,
-    SummaryLevel.BRIEF: 500,
-    SummaryLevel.STANDARD: 3000,
-    SummaryLevel.DETAILED: 15000,
-    # HIERARCHICAL is everything above DETAILED
-}
-
-# Number of L1 chunks to group together for L2 summaries
-L2_GROUP_SIZE = 5
-# Minimum number of L1 chunks before L2 grouping is applied
-L2_MIN_CHUNKS = 5
+THRESHOLD_NONE = 100  # Below this, no summary needed
+THRESHOLD_BRIEF = 500  # Below this, just a single sentence
 
 
 class SummaryOutput(BaseModel):
@@ -88,7 +78,8 @@ class SummarizerConfig:
     openai_base_url: str
     model: str
     api_key: str | None = None
-    chunk_size: int = 3000
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default - when to collapse
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
     timeout: float = 60.0
@@ -102,15 +93,11 @@ def __post_init__(self) -> None:
 
 def determine_level(token_count: int) -> SummaryLevel:
     """Map token count to appropriate SummaryLevel."""
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
+    if token_count < THRESHOLD_NONE:
         return SummaryLevel.NONE
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
+    if token_count < THRESHOLD_BRIEF:
         return SummaryLevel.BRIEF
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
-        return SummaryLevel.STANDARD
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
-        return SummaryLevel.DETAILED
-    return SummaryLevel.HIERARCHICAL
+    return SummaryLevel.MAP_REDUCE
 
 
 async def summarize(
@@ -121,6 +108,11 @@ async def summarize(
 ) -> SummaryResult:
     """Summarize content with adaptive strategy based on length.
 
+    Uses a simple algorithm:
+    - Very short content (<100 tokens): No summary
+    - Short content (<500 tokens): Single sentence brief summary
+    - Everything else: Map-reduce with dynamic collapse
+
     Args:
         content: The content to summarize.
         config: Summarizer configuration.
@@ -135,7 +127,6 @@ async def summarize(
         return SummaryResult(
             level=SummaryLevel.NONE,
             summary=None,
-            hierarchical=None,
             input_tokens=0,
             output_tokens=0,
             compression_ratio=0.0,
@@ -155,7 +146,6 @@ async def summarize(
         return SummaryResult(
             level=level,
             summary=None,
-            hierarchical=None,
             input_tokens=input_tokens,
             output_tokens=0,
             compression_ratio=0.0,
@@ -163,68 +153,22 @@ async def summarize(
 
     if level == SummaryLevel.BRIEF:
         summary = await _brief_summary(content, config)
-    elif level == SummaryLevel.STANDARD:
-        summary = await _standard_summary(content, config, prior_summary, content_type)
-    elif level == SummaryLevel.DETAILED:
-        return await _detailed_summary(content, input_tokens, config)
-    else:  # HIERARCHICAL
-        return await _hierarchical_summary(content, input_tokens, config)
-
-    output_tokens = count_tokens(summary, config.model) if summary else 0
-    compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
-
-    return SummaryResult(
-        level=level,
-        summary=summary,
-        hierarchical=None,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=compression_ratio,
-    )
-
-
-async def _summarize_chunks(
-    chunks: list[str],
-    config: SummarizerConfig,
-) -> list[ChunkSummary]:
-    """Summarize chunks concurrently with semaphore-controlled parallelism."""
-    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
-    total = len(chunks)
-
-    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-        async with semaphore:
-            return await _summarize_single_chunk(chunk, idx, total, config)
-
-    gen = (summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks))
-    return list(await asyncio.gather(*gen))
-
-
-async def _summarize_single_chunk(
-    chunk: str,
-    chunk_index: int,
-    total_chunks: int,
-    config: SummarizerConfig,
-) -> ChunkSummary:
-    """Summarize a single chunk and return its metadata."""
-    source_tokens = count_tokens(chunk, config.model)
-    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt = CHUNK_SUMMARY_PROMPT.format(
-        chunk_index=chunk_index + 1,
-        total_chunks=total_chunks,
-        content=chunk,
-        max_words=max_words,
-    )
-
-    summary = await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-    summary_tokens = count_tokens(summary, config.model)
+        output_tokens = count_tokens(summary, config.model) if summary else 0
+        return SummaryResult(
+            level=level,
+            summary=summary,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        )
 
-    return ChunkSummary(
-        chunk_index=chunk_index,
-        content=summary,
-        token_count=summary_tokens,
-        source_tokens=source_tokens,
+    # MAP_REDUCE level
+    return await _map_reduce_summary(
+        content,
+        input_tokens,
+        config,
+        prior_summary,
+        content_type,
     )
 
 
@@ -234,159 +178,77 @@ async def _brief_summary(content: str, config: SummarizerConfig) -> str:
     return await _generate_summary(prompt, config, max_tokens=50)
 
 
-async def _standard_summary(
+async def _map_reduce_summary(
     content: str,
+    input_tokens: int,
     config: SummarizerConfig,
     prior_summary: str | None,
     content_type: str,
-) -> str:
-    """Generate a paragraph summary for standard-length content."""
-    input_tokens = count_tokens(content, config.model)
-    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt_template = get_prompt_for_content_type(content_type)
-    prior_context = format_prior_context(prior_summary)
-
-    prompt = prompt_template.format(
-        content=content,
-        prior_context=prior_context,
-        max_words=max_words,
-    )
-
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
-async def _detailed_summary(
-    content: str,
-    input_tokens: int,
-    config: SummarizerConfig,
 ) -> SummaryResult:
-    """Generate chunked summaries with meta-summary for detailed content."""
-    chunks = chunk_text(
-        content,
-        chunk_size=config.chunk_size,
-        overlap=config.chunk_overlap,
-        model=config.model,
-    )
-
-    logger.info("Detailed summary: processing %d chunks", len(chunks))
-
-    chunk_summaries = await _summarize_chunks(chunks, config)
-
-    # Generate meta-summary
-    all_summaries = [cs.content for cs in chunk_summaries]
-    meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
-    max_words = tokens_to_words(meta_target)
-
-    meta_prompt = META_SUMMARY_PROMPT.format(
-        summaries=format_summaries_for_meta(all_summaries),
-        max_words=max_words,
-    )
-
-    final_summary = await _generate_summary(
-        meta_prompt,
-        config,
-        max_tokens=meta_target + 100,
-    )
-    output_tokens = count_tokens(final_summary, config.model)
+    """Use map-reduce with dynamic collapse for longer content."""
+    # For content that fits in a single chunk, use content-type aware summary
+    if input_tokens <= config.token_max:
+        summary = await _content_aware_summary(content, config, prior_summary, content_type)
+        output_tokens = count_tokens(summary, config.model) if summary else 0
+        return SummaryResult(
+            level=SummaryLevel.MAP_REDUCE,
+            summary=summary,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            collapse_depth=0,
+        )
 
-    hierarchical = HierarchicalSummary(
-        l1_summaries=list(chunk_summaries),
-        l2_summaries=[],  # Not used for DETAILED level
-        l3_summary=final_summary,
+    # Use map-reduce for multi-chunk content
+    mr_config = MapReduceConfig(
+        openai_base_url=config.openai_base_url,
+        model=config.model,
+        api_key=config.api_key,
         chunk_size=config.chunk_size,
+        token_max=config.token_max,
         chunk_overlap=config.chunk_overlap,
+        max_concurrent=config.max_concurrent_chunks,
+        timeout=config.timeout,
     )
 
+    try:
+        result = await map_reduce_summarize(content, mr_config)
+    except MapReduceSummarizationError as e:
+        raise SummarizationError(str(e)) from e
+
     return SummaryResult(
-        level=SummaryLevel.DETAILED,
-        summary=final_summary,
-        hierarchical=hierarchical,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        level=SummaryLevel.MAP_REDUCE,
+        summary=result.summary,
+        input_tokens=result.input_tokens,
+        output_tokens=result.output_tokens,
+        compression_ratio=result.compression_ratio,
+        collapse_depth=result.collapse_depth,
     )
 
 
-async def _hierarchical_summary(
+async def _content_aware_summary(
     content: str,
-    input_tokens: int,
     config: SummarizerConfig,
-) -> SummaryResult:
-    """Build a tree of summaries for very long content.
-
-    Structure:
-    - L1: Individual chunk summaries
-    - L2: Group summaries (groups of ~5 L1 summaries)
-    - L3: Final synthesis
-    """
-    chunks = chunk_text(
-        content,
-        chunk_size=config.chunk_size,
-        overlap=config.chunk_overlap,
-        model=config.model,
+    prior_summary: str | None,
+    content_type: str,
+) -> str:
+    """Generate a content-type aware summary for single-chunk content."""
+    target_tokens = estimate_summary_tokens(
+        count_tokens(content, config.model),
+        SummaryLevel.MAP_REDUCE,
     )
+    max_words = tokens_to_words(target_tokens)
 
-    logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
-
-    # L1: Summarize each chunk
-    l1_summaries = await _summarize_chunks(chunks, config)
-
-    # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
-    l2_summaries: list[str] = []
-    if len(l1_summaries) > L2_MIN_CHUNKS:
-        groups: list[list[str]] = []
-        for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
-            group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
-            groups.append(group)
-
-        async def summarize_group(group: list[str]) -> str:
-            combined_tokens = sum(count_tokens(s, config.model) for s in group)
-            target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-            max_words = tokens_to_words(target_tokens)
-
-            prompt = META_SUMMARY_PROMPT.format(
-                summaries=format_summaries_for_meta(group),
-                max_words=max_words,
-            )
-            return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-        l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
-
-    # L3: Final synthesis
-    summaries_to_synthesize = l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
-    final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
-    max_words = tokens_to_words(final_target)
+    prompt_template = get_prompt_for_content_type(content_type)
+    prior_context = format_prior_context(prior_summary)
 
-    final_prompt = META_SUMMARY_PROMPT.format(
-        summaries=format_summaries_for_meta(summaries_to_synthesize),
+    prompt = prompt_template.format(
+        content=content,
+        prior_context=prior_context,
         max_words=max_words,
     )
 
-    final_summary = await _generate_summary(
-        final_prompt,
-        config,
-        max_tokens=final_target + 100,
-    )
-    output_tokens = count_tokens(final_summary, config.model)
-
-    hierarchical = HierarchicalSummary(
-        l1_summaries=list(l1_summaries),
-        l2_summaries=list(l2_summaries),
-        l3_summary=final_summary,
-        chunk_size=config.chunk_size,
-        chunk_overlap=config.chunk_overlap,
-    )
-
-    return SummaryResult(
-        level=SummaryLevel.HIERARCHICAL,
-        summary=final_summary,
-        hierarchical=hierarchical,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
-    )
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
 
 
 async def _generate_summary(
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
new file mode 100644
index 000000000..09d82d09c
--- /dev/null
+++ b/agent_cli/summarizer/map_reduce.py
@@ -0,0 +1,349 @@
+"""Map-reduce summarization inspired by LangChain's approach.
+
+Simple algorithm:
+1. Map: Split content into chunks, summarize each in parallel
+2. Reduce: If combined summaries exceed token_max, recursively collapse
+
+Key insight from LangChain: No need for predetermined levels (L1/L2/L3).
+Just keep collapsing until content fits. Dynamic depth based on actual content.
+
+References:
+- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
+- BOOOOKSCORE: chunk_size=2048 optimal for summarization
+
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from dataclasses import dataclass
+
+from pydantic import BaseModel
+
+from agent_cli.summarizer._prompts import (
+    CHUNK_SUMMARY_PROMPT,
+    META_SUMMARY_PROMPT,
+    format_summaries_for_meta,
+)
+from agent_cli.summarizer._utils import (
+    chunk_text,
+    count_tokens,
+    estimate_summary_tokens,
+    tokens_to_words,
+)
+from agent_cli.summarizer.models import SummaryLevel
+
+logger = logging.getLogger(__name__)
+
+
+class SummaryOutput(BaseModel):
+    """Structured output for summary generation."""
+
+    summary: str
+
+
+class MapReduceSummarizationError(Exception):
+    """Raised when map-reduce summarization fails."""
+
+
+@dataclass
+class MapReduceConfig:
+    """Configuration for map-reduce summarization.
+
+    Attributes:
+        openai_base_url: Base URL for OpenAI-compatible API.
+        model: Model name for summarization.
+        api_key: Optional API key.
+        chunk_size: Target size for splitting content (tokens).
+                   LangChain uses 3000, BOOOOKSCORE suggests 2048.
+        token_max: Maximum tokens for combined summaries before collapsing.
+                  When combined summaries exceed this, we recursively reduce.
+        chunk_overlap: Overlap between chunks for context continuity.
+        max_concurrent: Maximum parallel summarization calls.
+        timeout: Timeout for API calls in seconds.
+        max_collapse_depth: Safety limit on recursive collapse depth.
+
+    """
+
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default
+    chunk_overlap: int = 200
+    max_concurrent: int = 5
+    timeout: float = 60.0
+    max_collapse_depth: int = 10  # Safety limit
+
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
+
+
+@dataclass
+class MapReduceResult:
+    """Result of map-reduce summarization.
+
+    Attributes:
+        summary: The final collapsed summary.
+        input_tokens: Token count of original content.
+        output_tokens: Token count of final summary.
+        compression_ratio: output_tokens / input_tokens.
+        collapse_depth: How many reduce iterations were needed.
+        intermediate_summaries: All intermediate summaries (for debugging/storage).
+
+    """
+
+    summary: str
+    input_tokens: int
+    output_tokens: int
+    compression_ratio: float
+    collapse_depth: int
+    intermediate_summaries: list[list[str]]  # Each level of collapse
+
+
+async def map_reduce_summarize(
+    content: str,
+    config: MapReduceConfig,
+) -> MapReduceResult:
+    """Summarize content using map-reduce with dynamic collapse.
+
+    Algorithm:
+    1. If content fits in token_max, summarize directly
+    2. Otherwise, split into chunks and summarize each (map phase)
+    3. If combined summaries exceed token_max, recursively collapse (reduce phase)
+    4. Continue until everything fits in token_max
+
+    Args:
+        content: The content to summarize.
+        config: Map-reduce configuration.
+
+    Returns:
+        MapReduceResult with summary and metadata.
+
+    """
+    if not content or not content.strip():
+        return MapReduceResult(
+            summary="",
+            input_tokens=0,
+            output_tokens=0,
+            compression_ratio=0.0,
+            collapse_depth=0,
+            intermediate_summaries=[],
+        )
+
+    input_tokens = count_tokens(content, config.model)
+
+    # If content already fits, just summarize directly
+    if input_tokens <= config.token_max:
+        summary = await _summarize_text(content, config)
+        output_tokens = count_tokens(summary, config.model)
+        return MapReduceResult(
+            summary=summary,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            collapse_depth=0,
+            intermediate_summaries=[],
+        )
+
+    # Map phase: Split and summarize chunks in parallel
+    chunks = chunk_text(
+        content,
+        chunk_size=config.chunk_size,
+        overlap=config.chunk_overlap,
+        model=config.model,
+    )
+
+    logger.info("Map phase: processing %d chunks", len(chunks))
+    summaries = await _map_summarize(chunks, config)
+    intermediate_summaries = [summaries.copy()]
+
+    # Reduce phase: Recursively collapse until fits token_max
+    depth = 0
+    while _total_tokens(summaries, config.model) > config.token_max:
+        depth += 1
+        if depth > config.max_collapse_depth:
+            logger.warning(
+                "Hit max collapse depth %d, forcing final summary",
+                config.max_collapse_depth,
+            )
+            break
+
+        logger.info(
+            "Reduce phase (depth %d): collapsing %d summaries (%d tokens)",
+            depth,
+            len(summaries),
+            _total_tokens(summaries, config.model),
+        )
+        summaries = await _collapse_summaries(summaries, config)
+        intermediate_summaries.append(summaries.copy())
+
+    # Final synthesis if we have multiple summaries left
+    if len(summaries) > 1:
+        final_summary = await _synthesize(summaries, config)
+    else:
+        final_summary = summaries[0] if summaries else ""
+
+    output_tokens = count_tokens(final_summary, config.model)
+
+    return MapReduceResult(
+        summary=final_summary,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        collapse_depth=depth,
+        intermediate_summaries=intermediate_summaries,
+    )
+
+
+def _total_tokens(texts: list[str], model: str) -> int:
+    """Count total tokens across all texts."""
+    return sum(count_tokens(t, model) for t in texts)
+
+
+async def _map_summarize(chunks: list[str], config: MapReduceConfig) -> list[str]:
+    """Summarize each chunk in parallel (map phase)."""
+    semaphore = asyncio.Semaphore(config.max_concurrent)
+    total = len(chunks)
+
+    async def summarize_chunk(idx: int, chunk: str) -> str:
+        async with semaphore:
+            return await _summarize_chunk(chunk, idx, total, config)
+
+    tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)]
+    return list(await asyncio.gather(*tasks))
+
+
+async def _summarize_chunk(
+    chunk: str,
+    chunk_index: int,
+    total_chunks: int,
+    config: MapReduceConfig,
+) -> str:
+    """Summarize a single chunk."""
+    source_tokens = count_tokens(chunk, config.model)
+    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.MAP_REDUCE)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = CHUNK_SUMMARY_PROMPT.format(
+        chunk_index=chunk_index + 1,
+        total_chunks=total_chunks,
+        content=chunk,
+        max_words=max_words,
+    )
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+
+async def _collapse_summaries(
+    summaries: list[str],
+    config: MapReduceConfig,
+) -> list[str]:
+    """Collapse summaries by grouping and re-summarizing (reduce phase).
+
+    Groups summaries that together fit within token_max, then summarizes each group.
+    This is similar to LangChain's split_list_of_docs approach.
+    """
+    if len(summaries) <= 1:
+        return summaries
+
+    # Group summaries that together fit within token_max
+    groups: list[list[str]] = []
+    current_group: list[str] = []
+    current_tokens = 0
+
+    for summary in summaries:
+        summary_tokens = count_tokens(summary, config.model)
+
+        # If adding this summary would exceed token_max, start new group
+        if current_tokens + summary_tokens > config.token_max and current_group:
+            groups.append(current_group)
+            current_group = [summary]
+            current_tokens = summary_tokens
+        else:
+            current_group.append(summary)
+            current_tokens += summary_tokens
+
+    if current_group:
+        groups.append(current_group)
+
+    # Summarize each group in parallel
+    semaphore = asyncio.Semaphore(config.max_concurrent)
+
+    async def summarize_group(group: list[str]) -> str:
+        async with semaphore:
+            return await _synthesize(group, config)
+
+    tasks = [summarize_group(g) for g in groups]
+    return list(await asyncio.gather(*tasks))
+
+
+async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str:
+    """Synthesize multiple summaries into one."""
+    combined_tokens = sum(count_tokens(s, config.model) for s in summaries)
+    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = META_SUMMARY_PROMPT.format(
+        summaries=format_summaries_for_meta(summaries),
+        max_words=max_words,
+    )
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 100)
+
+
+async def _summarize_text(text: str, config: MapReduceConfig) -> str:
+    """Summarize text that fits within token_max."""
+    input_tokens = count_tokens(text, config.model)
+    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = f"""Summarize the following content in {max_words} words or less.
+Focus on the key points and main ideas.
+
+Content:
+{text}
+
+Summary:"""
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+
+async def _generate_summary(
+    prompt: str,
+    config: MapReduceConfig,
+    max_tokens: int = 256,
+) -> str:
+    """Call the LLM to generate a summary."""
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
+
+    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
+    model = OpenAIChatModel(
+        model_name=config.model,
+        provider=provider,
+        settings=ModelSettings(
+            temperature=0.3,
+            max_tokens=max_tokens,
+        ),
+    )
+
+    agent = Agent(
+        model=model,
+        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+        output_type=SummaryOutput,
+        retries=2,
+    )
+
+    try:
+        result = await agent.run(prompt)
+        return result.output.summary.strip()
+    except Exception as e:
+        msg = f"Map-reduce summarization failed: {e}"
+        raise MapReduceSummarizationError(msg) from e
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 36407e459..be0d309be 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -1,4 +1,4 @@
-"""Data models for adaptive summarization."""
+"""Data models for map-reduce summarization."""
 
 from __future__ import annotations
 
@@ -8,103 +8,31 @@
 
 from pydantic import BaseModel, Field
 
-# Hierarchical level constants for storage
-HIERARCHICAL_LEVEL_L1 = 1
-HIERARCHICAL_LEVEL_L2 = 2
-HIERARCHICAL_LEVEL_L3 = 3
-
 
 class SummaryLevel(IntEnum):
-    """Summary granularity levels based on input complexity."""
+    """Summary strategy based on input length."""
 
     NONE = 0
-    """< 100 tokens: No summary needed, facts only."""
+    """< 100 tokens: No summary needed."""
 
     BRIEF = 1
-    """100-500 tokens: Single-sentence summary (~20% compression)."""
-
-    STANDARD = 2
-    """500-3000 tokens: Paragraph summary (~12% compression)."""
-
-    DETAILED = 3
-    """3000-15000 tokens: Chunked summaries + meta-summary (~7% compression)."""
-
-    HIERARCHICAL = 4
-    """> 15000 tokens: Tree of summaries with multiple levels."""
+    """100-500 tokens: Single-sentence summary."""
 
-
-class ChunkSummary(BaseModel):
-    """Summary of a single chunk within a hierarchical summary."""
-
-    chunk_index: int = Field(..., description="Index of this chunk in the original content")
-    content: str = Field(..., description="The summarized content of this chunk")
-    token_count: int = Field(..., ge=0, description="Token count of this summary")
-    source_tokens: int = Field(..., ge=0, description="Token count of the source chunk")
-
-
-class HierarchicalSummary(BaseModel):
-    """A hierarchical summary with multiple levels.
-
-    Structure inspired by Letta's partial eviction pattern:
-    - L1: Individual chunk summaries (parallel processing)
-    - L2: Group summaries (groups of ~5 L1 summaries)
-    - L3: Final synthesis (single top-level summary)
-    """
-
-    l1_summaries: list[ChunkSummary] = Field(
-        default_factory=list,
-        description="Level 1: Individual chunk summaries",
-    )
-    l2_summaries: list[str] = Field(
-        default_factory=list,
-        description="Level 2: Group summaries (if > 5 chunks)",
-    )
-    l3_summary: str = Field(
-        ...,
-        description="Level 3: Final synthesized summary",
-    )
-    chunk_size: int = Field(
-        default=3000,
-        description="Token size used for chunking",
-    )
-    chunk_overlap: int = Field(
-        default=200,
-        description="Token overlap between chunks",
-    )
-
-    def get_summary_at_level(self, level: int) -> str | list[str]:
-        """Get summary content at a specific level.
-
-        Args:
-            level: 1 for chunk summaries, 2 for group summaries, 3 for final.
-
-        Returns:
-            Summary content at the requested level.
-
-        """
-        if level == HIERARCHICAL_LEVEL_L1:
-            return [cs.content for cs in self.l1_summaries]
-        if level == HIERARCHICAL_LEVEL_L2:
-            return self.l2_summaries if self.l2_summaries else [self.l3_summary]
-        return self.l3_summary
+    MAP_REDUCE = 2
+    """> 500 tokens: Map-reduce with dynamic collapse."""
 
 
 class SummaryResult(BaseModel):
-    """Result of adaptive summarization.
+    """Result of summarization.
 
-    Contains the summary at the appropriate level for the input complexity,
-    along with metadata about the compression achieved.
+    Contains the summary and metadata about the compression achieved.
     """
 
-    level: SummaryLevel = Field(..., description="The summarization level used")
+    level: SummaryLevel = Field(..., description="The summarization strategy used")
     summary: str | None = Field(
         default=None,
         description="The final summary text (None for NONE level)",
     )
-    hierarchical: HierarchicalSummary | None = Field(
-        default=None,
-        description="Full hierarchical structure (for DETAILED/HIERARCHICAL levels)",
-    )
     input_tokens: int = Field(..., ge=0, description="Token count of the input content")
     output_tokens: int = Field(..., ge=0, description="Token count of the summary")
     compression_ratio: float = Field(
@@ -113,100 +41,40 @@ class SummaryResult(BaseModel):
         le=1.0,
         description="Ratio of output to input tokens (lower = more compression)",
     )
+    collapse_depth: int = Field(
+        default=0,
+        ge=0,
+        description="Number of collapse iterations in map-reduce (0 = no collapse needed)",
+    )
     created_at: datetime = Field(
         default_factory=lambda: datetime.now(UTC),
         description="Timestamp when summary was created",
     )
 
-    @property
-    def chunk_summaries(self) -> list[str] | None:
-        """Get L1 chunk summaries if available."""
-        if self.hierarchical:
-            return [cs.content for cs in self.hierarchical.l1_summaries]
-        return None
-
     def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
-        """Convert to metadata entries for ChromaDB storage.
+        """Convert to metadata entry for ChromaDB storage.
 
-        Returns a list of metadata dicts, one for each summary level stored.
+        Returns a list with a single metadata dict for the summary.
         """
-        entries: list[dict[str, Any]] = []
+        if self.level == SummaryLevel.NONE or not self.summary:
+            return []
+
         timestamp = self.created_at.isoformat()
 
-        if self.level == SummaryLevel.NONE:
-            return entries
-
-        # For hierarchical summaries, store each level
-        if self.hierarchical:
-            # L1: Individual chunk summaries
-            entries.extend(
-                {
-                    "id": f"{conversation_id}:summary:L1:{cs.chunk_index}",
-                    "content": cs.content,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L1,
-                        "chunk_index": cs.chunk_index,
-                        "token_count": cs.token_count,
-                        "created_at": timestamp,
-                    },
-                }
-                for cs in self.hierarchical.l1_summaries
-            )
-
-            # L2: Group summaries
-            entries.extend(
-                {
-                    "id": f"{conversation_id}:summary:L2:{idx}",
-                    "content": l2_summary,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L2,
-                        "group_index": idx,
-                        "created_at": timestamp,
-                    },
-                }
-                for idx, l2_summary in enumerate(self.hierarchical.l2_summaries)
-            )
-
-            # L3: Final summary
-            entries.append(
-                {
-                    "id": f"{conversation_id}:summary:L3:final",
-                    "content": self.hierarchical.l3_summary,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L3,
-                        "is_final": True,
-                        "summary_level_name": self.level.name,
-                        "input_tokens": self.input_tokens,
-                        "output_tokens": self.output_tokens,
-                        "compression_ratio": self.compression_ratio,
-                        "created_at": timestamp,
-                    },
-                },
-            )
-        elif self.summary:
-            # Non-hierarchical: just store the single summary
-            entries.append(
-                {
-                    "id": f"{conversation_id}:summary:L3:final",
-                    "content": self.summary,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L3,
-                        "is_final": True,
-                        "summary_level_name": self.level.name,
-                        "input_tokens": self.input_tokens,
-                        "output_tokens": self.output_tokens,
-                        "compression_ratio": self.compression_ratio,
-                        "created_at": timestamp,
-                    },
+        return [
+            {
+                "id": f"{conversation_id}:summary",
+                "content": self.summary,
+                "metadata": {
+                    "conversation_id": conversation_id,
+                    "role": "summary",
+                    "is_final": True,
+                    "summary_level": self.level.name,
+                    "input_tokens": self.input_tokens,
+                    "output_tokens": self.output_tokens,
+                    "compression_ratio": self.compression_ratio,
+                    "collapse_depth": self.collapse_depth,
+                    "created_at": timestamp,
                 },
-            )
-
-        return entries
+            },
+        ]
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index f08ea1a44..c34540bc1 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -4,23 +4,23 @@ This document describes the architectural decisions, design rationale, and techn
 
 ## 1. System Overview
 
-The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count.
+The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. Rather than applying fixed summarization levels, it dynamically collapses content until it fits within a token budget.
 
 ```
-Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
+Input Content ──▶ Token Count ──▶ Strategy Selection
                                         │
-        ┌───────────────────────────────┼───────────────────────────────┐
-        │                               │                               │
-   < 100 tokens                   500-15000 tokens                > 15000 tokens
-        │                               │                               │
-   No summary needed            Chunked processing              Hierarchical tree
-                                  + meta-synthesis                  (L1/L2/L3)
+        ┌───────────────────────────────┼─────────────────────┐
+        │                               │                     │
+   < 100 tokens                  100-500 tokens         > 500 tokens
+        │                               │                     │
+   No summary                    Brief summary           Map-Reduce
+                                (single sentence)     (dynamic collapse)
 ```
 
 **Design Goals:**
 
-- **Adaptive compression:** Match summarization depth to content complexity.
-- **Hierarchical structure:** Preserve detail at multiple granularities for large content.
+- **Simple algorithm:** Map-reduce with dynamic collapse depth based on actual content.
+- **Research-grounded defaults:** chunk_size=2048 (BOOOOKSCORE), token_max=3000 (LangChain).
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
 ---
@@ -29,25 +29,31 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 
 This section documents what techniques are borrowed from research vs. what is original design.
 
-### 2.1 Borrowed: Two-Phase Architecture (Mem0)
+### 2.1 Borrowed: LangChain Map-Reduce Pattern
 
-**Reference:** arXiv:2504.19413
+**Reference:** LangChain `ReduceDocumentsChain`
 
-Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+LangChain's approach to document summarization uses a simple algorithm:
+1. **Map phase:** Split content into chunks, summarize each in parallel
+2. **Reduce phase:** If combined summaries exceed `token_max`, recursively collapse until they fit
 
-### 2.2 Borrowed: Hierarchical Merging Concept (BOOOOKSCORE)
+Key insight: No need for predetermined L1/L2/L3 levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`.
+
+### 2.2 Borrowed: Chunk Size (BOOOOKSCORE)
 
 **Reference:** arXiv:2310.00785 (ICLR 2024)
 
-BOOOOKSCORE's research on book-length summarization demonstrated two approaches:
-- **Hierarchical merging:** Summarize chunks, then merge chunk summaries
-- **Incremental updating:** Maintain a running summary updated with each chunk
+BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. Their defaults:
+- Chunk size: **2048 tokens** (we use this)
+- Max summary length: **900 tokens**
 
-Key finding: For smaller context models (like local LLMs), hierarchical merging produces more coherent summaries. This informed our L1/L2/L3 structure.
+### 2.3 Borrowed: Two-Phase Architecture (Mem0)
 
-BOOOOKSCORE's defaults: chunk size of **2048 tokens**, max summary length of **900 tokens**.
+**Reference:** arXiv:2504.19413
 
-### 2.3 Not Directly Borrowed: Letta's Approach
+Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+
+### 2.4 Not Directly Borrowed: Letta's Approach
 
 **Reference:** arXiv:2310.08560
 
@@ -56,61 +62,79 @@ Letta (MemGPT) uses a different paradigm focused on **context window management*
 - 30% partial eviction when buffer overflows
 - Purpose: fit conversation in LLM context window
 
-Our system has a different purpose (memory compression for storage/retrieval), so while we were inspired by Letta's "partial eviction" concept, our implementation differs significantly.
+Our system has a different purpose (memory compression for storage/retrieval), so our implementation differs significantly.
 
-### 2.4 Original Design (Not Research-Backed)
+### 2.5 Original Design (Not Research-Backed)
 
 The following aspects are **original design choices without direct research justification**:
 
-- **Token thresholds (100/500/3000/15000):** These numbers were chosen heuristically, not derived from research. They may benefit from tuning.
-- **L1/L2/L3 hierarchy structure:** The three-level design is original. The naming was loosely inspired by aijournal's L1-L4 "context pack" levels, but those serve a different purpose (what to include in LLM context, not summarization levels).
-- **Chunk size (3000 tokens):** This is larger than BOOOOKSCORE's research-backed 2048 tokens. Consider reducing.
-- **L2 group size (5 chunks):** Chosen heuristically.
+- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/map-reduce were chosen heuristically.
+- **L2 group logic for storage:** The intermediate summaries stored as "L2" is for backward compatibility with the storage layer.
+- **Content-type prompts:** Domain-specific prompts are original design.
 
 ---
 
 ## 3. Architectural Decisions
 
-### 3.1 Token-Based Level Selection
+### 3.1 Map-Reduce with Dynamic Collapse
 
-**Decision:** Select summarization strategy based on input token count with fixed thresholds.
+**Decision:** Use LangChain-style map-reduce instead of fixed L1/L2/L3 levels.
 
 **Rationale:**
 
-- **Predictable behavior:** Users can anticipate output length based on input size.
-- **Efficiency:** Avoid over-processing short content or under-processing long content.
+- **Simpler algorithm:** No need to distinguish STANDARD/DETAILED/HIERARCHICAL.
+- **Dynamic depth:** Collapse depth adapts to actual content length.
+- **Research-backed:** LangChain's approach is battle-tested.
 
-**Thresholds:**
+**Algorithm:**
 
-| Level | Token Range | Strategy |
-| :--- | :--- | :--- |
-| NONE | < 100 | No summarization needed |
-| BRIEF | 100-500 | Single sentence |
-| STANDARD | 500-3000 | Paragraph |
-| DETAILED | 3000-15000 | Chunked + meta-synthesis |
-| HIERARCHICAL | > 15000 | L1/L2/L3 tree |
+```python
+def map_reduce_summarize(content, token_max=3000):
+    if tokens(content) <= token_max:
+        return summarize_directly(content)
 
-**Caveat:** These thresholds are heuristic, not research-backed. They should be validated empirically.
+    # Map: Split and summarize chunks in parallel
+    chunks = split_into_chunks(content, chunk_size=2048)
+    summaries = [summarize(chunk) for chunk in chunks]
 
-### 3.2 Hierarchical Summary Structure (L1/L2/L3)
+    # Reduce: Recursively collapse until fits
+    while total_tokens(summaries) > token_max:
+        groups = group_summaries_by_token_max(summaries, token_max)
+        summaries = [synthesize(group) for group in groups]
 
-**Decision:** For long content, build a tree of summaries at three levels of granularity.
+    return final_synthesis(summaries)
+```
+
+### 3.2 Token-Based Level Selection (Simplified)
+
+**Decision:** Use three effective levels instead of five.
 
 **Rationale:**
 
-- **Hierarchical merging:** Research (BOOOOKSCORE) shows this approach works well for smaller context models.
-- **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
-- **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure.
+- **Simplicity:** Fewer code paths, easier to understand.
+- **Dynamic instead of fixed:** Map-reduce adapts to content, no need for DETAILED vs HIERARCHICAL distinction.
 
-**Structure:**
+**Effective Levels:**
 
-- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries.
-- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction.
-- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for incremental updates.
+| Level | Token Range | Strategy |
+| :--- | :--- | :--- |
+| NONE | < 100 | No summarization needed |
+| BRIEF | 100-500 | Single sentence |
+| MAP_REDUCE | > 500 | Dynamic collapse until fits token_max |
+
+**Backward Compatibility:** The output still reports STANDARD, DETAILED, or HIERARCHICAL based on collapse depth for storage compatibility.
 
-**Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3).
+### 3.3 Research-Backed Defaults
+
+**Decision:** Use values from published research.
+
+| Parameter | Value | Source |
+| :--- | :--- | :--- |
+| `chunk_size` | 2048 | BOOOOKSCORE |
+| `token_max` | 3000 | LangChain |
+| `chunk_overlap` | 200 | Original |
 
-### 3.3 Semantic Boundary Chunking
+### 3.4 Semantic Boundary Chunking
 
 **Decision:** Split content on semantic boundaries (paragraphs, then sentences) rather than fixed character counts.
 
@@ -126,7 +150,7 @@ The following aspects are **original design choices without direct research just
 2. Fall back to sentence boundaries (`.!?` followed by space + capital)
 3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation)
 
-### 3.4 Content-Type Aware Prompts
+### 3.5 Content-Type Aware Prompts
 
 **Decision:** Use different prompt templates for different content domains.
 
@@ -138,7 +162,7 @@ The following aspects are **original design choices without direct research just
 
 A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
 
-### 3.5 Prior Summary Integration
+### 3.6 Prior Summary Integration
 
 **Decision:** Always provide the previous summary as context when generating updates.
 
@@ -150,7 +174,7 @@ A generic summarization prompt loses domain-specific signal. By tailoring prompt
 
 The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
 
-### 3.6 Compression Ratio Tracking
+### 3.7 Compression Ratio Tracking
 
 **Decision:** Track and report compression metrics for every summary.
 
@@ -171,28 +195,26 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression
 The entry point counts tokens and selects strategy:
 
 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
-2. **Threshold comparison:** Maps token count to `SummaryLevel` enum.
-3. **Strategy dispatch:** Calls level-specific handler.
+2. **Threshold comparison:** Determines if NONE, BRIEF, or map-reduce.
+3. **Strategy dispatch:** Calls appropriate handler.
 
-### 4.2 Brief and Standard Levels
+### 4.2 Brief Level
 
-For short content (< 3000 tokens):
+For short content (100-500 tokens):
 
-- Single LLM call with level-appropriate prompt
-- Prior summary injected as context if available
-- Content-type selection determines prompt variant
+- Single LLM call with brief prompt
 - Returns simple `SummaryResult` with no hierarchical structure
 
-### 4.3 Detailed and Hierarchical Levels
+### 4.3 Map-Reduce Level
 
-For longer content:
+For longer content (> 500 tokens):
 
-1. **Chunking:** Split content into overlapping chunks on semantic boundaries.
-2. **Parallel L1 generation:** Summarize each chunk independently. Uses semaphore-controlled concurrency to avoid overwhelming the LLM.
-3. **L2 grouping (hierarchical only):** Organize L1s into groups of ~5, summarize each group.
-4. **L3 synthesis:** Meta-summarize all L2s (or all L1s for DETAILED level) into final summary.
+1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly.
+2. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
+3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively.
+4. **Final synthesis:** Combine remaining summaries into final output.
 
-The parallelism at L1 and L2 levels provides significant speedup for long content while maintaining semantic coherence through the hierarchical structure.
+The parallelism in the map phase provides significant speedup for long content while maintaining semantic coherence through the collapse process.
 
 ---
 
@@ -222,17 +244,22 @@ Summaries are persisted in two places:
 - **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable.
 - **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps.
 
+For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 structure:
+- First collapse level → L1 (chunk summaries)
+- Intermediate levels → L2 (grouped summaries)
+- Final output → L3 (synthesis)
+
 ---
 
 ## 6. Configuration
 
-| Parameter | Default | Research Comparison |
+| Parameter | Default | Source |
 | :--- | :--- | :--- |
-| `chunk_size` | 3000 | BOOOOKSCORE uses 2048 |
-| `chunk_overlap` | 200 | No direct comparison |
-| `max_concurrent_chunks` | 5 | Implementation choice |
-
-Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived from published research.
+| `chunk_size` | 2048 | BOOOOKSCORE |
+| `token_max` | 3000 | LangChain |
+| `chunk_overlap` | 200 | Original |
+| `max_concurrent` | 5 | Implementation choice |
+| `max_collapse_depth` | 10 | Safety limit |
 
 ---
 
@@ -240,19 +267,30 @@ Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived fr
 
 Summarization follows a fail-fast philosophy:
 
-- **LLM errors:** Propagated as `SummarizationError` rather than silently returning empty results.
+- **LLM errors:** Propagated as `SummarizationError` or `MapReduceSummarizationError` rather than silently returning empty results.
 - **Empty input:** Returns NONE level immediately (not an error).
 - **Encoding errors:** Falls back to character-based token estimation.
+- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
 
 The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
 
 ---
 
-## 8. Future Improvements
+## 8. Comparison: Old vs New Approach
+
+| Aspect | Old Approach | New Approach |
+| :--- | :--- | :--- |
+| Levels | 5 fixed (NONE/BRIEF/STANDARD/DETAILED/HIERARCHICAL) | 3 effective (NONE/BRIEF/MAP_REDUCE) |
+| Hierarchy | Fixed L1/L2/L3 structure | Dynamic collapse depth |
+| Chunk size | 3000 tokens | 2048 tokens (BOOOOKSCORE) |
+| token_max | N/A (fixed levels) | 3000 (LangChain) |
+| Complexity | Multiple code paths | Single map-reduce algorithm |
+| Research basis | Heuristic | LangChain + BOOOOKSCORE |
+
+---
 
-Based on research findings, consider:
+## 9. Future Improvements
 
-1. **Reduce chunk size to 2048** to align with BOOOOKSCORE's tested defaults
-2. **Validate token thresholds empirically** with real-world content
-3. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
-4. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
+1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
+2. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
+3. **Tune token thresholds empirically** with real-world content
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
index 6a542dbdc..70d434dda 100644
--- a/examples/summarizer_demo.py
+++ b/examples/summarizer_demo.py
@@ -1,17 +1,15 @@
 """Demonstrate the summarizer on texts of varying lengths from the internet.
 
 This script fetches content of different sizes and shows how the adaptive
-summarizer automatically selects the appropriate strategy (BRIEF, STANDARD,
-DETAILED, or HIERARCHICAL) based on content length.
+summarizer automatically selects the appropriate strategy (BRIEF or MAP_REDUCE)
+based on content length.
 
 Usage:
     python examples/summarizer_demo.py
 
     # Test specific levels only
     python examples/summarizer_demo.py --level brief
-    python examples/summarizer_demo.py --level standard
-    python examples/summarizer_demo.py --level detailed
-    python examples/summarizer_demo.py --level hierarchical
+    python examples/summarizer_demo.py --level map_reduce
 
     # Use a different model
     python examples/summarizer_demo.py --model "gpt-4o-mini"
@@ -58,9 +56,7 @@ class TextSample:
 # Thresholds from adaptive.py:
 # NONE: < 100 tokens
 # BRIEF: 100-500 tokens
-# STANDARD: 500-3000 tokens
-# DETAILED: 3000-15000 tokens
-# HIERARCHICAL: > 15000 tokens
+# MAP_REDUCE: >= 500 tokens
 
 # Sample texts of varying lengths to demonstrate different summarization levels
 SAMPLES: list[TextSample] = [
@@ -98,10 +94,10 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Standard - Technology Article",
-        description="~800-2000 tokens - triggers STANDARD level (500-3000 token range)",
+        name="Map-Reduce - Technology Article",
+        description="~800-2000 tokens - triggers MAP_REDUCE level (>=500 tokens)",
         url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence",
-        expected_level=SummaryLevel.STANDARD,
+        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content="""
         Artificial intelligence (AI) is the intelligence of machines or software,
@@ -178,18 +174,18 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Detailed - Full Article",
-        description="~4000-10000 tokens - triggers DETAILED level (3000-15000 token range)",
+        name="Map-Reduce - Full Article",
+        description="~4000-10000 tokens - triggers MAP_REDUCE with chunking",
         url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning",
-        expected_level=SummaryLevel.DETAILED,
+        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content=None,  # We'll generate synthetic content
     ),
     TextSample(
-        name="Hierarchical - Long Document",
-        description="~16000+ tokens - triggers HIERARCHICAL level (>15000 tokens)",
+        name="Map-Reduce - Long Document",
+        description="~16000+ tokens - triggers MAP_REDUCE with multiple collapse iterations",
         url="https://www.gutenberg.org/cache/epub/84/pg84.txt",  # Frankenstein (truncated)
-        expected_level=SummaryLevel.HIERARCHICAL,
+        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content=None,  # We'll generate synthetic content (~16K tokens)
     ),
@@ -229,7 +225,7 @@ def generate_synthetic_content(target_tokens: int, topic: str = "technology") ->
     return "\n\n".join(result)
 
 
-async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:  # noqa: PLR0912
+async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
     """Fetch content from URL or use fallback."""
     try:
         # Add User-Agent header to avoid 403 errors from some sites
@@ -269,9 +265,7 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
         # Check if content is too short for expected level
         min_words_for_level = {
             SummaryLevel.BRIEF: 80,  # Need ~100 tokens
-            SummaryLevel.STANDARD: 400,  # Need ~500 tokens
-            SummaryLevel.DETAILED: 2500,  # Need ~3000 tokens
-            SummaryLevel.HIERARCHICAL: 12000,  # Need ~15000 tokens
+            SummaryLevel.MAP_REDUCE: 400,  # Need ~500 tokens
         }
         min_words = min_words_for_level.get(sample.expected_level, 50)
 
@@ -282,22 +276,17 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
             else:
                 target_tokens = {
                     SummaryLevel.BRIEF: 300,
-                    SummaryLevel.STANDARD: 1500,
-                    SummaryLevel.DETAILED: 8000,
-                    SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+                    SummaryLevel.MAP_REDUCE: 1500,
                 }
                 content = generate_synthetic_content(
                     target_tokens.get(sample.expected_level, 1000),
                 )
 
-        # For HIERARCHICAL, truncate very long content to keep demo fast
-        # but ensure we stay above 15000 tokens (~13000 words)
-        if sample.expected_level == SummaryLevel.HIERARCHICAL:
-            words = content.split()
-            # ~16000 tokens ≈ 13500 words (need >15000 tokens for HIERARCHICAL)
-            if len(words) > 13500:  # noqa: PLR2004
-                content = " ".join(words[:13500])
-                print("  📎 Truncated to ~13500 words for faster demo")
+        # For very long content, truncate to keep demo fast
+        words = content.split()
+        if len(words) > 13500:  # noqa: PLR2004
+            content = " ".join(words[:13500])
+            print("  📎 Truncated to ~13500 words for faster demo")
 
         return content.strip()
 
@@ -310,9 +299,7 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
         # Generate synthetic content for the expected level
         target_tokens = {
             SummaryLevel.BRIEF: 300,
-            SummaryLevel.STANDARD: 1500,
-            SummaryLevel.DETAILED: 8000,
-            SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+            SummaryLevel.MAP_REDUCE: 1500,
         }
         return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000))
 
@@ -335,9 +322,7 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
     level_emoji = {
         SummaryLevel.NONE: "⏭️",
         SummaryLevel.BRIEF: "📝",
-        SummaryLevel.STANDARD: "📄",
-        SummaryLevel.DETAILED: "📚",
-        SummaryLevel.HIERARCHICAL: "🏗️",
+        SummaryLevel.MAP_REDUCE: "🔄",
     }
     print("\n🎯 Summarization Result:")
     print(f"   Level: {level_emoji.get(result.level, '❓')} {result.level.name}")
@@ -345,6 +330,8 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
     print(f"   Match: {'✅' if result.level == sample.expected_level else '⚠️'}")
     print(f"   Output tokens: {result.output_tokens:,}")
     print(f"   Compression: {result.compression_ratio:.1%}")
+    if result.collapse_depth > 0:
+        print(f"   Collapse depth: {result.collapse_depth}")
 
     # Summary content
     if result.summary:
@@ -357,23 +344,6 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
         )
         print(wrapped)
 
-    # Hierarchical details if present
-    if result.hierarchical:
-        h = result.hierarchical
-        print("\n🏗️  Hierarchical Structure:")
-        print(f"   L1 chunks: {len(h.l1_summaries)}")
-        print(f"   L2 groups: {len(h.l2_summaries)}")
-        if h.l2_summaries:
-            print(f"   L2 preview: {h.l2_summaries[0][:100]}...")
-        print("\n   L3 Final Summary:")
-        wrapped = textwrap.fill(
-            h.l3_summary,
-            width=68,
-            initial_indent="   ",
-            subsequent_indent="   ",
-        )
-        print(wrapped)
-
 
 async def run_demo(
     level_filter: str | None = None,
@@ -394,7 +364,7 @@ async def run_demo(
         openai_base_url=actual_base_url,
         model=actual_model,
         api_key=api_key,
-        chunk_size=3000,
+        chunk_size=2048,  # BOOOOKSCORE default
         max_concurrent_chunks=3,
         timeout=120.0,  # Longer timeout for local models
     )
@@ -404,9 +374,7 @@ async def run_demo(
     if level_filter:
         level_map = {
             "brief": SummaryLevel.BRIEF,
-            "standard": SummaryLevel.STANDARD,
-            "detailed": SummaryLevel.DETAILED,
-            "hierarchical": SummaryLevel.HIERARCHICAL,
+            "map_reduce": SummaryLevel.MAP_REDUCE,
         }
         target_level = level_map.get(level_filter.lower())
         if target_level:
@@ -449,14 +417,15 @@ def main() -> None:
         epilog=textwrap.dedent("""
         Examples:
           python examples/summarizer_demo.py
-          python examples/summarizer_demo.py --level standard
+          python examples/summarizer_demo.py --level brief
+          python examples/summarizer_demo.py --level map_reduce
           python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1"
         """),
     )
 
     parser.add_argument(
         "--level",
-        choices=["brief", "standard", "detailed", "hierarchical"],
+        choices=["brief", "map_reduce"],
         help="Only test a specific summarization level",
     )
     parser.add_argument(
diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py
index 12e419de9..44d0a031c 100644
--- a/tests/memory/test_engine.py
+++ b/tests/memory/test_engine.py
@@ -355,9 +355,8 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.STANDARD,
+            level=SummaryLevel.MAP_REDUCE,
             summary="summary up to 256",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=20,
             compression_ratio=0.2,
@@ -583,9 +582,8 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.STANDARD,
+            level=SummaryLevel.MAP_REDUCE,
             summary="summary text",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=20,
             compression_ratio=0.2,
@@ -632,4 +630,4 @@ async def fake_reconcile(
     files = list(tmp_path.glob("entries/**/*.md"))
     assert len(files) == 4  # user + assistant + fact + 1 summary
     assert any("facts" in str(f) for f in files)
-    assert any("summaries/L3/final.md" in str(f) for f in files)
+    assert any("summaries" in str(f) for f in files)
diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py
index db197b023..86040d7a1 100644
--- a/tests/memory/test_git_integration.py
+++ b/tests/memory/test_git_integration.py
@@ -66,9 +66,8 @@ async def fake_reconcile(
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.STANDARD,
+            level=SummaryLevel.MAP_REDUCE,
             summary="User likes testing.",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=20,
             compression_ratio=0.2,
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 5e8e33142..29dbe2e55 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -137,21 +137,21 @@ def test_upsert_and_delete_entries_delegate() -> None:
 
 
 def test_upsert_summary_entries_simple() -> None:
-    """Test upserting a simple (non-hierarchical) summary."""
+    """Test upserting a summary."""
     fake = _FakeCollection()
     entries = [
         {
-            "id": "conv-123:summary:L3:final",
-            "content": "A standard paragraph summary.",
+            "id": "conv-123:summary",
+            "content": "A paragraph summary.",
             "metadata": {
                 "conversation_id": "conv-123",
                 "role": "summary",
-                "level": 3,
                 "is_final": True,
-                "summary_level_name": "STANDARD",
+                "summary_level": "MAP_REDUCE",
                 "input_tokens": 1000,
                 "output_tokens": 50,
                 "compression_ratio": 0.05,
+                "collapse_depth": 0,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
@@ -159,52 +159,30 @@ def test_upsert_summary_entries_simple() -> None:
 
     ids = _store.upsert_summary_entries(fake, entries)
 
-    assert ids == ["conv-123:summary:L3:final"]
+    assert ids == ["conv-123:summary"]
     assert len(fake.upserts) == 1
     upserted_ids, upserted_docs, upserted_metas = fake.upserts[0]
-    assert upserted_ids == ["conv-123:summary:L3:final"]
-    assert upserted_docs == ["A standard paragraph summary."]
-    assert upserted_metas[0]["level"] == 3
+    assert upserted_ids == ["conv-123:summary"]
+    assert upserted_docs == ["A paragraph summary."]
     assert upserted_metas[0]["is_final"] is True
 
 
-def test_upsert_summary_entries_with_chunks() -> None:
-    """Test upserting a hierarchical summary with L1 and L3 entries."""
+def test_upsert_summary_entries_with_collapse_depth() -> None:
+    """Test upserting a summary with collapse depth metadata."""
     fake = _FakeCollection()
     entries = [
         {
-            "id": "conv-456:summary:L1:0",
-            "content": "Chunk 0 summary",
-            "metadata": {
-                "conversation_id": "conv-456",
-                "role": "summary",
-                "level": 1,
-                "chunk_index": 0,
-                "created_at": "2024-01-01T00:00:00",
-            },
-        },
-        {
-            "id": "conv-456:summary:L1:1",
-            "content": "Chunk 1 summary",
-            "metadata": {
-                "conversation_id": "conv-456",
-                "role": "summary",
-                "level": 1,
-                "chunk_index": 1,
-                "created_at": "2024-01-01T00:00:00",
-            },
-        },
-        {
-            "id": "conv-456:summary:L3:final",
+            "id": "conv-456:summary",
             "content": "Final synthesis",
             "metadata": {
                 "conversation_id": "conv-456",
                 "role": "summary",
-                "level": 3,
                 "is_final": True,
+                "summary_level": "MAP_REDUCE",
                 "input_tokens": 5000,
                 "output_tokens": 100,
                 "compression_ratio": 0.02,
+                "collapse_depth": 2,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
@@ -212,10 +190,9 @@ def test_upsert_summary_entries_with_chunks() -> None:
 
     ids = _store.upsert_summary_entries(fake, entries)
 
-    assert len(ids) == 3
-    assert "conv-456:summary:L1:0" in ids
-    assert "conv-456:summary:L1:1" in ids
-    assert "conv-456:summary:L3:final" in ids
+    assert len(ids) == 1
+    assert ids[0] == "conv-456:summary"
+    assert fake.upserts[0][2][0]["collapse_depth"] == 2
 
 
 def test_upsert_summary_entries_empty() -> None:
@@ -228,41 +205,8 @@ def test_upsert_summary_entries_empty() -> None:
     assert len(fake.upserts) == 0
 
 
-def test_get_summary_at_level() -> None:
-    """Test retrieving summaries at a specific level."""
-    fake = _FakeCollection(
-        get_result={
-            "documents": ["Chunk 0", "Chunk 1"],
-            "metadatas": [
-                {
-                    "conversation_id": "c1",
-                    "role": "summary",
-                    "level": 1,
-                    "chunk_index": 0,
-                    "created_at": "now",
-                },
-                {
-                    "conversation_id": "c1",
-                    "role": "summary",
-                    "level": 1,
-                    "chunk_index": 1,
-                    "created_at": "now",
-                },
-            ],
-            "ids": ["c1:summary:L1:0", "c1:summary:L1:1"],
-        },
-    )
-
-    records = _store.get_summary_at_level(fake, "c1", level=1)
-
-    assert len(records) == 2
-    assert records[0].metadata.level == 1
-    assert records[0].metadata.chunk_index == 0
-    assert records[1].metadata.chunk_index == 1
-
-
-def test_get_final_summary_returns_final() -> None:
-    """Test getting the L3 final summary."""
+def test_get_final_summary_returns_summary() -> None:
+    """Test getting the final summary for a conversation."""
     fake = _FakeCollection(
         get_result={
             "documents": ["The final summary"],
@@ -270,12 +214,13 @@ def test_get_final_summary_returns_final() -> None:
                 {
                     "conversation_id": "c1",
                     "role": "summary",
-                    "level": 3,
                     "is_final": True,
+                    "summary_level": "MAP_REDUCE",
+                    "collapse_depth": 1,
                     "created_at": "now",
                 },
             ],
-            "ids": ["c1:summary:L3:final"],
+            "ids": ["c1:summary"],
         },
     )
 
@@ -295,42 +240,28 @@ def test_get_final_summary_returns_none_when_missing() -> None:
     assert result is None
 
 
-def test_delete_summaries_all_levels() -> None:
-    """Test deleting all summary levels for a conversation."""
+def test_delete_summaries() -> None:
+    """Test deleting summaries for a conversation."""
     fake = _FakeCollection(
         get_result={
-            "documents": ["L1", "L3"],
+            "documents": ["The summary"],
             "metadatas": [
-                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
-                {"conversation_id": "c1", "role": "summary", "level": 3, "created_at": "now"},
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "summary_level": "MAP_REDUCE",
+                    "created_at": "now",
+                },
             ],
-            "ids": ["c1:summary:L1:0", "c1:summary:L3:final"],
+            "ids": ["c1:summary"],
         },
     )
 
     deleted_count = _store.delete_summaries(fake, "c1")
 
-    assert deleted_count == 2
-    assert len(fake.deleted) == 1
-    assert set(fake.deleted[0]) == {"c1:summary:L1:0", "c1:summary:L3:final"}
-
-
-def test_delete_summaries_specific_levels() -> None:
-    """Test deleting only specific summary levels."""
-    fake = _FakeCollection(
-        get_result={
-            "documents": ["L1 chunk"],
-            "metadatas": [
-                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
-            ],
-            "ids": ["c1:summary:L1:0"],
-        },
-    )
-
-    deleted_count = _store.delete_summaries(fake, "c1", levels=[1])
-
     assert deleted_count == 1
-    assert fake.deleted[0] == ["c1:summary:L1:0"]
+    assert len(fake.deleted) == 1
+    assert fake.deleted[0] == ["c1:summary"]
 
 
 def test_delete_summaries_no_entries() -> None:
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index 6acf43171..a64a72a16 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -7,7 +7,8 @@
 import pytest
 
 from agent_cli.summarizer.adaptive import (
-    LEVEL_THRESHOLDS,
+    THRESHOLD_BRIEF,
+    THRESHOLD_NONE,
     SummarizationError,
     SummarizerConfig,
     SummaryOutput,
@@ -63,9 +64,31 @@ def test_trailing_slash_stripped(self) -> None:
         )
         assert config.openai_base_url == "http://localhost:8000/v1"
 
+    def test_default_chunk_size_is_booookscore(self) -> None:
+        """Test that default chunk_size follows BOOOOKSCORE recommendation."""
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+        assert config.chunk_size == 2048  # BOOOOKSCORE's tested default
+
+    def test_default_token_max_is_langchain(self) -> None:
+        """Test that default token_max follows LangChain's default."""
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+        assert config.token_max == 3000  # LangChain's default
+
 
 class TestDetermineLevel:
-    """Tests for level determination based on token count."""
+    """Tests for level determination based on token count.
+
+    The simplified approach has 3 levels:
+    - NONE: Very short content (< 100 tokens)
+    - BRIEF: Short content (100-500 tokens)
+    - MAP_REDUCE: Everything else (uses map-reduce)
+    """
 
     def test_none_level_threshold(self) -> None:
         """Test NONE level for very short content."""
@@ -78,30 +101,17 @@ def test_brief_level_threshold(self) -> None:
         assert determine_level(300) == SummaryLevel.BRIEF
         assert determine_level(499) == SummaryLevel.BRIEF
 
-    def test_standard_level_threshold(self) -> None:
-        """Test STANDARD level for medium content."""
-        assert determine_level(500) == SummaryLevel.STANDARD
-        assert determine_level(1500) == SummaryLevel.STANDARD
-        assert determine_level(2999) == SummaryLevel.STANDARD
-
-    def test_detailed_level_threshold(self) -> None:
-        """Test DETAILED level for longer content."""
-        assert determine_level(3000) == SummaryLevel.DETAILED
-        assert determine_level(8000) == SummaryLevel.DETAILED
-        assert determine_level(14999) == SummaryLevel.DETAILED
-
-    def test_hierarchical_level_threshold(self) -> None:
-        """Test HIERARCHICAL level for very long content."""
-        assert determine_level(15000) == SummaryLevel.HIERARCHICAL
-        assert determine_level(50000) == SummaryLevel.HIERARCHICAL
-        assert determine_level(100000) == SummaryLevel.HIERARCHICAL
+    def test_map_reduce_level_for_longer_content(self) -> None:
+        """Test that content >= 500 tokens uses MAP_REDUCE."""
+        assert determine_level(500) == SummaryLevel.MAP_REDUCE
+        assert determine_level(1500) == SummaryLevel.MAP_REDUCE
+        assert determine_level(5000) == SummaryLevel.MAP_REDUCE
+        assert determine_level(20000) == SummaryLevel.MAP_REDUCE
 
     def test_thresholds_match_constants(self) -> None:
         """Verify thresholds match the module constants."""
-        assert LEVEL_THRESHOLDS[SummaryLevel.NONE] == 100
-        assert LEVEL_THRESHOLDS[SummaryLevel.BRIEF] == 500
-        assert LEVEL_THRESHOLDS[SummaryLevel.STANDARD] == 3000
-        assert LEVEL_THRESHOLDS[SummaryLevel.DETAILED] == 15000
+        assert THRESHOLD_NONE == 100
+        assert THRESHOLD_BRIEF == 500
 
 
 class TestSummarize:
@@ -168,92 +178,81 @@ async def test_brief_level_calls_brief_summary(
         assert result.summary == "Brief summary."
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._standard_summary")
-    async def test_standard_level_calls_standard_summary(
+    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
+    async def test_longer_content_uses_map_reduce(
         self,
-        mock_standard: AsyncMock,
+        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that STANDARD level content calls _standard_summary."""
-        mock_standard.return_value = "Standard summary paragraph."
+        """Test that content >= 500 tokens uses map-reduce."""
+        mock_result = SummaryResult(
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Map-reduce summary.",
+            input_tokens=800,
+            output_tokens=100,
+            compression_ratio=0.125,
+        )
+        mock_map_reduce.return_value = mock_result
 
-        # Create content that's ~500-3000 tokens
+        # Create content that's ~500+ tokens
         content = "This is a test sentence with more words. " * 100  # ~800 tokens
 
         result = await summarize(content, config, content_type="general")
 
-        mock_standard.assert_called_once_with(content, config, None, "general")
-        assert result.level == SummaryLevel.STANDARD
-        assert result.summary == "Standard summary paragraph."
+        mock_map_reduce.assert_called_once()
+        assert result.summary == "Map-reduce summary."
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._standard_summary")
-    async def test_prior_summary_passed_to_standard(
+    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
+    async def test_prior_summary_passed_to_map_reduce(
         self,
-        mock_standard: AsyncMock,
+        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that prior_summary is passed to _standard_summary."""
-        mock_standard.return_value = "Updated summary."
-
-        content = "This is a test sentence with more words. " * 100
-        prior = "Previous context summary."
-
-        await summarize(content, config, prior_summary=prior)
-
-        mock_standard.assert_called_once_with(content, config, prior, "general")
-
-    @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._detailed_summary")
-    async def test_detailed_level_calls_detailed_summary(
-        self,
-        mock_detailed: AsyncMock,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that DETAILED level content calls _detailed_summary."""
+        """Test that prior_summary is passed to _map_reduce_summary."""
         mock_result = SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary="Detailed summary.",
-            hierarchical=None,
-            input_tokens=5000,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Updated summary.",
+            input_tokens=800,
             output_tokens=100,
-            compression_ratio=0.02,
+            compression_ratio=0.125,
         )
-        mock_detailed.return_value = mock_result
+        mock_map_reduce.return_value = mock_result
 
-        # Create content that's ~3000-15000 tokens
-        content = "Word " * 5000  # ~5000 tokens
+        content = "This is a test sentence with more words. " * 100
+        prior = "Previous context summary."
 
-        result = await summarize(content, config)
+        await summarize(content, config, prior_summary=prior)
 
-        assert mock_detailed.called
-        assert result.level == SummaryLevel.DETAILED
+        # Verify prior_summary was passed
+        call_args = mock_map_reduce.call_args
+        assert call_args[0][3] == prior  # prior_summary is 4th positional arg
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._hierarchical_summary")
-    async def test_hierarchical_level_calls_hierarchical_summary(
+    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
+    async def test_very_long_content_uses_map_reduce(
         self,
-        mock_hierarchical: AsyncMock,
+        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that HIERARCHICAL level content calls _hierarchical_summary."""
+        """Test that very long content uses map-reduce."""
         mock_result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Hierarchical summary.",
-            hierarchical=None,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Long content summary.",
             input_tokens=20000,
             output_tokens=500,
             compression_ratio=0.025,
+            collapse_depth=2,
         )
-        mock_hierarchical.return_value = mock_result
+        mock_map_reduce.return_value = mock_result
 
         # Create content that's > 15000 tokens
         content = "Word " * 20000
 
         result = await summarize(content, config)
 
-        assert mock_hierarchical.called
-        assert result.level == SummaryLevel.HIERARCHICAL
+        assert mock_map_reduce.called
+        assert result.level == SummaryLevel.MAP_REDUCE
 
 
 class TestGenerateSummary:
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index d70286592..f11fcff8b 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -1,457 +1,68 @@
-"""Integration tests for the summarizer with memory system."""
+"""Integration tests for summarizer with storage layer."""
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
-from unittest.mock import patch
-
-import pytest
-
-from agent_cli.memory._ingest import summarize_content
-from agent_cli.memory._persistence import persist_hierarchical_summary
-from agent_cli.memory._store import (
-    get_final_summary,
-    get_summary_at_level,
-    upsert_summary_entries,
-)
-from agent_cli.summarizer import SummaryLevel, SummaryResult
 from agent_cli.summarizer.adaptive import determine_level
-from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
-class _FakeCollection:
-    """Minimal Chroma-like collection for testing."""
-
-    def __init__(self) -> None:
-        self._store: dict[str, tuple[str, dict[str, Any]]] = {}
-
-    def upsert(
-        self,
-        *,
-        ids: list[str],
-        documents: list[str],
-        metadatas: list[dict[str, Any]],
-    ) -> None:
-        for doc_id, doc, meta in zip(ids, documents, metadatas, strict=False):
-            self._store[doc_id] = (doc, meta)
-
-    def get(
-        self,
-        *,
-        where: dict[str, Any] | None = None,
-        include: list[str] | None = None,  # noqa: ARG002
-    ) -> dict[str, Any]:
-        if where is None:
-            return {"documents": [], "metadatas": [], "ids": []}
-
-        results: list[tuple[str, tuple[str, dict[str, Any]]]] = []
-        for doc_id, (doc, meta) in self._store.items():
-            # Check all conditions in $and clause
-            conditions = where.get("$and", [where])
-            match = True
-            for clause in conditions:
-                for k, v in clause.items():
-                    if k == "$and":
-                        continue
-                    if isinstance(v, dict):
-                        if "$in" in v and meta.get(k) not in v["$in"]:
-                            match = False
-                        if "$ne" in v and meta.get(k) == v["$ne"]:
-                            match = False
-                    elif meta.get(k) != v:
-                        match = False
-            if match:
-                results.append((doc_id, (doc, meta)))
-
-        docs = [doc for _, (doc, _) in results]
-        metas = [meta for _, (_, meta) in results]
-        ids = [doc_id for doc_id, _ in results]
-        return {"documents": docs, "metadatas": metas, "ids": ids}
-
-    def delete(
-        self,
-        ids: list[str] | None = None,
-        where: dict[str, Any] | None = None,  # noqa: ARG002
-    ) -> None:
-        if ids:
-            for doc_id in ids:
-                self._store.pop(doc_id, None)
-
-
-@pytest.fixture
-def fake_collection() -> _FakeCollection:
-    """Create a fake ChromaDB collection."""
-    return _FakeCollection()
-
-
-@pytest.fixture
-def memory_root(tmp_path: Path) -> Path:
-    """Create a temporary memory root directory."""
-    return tmp_path / "memory"
-
-
-class TestSummaryResultStorageMetadata:
-    """Test SummaryResult.to_storage_metadata for various levels."""
-
-    def test_standard_summary_produces_single_entry(self) -> None:
-        """Test that STANDARD level produces a single L3 entry."""
-        result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A paragraph summary of the content.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        entries = result.to_storage_metadata("conv-123")
-
-        assert len(entries) == 1
-        entry = entries[0]
-        assert entry["id"] == "conv-123:summary:L3:final"
-        assert entry["content"] == "A paragraph summary of the content."
-        assert entry["metadata"]["level"] == 3
-        assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level_name"] == "STANDARD"
+from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
-    def test_hierarchical_summary_produces_multiple_entries(self) -> None:
-        """Test that HIERARCHICAL level produces L1, L2, L3 entries."""
-        l1_summaries = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=2,
-                content="Chunk 2",
-                token_count=10,
-                source_tokens=100,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1_summaries,
-            l2_summaries=["Group 0 summary"],
-            l3_summary="Final hierarchical synthesis.",
-        )
-        result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Final hierarchical synthesis.",
-            hierarchical=hierarchical,
-            input_tokens=20000,
-            output_tokens=200,
-            compression_ratio=0.01,
-        )
 
-        entries = result.to_storage_metadata("conv-456")
-
-        # Should have 3 L1 + 1 L2 + 1 L3 = 5 entries
-        assert len(entries) == 5
-
-        # Check L1 entries
-        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
-        assert len(l1_entries) == 3
-
-        # Check L2 entries
-        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
-        assert len(l2_entries) == 1
-
-        # Check L3 entry
-        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
-        assert len(l3_entries) == 1
+class TestDetermineLevel:
+    """Tests for determine_level function with various content sizes."""
 
+    def test_short_content_is_brief(self) -> None:
+        """Test that 100-500 token content uses BRIEF."""
+        level = determine_level(200)
+        assert level == SummaryLevel.BRIEF
 
-class TestHierarchicalSummaryStorage:
-    """Test storing hierarchical summaries to ChromaDB."""
+    def test_medium_content_is_map_reduce(self) -> None:
+        """Test that 500+ token content uses MAP_REDUCE."""
+        level = determine_level(1000)
+        assert level == SummaryLevel.MAP_REDUCE
 
-    def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None:
-        """Test storing a simple (non-hierarchical) summary."""
-        result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A standard summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
+    def test_long_content_is_map_reduce(self) -> None:
+        """Test that 3000+ token content uses MAP_REDUCE."""
+        level = determine_level(5000)
+        assert level == SummaryLevel.MAP_REDUCE
 
-        entries = result.to_storage_metadata("conv-123")
-        ids = upsert_summary_entries(fake_collection, entries)
+    def test_very_long_content_is_map_reduce(self) -> None:
+        """Test that content over 15000 tokens still uses MAP_REDUCE."""
+        level = determine_level(20000)
+        assert level == SummaryLevel.MAP_REDUCE
 
-        assert len(ids) == 1
-        assert "conv-123:summary:L3:final" in ids
 
-        # Verify retrieval
-        stored = get_final_summary(fake_collection, "conv-123")
-        assert stored is not None
-        assert stored.content == "A standard summary."
+class TestSummaryResultStorage:
+    """Tests for SummaryResult storage metadata generation."""
 
-    def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> None:
-        """Test storing a hierarchical summary with all levels."""
-        l1_summaries = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0 summary",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1 summary",
-                token_count=10,
-                source_tokens=100,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1_summaries,
-            l2_summaries=[],
-            l3_summary="Final summary",
-        )
+    def test_to_storage_metadata_creates_entry(self) -> None:
+        """Test that to_storage_metadata creates a valid entry."""
         result = SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary="Final summary",
-            hierarchical=hierarchical,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
             compression_ratio=0.02,
+            collapse_depth=1,
         )
+        entries = result.to_storage_metadata("test-conversation")
 
-        entries = result.to_storage_metadata("conv-789")
-        ids = upsert_summary_entries(fake_collection, entries)
-
-        assert len(ids) == 3  # 2 L1 + 1 L3
-
-        # Verify L1 retrieval
-        l1_stored = get_summary_at_level(fake_collection, "conv-789", level=1)
-        assert len(l1_stored) == 2
-
-        # Verify L3 retrieval
-        final = get_final_summary(fake_collection, "conv-789")
-        assert final is not None
-        assert final.content == "Final summary"
-
-
-class TestFilePersistence:
-    """Test hierarchical summary file persistence."""
-
-    def test_persist_hierarchical_creates_files(
-        self,
-        fake_collection: _FakeCollection,
-        memory_root: Path,
-    ) -> None:
-        """Test that persist_hierarchical_summary creates correct file structure."""
-        l1_summaries = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0 content",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1 content",
-                token_count=10,
-                source_tokens=100,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1_summaries,
-            l2_summaries=["Group 0 summary"],
-            l3_summary="Final synthesis",
-        )
-        result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Final synthesis",
-            hierarchical=hierarchical,
-            input_tokens=20000,
-            output_tokens=200,
-            compression_ratio=0.01,
-        )
-
-        ids = persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="test-conv",
-            summary_result=result,
-        )
-
-        assert len(ids) == 4  # 2 L1 + 1 L2 + 1 L3
-
-        # Check file structure (note: _slugify converts - to - not _)
-        entries_dir = memory_root / "entries" / "test-conv"
-        l1_dir = entries_dir / "summaries" / "L1"
-        l2_dir = entries_dir / "summaries" / "L2"
-        l3_dir = entries_dir / "summaries" / "L3"
-
-        assert l1_dir.exists()
-        assert l2_dir.exists()
-        assert l3_dir.exists()
-
-        # Check L1 files
-        l1_files = list(l1_dir.glob("*.md"))
-        assert len(l1_files) == 2
-
-        # Check L2 files
-        l2_files = list(l2_dir.glob("*.md"))
-        assert len(l2_files) == 1
-
-        # Check L3 files
-        l3_files = list(l3_dir.glob("*.md"))
-        assert len(l3_files) == 1
-        assert (l3_dir / "final.md").exists()
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "test-conversation:summary"
+        assert entry["content"] == "A comprehensive summary."
+        assert entry["metadata"]["conversation_id"] == "test-conversation"
+        assert entry["metadata"]["role"] == "summary"
+        assert entry["metadata"]["is_final"] is True
+        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
+        assert entry["metadata"]["collapse_depth"] == 1
 
-    def test_persist_simple_summary_creates_l3_file(
-        self,
-        fake_collection: _FakeCollection,
-        memory_root: Path,
-    ) -> None:
-        """Test that a simple summary creates just L3/final.md."""
+    def test_none_level_returns_empty(self) -> None:
+        """Test that NONE level produces no storage entries."""
         result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A standard paragraph summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        ids = persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="simple-conv",
-            summary_result=result,
-        )
-
-        assert len(ids) == 1
-
-        # Check file exists (note: _slugify converts - to - not _)
-        entries_dir = memory_root / "entries" / "simple-conv"
-        l3_file = entries_dir / "summaries" / "L3" / "final.md"
-        assert l3_file.exists()
-
-        # Check content has YAML front matter
-        content = l3_file.read_text(encoding="utf-8")
-        assert "---" in content
-        assert "level: 3" in content
-        assert "A standard paragraph summary." in content
-
-    def test_persist_deletes_old_summaries(
-        self,
-        fake_collection: _FakeCollection,
-        memory_root: Path,
-    ) -> None:
-        """Test that persisting new summary deletes old summary files."""
-        # Create first summary
-        result1 = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="First summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="conv",
-            summary_result=result1,
+            level=SummaryLevel.NONE,
+            summary=None,
+            input_tokens=50,
+            output_tokens=0,
+            compression_ratio=0.0,
         )
-
-        entries_dir = memory_root / "entries" / "conv"
-        first_file = entries_dir / "summaries" / "L3" / "final.md"
-        assert first_file.exists()
-        assert "First summary." in first_file.read_text()
-
-        # Create second summary (should replace first)
-        result2 = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="Second summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="conv",
-            summary_result=result2,
-        )
-
-        # First summary should be moved to deleted
-        assert first_file.exists()
-        assert "Second summary." in first_file.read_text()
-
-        # Old summary should be in deleted folder
-        deleted_dir = memory_root / "entries" / "deleted" / "conv" / "summaries"
-        assert deleted_dir.exists()
-
-
-class TestDetermineLevelFunction:
-    """Test that determine_level correctly determines summary levels."""
-
-    def test_very_short_content_is_none(self) -> None:
-        """Test that content under 100 tokens gets NONE level."""
-        level = determine_level(50)
-        assert level == SummaryLevel.NONE
-
-    def test_short_content_is_brief(self) -> None:
-        """Test that 100-500 token content gets BRIEF level."""
-        level = determine_level(300)
-        assert level == SummaryLevel.BRIEF
-
-    def test_medium_content_is_standard(self) -> None:
-        """Test that 500-3000 token content gets STANDARD level."""
-        level = determine_level(1500)
-        assert level == SummaryLevel.STANDARD
-
-    def test_long_content_is_detailed(self) -> None:
-        """Test that 3000-15000 token content gets DETAILED level."""
-        level = determine_level(8000)
-        assert level == SummaryLevel.DETAILED
-
-    def test_very_long_content_is_hierarchical(self) -> None:
-        """Test that content over 15000 tokens gets HIERARCHICAL level."""
-        level = determine_level(25000)
-        assert level == SummaryLevel.HIERARCHICAL
-
-
-class TestSummarizeContentFunction:
-    """Test the summarize_content function from _ingest."""
-
-    @pytest.mark.asyncio
-    async def test_summarize_content_creates_result(self) -> None:
-        """Test that summarize_content returns a valid SummaryResult."""
-        # Patch at source since _ingest imports inside the function
-        with patch("agent_cli.summarizer.summarize") as mock_summarize:
-            mock_result = SummaryResult(
-                level=SummaryLevel.STANDARD,
-                summary="Mocked summary.",
-                hierarchical=None,
-                input_tokens=1000,
-                output_tokens=50,
-                compression_ratio=0.05,
-            )
-            mock_summarize.return_value = mock_result
-
-            result = await summarize_content(
-                content="Some content to summarize " * 100,
-                openai_base_url="http://localhost:8000/v1",
-                api_key=None,
-                model="test-model",
-            )
-
-            assert result.level == SummaryLevel.STANDARD
-            assert result.summary == "Mocked summary."
+        entries = result.to_storage_metadata("test-conversation")
+        assert entries == []
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index d39621119..c5b04f703 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -7,8 +7,6 @@
 import pytest
 
 from agent_cli.summarizer.models import (
-    ChunkSummary,
-    HierarchicalSummary,
     SummaryLevel,
     SummaryResult,
 )
@@ -21,122 +19,12 @@ def test_level_values(self) -> None:
         """Test that levels have correct integer values."""
         assert SummaryLevel.NONE == 0
         assert SummaryLevel.BRIEF == 1
-        assert SummaryLevel.STANDARD == 2
-        assert SummaryLevel.DETAILED == 3
-        assert SummaryLevel.HIERARCHICAL == 4
+        assert SummaryLevel.MAP_REDUCE == 2
 
     def test_level_ordering(self) -> None:
         """Test that levels can be compared."""
         assert SummaryLevel.NONE < SummaryLevel.BRIEF
-        assert SummaryLevel.BRIEF < SummaryLevel.STANDARD
-        assert SummaryLevel.STANDARD < SummaryLevel.DETAILED
-        assert SummaryLevel.DETAILED < SummaryLevel.HIERARCHICAL
-
-
-class TestChunkSummary:
-    """Tests for ChunkSummary model."""
-
-    def test_basic_creation(self) -> None:
-        """Test creating a chunk summary."""
-        chunk = ChunkSummary(
-            chunk_index=0,
-            content="This is a summary of chunk 1.",
-            token_count=10,
-            source_tokens=100,
-        )
-        assert chunk.chunk_index == 0
-        assert chunk.content == "This is a summary of chunk 1."
-        assert chunk.token_count == 10
-        assert chunk.source_tokens == 100
-
-    def test_validation_negative_tokens(self) -> None:
-        """Test that negative token counts fail validation."""
-        with pytest.raises(ValueError, match="greater than or equal to 0"):
-            ChunkSummary(
-                chunk_index=0,
-                content="Test",
-                token_count=-1,
-                source_tokens=100,
-            )
-
-
-class TestHierarchicalSummary:
-    """Tests for HierarchicalSummary model."""
-
-    def test_basic_creation(self) -> None:
-        """Test creating a hierarchical summary."""
-        l1 = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 1 summary",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 2 summary",
-                token_count=12,
-                source_tokens=120,
-            ),
-        ]
-        hs = HierarchicalSummary(
-            l1_summaries=l1,
-            l2_summaries=["Group summary"],
-            l3_summary="Final summary of all content.",
-        )
-        assert len(hs.l1_summaries) == 2
-        assert len(hs.l2_summaries) == 1
-        assert hs.l3_summary == "Final summary of all content."
-
-    def test_default_chunk_settings(self) -> None:
-        """Test default chunk size and overlap."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=[],
-            l3_summary="Final",
-        )
-        assert hs.chunk_size == 3000
-        assert hs.chunk_overlap == 200
-
-    def test_get_summary_at_level_1(self) -> None:
-        """Test getting L1 summaries."""
-        l1 = [
-            ChunkSummary(chunk_index=0, content="C1", token_count=5, source_tokens=50),
-            ChunkSummary(chunk_index=1, content="C2", token_count=5, source_tokens=50),
-        ]
-        hs = HierarchicalSummary(l1_summaries=l1, l2_summaries=[], l3_summary="Final")
-        result = hs.get_summary_at_level(1)
-        assert result == ["C1", "C2"]
-
-    def test_get_summary_at_level_2_with_l2(self) -> None:
-        """Test getting L2 summaries when available."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=["Group A", "Group B"],
-            l3_summary="Final",
-        )
-        result = hs.get_summary_at_level(2)
-        assert result == ["Group A", "Group B"]
-
-    def test_get_summary_at_level_2_fallback(self) -> None:
-        """Test getting L2 falls back to L3 when no L2 summaries."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=[],
-            l3_summary="Final summary",
-        )
-        result = hs.get_summary_at_level(2)
-        assert result == ["Final summary"]
-
-    def test_get_summary_at_level_3(self) -> None:
-        """Test getting L3 summary."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=["Group"],
-            l3_summary="The final summary",
-        )
-        result = hs.get_summary_at_level(3)
-        assert result == "The final summary"
+        assert SummaryLevel.BRIEF < SummaryLevel.MAP_REDUCE
 
 
 class TestSummaryResult:
@@ -147,56 +35,46 @@ def test_none_level_result(self) -> None:
         result = SummaryResult(
             level=SummaryLevel.NONE,
             summary=None,
-            hierarchical=None,
             input_tokens=50,
             output_tokens=0,
             compression_ratio=0.0,
         )
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
-        assert result.chunk_summaries is None
+        assert result.collapse_depth == 0
 
     def test_brief_level_result(self) -> None:
         """Test result for brief summary."""
         result = SummaryResult(
             level=SummaryLevel.BRIEF,
             summary="A brief one-sentence summary.",
-            hierarchical=None,
             input_tokens=200,
             output_tokens=10,
             compression_ratio=0.05,
         )
         assert result.level == SummaryLevel.BRIEF
         assert result.summary == "A brief one-sentence summary."
-        assert result.chunk_summaries is None
+        assert result.collapse_depth == 0
 
-    def test_hierarchical_result_with_chunk_summaries(self) -> None:
-        """Test hierarchical result exposes chunk summaries."""
-        l1 = [
-            ChunkSummary(chunk_index=0, content="Chunk 1", token_count=10, source_tokens=100),
-            ChunkSummary(chunk_index=1, content="Chunk 2", token_count=10, source_tokens=100),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1,
-            l2_summaries=[],
-            l3_summary="Final",
-        )
+    def test_map_reduce_result(self) -> None:
+        """Test result for map-reduce summary."""
         result = SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary="Final",
-            hierarchical=hierarchical,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
             compression_ratio=0.02,
+            collapse_depth=2,
         )
-        assert result.chunk_summaries == ["Chunk 1", "Chunk 2"]
+        assert result.level == SummaryLevel.MAP_REDUCE
+        assert result.summary == "A comprehensive summary."
+        assert result.collapse_depth == 2
 
     def test_to_storage_metadata_none_level(self) -> None:
         """Test that NONE level produces no storage entries."""
         result = SummaryResult(
             level=SummaryLevel.NONE,
             summary=None,
-            hierarchical=None,
             input_tokens=50,
             output_tokens=0,
             compression_ratio=0.0,
@@ -205,77 +83,44 @@ def test_to_storage_metadata_none_level(self) -> None:
         assert entries == []
 
     def test_to_storage_metadata_simple_summary(self) -> None:
-        """Test storage metadata for simple (non-hierarchical) summary."""
+        """Test storage metadata for a summary."""
         result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A standard paragraph summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
+            level=SummaryLevel.BRIEF,
+            summary="A brief summary.",
+            input_tokens=200,
+            output_tokens=10,
             compression_ratio=0.05,
         )
         entries = result.to_storage_metadata("conv-456")
         assert len(entries) == 1
         entry = entries[0]
-        assert entry["id"] == "conv-456:summary:L3:final"
-        assert entry["content"] == "A standard paragraph summary."
+        assert entry["id"] == "conv-456:summary"
+        assert entry["content"] == "A brief summary."
         assert entry["metadata"]["conversation_id"] == "conv-456"
         assert entry["metadata"]["role"] == "summary"
-        assert entry["metadata"]["level"] == 3
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level_name"] == "STANDARD"
+        assert entry["metadata"]["summary_level"] == "BRIEF"
 
-    def test_to_storage_metadata_hierarchical(self) -> None:
-        """Test storage metadata for hierarchical summary."""
-        l1 = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0 text",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1 text",
-                token_count=12,
-                source_tokens=120,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1,
-            l2_summaries=["Group 0 summary"],
-            l3_summary="Final synthesis",
-        )
+    def test_to_storage_metadata_map_reduce(self) -> None:
+        """Test storage metadata for map-reduce summary."""
         result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Final synthesis",
-            hierarchical=hierarchical,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Final synthesis of content.",
             input_tokens=20000,
             output_tokens=200,
             compression_ratio=0.01,
+            collapse_depth=3,
         )
         entries = result.to_storage_metadata("conv-789")
 
-        # Should have 2 L1 + 1 L2 + 1 L3 = 4 entries
-        assert len(entries) == 4
-
-        # Check L1 entries
-        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
-        assert len(l1_entries) == 2
-        assert l1_entries[0]["id"] == "conv-789:summary:L1:0"
-        assert l1_entries[0]["metadata"]["chunk_index"] == 0
-
-        # Check L2 entry
-        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
-        assert len(l2_entries) == 1
-        assert l2_entries[0]["id"] == "conv-789:summary:L2:0"
-        assert l2_entries[0]["content"] == "Group 0 summary"
-
-        # Check L3 entry
-        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
-        assert len(l3_entries) == 1
-        assert l3_entries[0]["id"] == "conv-789:summary:L3:final"
-        assert l3_entries[0]["metadata"]["is_final"] is True
+        # Should have 1 entry (the final summary)
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "conv-789:summary"
+        assert entry["content"] == "Final synthesis of content."
+        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
+        assert entry["metadata"]["collapse_depth"] == 3
+        assert entry["metadata"]["is_final"] is True
 
     def test_compression_ratio_bounds(self) -> None:
         """Test compression ratio validation."""
@@ -283,7 +128,6 @@ def test_compression_ratio_bounds(self) -> None:
         result = SummaryResult(
             level=SummaryLevel.BRIEF,
             summary="Test",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=10,
             compression_ratio=0.1,
@@ -295,7 +139,6 @@ def test_compression_ratio_bounds(self) -> None:
             SummaryResult(
                 level=SummaryLevel.BRIEF,
                 summary="Test",
-                hierarchical=None,
                 input_tokens=100,
                 output_tokens=10,
                 compression_ratio=1.5,
@@ -307,7 +150,6 @@ def test_created_at_default(self) -> None:
         result = SummaryResult(
             level=SummaryLevel.BRIEF,
             summary="Test",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=10,
             compression_ratio=0.1,
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 22eb4039e..2621b158e 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -140,37 +140,27 @@ def test_none_level(self) -> None:
 
     def test_brief_level(self) -> None:
         """Test level 1 (BRIEF) compression."""
-        # BRIEF: ~20% compression, capped at 50
+        # BRIEF: ~20% compression, capped at 50, minimum 20
         result = estimate_summary_tokens(100, level=1)
         assert result >= 20  # minimum of 20
         assert result <= 50  # capped at 50
 
-    def test_standard_level(self) -> None:
-        """Test level 2 (STANDARD) compression."""
-        # STANDARD: ~12% compression, capped at 200
+    def test_map_reduce_level(self) -> None:
+        """Test level 2 (MAP_REDUCE) compression."""
+        # MAP_REDUCE: ~10% compression, capped at 500, minimum 50
         result = estimate_summary_tokens(1000, level=2)
         assert result >= 50  # minimum of 50
-        assert result <= 200  # capped at 200
-
-    def test_detailed_level(self) -> None:
-        """Test level 3 (DETAILED) compression."""
-        # DETAILED: ~7% compression, capped at 500
-        result = estimate_summary_tokens(10000, level=3)
-        assert result >= 100  # minimum of 100
         assert result <= 500  # capped at 500
 
-    def test_hierarchical_level(self) -> None:
-        """Test level 4 (HIERARCHICAL) compression."""
-        # HIERARCHICAL: base of 1000 + diminishing returns
-        result = estimate_summary_tokens(50000, level=4)
-        assert result >= 1000  # base minimum
-        assert result <= 2000  # capped at 2000
-
-    def test_hierarchical_small_input(self) -> None:
-        """Test HIERARCHICAL with smaller input."""
-        # Even with small input, should return base
-        result = estimate_summary_tokens(5000, level=4)
-        assert result == 1000  # just the base, no additional
+    def test_map_reduce_large_input(self) -> None:
+        """Test MAP_REDUCE with large input hits cap."""
+        result = estimate_summary_tokens(50000, level=2)
+        assert result == 500  # capped at 500
+
+    def test_map_reduce_small_input(self) -> None:
+        """Test MAP_REDUCE with small input uses floor."""
+        result = estimate_summary_tokens(100, level=2)
+        assert result == 50  # floor of 50
 
 
 class TestTokensToWords:

From 0fce8aa3db38fa125a4fb8752378322a565cb961 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:16:57 -0800
Subject: [PATCH 24/37] refactor(summarizer): consolidate shared code to reduce
 duplication

Address review feedback:
1. DRY: Move SummaryOutput, SummarizationError, SummarizerConfig, and
   generate_summary to _utils.py - eliminates duplicate code between
   adaptive.py and map_reduce.py

2. Config consolidation: Remove MapReduceConfig, use SummarizerConfig
   throughout. map_reduce.py now accepts SummarizerConfig directly.

3. Document redundant check: The token_max check in map_reduce_summarize
   is kept as a safety guard for direct calls, with clear documentation
   explaining it's normally handled by adaptive.py.
---
 agent_cli/summarizer/_utils.py     |  93 +++++++++++++++++++++
 agent_cli/summarizer/adaptive.py   | 109 ++++--------------------
 agent_cli/summarizer/map_reduce.py | 129 +++++++----------------------
 tests/summarizer/test_adaptive.py  |  16 ++--
 4 files changed, 146 insertions(+), 201 deletions(-)

diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 8dbfb1ffd..078e21edc 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -3,15 +3,108 @@
 from __future__ import annotations
 
 import re
+from dataclasses import dataclass
 from functools import lru_cache
 from typing import TYPE_CHECKING
 
+from pydantic import BaseModel
+
 from agent_cli.summarizer.models import SummaryLevel
 
 if TYPE_CHECKING:
     import tiktoken
 
 
+class SummaryOutput(BaseModel):
+    """Structured output for summary generation."""
+
+    summary: str
+
+
+class SummarizationError(Exception):
+    """Raised when summarization fails after all retries."""
+
+
+@dataclass
+class SummarizerConfig:
+    """Configuration for summarization operations.
+
+    Example:
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        result = await summarize(long_document, config)
+        print(f"Level: {result.level.name}")
+        print(f"Compression: {result.compression_ratio:.1%}")
+
+    """
+
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default - when to collapse
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
+
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
+
+
+async def generate_summary(
+    prompt: str,
+    config: SummarizerConfig,
+    max_tokens: int = 256,
+) -> str:
+    """Call the LLM to generate a summary.
+
+    Args:
+        prompt: The prompt to send to the LLM.
+        config: Summarizer configuration.
+        max_tokens: Maximum tokens for the response.
+
+    Returns:
+        The generated summary text.
+
+    Raises:
+        SummarizationError: If the LLM call fails.
+
+    """
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
+
+    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
+    model = OpenAIChatModel(
+        model_name=config.model,
+        provider=provider,
+        settings=ModelSettings(
+            temperature=0.3,
+            max_tokens=max_tokens,
+        ),
+    )
+
+    agent = Agent(
+        model=model,
+        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+        output_type=SummaryOutput,
+        retries=2,
+    )
+
+    try:
+        result = await agent.run(prompt)
+        return result.output.summary.strip()
+    except Exception as e:
+        msg = f"Summarization failed: {e}"
+        raise SummarizationError(msg) from e
+
+
 @lru_cache(maxsize=4)
 def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None:
     """Get tiktoken encoding for a model, with caching.
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 39669e97d..b03a84e6c 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -19,9 +19,6 @@
 from __future__ import annotations
 
 import logging
-from dataclasses import dataclass
-
-from pydantic import BaseModel
 
 from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
@@ -29,12 +26,14 @@
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
+    SummarizationError,
+    SummarizerConfig,
     count_tokens,
     estimate_summary_tokens,
+    generate_summary,
     tokens_to_words,
 )
 from agent_cli.summarizer.map_reduce import (
-    MapReduceConfig,
     MapReduceSummarizationError,
     map_reduce_summarize,
 )
@@ -49,46 +48,15 @@
 THRESHOLD_NONE = 100  # Below this, no summary needed
 THRESHOLD_BRIEF = 500  # Below this, just a single sentence
 
-
-class SummaryOutput(BaseModel):
-    """Structured output for summary generation."""
-
-    summary: str
-
-
-class SummarizationError(Exception):
-    """Raised when summarization fails after all retries."""
-
-
-@dataclass
-class SummarizerConfig:
-    """Configuration for summarization operations.
-
-    Example:
-        config = SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="llama3.1:8b",
-        )
-        result = await summarize(long_document, config)
-        print(f"Level: {result.level.name}")
-        print(f"Compression: {result.compression_ratio:.1%}")
-
-    """
-
-    openai_base_url: str
-    model: str
-    api_key: str | None = None
-    chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default - when to collapse
-    chunk_overlap: int = 200
-    max_concurrent_chunks: int = 5
-    timeout: float = 60.0
-
-    def __post_init__(self) -> None:
-        """Normalize the base URL."""
-        self.openai_base_url = self.openai_base_url.rstrip("/")
-        if self.api_key is None:
-            self.api_key = "not-needed"
+# Re-export for backwards compatibility
+__all__ = [
+    "THRESHOLD_BRIEF",
+    "THRESHOLD_NONE",
+    "SummarizationError",
+    "SummarizerConfig",
+    "determine_level",
+    "summarize",
+]
 
 
 def determine_level(token_count: int) -> SummaryLevel:
@@ -175,7 +143,7 @@ async def summarize(
 async def _brief_summary(content: str, config: SummarizerConfig) -> str:
     """Generate a single-sentence summary for brief content."""
     prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
-    return await _generate_summary(prompt, config, max_tokens=50)
+    return await generate_summary(prompt, config, max_tokens=50)
 
 
 async def _map_reduce_summary(
@@ -200,19 +168,8 @@ async def _map_reduce_summary(
         )
 
     # Use map-reduce for multi-chunk content
-    mr_config = MapReduceConfig(
-        openai_base_url=config.openai_base_url,
-        model=config.model,
-        api_key=config.api_key,
-        chunk_size=config.chunk_size,
-        token_max=config.token_max,
-        chunk_overlap=config.chunk_overlap,
-        max_concurrent=config.max_concurrent_chunks,
-        timeout=config.timeout,
-    )
-
     try:
-        result = await map_reduce_summarize(content, mr_config)
+        result = await map_reduce_summarize(content, config)
     except MapReduceSummarizationError as e:
         raise SummarizationError(str(e)) from e
 
@@ -248,40 +205,4 @@ async def _content_aware_summary(
         max_words=max_words,
     )
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
-async def _generate_summary(
-    prompt: str,
-    config: SummarizerConfig,
-    max_tokens: int = 256,
-) -> str:
-    """Call the LLM to generate a summary. Raises SummarizationError on failure."""
-    from pydantic_ai import Agent  # noqa: PLC0415
-    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
-    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
-    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
-
-    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
-    model = OpenAIChatModel(
-        model_name=config.model,
-        provider=provider,
-        settings=ModelSettings(
-            temperature=0.3,
-            max_tokens=max_tokens,
-        ),
-    )
-
-    agent = Agent(
-        model=model,
-        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
-        output_type=SummaryOutput,
-        retries=2,
-    )
-
-    try:
-        result = await agent.run(prompt)
-        return result.output.summary.strip()
-    except Exception as e:
-        msg = f"Summarization failed: {e}"
-        raise SummarizationError(msg) from e
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 09d82d09c..76365e2d3 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -19,17 +19,18 @@
 import logging
 from dataclasses import dataclass
 
-from pydantic import BaseModel
-
 from agent_cli.summarizer._prompts import (
     CHUNK_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
     format_summaries_for_meta,
 )
 from agent_cli.summarizer._utils import (
+    SummarizationError,
+    SummarizerConfig,
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
+    generate_summary,
     tokens_to_words,
 )
 from agent_cli.summarizer.models import SummaryLevel
@@ -37,52 +38,10 @@
 logger = logging.getLogger(__name__)
 
 
-class SummaryOutput(BaseModel):
-    """Structured output for summary generation."""
-
-    summary: str
-
-
-class MapReduceSummarizationError(Exception):
+class MapReduceSummarizationError(SummarizationError):
     """Raised when map-reduce summarization fails."""
 
 
-@dataclass
-class MapReduceConfig:
-    """Configuration for map-reduce summarization.
-
-    Attributes:
-        openai_base_url: Base URL for OpenAI-compatible API.
-        model: Model name for summarization.
-        api_key: Optional API key.
-        chunk_size: Target size for splitting content (tokens).
-                   LangChain uses 3000, BOOOOKSCORE suggests 2048.
-        token_max: Maximum tokens for combined summaries before collapsing.
-                  When combined summaries exceed this, we recursively reduce.
-        chunk_overlap: Overlap between chunks for context continuity.
-        max_concurrent: Maximum parallel summarization calls.
-        timeout: Timeout for API calls in seconds.
-        max_collapse_depth: Safety limit on recursive collapse depth.
-
-    """
-
-    openai_base_url: str
-    model: str
-    api_key: str | None = None
-    chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default
-    chunk_overlap: int = 200
-    max_concurrent: int = 5
-    timeout: float = 60.0
-    max_collapse_depth: int = 10  # Safety limit
-
-    def __post_init__(self) -> None:
-        """Normalize the base URL."""
-        self.openai_base_url = self.openai_base_url.rstrip("/")
-        if self.api_key is None:
-            self.api_key = "not-needed"
-
-
 @dataclass
 class MapReduceResult:
     """Result of map-reduce summarization.
@@ -107,19 +66,24 @@ class MapReduceResult:
 
 async def map_reduce_summarize(
     content: str,
-    config: MapReduceConfig,
+    config: SummarizerConfig,
+    max_collapse_depth: int = 10,
 ) -> MapReduceResult:
     """Summarize content using map-reduce with dynamic collapse.
 
     Algorithm:
-    1. If content fits in token_max, summarize directly
-    2. Otherwise, split into chunks and summarize each (map phase)
-    3. If combined summaries exceed token_max, recursively collapse (reduce phase)
-    4. Continue until everything fits in token_max
+    1. Split into chunks and summarize each (map phase)
+    2. If combined summaries exceed token_max, recursively collapse (reduce phase)
+    3. Continue until everything fits in token_max
+
+    Note: This function assumes content exceeds token_max. The caller (adaptive.py)
+    handles the case where content fits in a single chunk. The check below is a
+    safety guard for direct calls to this function.
 
     Args:
         content: The content to summarize.
-        config: Map-reduce configuration.
+        config: Summarizer configuration.
+        max_collapse_depth: Safety limit on recursive collapse depth.
 
     Returns:
         MapReduceResult with summary and metadata.
@@ -137,7 +101,8 @@ async def map_reduce_summarize(
 
     input_tokens = count_tokens(content, config.model)
 
-    # If content already fits, just summarize directly
+    # Safety guard: if content fits in token_max, summarize directly.
+    # Normally handled by adaptive.py, but kept for direct calls to this function.
     if input_tokens <= config.token_max:
         summary = await _summarize_text(content, config)
         output_tokens = count_tokens(summary, config.model)
@@ -166,10 +131,10 @@ async def map_reduce_summarize(
     depth = 0
     while _total_tokens(summaries, config.model) > config.token_max:
         depth += 1
-        if depth > config.max_collapse_depth:
+        if depth > max_collapse_depth:
             logger.warning(
                 "Hit max collapse depth %d, forcing final summary",
-                config.max_collapse_depth,
+                max_collapse_depth,
             )
             break
 
@@ -205,9 +170,9 @@ def _total_tokens(texts: list[str], model: str) -> int:
     return sum(count_tokens(t, model) for t in texts)
 
 
-async def _map_summarize(chunks: list[str], config: MapReduceConfig) -> list[str]:
+async def _map_summarize(chunks: list[str], config: SummarizerConfig) -> list[str]:
     """Summarize each chunk in parallel (map phase)."""
-    semaphore = asyncio.Semaphore(config.max_concurrent)
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
     total = len(chunks)
 
     async def summarize_chunk(idx: int, chunk: str) -> str:
@@ -222,7 +187,7 @@ async def _summarize_chunk(
     chunk: str,
     chunk_index: int,
     total_chunks: int,
-    config: MapReduceConfig,
+    config: SummarizerConfig,
 ) -> str:
     """Summarize a single chunk."""
     source_tokens = count_tokens(chunk, config.model)
@@ -236,12 +201,12 @@ async def _summarize_chunk(
         max_words=max_words,
     )
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)
 
 
 async def _collapse_summaries(
     summaries: list[str],
-    config: MapReduceConfig,
+    config: SummarizerConfig,
 ) -> list[str]:
     """Collapse summaries by grouping and re-summarizing (reduce phase).
 
@@ -272,7 +237,7 @@ async def _collapse_summaries(
         groups.append(current_group)
 
     # Summarize each group in parallel
-    semaphore = asyncio.Semaphore(config.max_concurrent)
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
 
     async def summarize_group(group: list[str]) -> str:
         async with semaphore:
@@ -282,7 +247,7 @@ async def summarize_group(group: list[str]) -> str:
     return list(await asyncio.gather(*tasks))
 
 
-async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str:
+async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str:
     """Synthesize multiple summaries into one."""
     combined_tokens = sum(count_tokens(s, config.model) for s in summaries)
     target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE)
@@ -293,10 +258,10 @@ async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str:
         max_words=max_words,
     )
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 100)
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 100)
 
 
-async def _summarize_text(text: str, config: MapReduceConfig) -> str:
+async def _summarize_text(text: str, config: SummarizerConfig) -> str:
     """Summarize text that fits within token_max."""
     input_tokens = count_tokens(text, config.model)
     target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE)
@@ -310,40 +275,4 @@ async def _summarize_text(text: str, config: MapReduceConfig) -> str:
 
 Summary:"""
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
-async def _generate_summary(
-    prompt: str,
-    config: MapReduceConfig,
-    max_tokens: int = 256,
-) -> str:
-    """Call the LLM to generate a summary."""
-    from pydantic_ai import Agent  # noqa: PLC0415
-    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
-    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
-    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
-
-    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
-    model = OpenAIChatModel(
-        model_name=config.model,
-        provider=provider,
-        settings=ModelSettings(
-            temperature=0.3,
-            max_tokens=max_tokens,
-        ),
-    )
-
-    agent = Agent(
-        model=model,
-        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
-        output_type=SummaryOutput,
-        retries=2,
-    )
-
-    try:
-        result = await agent.run(prompt)
-        return result.output.summary.strip()
-    except Exception as e:
-        msg = f"Map-reduce summarization failed: {e}"
-        raise MapReduceSummarizationError(msg) from e
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index a64a72a16..202a55921 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -6,13 +6,15 @@
 
 import pytest
 
-from agent_cli.summarizer.adaptive import (
-    THRESHOLD_BRIEF,
-    THRESHOLD_NONE,
+from agent_cli.summarizer._utils import (
     SummarizationError,
     SummarizerConfig,
     SummaryOutput,
-    _generate_summary,
+    generate_summary,
+)
+from agent_cli.summarizer.adaptive import (
+    THRESHOLD_BRIEF,
+    THRESHOLD_NONE,
     determine_level,
     summarize,
 )
@@ -256,7 +258,7 @@ async def test_very_long_content_uses_map_reduce(
 
 
 class TestGenerateSummary:
-    """Tests for _generate_summary function."""
+    """Tests for generate_summary function."""
 
     @pytest.fixture
     def config(self) -> SummarizerConfig:
@@ -281,7 +283,7 @@ async def test_generate_summary_with_pydantic_ai(
             mock_agent.run = AsyncMock(return_value=mock_result)
             mock_agent_class.return_value = mock_agent
 
-            result = await _generate_summary("Test prompt", config, max_tokens=100)
+            result = await generate_summary("Test prompt", config, max_tokens=100)
 
             assert result == "Generated summary."
             mock_agent.run.assert_called_once_with("Test prompt")
@@ -298,7 +300,7 @@ async def test_raises_summarization_error_on_failure(
             mock_agent_class.return_value = mock_agent
 
             with pytest.raises(SummarizationError, match="Summarization failed"):
-                await _generate_summary("Test prompt", config, max_tokens=100)
+                await generate_summary("Test prompt", config, max_tokens=100)
 
 
 class TestSummaryOutput:

From 38cce558a15f7ef0d4af7b401d28c3b2aa1dd6b9 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:23:57 -0800
Subject: [PATCH 25/37] refactor(summarizer): remove redundant _summarize_text
 and safety guard

- Remove _summarize_text function with hardcoded prompt (use centralized
  prompts in _prompts.py via adaptive.py instead)
- Remove redundant token_max safety guard from map_reduce_summarize
- Update docstring to clarify function is designed for content exceeding
  token_max, directing users to adaptive.summarize() for proper routing
---
 agent_cli/summarizer/map_reduce.py | 37 +++---------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 76365e2d3..93aaabd8c 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -76,9 +76,9 @@ async def map_reduce_summarize(
     2. If combined summaries exceed token_max, recursively collapse (reduce phase)
     3. Continue until everything fits in token_max
 
-    Note: This function assumes content exceeds token_max. The caller (adaptive.py)
-    handles the case where content fits in a single chunk. The check below is a
-    safety guard for direct calls to this function.
+    Note: This function is designed for content that exceeds token_max. For shorter
+    content, use the main `summarize()` function in adaptive.py which selects the
+    appropriate strategy (NONE, BRIEF, or MAP_REDUCE with content-aware prompts).
 
     Args:
         content: The content to summarize.
@@ -101,20 +101,6 @@ async def map_reduce_summarize(
 
     input_tokens = count_tokens(content, config.model)
 
-    # Safety guard: if content fits in token_max, summarize directly.
-    # Normally handled by adaptive.py, but kept for direct calls to this function.
-    if input_tokens <= config.token_max:
-        summary = await _summarize_text(content, config)
-        output_tokens = count_tokens(summary, config.model)
-        return MapReduceResult(
-            summary=summary,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
-            collapse_depth=0,
-            intermediate_summaries=[],
-        )
-
     # Map phase: Split and summarize chunks in parallel
     chunks = chunk_text(
         content,
@@ -259,20 +245,3 @@ async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str:
     )
 
     return await generate_summary(prompt, config, max_tokens=target_tokens + 100)
-
-
-async def _summarize_text(text: str, config: SummarizerConfig) -> str:
-    """Summarize text that fits within token_max."""
-    input_tokens = count_tokens(text, config.model)
-    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt = f"""Summarize the following content in {max_words} words or less.
-Focus on the key points and main ideas.
-
-Content:
-{text}
-
-Summary:"""
-
-    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)

From c38e305ae886320577e09ff22f9ec42dbda1192b Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:34:11 -0800
Subject: [PATCH 26/37] refactor(summarizer): remove redundant exception
 re-wrapping

MapReduceSummarizationError already inherits from SummarizationError,
so catching and re-raising serves no purpose.
---
 agent_cli/summarizer/adaptive.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index b03a84e6c..c5ba092eb 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -33,10 +33,7 @@
     generate_summary,
     tokens_to_words,
 )
-from agent_cli.summarizer.map_reduce import (
-    MapReduceSummarizationError,
-    map_reduce_summarize,
-)
+from agent_cli.summarizer.map_reduce import map_reduce_summarize
 from agent_cli.summarizer.models import (
     SummaryLevel,
     SummaryResult,
@@ -168,10 +165,7 @@ async def _map_reduce_summary(
         )
 
     # Use map-reduce for multi-chunk content
-    try:
-        result = await map_reduce_summarize(content, config)
-    except MapReduceSummarizationError as e:
-        raise SummarizationError(str(e)) from e
+    result = await map_reduce_summarize(content, config)
 
     return SummaryResult(
         level=SummaryLevel.MAP_REDUCE,

From 349942b95a23506b078e4d74d432f7d66f4bc0e1 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:35:21 -0800
Subject: [PATCH 27/37] refactor(summarizer): remove defensive guards for
 impossible conditions

- Remove empty content check in map_reduce_summarize (caller validates)
- Remove 'if summary else 0' guards (generate_summary never returns None)
- Remove 'if input_tokens > 0' guards (input is guaranteed non-empty)
- Remove 'if summaries else ""' guard (summaries always has content)
---
 agent_cli/summarizer/adaptive.py   |  8 ++++----
 agent_cli/summarizer/map_reduce.py | 14 ++------------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index c5ba092eb..640c52e60 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -118,13 +118,13 @@ async def summarize(
 
     if level == SummaryLevel.BRIEF:
         summary = await _brief_summary(content, config)
-        output_tokens = count_tokens(summary, config.model) if summary else 0
+        output_tokens = count_tokens(summary, config.model)
         return SummaryResult(
             level=level,
             summary=summary,
             input_tokens=input_tokens,
             output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            compression_ratio=output_tokens / input_tokens,
         )
 
     # MAP_REDUCE level
@@ -154,13 +154,13 @@ async def _map_reduce_summary(
     # For content that fits in a single chunk, use content-type aware summary
     if input_tokens <= config.token_max:
         summary = await _content_aware_summary(content, config, prior_summary, content_type)
-        output_tokens = count_tokens(summary, config.model) if summary else 0
+        output_tokens = count_tokens(summary, config.model)
         return SummaryResult(
             level=SummaryLevel.MAP_REDUCE,
             summary=summary,
             input_tokens=input_tokens,
             output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            compression_ratio=output_tokens / input_tokens,
             collapse_depth=0,
         )
 
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 93aaabd8c..07332c1cf 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -89,16 +89,6 @@ async def map_reduce_summarize(
         MapReduceResult with summary and metadata.
 
     """
-    if not content or not content.strip():
-        return MapReduceResult(
-            summary="",
-            input_tokens=0,
-            output_tokens=0,
-            compression_ratio=0.0,
-            collapse_depth=0,
-            intermediate_summaries=[],
-        )
-
     input_tokens = count_tokens(content, config.model)
 
     # Map phase: Split and summarize chunks in parallel
@@ -137,7 +127,7 @@ async def map_reduce_summarize(
     if len(summaries) > 1:
         final_summary = await _synthesize(summaries, config)
     else:
-        final_summary = summaries[0] if summaries else ""
+        final_summary = summaries[0]
 
     output_tokens = count_tokens(final_summary, config.model)
 
@@ -145,7 +135,7 @@ async def map_reduce_summarize(
         summary=final_summary,
         input_tokens=input_tokens,
         output_tokens=output_tokens,
-        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        compression_ratio=output_tokens / input_tokens,
         collapse_depth=depth,
         intermediate_summaries=intermediate_summaries,
     )

From aef0e9cc02deb98a8ff14581a2912724441e24d2 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:59:24 -0800
Subject: [PATCH 28/37] feat(scripts): add summarizer comparison script with
 needle-in-haystack test

Compares old L1-L4 hierarchical vs new adaptive map-reduce approach:
- Shows which level each system would use
- Runs new summarizer and measures fact preservation
- Uses specific 'needle' facts embedded in test content
---
 scripts/compare_summarizers.py | 402 +++++++++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 scripts/compare_summarizers.py

diff --git a/scripts/compare_summarizers.py b/scripts/compare_summarizers.py
new file mode 100644
index 000000000..15265cb0e
--- /dev/null
+++ b/scripts/compare_summarizers.py
@@ -0,0 +1,402 @@
+"""Compare old (L1-L4 hierarchical) vs new (adaptive map-reduce) summarizer.
+
+This script:
+1. Shows what level each system would use for test content
+2. Runs the NEW summarizer to produce actual summaries
+3. Evaluates summary quality using needle-in-haystack questions
+4. Uses LLM-as-judge for quality assessment
+
+Usage:
+    python scripts/compare_summarizers.py
+    python scripts/compare_summarizers.py --model "gpt-4o-mini" --base-url "https://api.openai.com/v1"
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+import textwrap
+from dataclasses import dataclass, field
+
+from agent_cli.summarizer import SummarizerConfig, summarize
+from agent_cli.summarizer._utils import count_tokens
+
+# Old system thresholds
+OLD_THRESHOLD_NONE = 100
+OLD_THRESHOLD_BRIEF = 500
+OLD_THRESHOLD_STANDARD = 3000
+OLD_THRESHOLD_DETAILED = 15000
+
+# New system thresholds
+NEW_THRESHOLD_NONE = 100
+NEW_THRESHOLD_BRIEF = 500
+
+# Evaluation threshold
+FACT_PRESERVATION_THRESHOLD = 0.5
+
+# Test content at different sizes with embedded "needles" (specific facts)
+TEST_CASES = [
+    {
+        "name": "Brief Range (~300 tokens)",
+        "description": "Tests the 100-500 token range where OLD=BRIEF, NEW=BRIEF",
+        "content": """
+        The artificial intelligence revolution is transforming every industry.
+        Machine learning algorithms now power recommendation systems, fraud detection,
+        and autonomous vehicles. Deep learning, a subset of machine learning, uses
+        neural networks with multiple layers to analyze complex patterns in data.
+
+        Major tech companies are investing billions in AI research. Google's DeepMind
+        created AlphaGo, which defeated world champion Lee Sedol in March 2016 in
+        the ancient game of Go. OpenAI developed GPT models that can generate
+        human-like text. These advances raise both excitement and concerns about
+        the future of work and society.
+
+        Researchers are working on making AI systems more transparent and aligned with
+        human values. The field of AI safety, pioneered by researchers like Stuart
+        Russell at UC Berkeley, aims to ensure that advanced AI systems remain
+        beneficial and under human control.
+        """,
+        "needles": [
+            ("Who did AlphaGo defeat?", "Lee Sedol"),
+            ("When did AlphaGo win?", "March 2016"),
+            ("Who pioneered AI safety?", "Stuart Russell"),
+            ("Where does Stuart Russell work?", "UC Berkeley"),
+        ],
+    },
+    {
+        "name": "Standard/MapReduce Range (~900 tokens)",
+        "description": "Tests 500-3000 range where OLD=STANDARD, NEW=MAP_REDUCE",
+        "content": """
+        Climate change represents one of the most pressing challenges facing humanity.
+        The Earth's average temperature has risen approximately 1.1 degrees Celsius since
+        the pre-industrial era, primarily due to human activities that release greenhouse
+        gases. Carbon dioxide from burning fossil fuels accounts for 76% of emissions.
+
+        The Intergovernmental Panel on Climate Change (IPCC), led by chair Hoesung Lee,
+        has warned that limiting warming to 1.5 degrees Celsius is crucial. The 2021
+        report involved 234 authors from 66 countries analyzing over 14,000 scientific
+        papers. Their conclusion: human influence has warmed the climate at a rate
+        unprecedented in at least the last 2,000 years.
+
+        Renewable energy offers hope. Solar panel costs dropped 89% between 2010 and 2020,
+        making solar competitive with fossil fuels. China leads with 306 gigawatts of
+        installed solar capacity. Wind energy has grown exponentially, with Denmark
+        generating 47% of its electricity from wind in 2019.
+
+        Electric vehicles are gaining ground. Tesla delivered 936,172 vehicles in 2021,
+        while traditional automakers race to electrify. Norway leads adoption, with
+        electric vehicles representing 65% of new car sales in 2021. Battery costs
+        have fallen 89% since 2010, from $1,100 to $132 per kilowatt-hour.
+
+        Carbon capture remains expensive at $250-$600 per ton of CO2. The Orca plant
+        in Iceland, opened in September 2021, captures just 4,000 tons annually.
+        Critics note this equals emissions from about 870 cars. More radical approaches
+        like solar radiation management could cool the planet but carry unknown risks.
+
+        The Paris Agreement, signed by 196 parties in December 2015, aims to limit
+        warming to well below 2 degrees. Countries submit Nationally Determined
+        Contributions (NDCs) outlining their emission reduction plans. However,
+        current pledges put the world on track for 2.7 degrees of warming by 2100.
+
+        Individual actions matter but systemic change is essential. Agriculture accounts
+        for 10-12% of global emissions. Beef production generates 60 kg of CO2 equivalent
+        per kilogram of meat. A plant-based diet could reduce food emissions by up to 73%.
+        """,
+        "needles": [
+            ("Who chairs the IPCC?", "Hoesung Lee"),
+            ("How many authors contributed to the 2021 IPCC report?", "234"),
+            ("What percent of Denmark's electricity comes from wind?", "47%"),
+            ("When did the Orca plant open?", "September 2021"),
+            ("How many vehicles did Tesla deliver in 2021?", "936,172"),
+            ("What percent of Norway's new cars are electric?", "65%"),
+            ("When was the Paris Agreement signed?", "December 2015"),
+            ("How much CO2 does beef production generate per kg?", "60 kg"),
+        ],
+    },
+    {
+        "name": "Detailed/MapReduce Range (~1800 tokens)",
+        "description": "Tests larger content where OLD=DETAILED (chunks+meta), NEW=MAP_REDUCE",
+        "content": """
+        The history of computing spans centuries of human innovation, from ancient
+        calculating devices to quantum computers. Understanding this evolution reveals
+        how incremental advances compound into revolutionary change.
+
+        Ancient Foundations (2400 BCE - 1600 CE)
+
+        The abacus emerged independently in multiple civilizations. Chinese merchants
+        used the suanpan as early as 2400 BCE for arithmetic. The Roman abacus used
+        grooved beads, while the Japanese soroban featured a distinctive 1:4 bead
+        arrangement still used today.
+
+        Mechanical Calculation (1600-1900)
+
+        In 1642, nineteen-year-old Blaise Pascal invented the Pascaline to help his
+        tax-collector father. This brass rectangular box could add and subtract using
+        interlocking gears. Only 50 were built, and 9 survive in museums today.
+
+        Gottfried Wilhelm Leibniz improved Pascal's design in 1694, creating the
+        Stepped Reckoner capable of multiplication and division. He also invented
+        binary arithmetic, writing "Explication de l'Arithmétique Binaire" in 1703,
+        laying groundwork for digital computing.
+
+        Charles Babbage designed the Analytical Engine from 1833-1871, incorporating
+        a mill (processor), store (memory), and punch card input. Ada Lovelace wrote
+        detailed notes including what's considered the first algorithm - for computing
+        Bernoulli numbers. The engine was never completed; Babbage died in 1871.
+
+        Electronic Era (1900-1970)
+
+        Alan Turing published "On Computable Numbers" in 1936, defining the theoretical
+        Turing machine. During WWII, he led the team at Bletchley Park that cracked
+        the Enigma code, shortening the war by an estimated two years.
+
+        ENIAC, completed February 14, 1946, at the University of Pennsylvania, was
+        the first general-purpose electronic computer. It weighed 30 tons, consumed
+        150 kilowatts, and contained 17,468 vacuum tubes. Programming required
+        physically rewiring the machine, taking days for each new problem.
+
+        The transistor, invented December 23, 1947, at Bell Labs by John Bardeen,
+        Walter Brattain, and William Shockley, revolutionized electronics. They
+        shared the 1956 Nobel Prize in Physics. By 1954, the TRADIC computer used
+        800 transistors instead of vacuum tubes.
+
+        Jack Kilby demonstrated the first integrated circuit on September 12, 1958,
+        at Texas Instruments. Robert Noyce independently developed a superior silicon
+        version at Fairchild. Kilby won the 2000 Nobel Prize; Noyce had died in 1990.
+
+        Personal Computing (1970-2000)
+
+        Intel's 4004, released November 15, 1971, was the first commercial microprocessor.
+        Designed by Federico Faggin, it contained 2,300 transistors running at 740 kHz.
+        The 8080 (1974) powered the Altair 8800, sparking the PC revolution.
+
+        Steve Wozniak built the Apple I in 1976 in his garage. The Apple II (1977)
+        featured color graphics and cost $1,298. IBM entered with the PC on August 12,
+        1981, using Microsoft's MS-DOS. By 1984, Apple's Macintosh introduced the GUI
+        to mainstream users at $2,495.
+
+        Tim Berners-Lee invented the World Wide Web at CERN in 1989, proposing it
+        on March 12. The first website went live December 20, 1990. By 1995, the
+        internet had 16 million users; by 2000, 361 million.
+
+        Modern Era (2000-Present)
+
+        Moore's Law, predicting transistor doubling every two years, has held since
+        Gordon Moore's 1965 observation. Intel's 2021 Alder Lake processors contain
+        10+ billion transistors on chips measuring 215 mm².
+
+        Steve Jobs unveiled the iPhone on January 9, 2007. It sold 1.4 million units
+        in its first year. Smartphones now exceed 6.6 billion globally, containing
+        more power than 1990s supercomputers.
+
+        Google claimed quantum supremacy October 23, 2019, with Sycamore completing
+        a calculation in 200 seconds that would take 10,000 years classically.
+        IBM disputed this, but the quantum era has clearly begun.
+        """,
+        "needles": [
+            ("How old was Pascal when he invented the Pascaline?", "19"),
+            ("When did Leibniz write about binary arithmetic?", "1703"),
+            ("How many vacuum tubes did ENIAC contain?", "17,468"),
+            ("When was the transistor invented?", "December 23, 1947"),
+            ("When did Jack Kilby demonstrate the integrated circuit?", "September 12, 1958"),
+            ("How many transistors did the Intel 4004 have?", "2,300"),
+            ("When did the first website go live?", "December 20, 1990"),
+            ("When did Jobs unveil the iPhone?", "January 9, 2007"),
+            ("When did Google claim quantum supremacy?", "October 23, 2019"),
+        ],
+    },
+]
+
+
+def get_old_level(tokens: int) -> tuple[str, str]:
+    """Determine what level the OLD (L1-L4) summarizer would use."""
+    if tokens < OLD_THRESHOLD_NONE:
+        return "NONE", "No summary needed"
+    if tokens < OLD_THRESHOLD_BRIEF:
+        return "BRIEF", "Single sentence (~20% compression)"
+    if tokens < OLD_THRESHOLD_STANDARD:
+        return "STANDARD", "Paragraph with content-aware prompts (~12%)"
+    if tokens < OLD_THRESHOLD_DETAILED:
+        return "DETAILED", "Chunked L1 summaries + meta L3 (~7%)"
+    return "HIERARCHICAL", "Full L1/L2/L3 tree structure"
+
+
+def get_new_level(tokens: int) -> tuple[str, str]:
+    """Determine what level the NEW (adaptive) summarizer would use."""
+    if tokens < NEW_THRESHOLD_NONE:
+        return "NONE", "No summary needed"
+    if tokens < NEW_THRESHOLD_BRIEF:
+        return "BRIEF", "Single sentence"
+    return "MAP_REDUCE", "Dynamic collapse based on content"
+
+
+@dataclass
+class TestResult:
+    """Result of testing one content sample."""
+
+    name: str
+    tokens: int
+    old_level: str
+    old_description: str
+    new_level: str
+    new_description: str
+    new_summary: str | None = None
+    needles_found: int = 0
+    total_needles: int = 0
+    needle_details: list[tuple[str, str, bool]] = field(default_factory=list)
+
+
+async def run_test(test_case: dict, config: dict) -> TestResult:
+    """Run a single test case."""
+    content = test_case["content"].strip()
+    tokens = count_tokens(content, config["model"])
+
+    old_level, old_desc = get_old_level(tokens)
+    new_level, new_desc = get_new_level(tokens)
+
+    # Run new summarizer
+    cfg = SummarizerConfig(
+        openai_base_url=config["base_url"],
+        model=config["model"],
+        api_key=config.get("api_key", "not-needed"),
+    )
+
+    result = await summarize(content, cfg, content_type="document")
+
+    # Check needles in summary
+    needle_details = []
+    needles_found = 0
+
+    if result.summary:
+        summary_lower = result.summary.lower()
+        for question, answer in test_case["needles"]:
+            # Check if the key fact is preserved
+            found = answer.lower() in summary_lower
+            needle_details.append((question, answer, found))
+            if found:
+                needles_found += 1
+
+    return TestResult(
+        name=test_case["name"],
+        tokens=tokens,
+        old_level=old_level,
+        old_description=old_desc,
+        new_level=new_level,
+        new_description=new_desc,
+        new_summary=result.summary,
+        needles_found=needles_found,
+        total_needles=len(test_case["needles"]),
+        needle_details=needle_details,
+    )
+
+
+def print_result(result: TestResult) -> None:
+    """Print a test result."""
+    print(f"\n{'=' * 70}")
+    print(f"{result.name}")
+    print(f"{'=' * 70}")
+    print(f"Input tokens: {result.tokens}")
+    print()
+    print("Level comparison:")
+    print(f"  OLD: {result.old_level:12} - {result.old_description}")
+    print(f"  NEW: {result.new_level:12} - {result.new_description}")
+    print()
+
+    if result.new_summary:
+        print("New summary:")
+        wrapped = textwrap.fill(
+            result.new_summary,
+            width=68,
+            initial_indent="  ",
+            subsequent_indent="  ",
+        )
+        print(wrapped)
+        print()
+
+        print(
+            f"Needle-in-haystack test: {result.needles_found}/{result.total_needles} facts preserved",
+        )
+        for question, answer, found in result.needle_details:
+            status = "[OK]" if found else "[MISSING]"
+            print(f"  {status} {question} -> {answer}")
+    else:
+        print("No summary produced (NONE level)")
+
+
+async def main() -> None:
+    """Run all tests."""
+    parser = argparse.ArgumentParser(description="Compare summarizer versions")
+    parser.add_argument("--model", default=os.environ.get("OPENAI_MODEL", "gpt-oss-high:20b"))
+    parser.add_argument(
+        "--base-url",
+        default=os.environ.get("OPENAI_BASE_URL", "http://192.168.1.143:9292/v1"),
+    )
+    parser.add_argument("--api-key", default=os.environ.get("OPENAI_API_KEY", "not-needed"))
+    args = parser.parse_args()
+
+    config = {
+        "model": args.model,
+        "base_url": args.base_url,
+        "api_key": args.api_key,
+    }
+
+    print("=" * 70)
+    print("SUMMARIZER COMPARISON: OLD (L1-L4) vs NEW (Adaptive Map-Reduce)")
+    print("=" * 70)
+    print(f"Model: {config['model']}")
+    print(f"Base URL: {config['base_url']}")
+
+    results = []
+    for test in TEST_CASES:
+        print(f"\nRunning: {test['name']}...")
+        result = await run_test(test, config)
+        results.append(result)
+        print_result(result)
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+
+    total_needles = sum(r.total_needles for r in results)
+    found_needles = sum(r.needles_found for r in results)
+
+    print(
+        f"\nOverall fact preservation: {found_needles}/{total_needles} ({100 * found_needles / total_needles:.1f}%)",
+    )
+    print()
+
+    print("Key differences:")
+    print("""
+OLD System (5 levels):
+  - NONE (<100), BRIEF (100-500), STANDARD (500-3000),
+    DETAILED (3000-15000), HIERARCHICAL (>15000)
+  - Fixed boundaries, L1/L2/L3 tree for large content
+  - Stored intermediate summaries at each level
+  - Chunk size: 3000 tokens
+
+NEW System (3 levels):
+  - NONE (<100), BRIEF (100-500), MAP_REDUCE (>=500)
+  - Dynamic collapse depth based on content
+  - Content-type aware prompts
+  - Chunk size: 2048 tokens (BOOOOKSCORE research)
+  - Only stores final summary
+
+Trade-offs:
+  + Simpler (3 levels vs 5)
+  + Research-backed parameters
+  + Content-aware prompts
+  - No intermediate level access
+  - All >=500 token content treated the same
+""")
+
+    print("Verdict: ", end="")
+    if found_needles / total_needles >= FACT_PRESERVATION_THRESHOLD:
+        print("NEW system preserves facts adequately")
+    else:
+        print("NEW system may lose important details - further tuning needed")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 86500c5b585d3d18eec31130d147c7d10230fcb7 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 10:05:07 -0800
Subject: [PATCH 29/37] docs(summarizer): update architecture doc to reflect
 current implementation

- Remove references to old L1-L4/STANDARD/DETAILED/HIERARCHICAL levels
- Remove HierarchicalSummary and ChunkSummary (no longer exist)
- Update storage format to show single summary entry
- Add new section on limitations and trade-offs
- Simplify error handling section
- Add data models section with current code
---
 docs/architecture/summarizer.md | 231 ++++++++++++++++++--------------
 1 file changed, 128 insertions(+), 103 deletions(-)

diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index c34540bc1..43caf336d 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -37,7 +37,7 @@ LangChain's approach to document summarization uses a simple algorithm:
 1. **Map phase:** Split content into chunks, summarize each in parallel
 2. **Reduce phase:** If combined summaries exceed `token_max`, recursively collapse until they fit
 
-Key insight: No need for predetermined L1/L2/L3 levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`.
+Key insight: No need for predetermined levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`.
 
 ### 2.2 Borrowed: Chunk Size (BOOOOKSCORE)
 
@@ -51,25 +51,13 @@ BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. T
 
 **Reference:** arXiv:2504.19413
 
-Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to storage.
 
-### 2.4 Not Directly Borrowed: Letta's Approach
-
-**Reference:** arXiv:2310.08560
-
-Letta (MemGPT) uses a different paradigm focused on **context window management**:
-- Message count thresholds (e.g., 10 messages), not token thresholds
-- 30% partial eviction when buffer overflows
-- Purpose: fit conversation in LLM context window
-
-Our system has a different purpose (memory compression for storage/retrieval), so our implementation differs significantly.
-
-### 2.5 Original Design (Not Research-Backed)
+### 2.4 Original Design (Not Research-Backed)
 
 The following aspects are **original design choices without direct research justification**:
 
-- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/map-reduce were chosen heuristically.
-- **L2 group logic for storage:** The intermediate summaries stored as "L2" is for backward compatibility with the storage layer.
+- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/MAP_REDUCE were chosen heuristically.
 - **Content-type prompts:** Domain-specific prompts are original design.
 
 ---
@@ -78,51 +66,39 @@ The following aspects are **original design choices without direct research just
 
 ### 3.1 Map-Reduce with Dynamic Collapse
 
-**Decision:** Use LangChain-style map-reduce instead of fixed L1/L2/L3 levels.
+**Decision:** Use LangChain-style map-reduce instead of fixed hierarchy.
 
 **Rationale:**
 
-- **Simpler algorithm:** No need to distinguish STANDARD/DETAILED/HIERARCHICAL.
+- **Simpler algorithm:** Single code path handles all content sizes.
 - **Dynamic depth:** Collapse depth adapts to actual content length.
 - **Research-backed:** LangChain's approach is battle-tested.
 
 **Algorithm:**
 
 ```python
-def map_reduce_summarize(content, token_max=3000):
-    if tokens(content) <= token_max:
-        return summarize_directly(content)
-
+async def map_reduce_summarize(content, config):
     # Map: Split and summarize chunks in parallel
-    chunks = split_into_chunks(content, chunk_size=2048)
-    summaries = [summarize(chunk) for chunk in chunks]
+    chunks = chunk_text(content, chunk_size=2048)
+    summaries = await parallel_summarize(chunks)
 
-    # Reduce: Recursively collapse until fits
-    while total_tokens(summaries) > token_max:
-        groups = group_summaries_by_token_max(summaries, token_max)
-        summaries = [synthesize(group) for group in groups]
+    # Reduce: Recursively collapse until fits token_max
+    while total_tokens(summaries) > config.token_max:
+        groups = group_by_token_limit(summaries, config.token_max)
+        summaries = await parallel_synthesize(groups)
 
     return final_synthesis(summaries)
 ```
 
-### 3.2 Token-Based Level Selection (Simplified)
-
-**Decision:** Use three effective levels instead of five.
+### 3.2 Three-Level Strategy
 
-**Rationale:**
-
-- **Simplicity:** Fewer code paths, easier to understand.
-- **Dynamic instead of fixed:** Map-reduce adapts to content, no need for DETAILED vs HIERARCHICAL distinction.
-
-**Effective Levels:**
+**Decision:** Use three levels based on token count.
 
 | Level | Token Range | Strategy |
 | :--- | :--- | :--- |
 | NONE | < 100 | No summarization needed |
 | BRIEF | 100-500 | Single sentence |
-| MAP_REDUCE | > 500 | Dynamic collapse until fits token_max |
-
-**Backward Compatibility:** The output still reports STANDARD, DETAILED, or HIERARCHICAL based on collapse depth for storage compatibility.
+| MAP_REDUCE | >= 500 | Dynamic collapse until fits token_max |
 
 ### 3.3 Research-Backed Defaults
 
@@ -140,15 +116,15 @@ def map_reduce_summarize(content, token_max=3000):
 
 **Rationale:**
 
-- **Coherence preservation:** Splitting mid-sentence or mid-thought loses context and produces poor summaries.
-- **Natural units:** Paragraphs and sentences are natural semantic units that humans use to organize thoughts.
+- **Coherence preservation:** Splitting mid-sentence loses context.
+- **Natural units:** Paragraphs and sentences are natural semantic units.
 - **Overlap for continuity:** The 200-token overlap ensures concepts spanning chunk boundaries aren't lost.
 
 **Fallback chain:**
 
 1. Prefer paragraph boundaries (double newlines)
 2. Fall back to sentence boundaries (`.!?` followed by space + capital)
-3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation)
+3. Final fallback to word-based splitting
 
 ### 3.5 Content-Type Aware Prompts
 
@@ -156,35 +132,27 @@ def map_reduce_summarize(content, token_max=3000):
 
 **Rationale:**
 
-- **Conversations:** Focus on user preferences, decisions, action items—what the user wants and what was agreed.
-- **Journals:** Emphasize personal insights, emotional context, growth patterns—the subjective experience.
-- **Documents:** Prioritize key findings, methodology, conclusions—the objective content.
+- **Conversations:** Focus on user preferences, decisions, action items.
+- **Journals:** Emphasize personal insights, emotional context, growth patterns.
+- **Documents:** Prioritize key findings, methodology, conclusions.
 
-A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
+A generic summarization prompt loses domain-specific signal.
 
 ### 3.6 Prior Summary Integration
 
-**Decision:** Always provide the previous summary as context when generating updates.
+**Decision:** Provide the previous summary as context when generating updates.
 
 **Rationale:**
 
-- **Continuity:** New summaries should build on existing context, not start fresh each time.
-- **Incremental updates:** Avoid re-summarizing all historical content on every update.
-- **Information preservation:** Important information from earlier content persists through the chain of summaries.
-
-The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
+- **Continuity:** New summaries build on existing context.
+- **Incremental updates:** Avoid re-summarizing all historical content.
+- **Information preservation:** Important information persists through the chain.
 
 ### 3.7 Compression Ratio Tracking
 
 **Decision:** Track and report compression metrics for every summary.
 
-**Rationale:**
-
-- **Transparency:** Users can understand how much information was compressed.
-- **Quality monitoring:** Unusual ratios (e.g., output longer than input) may indicate summarization issues.
-- **Optimization:** Metrics inform future threshold tuning and quality assessment.
-
-Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression_ratio` for observability.
+Every `SummaryResult` includes `input_tokens`, `output_tokens`, `compression_ratio`, and `collapse_depth` for observability.
 
 ---
 
@@ -192,10 +160,10 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression
 
 ### 4.1 Level Selection
 
-The entry point counts tokens and selects strategy:
+The entry point (`summarize()`) counts tokens and selects strategy:
 
 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
-2. **Threshold comparison:** Determines if NONE, BRIEF, or map-reduce.
+2. **Threshold comparison:** Determines NONE, BRIEF, or MAP_REDUCE.
 3. **Strategy dispatch:** Calls appropriate handler.
 
 ### 4.2 Brief Level
@@ -203,55 +171,118 @@ The entry point counts tokens and selects strategy:
 For short content (100-500 tokens):
 
 - Single LLM call with brief prompt
-- Returns simple `SummaryResult` with no hierarchical structure
+- Returns `SummaryResult` with single-sentence summary
 
 ### 4.3 Map-Reduce Level
 
-For longer content (> 500 tokens):
+For longer content (>= 500 tokens):
 
 1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly.
 2. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
 3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively.
 4. **Final synthesis:** Combine remaining summaries into final output.
 
-The parallelism in the map phase provides significant speedup for long content while maintaining semantic coherence through the collapse process.
+The `collapse_depth` field in the result indicates how many reduce iterations were needed.
 
 ---
 
-## 5. Integration with Memory System
+## 5. Data Models
+
+### 5.1 SummaryLevel
+
+```python
+class SummaryLevel(IntEnum):
+    NONE = 0       # < 100 tokens
+    BRIEF = 1      # 100-500 tokens
+    MAP_REDUCE = 2 # >= 500 tokens
+```
+
+### 5.2 SummaryResult
 
-### 5.1 Write Path
+```python
+class SummaryResult(BaseModel):
+    level: SummaryLevel
+    summary: str | None
+    input_tokens: int
+    output_tokens: int
+    compression_ratio: float
+    collapse_depth: int  # 0 = no collapse needed
+    created_at: datetime
+```
+
+### 5.3 SummarizerConfig
+
+```python
+@dataclass
+class SummarizerConfig:
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048      # BOOOOKSCORE
+    token_max: int = 3000       # LangChain
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
+```
+
+---
+
+## 6. Integration with Memory System
+
+### 6.1 Write Path
 
 The memory system triggers summarization during post-processing:
 
-1. Collect raw conversation turns (user message + assistant message)
-2. Retrieve existing L3 summary as prior context
+1. Collect raw conversation turns
+2. Retrieve existing summary as prior context
 3. Call summarizer with content + prior summary + content type
-4. Persist results: delete old summaries, write new files, upsert to ChromaDB
+4. Persist result to storage
 
-### 5.2 Read Path
+### 6.2 Read Path
 
 The memory retrieval system uses summaries for context injection:
 
-- Fetches L3 (final) summary for the conversation
-- Injects as prefix to retrieved memories in the prompt
-- Provides high-level context that individual memory snippets lack
+- Fetches summary for the conversation
+- Injects as prefix to retrieved memories
+- Provides high-level context that individual snippets lack
+
+### 6.3 Storage
+
+Summaries are stored with metadata:
 
-### 5.3 Storage
+```python
+{
+    "id": "{conversation_id}:summary",
+    "content": summary_text,
+    "metadata": {
+        "conversation_id": conversation_id,
+        "role": "summary",
+        "summary_level": "MAP_REDUCE",
+        "input_tokens": 1500,
+        "output_tokens": 150,
+        "compression_ratio": 0.1,
+        "collapse_depth": 1,
+        "created_at": "2024-01-15T10:30:00Z",
+    },
+}
+```
+
+---
+
+## 7. Error Handling
 
-Summaries are persisted in two places:
+Summarization follows a fail-fast philosophy:
 
-- **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable.
-- **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps.
+- **LLM errors:** Propagated as `SummarizationError` (base class for all summarization errors).
+- **Empty input:** Returns NONE level immediately (not an error).
+- **Encoding errors:** Falls back to character-based token estimation.
+- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
 
-For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 structure:
-- First collapse level → L1 (chunk summaries)
-- Intermediate levels → L2 (grouped summaries)
-- Final output → L3 (synthesis)
+The caller decides how to handle failures—typically by proceeding without a summary rather than blocking the entire operation.
 
 ---
 
-## 6. Configuration
+## 8. Configuration
 
 | Parameter | Default | Source |
 | :--- | :--- | :--- |
@@ -263,34 +294,28 @@ For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 s
 
 ---
 
-## 7. Error Handling
+## 9. Limitations and Trade-offs
 
-Summarization follows a fail-fast philosophy:
+### 9.1 Fact Preservation
 
-- **LLM errors:** Propagated as `SummarizationError` or `MapReduceSummarizationError` rather than silently returning empty results.
-- **Empty input:** Returns NONE level immediately (not an error).
-- **Encoding errors:** Falls back to character-based token estimation.
-- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
+Summarization is inherently lossy. Specific facts (dates, numbers, names) are often dropped in favor of thematic content. If your use case requires fact retrieval:
 
-The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
+- Store original content alongside summaries
+- Use fact extraction instead of summarization
+- Use RAG to retrieve original chunks
 
----
+### 9.2 No Intermediate Summaries
 
-## 8. Comparison: Old vs New Approach
+Unlike hierarchical approaches, map-reduce only stores the final summary. Intermediate chunk summaries are discarded after synthesis. This simplifies storage but removes granular access.
 
-| Aspect | Old Approach | New Approach |
-| :--- | :--- | :--- |
-| Levels | 5 fixed (NONE/BRIEF/STANDARD/DETAILED/HIERARCHICAL) | 3 effective (NONE/BRIEF/MAP_REDUCE) |
-| Hierarchy | Fixed L1/L2/L3 structure | Dynamic collapse depth |
-| Chunk size | 3000 tokens | 2048 tokens (BOOOOKSCORE) |
-| token_max | N/A (fixed levels) | 3000 (LangChain) |
-| Complexity | Multiple code paths | Single map-reduce algorithm |
-| Research basis | Heuristic | LangChain + BOOOOKSCORE |
+### 9.3 Fixed Thresholds
+
+The 100/500 token thresholds are heuristic. They may need tuning for specific domains or languages.
 
 ---
 
-## 9. Future Improvements
+## 10. Future Improvements
 
 1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
-2. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
-3. **Tune token thresholds empirically** with real-world content
+2. **Tune token thresholds empirically** with real-world content
+3. **Add fact extraction mode** for use cases requiring specific detail preservation

From 63b755a0bd2b51bf31913934c5bc804712f409fd Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 10:29:15 -0800
Subject: [PATCH 30/37] docs: update memory.md for 3-level summarizer

Remove outdated references to 5-level hierarchy (STANDARD, DETAILED,
HIERARCHICAL) and L1/L2/L3 storage structure. Update to reflect current
3-level system (NONE, BRIEF, MAP_REDUCE) with single final summary.

Also fix prompt names to match actual implementation:
- BRIEF_SUMMARY_PROMPT, STANDARD_SUMMARY_PROMPT
- CHUNK_SUMMARY_PROMPT, META_SUMMARY_PROMPT
- Remove non-existent ROLLING_PROMPT
---
 docs/architecture/memory.md | 38 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index b42e739a9..361640e9c 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -164,13 +164,7 @@ entries/
       assistant/
         <timestamp>__<uuid>.md     # Raw assistant responses
     summaries/
-      L1/
-        chunk_0.md                 # Level 1: Individual chunk summaries
-        chunk_1.md
-      L2/
-        group_0.md                 # Level 2: Group summaries (groups of ~5 L1s)
-      L3/
-        final.md                   # Level 3: Final synthesized summary
+      <conversation_id>__summary.md  # Single final summary (map-reduce collapses to one)
 ```
 
 **Deleted Directory Structure (Soft Deletes):**
@@ -182,7 +176,7 @@ entries/
       facts/
         <timestamp>__<uuid>.md
       summaries/
-        L1/, L2/, L3/              # Tombstoned summary levels
+        <conversation_id>__summary.md  # Tombstoned summary
 ```
 
 ### 2.2 File Format
@@ -276,17 +270,16 @@ Resolves contradictions using a "Search-Decide-Update" loop with complete enumer
     *   **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`.
     *   **Deletes:** Soft-deletes files (moved under `deleted/`) and removes from Chroma.
 
-### 4.4 Summarization (Adaptive Hierarchical)
+### 4.4 Summarization (Adaptive Map-Reduce)
 Uses the `agent_cli.summarizer` module for research-backed adaptive summarization.
 
-*   **Level Selection:** Automatically determines summarization depth based on token count:
+*   **Level Selection:** Automatically determines summarization strategy based on token count:
     *   `NONE` (< 100 tokens): No summary needed, facts only.
-    *   `BRIEF` (100-500 tokens): Single-sentence summary (~20% compression).
-    *   `STANDARD` (500-3000 tokens): Paragraph summary (~12% compression).
-    *   `DETAILED` (3000-15000 tokens): Chunked summaries + meta-summary (~7% compression).
-    *   `HIERARCHICAL` (> 15000 tokens): Full L1/L2/L3 tree structure.
-*   **Input:** Previous L3 summary (if any) + newly extracted facts.
-*   **Persistence:** Stores summaries in `summaries/L1/`, `L2/`, `L3/` subdirectories with YAML front matter containing compression metrics.
+    *   `BRIEF` (100-500 tokens): Single-sentence summary.
+    *   `MAP_REDUCE` (>= 500 tokens): Dynamic collapse using map-reduce with content-type aware prompts.
+*   **Algorithm:** LangChain-inspired map-reduce that recursively collapses until content fits token_max (3000).
+*   **Input:** Previous summary (if any) + newly extracted facts.
+*   **Persistence:** Stores single final summary in `summaries/` directory with YAML front matter containing compression metrics.
 *   **See:** `docs/architecture/summarizer.md` for detailed algorithm specification.
 
 ### 4.5 Eviction
@@ -318,13 +311,12 @@ To replicate the system behavior, the following prompt strategies are required.
 *   **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences.
 
 ### 5.3 Summarization (Adaptive Prompts)
-The summarizer uses level-specific prompts from `agent_cli.summarizer._prompts`:
-*   **`BRIEF_PROMPT`:** Single-sentence distillation for short content.
-*   **`STANDARD_PROMPT`:** Paragraph summary with prior context integration.
-*   **`CHUNK_PROMPT`:** Individual chunk summarization for hierarchical processing.
-*   **`META_PROMPT`:** Synthesizes multiple chunk summaries into cohesive narrative.
-*   **`ROLLING_PROMPT`:** Integrates new facts with existing summary.
-*   **Content-type variants:** `CONVERSATION_PROMPT`, `JOURNAL_PROMPT`, `DOCUMENT_PROMPT` for domain-specific summarization.
+The summarizer uses prompts from `agent_cli.summarizer._prompts`:
+*   **`BRIEF_SUMMARY_PROMPT`:** Single-sentence distillation for short content (100-500 tokens).
+*   **`STANDARD_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content).
+*   **`CHUNK_SUMMARY_PROMPT`:** Individual chunk summarization for map phase.
+*   **`META_SUMMARY_PROMPT`:** Synthesizes multiple chunk summaries in reduce phase.
+*   **Content-type variants:** `CONVERSATION_SUMMARY_PROMPT`, `JOURNAL_SUMMARY_PROMPT`, `DOCUMENT_SUMMARY_PROMPT` for domain-specific summarization.
 
 ---
 

From 88869c257951edacdc039219f4f965ac845d0359 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 10:42:46 -0800
Subject: [PATCH 31/37] refactor(summarizer): rename STANDARD_SUMMARY_PROMPT to
 GENERAL_SUMMARY_PROMPT

The prompt name "STANDARD" was a leftover from the old 5-level system
which had a STANDARD SummaryLevel. Since that level no longer exists
(now just NONE, BRIEF, MAP_REDUCE), rename to GENERAL_SUMMARY_PROMPT
to match its actual purpose as the "general" content type prompt.
---
 agent_cli/summarizer/_prompts.py |  8 ++++----
 docs/architecture/memory.md      |  2 +-
 tests/summarizer/test_prompts.py | 32 ++++++++++++++++----------------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index 1de5fa44f..476cb408e 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -13,8 +13,8 @@
 
 One-sentence summary:""".strip()
 
-# MAP_REDUCE level - Paragraph summary for content-type aware summarization
-STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
+# MAP_REDUCE level - Paragraph summary for general content type
+GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
 Focus on:
 - Key facts, decisions, and outcomes
@@ -104,12 +104,12 @@ def get_prompt_for_content_type(content_type: str) -> str:
 
     """
     prompts = {
-        "general": STANDARD_SUMMARY_PROMPT,
+        "general": GENERAL_SUMMARY_PROMPT,
         "conversation": CONVERSATION_SUMMARY_PROMPT,
         "journal": JOURNAL_SUMMARY_PROMPT,
         "document": DOCUMENT_SUMMARY_PROMPT,
     }
-    return prompts.get(content_type, STANDARD_SUMMARY_PROMPT)
+    return prompts.get(content_type, GENERAL_SUMMARY_PROMPT)
 
 
 def format_prior_context(prior_summary: str | None) -> str:
diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index 361640e9c..66331d9f7 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -313,7 +313,7 @@ To replicate the system behavior, the following prompt strategies are required.
 ### 5.3 Summarization (Adaptive Prompts)
 The summarizer uses prompts from `agent_cli.summarizer._prompts`:
 *   **`BRIEF_SUMMARY_PROMPT`:** Single-sentence distillation for short content (100-500 tokens).
-*   **`STANDARD_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content).
+*   **`GENERAL_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content).
 *   **`CHUNK_SUMMARY_PROMPT`:** Individual chunk summarization for map phase.
 *   **`META_SUMMARY_PROMPT`:** Synthesizes multiple chunk summaries in reduce phase.
 *   **Content-type variants:** `CONVERSATION_SUMMARY_PROMPT`, `JOURNAL_SUMMARY_PROMPT`, `DOCUMENT_SUMMARY_PROMPT` for domain-specific summarization.
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index 660229709..ef05ebad5 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -7,9 +7,9 @@
     CHUNK_SUMMARY_PROMPT,
     CONVERSATION_SUMMARY_PROMPT,
     DOCUMENT_SUMMARY_PROMPT,
+    GENERAL_SUMMARY_PROMPT,
     JOURNAL_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
-    STANDARD_SUMMARY_PROMPT,
     format_prior_context,
     format_summaries_for_meta,
     get_prompt_for_content_type,
@@ -26,13 +26,13 @@ def test_brief_prompt_has_content_placeholder(self) -> None:
         result = BRIEF_SUMMARY_PROMPT.format(content="Test content")
         assert "Test content" in result
 
-    def test_standard_prompt_has_placeholders(self) -> None:
-        """Test STANDARD prompt contains required placeholders."""
-        assert "{content}" in STANDARD_SUMMARY_PROMPT
-        assert "{prior_context}" in STANDARD_SUMMARY_PROMPT
-        assert "{max_words}" in STANDARD_SUMMARY_PROMPT
+    def test_general_prompt_has_placeholders(self) -> None:
+        """Test GENERAL prompt contains required placeholders."""
+        assert "{content}" in GENERAL_SUMMARY_PROMPT
+        assert "{prior_context}" in GENERAL_SUMMARY_PROMPT
+        assert "{max_words}" in GENERAL_SUMMARY_PROMPT
 
-        result = STANDARD_SUMMARY_PROMPT.format(
+        result = GENERAL_SUMMARY_PROMPT.format(
             content="Main content",
             prior_context="Previous context",
             max_words=100,
@@ -92,10 +92,10 @@ def test_document_prompt_has_placeholders(self) -> None:
 class TestGetPromptForContentType:
     """Tests for get_prompt_for_content_type function."""
 
-    def test_general_returns_standard(self) -> None:
-        """Test general content type returns standard prompt."""
+    def test_general_returns_general(self) -> None:
+        """Test general content type returns general prompt."""
         prompt = get_prompt_for_content_type("general")
-        assert prompt == STANDARD_SUMMARY_PROMPT
+        assert prompt == GENERAL_SUMMARY_PROMPT
 
     def test_conversation_returns_conversation(self) -> None:
         """Test conversation content type returns conversation prompt."""
@@ -112,15 +112,15 @@ def test_document_returns_document(self) -> None:
         prompt = get_prompt_for_content_type("document")
         assert prompt == DOCUMENT_SUMMARY_PROMPT
 
-    def test_unknown_returns_standard(self) -> None:
-        """Test unknown content type falls back to standard."""
+    def test_unknown_returns_general(self) -> None:
+        """Test unknown content type falls back to general."""
         prompt = get_prompt_for_content_type("unknown_type")
-        assert prompt == STANDARD_SUMMARY_PROMPT
+        assert prompt == GENERAL_SUMMARY_PROMPT
 
-    def test_empty_returns_standard(self) -> None:
-        """Test empty string falls back to standard."""
+    def test_empty_returns_general(self) -> None:
+        """Test empty string falls back to general."""
         prompt = get_prompt_for_content_type("")
-        assert prompt == STANDARD_SUMMARY_PROMPT
+        assert prompt == GENERAL_SUMMARY_PROMPT
 
 
 class TestFormatPriorContext:

From df8f05688a0f98d287eb53104acfa64e62dbd697 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 11:12:27 -0800
Subject: [PATCH 32/37] docs: clarify prompt comments to avoid confusion with
 level names

---
 agent_cli/summarizer/_prompts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index 476cb408e..de59f9404 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -4,7 +4,7 @@
 and are optimized for structured, factual output.
 """
 
-# BRIEF level - Single sentence summary for short content (100-500 tokens)
+# Single sentence summary for short content (used at BRIEF level, 100-500 tokens)
 BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
 Focus on the single most important point or takeaway.
 
@@ -13,7 +13,7 @@
 
 One-sentence summary:""".strip()
 
-# MAP_REDUCE level - Paragraph summary for general content type
+# Paragraph summary for "general" content type (default when no specific type provided)
 GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
 Focus on:

From f550b3fa4293aa6a078c1037746becd80e64bad5 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Fri, 28 Nov 2025 22:49:27 -0800
Subject: [PATCH 33/37] Chunk memories

---
 agent_cli/rag/client.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/agent_cli/rag/client.py b/agent_cli/rag/client.py
index be930ab48..af3438739 100644
--- a/agent_cli/rag/client.py
+++ b/agent_cli/rag/client.py
@@ -125,8 +125,17 @@ def add(
             for i in range(len(chunks))
         ]
 
-        # Upsert to collection
-        self.collection.upsert(ids=ids, documents=chunks, metadatas=metadatas)
+        # Upsert to collection in batches to avoid overwhelming the embedding service
+        batch_size = 10
+        for i in range(0, len(ids), batch_size):
+            batch_ids = ids[i : i + batch_size]
+            batch_docs = chunks[i : i + batch_size]
+            batch_metas = metadatas[i : i + batch_size]
+            self.collection.upsert(
+                ids=batch_ids,
+                documents=batch_docs,
+                metadatas=batch_metas,
+            )
         logger.info("Added doc_id=%s with %d chunks", doc_id, len(chunks))
 
         return doc_id

From 4f1d16a529210b1dd73291483578791984d60fad Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Fri, 28 Nov 2025 23:12:38 -0800
Subject: [PATCH 34/37] refactor(summarizer): remove dead code and reorganize
 models

- Remove unused `middle_truncate()` function and its tests
- Remove unused `MapReduceSummarizationError` exception class
- Move `SummarizerConfig` and `SummarizationError` from _utils.py to models.py

This groups all exported types in models.py and keeps _utils.py focused
on actual utility functions (token counting, chunking, LLM calls).

Net: -96 lines
---
 agent_cli/summarizer/__init__.py   |  9 +++-
 agent_cli/summarizer/_utils.py     | 87 +-----------------------------
 agent_cli/summarizer/adaptive.py   |  4 +-
 agent_cli/summarizer/map_reduce.py |  8 +--
 agent_cli/summarizer/models.py     | 36 +++++++++++++
 tests/summarizer/test_utils.py     | 46 ----------------
 6 files changed, 47 insertions(+), 143 deletions(-)

diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index af977ada1..daf0e2bc6 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -21,8 +21,13 @@
 
 """
 
-from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize
-from agent_cli.summarizer.models import SummaryLevel, SummaryResult
+from agent_cli.summarizer.adaptive import summarize
+from agent_cli.summarizer.models import (
+    SummarizationError,
+    SummarizerConfig,
+    SummaryLevel,
+    SummaryResult,
+)
 
 __all__ = [
     "SummarizationError",
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 078e21edc..23c8dd195 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -3,13 +3,12 @@
 from __future__ import annotations
 
 import re
-from dataclasses import dataclass
 from functools import lru_cache
 from typing import TYPE_CHECKING
 
 from pydantic import BaseModel
 
-from agent_cli.summarizer.models import SummaryLevel
+from agent_cli.summarizer.models import SummarizationError, SummarizerConfig, SummaryLevel
 
 if TYPE_CHECKING:
     import tiktoken
@@ -21,41 +20,6 @@ class SummaryOutput(BaseModel):
     summary: str
 
 
-class SummarizationError(Exception):
-    """Raised when summarization fails after all retries."""
-
-
-@dataclass
-class SummarizerConfig:
-    """Configuration for summarization operations.
-
-    Example:
-        config = SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="llama3.1:8b",
-        )
-        result = await summarize(long_document, config)
-        print(f"Level: {result.level.name}")
-        print(f"Compression: {result.compression_ratio:.1%}")
-
-    """
-
-    openai_base_url: str
-    model: str
-    api_key: str | None = None
-    chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default - when to collapse
-    chunk_overlap: int = 200
-    max_concurrent_chunks: int = 5
-    timeout: float = 60.0
-
-    def __post_init__(self) -> None:
-        """Normalize the base URL."""
-        self.openai_base_url = self.openai_base_url.rstrip("/")
-        if self.api_key is None:
-            self.api_key = "not-needed"
-
-
 async def generate_summary(
     prompt: str,
     config: SummarizerConfig,
@@ -266,55 +230,6 @@ def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str:
     return " ".join(overlap_parts)
 
 
-def middle_truncate(
-    text: str,
-    budget_chars: int,
-    head_frac: float = 0.3,
-    tail_frac: float = 0.3,
-) -> tuple[str, int]:
-    """Middle-truncate text to fit within a character budget.
-
-    Keeps the first head_frac and last tail_frac portions, dropping the middle.
-    This preserves context from both the beginning (often contains setup) and
-    end (often contains conclusions/recent events).
-
-    Inspired by Letta's `middle_truncate_text` function.
-
-    Args:
-        text: Text to truncate.
-        budget_chars: Maximum character count for output.
-        head_frac: Fraction of budget for the head portion.
-        tail_frac: Fraction of budget for the tail portion.
-
-    Returns:
-        Tuple of (truncated_text, dropped_char_count).
-
-    """
-    if budget_chars <= 0 or len(text) <= budget_chars:
-        return text, 0
-
-    head_len = max(0, int(budget_chars * head_frac))
-    tail_len = max(0, int(budget_chars * tail_frac))
-
-    # Ensure head + tail doesn't exceed budget
-    if head_len + tail_len > budget_chars:
-        tail_len = max(0, budget_chars - head_len)
-
-    head = text[:head_len]
-    tail = text[-tail_len:] if tail_len > 0 else ""
-    dropped = max(0, len(text) - (len(head) + len(tail)))
-
-    marker = f"\n[...{dropped} characters truncated...]\n"
-
-    # If marker would overflow budget, shrink tail
-    available_for_marker = budget_chars - (len(head) + len(tail))
-    if available_for_marker < len(marker):
-        over = len(marker) - available_for_marker
-        tail = tail[:-over] if over < len(tail) else ""
-
-    return head + marker + tail, dropped
-
-
 def estimate_summary_tokens(input_tokens: int, level: int) -> int:
     """Estimate target summary tokens based on input size and level."""
     if level == SummaryLevel.NONE:
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 640c52e60..f242b662f 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -26,8 +26,6 @@
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
-    SummarizationError,
-    SummarizerConfig,
     count_tokens,
     estimate_summary_tokens,
     generate_summary,
@@ -35,6 +33,8 @@
 )
 from agent_cli.summarizer.map_reduce import map_reduce_summarize
 from agent_cli.summarizer.models import (
+    SummarizationError,
+    SummarizerConfig,
     SummaryLevel,
     SummaryResult,
 )
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 07332c1cf..3dd81aa43 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -25,23 +25,17 @@
     format_summaries_for_meta,
 )
 from agent_cli.summarizer._utils import (
-    SummarizationError,
-    SummarizerConfig,
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
     generate_summary,
     tokens_to_words,
 )
-from agent_cli.summarizer.models import SummaryLevel
+from agent_cli.summarizer.models import SummarizerConfig, SummaryLevel
 
 logger = logging.getLogger(__name__)
 
 
-class MapReduceSummarizationError(SummarizationError):
-    """Raised when map-reduce summarization fails."""
-
-
 @dataclass
 class MapReduceResult:
     """Result of map-reduce summarization.
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index be0d309be..14be0c864 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass
 from datetime import UTC, datetime
 from enum import IntEnum
 from typing import Any
@@ -9,6 +10,41 @@
 from pydantic import BaseModel, Field
 
 
+class SummarizationError(Exception):
+    """Raised when summarization fails after all retries."""
+
+
+@dataclass
+class SummarizerConfig:
+    """Configuration for summarization operations.
+
+    Example:
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        result = await summarize(long_document, config)
+        print(f"Level: {result.level.name}")
+        print(f"Compression: {result.compression_ratio:.1%}")
+
+    """
+
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default - when to collapse
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
+
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
+
+
 class SummaryLevel(IntEnum):
     """Summary strategy based on input length."""
 
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 2621b158e..188a79172 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -6,7 +6,6 @@
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
-    middle_truncate,
     tokens_to_words,
 )
 
@@ -86,51 +85,6 @@ def test_large_paragraph_sentence_split(self) -> None:
         assert len(chunks) > 1
 
 
-class TestMiddleTruncate:
-    """Tests for middle_truncate function."""
-
-    def test_no_truncation_needed(self) -> None:
-        """Test that short text is not truncated."""
-        text = "Short text"
-        result, dropped = middle_truncate(text, budget_chars=100)
-        assert result == text
-        assert dropped == 0
-
-    def test_basic_truncation(self) -> None:
-        """Test basic middle truncation."""
-        text = "A" * 100  # 100 character string
-        result, dropped = middle_truncate(text, budget_chars=50)
-
-        # Should have head + marker + tail
-        assert len(result) <= 50 + 50  # Allow for marker
-        assert dropped > 0
-        assert "[..." in result
-        assert "truncated...]" in result
-
-    def test_head_tail_fractions(self) -> None:
-        """Test custom head/tail fractions."""
-        text = "AAAAA" + "BBBBB" * 20 + "CCCCC"
-        result, dropped = middle_truncate(text, budget_chars=30, head_frac=0.5, tail_frac=0.5)
-
-        # Should preserve beginning (A's) and end (C's)
-        assert result.startswith("A")
-        assert dropped > 0
-
-    def test_zero_budget(self) -> None:
-        """Test with zero budget returns original."""
-        text = "Some text"
-        result, dropped = middle_truncate(text, budget_chars=0)
-        assert result == text
-        assert dropped == 0
-
-    def test_negative_budget(self) -> None:
-        """Test with negative budget returns original."""
-        text = "Some text"
-        result, dropped = middle_truncate(text, budget_chars=-10)
-        assert result == text
-        assert dropped == 0
-
-
 class TestEstimateSummaryTokens:
     """Tests for estimate_summary_tokens function."""
 

From 1ed9ff4ba1ae92aac70100c96677eae27fa55b96 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 3 Dec 2025 20:10:28 -0800
Subject: [PATCH 35/37] refactor(summarizer): simplify API with
 target_tokens/target_ratio parameters

Remove SummaryLevel enum and three-level strategy in favor of a simple
"fits target? return as-is : map-reduce" approach. This reduces complexity
while maintaining full functionality.

Changes:
- Remove SummaryLevel enum (NONE/BRIEF/MAP_REDUCE)
- Add target_tokens parameter for absolute token limit
- Add target_ratio parameter for relative compression (e.g., 0.2 = 20%)
- Simplify estimate_summary_tokens to use ~10% compression ratio
- Update memory integration to use compression_ratio in logging
- Rewrite examples and tests for new API
- Update architecture documentation

Net reduction: ~165 lines of code
---
 agent_cli/agents/summarize.py        |   5 +-
 agent_cli/memory/_ingest.py          |   4 +-
 agent_cli/memory/_persistence.py     |   6 +-
 agent_cli/memory/models.py           |   2 +-
 agent_cli/summarizer/__init__.py     |  18 ++-
 agent_cli/summarizer/_utils.py       |  14 +-
 agent_cli/summarizer/adaptive.py     | 138 +++++++------------
 agent_cli/summarizer/map_reduce.py   |  53 +++++---
 agent_cli/summarizer/models.py       |  26 +---
 docs/architecture/summarizer.md      | 195 ++++++++++++++++-----------
 examples/summarizer_demo.py          | 164 ++++++++++------------
 tests/memory/test_engine.py          |   4 +-
 tests/memory/test_git_integration.py |   3 +-
 tests/summarizer/test_adaptive.py    | 168 +++++++----------------
 tests/summarizer/test_integration.py |  34 +----
 tests/summarizer/test_models.py      |  60 ++-------
 tests/summarizer/test_utils.py       |  48 +++----
 17 files changed, 388 insertions(+), 554 deletions(-)

diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py
index ec516310e..ecfd1e053 100644
--- a/agent_cli/agents/summarize.py
+++ b/agent_cli/agents/summarize.py
@@ -115,7 +115,7 @@ def _display_result(
     elif result.summary:
         print_output_panel(
             result.summary,
-            title=f"Summary (Level: {result.level.name})",
+            title="Summary",
             subtitle=f"[dim]{result.output_tokens:,} tokens | {result.compression_ratio:.1%} of original | {elapsed:.2f}s[/dim]",
         )
     else:
@@ -139,7 +139,6 @@ def _display_full_result(
 
     console.print()
     console.print("[bold cyan]Summarization Result[/bold cyan]")
-    console.print(f"  Level: [bold]{result.level.name}[/bold]")
     console.print(f"  Input tokens: [bold]{result.input_tokens:,}[/bold]")
     console.print(f"  Output tokens: [bold]{result.output_tokens:,}[/bold]")
     console.print(f"  Compression: [bold]{result.compression_ratio:.1%}[/bold]")
@@ -151,7 +150,7 @@ def _display_full_result(
     if result.summary:
         print_output_panel(
             result.summary,
-            title=f"Summary ({result.level.name})",
+            title="Summary",
         )
 
 
diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index e50e2ac45..933d8bf58 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -432,10 +432,10 @@ async def extract_and_store_facts_and_summaries(
             model=model,
         )
         LOGGER.info(
-            "Summary update completed in %.1f ms (conversation=%s, level=%s)",
+            "Summary update completed in %.1f ms (conversation=%s, compression=%.1f%%)",
             _elapsed_ms(summary_start),
             conversation_id,
-            summary_result.level.name,
+            summary_result.compression_ratio * 100,
         )
         if summary_result.summary:
             await store_adaptive_summary(
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index a7e3871e2..46ac03631 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -191,10 +191,8 @@ def persist_summary(
         List of IDs that were stored.
 
     """
-    from agent_cli.summarizer import SummaryLevel  # noqa: PLC0415
-
-    # Skip if no summary needed
-    if summary_result.level == SummaryLevel.NONE:
+    # Skip if no summary was generated
+    if not summary_result.summary:
         return []
 
     # Delete existing summary files
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 5b8df3855..d52d952ce 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -65,7 +65,7 @@ class MemoryMetadata(BaseModel):
     compression_ratio: float | None = None
     """Ratio of output to input tokens."""
     summary_level: str | None = None
-    """Name of the SummaryLevel enum used (NONE, BRIEF, or MAP_REDUCE)."""
+    """Deprecated: previously stored SummaryLevel enum name."""
     collapse_depth: int | None = None
     """Number of collapse iterations in map-reduce (0 = no collapse needed)."""
 
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index daf0e2bc6..7c7603b98 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -1,13 +1,13 @@
 """Adaptive summarization module for variable-length content.
 
 This module provides map-reduce summarization inspired by LangChain's approach:
-1. Split content into chunks and summarize each in parallel (map phase)
-2. Recursively collapse summaries until they fit token_max (reduce phase)
+1. If content fits target, return as-is (no LLM call)
+2. Otherwise, split into chunks and summarize each in parallel (map phase)
+3. Recursively collapse summaries until they fit target (reduce phase)
 
 Research foundations:
 - LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
 - BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
-- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
 
 Example:
     from agent_cli.summarizer import summarize, SummarizerConfig
@@ -16,8 +16,14 @@
         openai_base_url="http://localhost:8000/v1",
         model="gpt-4",
     )
-    result = await summarize(long_document, config)
-    print(f"Level: {result.level.name}, Compression: {result.compression_ratio:.1%}")
+
+    # Compress to fit 4000 tokens
+    result = await summarize(long_document, config, target_tokens=4000)
+
+    # Compress to 20% of original size
+    result = await summarize(long_document, config, target_ratio=0.2)
+
+    print(f"Compression: {result.compression_ratio:.1%}")
 
 """
 
@@ -25,14 +31,12 @@
 from agent_cli.summarizer.models import (
     SummarizationError,
     SummarizerConfig,
-    SummaryLevel,
     SummaryResult,
 )
 
 __all__ = [
     "SummarizationError",
     "SummarizerConfig",
-    "SummaryLevel",
     "SummaryResult",
     "summarize",
 ]
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 23c8dd195..64c72b8ff 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -8,7 +8,7 @@
 
 from pydantic import BaseModel
 
-from agent_cli.summarizer.models import SummarizationError, SummarizerConfig, SummaryLevel
+from agent_cli.summarizer.models import SummarizationError, SummarizerConfig
 
 if TYPE_CHECKING:
     import tiktoken
@@ -230,13 +230,11 @@ def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str:
     return " ".join(overlap_parts)
 
 
-def estimate_summary_tokens(input_tokens: int, level: int) -> int:
-    """Estimate target summary tokens based on input size and level."""
-    if level == SummaryLevel.NONE:
-        return 0
-    if level == SummaryLevel.BRIEF:
-        return min(50, max(20, input_tokens // 5))
-    # MAP_REDUCE: ~10% compression with floor/ceiling
+def estimate_summary_tokens(input_tokens: int) -> int:
+    """Estimate target summary tokens based on input size.
+
+    Uses ~10% compression ratio with floor/ceiling bounds.
+    """
     return min(500, max(50, input_tokens // 10))
 
 
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index f242b662f..2a772062a 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,17 +1,13 @@
 """Adaptive summarization using map-reduce with dynamic collapse.
 
 Implements a simple algorithm inspired by LangChain's map-reduce chains:
-1. If content is short enough, summarize directly
+1. If content fits target, return as-is (no LLM call)
 2. Otherwise, split into chunks and summarize each (map phase)
-3. Recursively collapse summaries until they fit token_max (reduce phase)
+3. Recursively collapse summaries until they fit target (reduce phase)
 
 Research foundations:
 - LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
 - BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
-- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
-
-Key insight: No need for predetermined L1/L2/L3 levels.
-Dynamic collapse depth based on actual content length.
 
 See docs/architecture/summarizer.md for detailed design rationale.
 """
@@ -21,76 +17,68 @@
 import logging
 
 from agent_cli.summarizer._prompts import (
-    BRIEF_SUMMARY_PROMPT,
     format_prior_context,
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
     count_tokens,
-    estimate_summary_tokens,
     generate_summary,
     tokens_to_words,
 )
 from agent_cli.summarizer.map_reduce import map_reduce_summarize
 from agent_cli.summarizer.models import (
-    SummarizationError,
     SummarizerConfig,
-    SummaryLevel,
     SummaryResult,
 )
 
 logger = logging.getLogger(__name__)
 
-# Thresholds for summary levels (in tokens)
-THRESHOLD_NONE = 100  # Below this, no summary needed
-THRESHOLD_BRIEF = 500  # Below this, just a single sentence
-
-# Re-export for backwards compatibility
 __all__ = [
-    "THRESHOLD_BRIEF",
-    "THRESHOLD_NONE",
-    "SummarizationError",
     "SummarizerConfig",
-    "determine_level",
     "summarize",
 ]
 
 
-def determine_level(token_count: int) -> SummaryLevel:
-    """Map token count to appropriate SummaryLevel."""
-    if token_count < THRESHOLD_NONE:
-        return SummaryLevel.NONE
-    if token_count < THRESHOLD_BRIEF:
-        return SummaryLevel.BRIEF
-    return SummaryLevel.MAP_REDUCE
-
-
 async def summarize(
     content: str,
     config: SummarizerConfig,
+    *,
+    target_tokens: int | None = None,
+    target_ratio: float | None = None,
     prior_summary: str | None = None,
     content_type: str = "general",
 ) -> SummaryResult:
-    """Summarize content with adaptive strategy based on length.
+    """Summarize content to fit within a target token limit.
 
-    Uses a simple algorithm:
-    - Very short content (<100 tokens): No summary
-    - Short content (<500 tokens): Single sentence brief summary
-    - Everything else: Map-reduce with dynamic collapse
+    Simple algorithm:
+    - If content already fits target, return as-is (no LLM call)
+    - Otherwise, use map-reduce to compress until it fits
 
     Args:
         content: The content to summarize.
         config: Summarizer configuration.
+        target_tokens: Absolute token limit (e.g., 4000). Defaults to config.token_max.
+        target_ratio: Relative compression ratio (e.g., 0.2 = compress to 20% of input).
+            Takes precedence over target_tokens if both provided.
         prior_summary: Optional prior summary for context continuity.
         content_type: Type of content ("general", "conversation", "journal", "document").
 
     Returns:
-        SummaryResult with summary and metadata.
+        SummaryResult with summary and compression metrics.
+
+    Examples:
+        # Compress to fit 4000 tokens
+        result = await summarize(huge_doc, config, target_tokens=4000)
+
+        # Compress to 20% of original size
+        result = await summarize(huge_doc, config, target_ratio=0.2)
+
+        # Use default (config.token_max = 3000)
+        result = await summarize(huge_doc, config)
 
     """
     if not content or not content.strip():
         return SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=0,
             output_tokens=0,
@@ -98,65 +86,43 @@ async def summarize(
         )
 
     input_tokens = count_tokens(content, config.model)
-    level = determine_level(input_tokens)
+
+    # Determine target
+    if target_ratio is not None:
+        target = max(1, int(input_tokens * target_ratio))
+    elif target_tokens is not None:
+        target = target_tokens
+    else:
+        target = config.token_max
 
     logger.info(
-        "Summarizing %d tokens at level %s (type=%s)",
+        "Summarizing %d tokens to target %d (type=%s)",
         input_tokens,
-        level.name,
+        target,
         content_type,
     )
 
-    if level == SummaryLevel.NONE:
+    # Already fits? Return content as-is (no LLM call)
+    if input_tokens <= target:
         return SummaryResult(
-            level=level,
-            summary=None,
+            summary=content,
             input_tokens=input_tokens,
-            output_tokens=0,
-            compression_ratio=0.0,
+            output_tokens=input_tokens,
+            compression_ratio=1.0,
+            collapse_depth=0,
         )
 
-    if level == SummaryLevel.BRIEF:
-        summary = await _brief_summary(content, config)
-        output_tokens = count_tokens(summary, config.model)
-        return SummaryResult(
-            level=level,
-            summary=summary,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens,
+    # Content fits in single chunk but exceeds target - use content-aware summary
+    if input_tokens <= config.chunk_size:
+        summary = await _content_aware_summary(
+            content,
+            config,
+            target,
+            prior_summary,
+            content_type,
         )
-
-    # MAP_REDUCE level
-    return await _map_reduce_summary(
-        content,
-        input_tokens,
-        config,
-        prior_summary,
-        content_type,
-    )
-
-
-async def _brief_summary(content: str, config: SummarizerConfig) -> str:
-    """Generate a single-sentence summary for brief content."""
-    prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
-    return await generate_summary(prompt, config, max_tokens=50)
-
-
-async def _map_reduce_summary(
-    content: str,
-    input_tokens: int,
-    config: SummarizerConfig,
-    prior_summary: str | None,
-    content_type: str,
-) -> SummaryResult:
-    """Use map-reduce with dynamic collapse for longer content."""
-    # For content that fits in a single chunk, use content-type aware summary
-    if input_tokens <= config.token_max:
-        summary = await _content_aware_summary(content, config, prior_summary, content_type)
         output_tokens = count_tokens(summary, config.model)
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary=summary,
             input_tokens=input_tokens,
             output_tokens=output_tokens,
@@ -164,11 +130,10 @@ async def _map_reduce_summary(
             collapse_depth=0,
         )
 
-    # Use map-reduce for multi-chunk content
-    result = await map_reduce_summarize(content, config)
+    # Large content - use map-reduce with dynamic collapse
+    result = await map_reduce_summarize(content, config, target)
 
     return SummaryResult(
-        level=SummaryLevel.MAP_REDUCE,
         summary=result.summary,
         input_tokens=result.input_tokens,
         output_tokens=result.output_tokens,
@@ -180,14 +145,11 @@ async def _map_reduce_summary(
 async def _content_aware_summary(
     content: str,
     config: SummarizerConfig,
+    target_tokens: int,
     prior_summary: str | None,
     content_type: str,
 ) -> str:
     """Generate a content-type aware summary for single-chunk content."""
-    target_tokens = estimate_summary_tokens(
-        count_tokens(content, config.model),
-        SummaryLevel.MAP_REDUCE,
-    )
     max_words = tokens_to_words(target_tokens)
 
     prompt_template = get_prompt_for_content_type(content_type)
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 3dd81aa43..86e8b796a 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -2,7 +2,7 @@
 
 Simple algorithm:
 1. Map: Split content into chunks, summarize each in parallel
-2. Reduce: If combined summaries exceed token_max, recursively collapse
+2. Reduce: If combined summaries exceed target, recursively collapse
 
 Key insight from LangChain: No need for predetermined levels (L1/L2/L3).
 Just keep collapsing until content fits. Dynamic depth based on actual content.
@@ -18,6 +18,7 @@
 import asyncio
 import logging
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 from agent_cli.summarizer._prompts import (
     CHUNK_SUMMARY_PROMPT,
@@ -31,7 +32,9 @@
     generate_summary,
     tokens_to_words,
 )
-from agent_cli.summarizer.models import SummarizerConfig, SummaryLevel
+
+if TYPE_CHECKING:
+    from agent_cli.summarizer.models import SummarizerConfig
 
 logger = logging.getLogger(__name__)
 
@@ -61,28 +64,29 @@ class MapReduceResult:
 async def map_reduce_summarize(
     content: str,
     config: SummarizerConfig,
+    target: int | None = None,
     max_collapse_depth: int = 10,
 ) -> MapReduceResult:
     """Summarize content using map-reduce with dynamic collapse.
 
     Algorithm:
     1. Split into chunks and summarize each (map phase)
-    2. If combined summaries exceed token_max, recursively collapse (reduce phase)
-    3. Continue until everything fits in token_max
-
-    Note: This function is designed for content that exceeds token_max. For shorter
-    content, use the main `summarize()` function in adaptive.py which selects the
-    appropriate strategy (NONE, BRIEF, or MAP_REDUCE with content-aware prompts).
+    2. If combined summaries exceed target, recursively collapse (reduce phase)
+    3. Continue until everything fits in target
 
     Args:
         content: The content to summarize.
         config: Summarizer configuration.
+        target: Target token count. Defaults to config.token_max.
         max_collapse_depth: Safety limit on recursive collapse depth.
 
     Returns:
         MapReduceResult with summary and metadata.
 
     """
+    if target is None:
+        target = config.token_max
+
     input_tokens = count_tokens(content, config.model)
 
     # Map phase: Split and summarize chunks in parallel
@@ -97,9 +101,9 @@ async def map_reduce_summarize(
     summaries = await _map_summarize(chunks, config)
     intermediate_summaries = [summaries.copy()]
 
-    # Reduce phase: Recursively collapse until fits token_max
+    # Reduce phase: Recursively collapse until fits target
     depth = 0
-    while _total_tokens(summaries, config.model) > config.token_max:
+    while _total_tokens(summaries, config.model) > target:
         depth += 1
         if depth > max_collapse_depth:
             logger.warning(
@@ -109,17 +113,18 @@ async def map_reduce_summarize(
             break
 
         logger.info(
-            "Reduce phase (depth %d): collapsing %d summaries (%d tokens)",
+            "Reduce phase (depth %d): collapsing %d summaries (%d tokens) to target %d",
             depth,
             len(summaries),
             _total_tokens(summaries, config.model),
+            target,
         )
-        summaries = await _collapse_summaries(summaries, config)
+        summaries = await _collapse_summaries(summaries, config, target)
         intermediate_summaries.append(summaries.copy())
 
     # Final synthesis if we have multiple summaries left
     if len(summaries) > 1:
-        final_summary = await _synthesize(summaries, config)
+        final_summary = await _synthesize(summaries, config, target)
     else:
         final_summary = summaries[0]
 
@@ -161,7 +166,7 @@ async def _summarize_chunk(
 ) -> str:
     """Summarize a single chunk."""
     source_tokens = count_tokens(chunk, config.model)
-    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.MAP_REDUCE)
+    target_tokens = estimate_summary_tokens(source_tokens)
     max_words = tokens_to_words(target_tokens)
 
     prompt = CHUNK_SUMMARY_PROMPT.format(
@@ -177,16 +182,17 @@ async def _summarize_chunk(
 async def _collapse_summaries(
     summaries: list[str],
     config: SummarizerConfig,
+    target: int,
 ) -> list[str]:
     """Collapse summaries by grouping and re-summarizing (reduce phase).
 
-    Groups summaries that together fit within token_max, then summarizes each group.
+    Groups summaries that together fit within target, then summarizes each group.
     This is similar to LangChain's split_list_of_docs approach.
     """
     if len(summaries) <= 1:
         return summaries
 
-    # Group summaries that together fit within token_max
+    # Group summaries that together fit within target
     groups: list[list[str]] = []
     current_group: list[str] = []
     current_tokens = 0
@@ -194,8 +200,8 @@ async def _collapse_summaries(
     for summary in summaries:
         summary_tokens = count_tokens(summary, config.model)
 
-        # If adding this summary would exceed token_max, start new group
-        if current_tokens + summary_tokens > config.token_max and current_group:
+        # If adding this summary would exceed target, start new group
+        if current_tokens + summary_tokens > target and current_group:
             groups.append(current_group)
             current_group = [summary]
             current_tokens = summary_tokens
@@ -211,16 +217,21 @@ async def _collapse_summaries(
 
     async def summarize_group(group: list[str]) -> str:
         async with semaphore:
-            return await _synthesize(group, config)
+            return await _synthesize(group, config, target)
 
     tasks = [summarize_group(g) for g in groups]
     return list(await asyncio.gather(*tasks))
 
 
-async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str:
+async def _synthesize(
+    summaries: list[str],
+    config: SummarizerConfig,
+    target: int,
+) -> str:
     """Synthesize multiple summaries into one."""
     combined_tokens = sum(count_tokens(s, config.model) for s in summaries)
-    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE)
+    # Aim for target tokens but use estimate if combined is smaller
+    target_tokens = min(target, estimate_summary_tokens(combined_tokens))
     max_words = tokens_to_words(target_tokens)
 
     prompt = META_SUMMARY_PROMPT.format(
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 14be0c864..65eb42ed5 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -4,7 +4,6 @@
 
 from dataclasses import dataclass
 from datetime import UTC, datetime
-from enum import IntEnum
 from typing import Any
 
 from pydantic import BaseModel, Field
@@ -24,7 +23,6 @@ class SummarizerConfig:
             model="llama3.1:8b",
         )
         result = await summarize(long_document, config)
-        print(f"Level: {result.level.name}")
         print(f"Compression: {result.compression_ratio:.1%}")
 
     """
@@ -33,7 +31,7 @@ class SummarizerConfig:
     model: str
     api_key: str | None = None
     chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default - when to collapse
+    token_max: int = 3000  # LangChain's default - target size after compression
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
     timeout: float = 60.0
@@ -45,32 +43,18 @@ def __post_init__(self) -> None:
             self.api_key = "not-needed"
 
 
-class SummaryLevel(IntEnum):
-    """Summary strategy based on input length."""
-
-    NONE = 0
-    """< 100 tokens: No summary needed."""
-
-    BRIEF = 1
-    """100-500 tokens: Single-sentence summary."""
-
-    MAP_REDUCE = 2
-    """> 500 tokens: Map-reduce with dynamic collapse."""
-
-
 class SummaryResult(BaseModel):
     """Result of summarization.
 
     Contains the summary and metadata about the compression achieved.
     """
 
-    level: SummaryLevel = Field(..., description="The summarization strategy used")
     summary: str | None = Field(
         default=None,
-        description="The final summary text (None for NONE level)",
+        description="The summary text (None if content already fit target)",
     )
     input_tokens: int = Field(..., ge=0, description="Token count of the input content")
-    output_tokens: int = Field(..., ge=0, description="Token count of the summary")
+    output_tokens: int = Field(..., ge=0, description="Token count of the output")
     compression_ratio: float = Field(
         ...,
         ge=0.0,
@@ -91,8 +75,9 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
         """Convert to metadata entry for ChromaDB storage.
 
         Returns a list with a single metadata dict for the summary.
+        Returns empty list if no summary was generated.
         """
-        if self.level == SummaryLevel.NONE or not self.summary:
+        if not self.summary:
             return []
 
         timestamp = self.created_at.isoformat()
@@ -105,7 +90,6 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                     "conversation_id": conversation_id,
                     "role": "summary",
                     "is_final": True,
-                    "summary_level": self.level.name,
                     "input_tokens": self.input_tokens,
                     "output_tokens": self.output_tokens,
                     "compression_ratio": self.compression_ratio,
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 43caf336d..c7476142e 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -4,22 +4,23 @@ This document describes the architectural decisions, design rationale, and techn
 
 ## 1. System Overview
 
-The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. Rather than applying fixed summarization levels, it dynamically collapses content until it fits within a token budget.
+The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. It compresses content to fit within a specified token budget using a simple algorithm:
 
 ```
-Input Content ──▶ Token Count ──▶ Strategy Selection
+Input Content ──▶ Token Count ──▶ Compare to Target
                                         │
-        ┌───────────────────────────────┼─────────────────────┐
-        │                               │                     │
-   < 100 tokens                  100-500 tokens         > 500 tokens
-        │                               │                     │
-   No summary                    Brief summary           Map-Reduce
-                                (single sentence)     (dynamic collapse)
+                ┌───────────────────────┴───────────────────────┐
+                │                                               │
+          Fits target                                    Exceeds target
+                │                                               │
+          Return as-is                                   Map-Reduce
+          (no LLM call)                               (dynamic collapse)
 ```
 
 **Design Goals:**
 
-- **Simple algorithm:** Map-reduce with dynamic collapse depth based on actual content.
+- **Maximum simplicity:** Single entry point with straightforward logic.
+- **Flexible targeting:** Specify absolute token count or relative compression ratio.
 - **Research-grounded defaults:** chunk_size=2048 (BOOOOKSCORE), token_max=3000 (LangChain).
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
@@ -47,59 +48,81 @@ BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. T
 - Chunk size: **2048 tokens** (we use this)
 - Max summary length: **900 tokens**
 
-### 2.3 Borrowed: Two-Phase Architecture (Mem0)
-
-**Reference:** arXiv:2504.19413
-
-Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to storage.
-
-### 2.4 Original Design (Not Research-Backed)
+### 2.3 Original Design (Not Research-Backed)
 
 The following aspects are **original design choices without direct research justification**:
 
-- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/MAP_REDUCE were chosen heuristically.
 - **Content-type prompts:** Domain-specific prompts are original design.
+- **Target ratio parameter:** The option to specify compression as a percentage is a convenience feature.
 
 ---
 
 ## 3. Architectural Decisions
 
-### 3.1 Map-Reduce with Dynamic Collapse
+### 3.1 Simple Target-Based Logic
+
+**Decision:** Use a simple "fits? return : compress" algorithm.
+
+**Rationale:**
+
+- **Minimal complexity:** No level selection logic, threshold management, or multiple code paths.
+- **Clear semantics:** If content fits the target, return it unchanged. Otherwise, compress.
+- **Flexible targeting:** Users can specify exact token counts or relative ratios.
+
+**Algorithm:**
+
+```python
+async def summarize(
+    content: str,
+    config: SummarizerConfig,
+    *,
+    target_tokens: int | None = None,   # Absolute limit
+    target_ratio: float | None = None,  # e.g., 0.2 = compress to 20%
+) -> SummaryResult:
+    input_tokens = count_tokens(content)
+
+    # Determine target
+    if target_ratio is not None:
+        target = max(1, int(input_tokens * target_ratio))
+    elif target_tokens is not None:
+        target = target_tokens
+    else:
+        target = config.token_max  # Default: 3000
+
+    # Already fits? Return as-is (no LLM call)
+    if input_tokens <= target:
+        return SummaryResult(summary=content, ...)
+
+    # Compress using map-reduce
+    return await map_reduce_summarize(content, config, target)
+```
 
-**Decision:** Use LangChain-style map-reduce instead of fixed hierarchy.
+### 3.2 Map-Reduce with Dynamic Collapse
+
+**Decision:** Use LangChain-style map-reduce for all compression.
 
 **Rationale:**
 
-- **Simpler algorithm:** Single code path handles all content sizes.
+- **Single algorithm:** One code path handles all content sizes.
 - **Dynamic depth:** Collapse depth adapts to actual content length.
 - **Research-backed:** LangChain's approach is battle-tested.
 
 **Algorithm:**
 
 ```python
-async def map_reduce_summarize(content, config):
+async def map_reduce_summarize(content, config, target):
     # Map: Split and summarize chunks in parallel
     chunks = chunk_text(content, chunk_size=2048)
     summaries = await parallel_summarize(chunks)
 
-    # Reduce: Recursively collapse until fits token_max
-    while total_tokens(summaries) > config.token_max:
-        groups = group_by_token_limit(summaries, config.token_max)
+    # Reduce: Recursively collapse until fits target
+    while total_tokens(summaries) > target:
+        groups = group_by_token_limit(summaries, target)
         summaries = await parallel_synthesize(groups)
 
     return final_synthesis(summaries)
 ```
 
-### 3.2 Three-Level Strategy
-
-**Decision:** Use three levels based on token count.
-
-| Level | Token Range | Strategy |
-| :--- | :--- | :--- |
-| NONE | < 100 | No summarization needed |
-| BRIEF | 100-500 | Single sentence |
-| MAP_REDUCE | >= 500 | Dynamic collapse until fits token_max |
-
 ### 3.3 Research-Backed Defaults
 
 **Decision:** Use values from published research.
@@ -158,29 +181,29 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, `compression_rat
 
 ## 4. Processing Pipeline
 
-### 4.1 Level Selection
+### 4.1 Entry Point
 
-The entry point (`summarize()`) counts tokens and selects strategy:
+The entry point (`summarize()`) implements simple logic:
 
 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
-2. **Threshold comparison:** Determines NONE, BRIEF, or MAP_REDUCE.
-3. **Strategy dispatch:** Calls appropriate handler.
+2. **Target calculation:** Determines target from `target_tokens`, `target_ratio`, or default `token_max`.
+3. **Fit check:** If content fits target, return as-is.
+4. **Compression:** Call map-reduce if content exceeds target.
 
-### 4.2 Brief Level
+### 4.2 Single-Chunk Content
 
-For short content (100-500 tokens):
+For content that fits within `chunk_size` but exceeds target:
 
-- Single LLM call with brief prompt
-- Returns `SummaryResult` with single-sentence summary
+- Single LLM call with content-type aware prompt
+- Returns `SummaryResult` with compressed summary
 
-### 4.3 Map-Reduce Level
+### 4.3 Multi-Chunk Content
 
-For longer content (>= 500 tokens):
+For larger content (> chunk_size tokens):
 
-1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly.
-2. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
-3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively.
-4. **Final synthesis:** Combine remaining summaries into final output.
+1. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
+2. **Reduce phase:** If combined summaries exceed target, group and re-summarize recursively.
+3. **Final synthesis:** Combine remaining summaries into final output.
 
 The `collapse_depth` field in the result indicates how many reduce iterations were needed.
 
@@ -188,29 +211,19 @@ The `collapse_depth` field in the result indicates how many reduce iterations we
 
 ## 5. Data Models
 
-### 5.1 SummaryLevel
-
-```python
-class SummaryLevel(IntEnum):
-    NONE = 0       # < 100 tokens
-    BRIEF = 1      # 100-500 tokens
-    MAP_REDUCE = 2 # >= 500 tokens
-```
-
-### 5.2 SummaryResult
+### 5.1 SummaryResult
 
 ```python
 class SummaryResult(BaseModel):
-    level: SummaryLevel
-    summary: str | None
+    summary: str | None      # None if content was empty
     input_tokens: int
     output_tokens: int
-    compression_ratio: float
-    collapse_depth: int  # 0 = no collapse needed
+    compression_ratio: float  # 0.0-1.0
+    collapse_depth: int       # 0 = no collapse needed
     created_at: datetime
 ```
 
-### 5.3 SummarizerConfig
+### 5.2 SummarizerConfig
 
 ```python
 @dataclass
@@ -219,7 +232,7 @@ class SummarizerConfig:
     model: str
     api_key: str | None = None
     chunk_size: int = 2048      # BOOOOKSCORE
-    token_max: int = 3000       # LangChain
+    token_max: int = 3000       # LangChain (default target)
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
     timeout: float = 60.0
@@ -257,12 +270,12 @@ Summaries are stored with metadata:
     "metadata": {
         "conversation_id": conversation_id,
         "role": "summary",
-        "summary_level": "MAP_REDUCE",
         "input_tokens": 1500,
         "output_tokens": 150,
         "compression_ratio": 0.1,
         "collapse_depth": 1,
         "created_at": "2024-01-15T10:30:00Z",
+        "is_final": True,
     },
 }
 ```
@@ -274,9 +287,9 @@ Summaries are stored with metadata:
 Summarization follows a fail-fast philosophy:
 
 - **LLM errors:** Propagated as `SummarizationError` (base class for all summarization errors).
-- **Empty input:** Returns NONE level immediately (not an error).
+- **Empty input:** Returns result with `summary=None` immediately (not an error).
 - **Encoding errors:** Falls back to character-based token estimation.
-- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
+- **Max depth exceeded:** Warning logged, forces final synthesis even if over target.
 
 The caller decides how to handle failures—typically by proceeding without a summary rather than blocking the entire operation.
 
@@ -294,9 +307,41 @@ The caller decides how to handle failures—typically by proceeding without a su
 
 ---
 
-## 9. Limitations and Trade-offs
+## 9. Usage Examples
+
+### Basic Usage
+
+```python
+from agent_cli.summarizer import SummarizerConfig, summarize
+
+config = SummarizerConfig(
+    openai_base_url="http://localhost:11434/v1",
+    model="llama3.1:8b",
+)
+
+# Default: compress to fit 3000 tokens
+result = await summarize(content, config)
+
+# Compress to specific token count
+result = await summarize(content, config, target_tokens=500)
 
-### 9.1 Fact Preservation
+# Compress to 20% of original size
+result = await summarize(content, config, target_ratio=0.2)
+
+# With content type for better prompts
+result = await summarize(
+    content,
+    config,
+    target_tokens=500,
+    content_type="conversation",
+)
+```
+
+---
+
+## 10. Limitations and Trade-offs
+
+### 10.1 Fact Preservation
 
 Summarization is inherently lossy. Specific facts (dates, numbers, names) are often dropped in favor of thematic content. If your use case requires fact retrieval:
 
@@ -304,18 +349,14 @@ Summarization is inherently lossy. Specific facts (dates, numbers, names) are of
 - Use fact extraction instead of summarization
 - Use RAG to retrieve original chunks
 
-### 9.2 No Intermediate Summaries
+### 10.2 No Intermediate Summaries
 
 Unlike hierarchical approaches, map-reduce only stores the final summary. Intermediate chunk summaries are discarded after synthesis. This simplifies storage but removes granular access.
 
-### 9.3 Fixed Thresholds
-
-The 100/500 token thresholds are heuristic. They may need tuning for specific domains or languages.
-
 ---
 
-## 10. Future Improvements
+## 11. Future Improvements
 
 1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
-2. **Tune token thresholds empirically** with real-world content
-3. **Add fact extraction mode** for use cases requiring specific detail preservation
+2. **Add fact extraction mode** for use cases requiring specific detail preservation
+3. **Streaming support** for real-time summarization feedback
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
index 70d434dda..feebc5f20 100644
--- a/examples/summarizer_demo.py
+++ b/examples/summarizer_demo.py
@@ -1,15 +1,16 @@
-"""Demonstrate the summarizer on texts of varying lengths from the internet.
+"""Demonstrate the simplified summarizer on texts of varying lengths.
 
 This script fetches content of different sizes and shows how the adaptive
-summarizer automatically selects the appropriate strategy (BRIEF or MAP_REDUCE)
-based on content length.
+summarizer compresses content to fit different target token counts or ratios.
 
 Usage:
     python examples/summarizer_demo.py
 
-    # Test specific levels only
-    python examples/summarizer_demo.py --level brief
-    python examples/summarizer_demo.py --level map_reduce
+    # Test with specific target ratio
+    python examples/summarizer_demo.py --target-ratio 0.2
+
+    # Test with specific target token count
+    python examples/summarizer_demo.py --target-tokens 500
 
     # Use a different model
     python examples/summarizer_demo.py --model "gpt-4o-mini"
@@ -30,12 +31,11 @@
 
 from agent_cli.summarizer import (
     SummarizerConfig,
-    SummaryLevel,
     SummaryResult,
     summarize,
 )
 
-# Defaults for local AI setup (same as aijournal_poc.py)
+# Defaults for local AI setup
 DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1"
 DEFAULT_MODEL = "gpt-oss-high:20b"
 
@@ -47,24 +47,17 @@ class TextSample:
     name: str
     description: str
     url: str
-    expected_level: SummaryLevel
     content_type: str = "general"
     # If URL fetch fails, use this fallback
     fallback_content: str | None = None
 
 
-# Thresholds from adaptive.py:
-# NONE: < 100 tokens
-# BRIEF: 100-500 tokens
-# MAP_REDUCE: >= 500 tokens
-
-# Sample texts of varying lengths to demonstrate different summarization levels
+# Sample texts of varying lengths to demonstrate summarization
 SAMPLES: list[TextSample] = [
     TextSample(
-        name="Brief - Short News Article",
-        description="~150-400 tokens - triggers BRIEF level (100-500 token range)",
+        name="Short News Article",
+        description="~150-400 tokens - demonstrates small content handling",
         url="https://httpbin.org/json",  # Returns small JSON we'll convert to text
-        expected_level=SummaryLevel.BRIEF,
         fallback_content="""
         Breaking News: Scientists at the Marine Biology Institute have made a
         groundbreaking discovery in the Mariana Trench. A new species of deep-sea
@@ -94,10 +87,9 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Map-Reduce - Technology Article",
-        description="~800-2000 tokens - triggers MAP_REDUCE level (>=500 tokens)",
+        name="Technology Article",
+        description="~800-2000 tokens - demonstrates medium content",
         url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence",
-        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content="""
         Artificial intelligence (AI) is the intelligence of machines or software,
@@ -174,21 +166,12 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Map-Reduce - Full Article",
-        description="~4000-10000 tokens - triggers MAP_REDUCE with chunking",
+        name="Full Article",
+        description="~4000-10000 tokens - demonstrates large content with chunking",
         url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning",
-        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content=None,  # We'll generate synthetic content
     ),
-    TextSample(
-        name="Map-Reduce - Long Document",
-        description="~16000+ tokens - triggers MAP_REDUCE with multiple collapse iterations",
-        url="https://www.gutenberg.org/cache/epub/84/pg84.txt",  # Frankenstein (truncated)
-        expected_level=SummaryLevel.MAP_REDUCE,
-        content_type="document",
-        fallback_content=None,  # We'll generate synthetic content (~16K tokens)
-    ),
 ]
 
 
@@ -262,25 +245,11 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
             content = re.sub(r"<[^>]+>", " ", content)
             content = re.sub(r"\s+", " ", content).strip()
 
-        # Check if content is too short for expected level
-        min_words_for_level = {
-            SummaryLevel.BRIEF: 80,  # Need ~100 tokens
-            SummaryLevel.MAP_REDUCE: 400,  # Need ~500 tokens
-        }
-        min_words = min_words_for_level.get(sample.expected_level, 50)
-
+        # Check if content is too short
+        min_words = 80
         if len(content.split()) < min_words:
             print(f"  📎 Fetched content too short ({len(content.split())} words), using fallback")
-            if sample.fallback_content:
-                content = sample.fallback_content
-            else:
-                target_tokens = {
-                    SummaryLevel.BRIEF: 300,
-                    SummaryLevel.MAP_REDUCE: 1500,
-                }
-                content = generate_synthetic_content(
-                    target_tokens.get(sample.expected_level, 1000),
-                )
+            content = sample.fallback_content or generate_synthetic_content(1500)
 
         # For very long content, truncate to keep demo fast
         words = content.split()
@@ -296,15 +265,17 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
         if sample.fallback_content:
             return sample.fallback_content.strip()
 
-        # Generate synthetic content for the expected level
-        target_tokens = {
-            SummaryLevel.BRIEF: 300,
-            SummaryLevel.MAP_REDUCE: 1500,
-        }
-        return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000))
+        # Generate synthetic content
+        return generate_synthetic_content(1500)
 
 
-def print_result(sample: TextSample, result: SummaryResult, content: str) -> None:
+def print_result(
+    sample: TextSample,
+    result: SummaryResult,
+    content: str,
+    target_tokens: int | None,
+    target_ratio: float | None,
+) -> None:
     """Print a formatted summary result."""
     print("\n" + "=" * 70)
     print(f"📄 {sample.name}")
@@ -318,23 +289,30 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
     print(f"   Tokens: {result.input_tokens:,}")
     print(f"   Content type: {sample.content_type}")
 
-    # Summarization result
-    level_emoji = {
-        SummaryLevel.NONE: "⏭️",
-        SummaryLevel.BRIEF: "📝",
-        SummaryLevel.MAP_REDUCE: "🔄",
-    }
-    print("\n🎯 Summarization Result:")
-    print(f"   Level: {level_emoji.get(result.level, '❓')} {result.level.name}")
-    print(f"   Expected: {sample.expected_level.name}")
-    print(f"   Match: {'✅' if result.level == sample.expected_level else '⚠️'}")
+    # Target info
+    print("\n🎯 Target:")
+    if target_ratio is not None:
+        print(f"   Ratio: {target_ratio:.0%} of input")
+        print(f"   Calculated target: ~{int(result.input_tokens * target_ratio):,} tokens")
+    elif target_tokens is not None:
+        print(f"   Tokens: {target_tokens:,}")
+    else:
+        print("   Default: 3000 tokens (LangChain default)")
+
+    # Result info
+    print("\n📝 Result:")
+    if result.summary == content:
+        print("   Status: ⏭️  Content already fits target (returned as-is)")
+    elif result.collapse_depth > 0:
+        print(f"   Status: 🔄 Map-reduce summarization (collapse depth: {result.collapse_depth})")
+    else:
+        print("   Status: 📝 Single-pass summarization")
+
     print(f"   Output tokens: {result.output_tokens:,}")
     print(f"   Compression: {result.compression_ratio:.1%}")
-    if result.collapse_depth > 0:
-        print(f"   Collapse depth: {result.collapse_depth}")
 
     # Summary content
-    if result.summary:
+    if result.summary and result.summary != content:
         print("\n📝 Summary:")
         wrapped = textwrap.fill(
             result.summary,
@@ -342,11 +320,15 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
             initial_indent="   ",
             subsequent_indent="   ",
         )
+        # Only show first ~500 chars of summary
+        if len(wrapped) > 600:  # noqa: PLR2004
+            wrapped = wrapped[:600] + "..."
         print(wrapped)
 
 
 async def run_demo(
-    level_filter: str | None = None,
+    target_tokens: int | None = None,
+    target_ratio: float | None = None,
     model: str | None = None,
     base_url: str | None = None,
 ) -> None:
@@ -369,39 +351,28 @@ async def run_demo(
         timeout=120.0,  # Longer timeout for local models
     )
 
-    # Filter samples if requested
-    samples = SAMPLES
-    if level_filter:
-        level_map = {
-            "brief": SummaryLevel.BRIEF,
-            "map_reduce": SummaryLevel.MAP_REDUCE,
-        }
-        target_level = level_map.get(level_filter.lower())
-        if target_level:
-            samples = [s for s in SAMPLES if s.expected_level == target_level]
-            print(f"\n🔍 Filtering to {level_filter.upper()} level only")
-
     async with httpx.AsyncClient() as client:
-        for sample in samples:
+        for sample in SAMPLES:
             print(f"\n⏳ Processing: {sample.name}...")
 
             # Fetch content
             content = await fetch_content(sample, client)
 
             try:
-                # Summarize
+                # Summarize with specified target
                 result = await summarize(
                     content=content,
                     config=config,
+                    target_tokens=target_tokens,
+                    target_ratio=target_ratio,
                     content_type=sample.content_type,
                 )
 
                 # Display results
-                print_result(sample, result, content)
+                print_result(sample, result, content, target_tokens, target_ratio)
 
             except Exception as e:
                 print(f"\n❌ Error summarizing {sample.name}: {e}")
-
                 traceback.print_exc()
 
     print("\n" + "=" * 70)
@@ -417,16 +388,21 @@ def main() -> None:
         epilog=textwrap.dedent("""
         Examples:
           python examples/summarizer_demo.py
-          python examples/summarizer_demo.py --level brief
-          python examples/summarizer_demo.py --level map_reduce
+          python examples/summarizer_demo.py --target-ratio 0.2
+          python examples/summarizer_demo.py --target-tokens 500
           python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1"
         """),
     )
 
     parser.add_argument(
-        "--level",
-        choices=["brief", "map_reduce"],
-        help="Only test a specific summarization level",
+        "--target-ratio",
+        type=float,
+        help="Target ratio for compression (e.g., 0.2 = compress to 20%%)",
+    )
+    parser.add_argument(
+        "--target-tokens",
+        type=int,
+        help="Target token count for summary",
     )
     parser.add_argument(
         "--model",
@@ -439,9 +415,13 @@ def main() -> None:
 
     args = parser.parse_args()
 
+    if args.target_ratio is not None and args.target_tokens is not None:
+        parser.error("Cannot specify both --target-ratio and --target-tokens")
+
     asyncio.run(
         run_demo(
-            level_filter=args.level,
+            target_tokens=args.target_tokens,
+            target_ratio=args.target_ratio,
             model=args.model,
             base_url=args.base_url,
         ),
diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py
index 44d0a031c..fc341b7df 100644
--- a/tests/memory/test_engine.py
+++ b/tests/memory/test_engine.py
@@ -23,7 +23,7 @@
     Message,
     StoredMemory,
 )
-from agent_cli.summarizer import SummaryLevel, SummaryResult
+from agent_cli.summarizer import SummaryResult
 
 
 class _DummyReranker:
@@ -355,7 +355,6 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="summary up to 256",
             input_tokens=100,
             output_tokens=20,
@@ -582,7 +581,6 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="summary text",
             input_tokens=100,
             output_tokens=20,
diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py
index 86040d7a1..64130990c 100644
--- a/tests/memory/test_git_integration.py
+++ b/tests/memory/test_git_integration.py
@@ -14,7 +14,7 @@
 from agent_cli.memory import _ingest
 from agent_cli.memory.client import MemoryClient
 from agent_cli.memory.entities import Fact
-from agent_cli.summarizer import SummaryLevel, SummaryResult
+from agent_cli.summarizer import SummaryResult
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -66,7 +66,6 @@ async def fake_reconcile(
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="User likes testing.",
             input_tokens=100,
             output_tokens=20,
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index 202a55921..b7ce45e82 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -12,13 +12,8 @@
     SummaryOutput,
     generate_summary,
 )
-from agent_cli.summarizer.adaptive import (
-    THRESHOLD_BRIEF,
-    THRESHOLD_NONE,
-    determine_level,
-    summarize,
-)
-from agent_cli.summarizer.models import SummaryLevel, SummaryResult
+from agent_cli.summarizer.adaptive import summarize
+from agent_cli.summarizer.map_reduce import MapReduceResult
 
 
 class TestSummarizerConfig:
@@ -83,39 +78,6 @@ def test_default_token_max_is_langchain(self) -> None:
         assert config.token_max == 3000  # LangChain's default
 
 
-class TestDetermineLevel:
-    """Tests for level determination based on token count.
-
-    The simplified approach has 3 levels:
-    - NONE: Very short content (< 100 tokens)
-    - BRIEF: Short content (100-500 tokens)
-    - MAP_REDUCE: Everything else (uses map-reduce)
-    """
-
-    def test_none_level_threshold(self) -> None:
-        """Test NONE level for very short content."""
-        assert determine_level(50) == SummaryLevel.NONE
-        assert determine_level(99) == SummaryLevel.NONE
-
-    def test_brief_level_threshold(self) -> None:
-        """Test BRIEF level for short content."""
-        assert determine_level(100) == SummaryLevel.BRIEF
-        assert determine_level(300) == SummaryLevel.BRIEF
-        assert determine_level(499) == SummaryLevel.BRIEF
-
-    def test_map_reduce_level_for_longer_content(self) -> None:
-        """Test that content >= 500 tokens uses MAP_REDUCE."""
-        assert determine_level(500) == SummaryLevel.MAP_REDUCE
-        assert determine_level(1500) == SummaryLevel.MAP_REDUCE
-        assert determine_level(5000) == SummaryLevel.MAP_REDUCE
-        assert determine_level(20000) == SummaryLevel.MAP_REDUCE
-
-    def test_thresholds_match_constants(self) -> None:
-        """Verify thresholds match the module constants."""
-        assert THRESHOLD_NONE == 100
-        assert THRESHOLD_BRIEF == 500
-
-
 class TestSummarize:
     """Tests for main summarize function."""
 
@@ -128,133 +90,101 @@ def config(self) -> SummarizerConfig:
         )
 
     @pytest.mark.asyncio
-    async def test_empty_content_returns_none_level(
+    async def test_empty_content_returns_no_summary(
         self,
         config: SummarizerConfig,
     ) -> None:
-        """Test that empty content returns NONE level result."""
+        """Test that empty content returns result with no summary."""
         result = await summarize("", config)
-        assert result.level == SummaryLevel.NONE
         assert result.summary is None
         assert result.input_tokens == 0
         assert result.output_tokens == 0
 
     @pytest.mark.asyncio
-    async def test_whitespace_only_returns_none_level(
+    async def test_whitespace_only_returns_no_summary(
         self,
         config: SummarizerConfig,
     ) -> None:
-        """Test that whitespace-only content returns NONE level result."""
+        """Test that whitespace-only content returns result with no summary."""
         result = await summarize("   \n\n   ", config)
-        assert result.level == SummaryLevel.NONE
         assert result.summary is None
 
     @pytest.mark.asyncio
-    async def test_very_short_content_no_summary(
+    async def test_short_content_returns_as_is(
         self,
         config: SummarizerConfig,
     ) -> None:
-        """Test that very short content gets NONE level (no summary)."""
-        # Less than 100 tokens
+        """Test that short content is returned as-is (no LLM call)."""
+        # Less than default token_max (3000)
         result = await summarize("Hello world", config)
-        assert result.level == SummaryLevel.NONE
-        assert result.summary is None
+        assert result.summary == "Hello world"
+        assert result.compression_ratio == 1.0  # No compression
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._brief_summary")
-    async def test_brief_level_calls_brief_summary(
+    async def test_target_tokens_respected(
         self,
-        mock_brief: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that BRIEF level content calls _brief_summary."""
-        mock_brief.return_value = "Brief summary."
-
-        # Create content that's ~100-500 tokens
-        content = "This is a test sentence. " * 30  # ~150 tokens
-
-        result = await summarize(content, config)
-
-        mock_brief.assert_called_once_with(content, config)
-        assert result.level == SummaryLevel.BRIEF
-        assert result.summary == "Brief summary."
+        """Test that content fitting target_tokens is returned as-is."""
+        content = "Short content"
+        result = await summarize(content, config, target_tokens=1000)
+        assert result.summary == content
+        assert result.compression_ratio == 1.0
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
-    async def test_longer_content_uses_map_reduce(
+    async def test_target_ratio_calculates_target(
         self,
-        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that content >= 500 tokens uses map-reduce."""
-        mock_result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
-            summary="Map-reduce summary.",
-            input_tokens=800,
-            output_tokens=100,
-            compression_ratio=0.125,
-        )
-        mock_map_reduce.return_value = mock_result
-
-        # Create content that's ~500+ tokens
-        content = "This is a test sentence with more words. " * 100  # ~800 tokens
-
-        result = await summarize(content, config, content_type="general")
-
-        mock_map_reduce.assert_called_once()
-        assert result.summary == "Map-reduce summary."
+        """Test that target_ratio calculates correct target."""
+        # Short content that fits even with 10% target
+        content = "Hello"
+        result = await summarize(content, config, target_ratio=0.1)
+        # Content is so short it fits in 10% target
+        assert result.summary == content
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
-    async def test_prior_summary_passed_to_map_reduce(
+    @patch("agent_cli.summarizer.adaptive._content_aware_summary")
+    async def test_content_exceeding_target_gets_summarized(
         self,
-        mock_map_reduce: AsyncMock,
+        mock_summary: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that prior_summary is passed to _map_reduce_summary."""
-        mock_result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
-            summary="Updated summary.",
-            input_tokens=800,
-            output_tokens=100,
-            compression_ratio=0.125,
-        )
-        mock_map_reduce.return_value = mock_result
+        """Test that content exceeding target gets summarized."""
+        mock_summary.return_value = "Summarized content."
 
-        content = "This is a test sentence with more words. " * 100
-        prior = "Previous context summary."
+        # Create content that's ~500 tokens (exceeds target of 100)
+        content = "This is a test sentence. " * 100
 
-        await summarize(content, config, prior_summary=prior)
+        result = await summarize(content, config, target_tokens=100)
 
-        # Verify prior_summary was passed
-        call_args = mock_map_reduce.call_args
-        assert call_args[0][3] == prior  # prior_summary is 4th positional arg
+        mock_summary.assert_called_once()
+        assert result.summary == "Summarized content."
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
-    async def test_very_long_content_uses_map_reduce(
+    @patch("agent_cli.summarizer.adaptive.map_reduce_summarize")
+    async def test_large_content_uses_map_reduce(
         self,
         mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that very long content uses map-reduce."""
-        mock_result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
-            summary="Long content summary.",
-            input_tokens=20000,
-            output_tokens=500,
-            compression_ratio=0.025,
-            collapse_depth=2,
+        """Test that content exceeding chunk_size uses map-reduce."""
+        mock_map_reduce.return_value = MapReduceResult(
+            summary="Map-reduce summary.",
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+            collapse_depth=1,
+            intermediate_summaries=[["chunk1", "chunk2"]],
         )
-        mock_map_reduce.return_value = mock_result
 
-        # Create content that's > 15000 tokens
-        content = "Word " * 20000
+        # Create content larger than chunk_size (2048)
+        content = "Word " * 3000  # ~3000 tokens
 
-        result = await summarize(content, config)
+        result = await summarize(content, config, target_tokens=500)
 
-        assert mock_map_reduce.called
-        assert result.level == SummaryLevel.MAP_REDUCE
+        mock_map_reduce.assert_called_once()
+        assert result.summary == "Map-reduce summary."
 
 
 class TestGenerateSummary:
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index f11fcff8b..867815ce9 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -2,32 +2,7 @@
 
 from __future__ import annotations
 
-from agent_cli.summarizer.adaptive import determine_level
-from agent_cli.summarizer.models import SummaryLevel, SummaryResult
-
-
-class TestDetermineLevel:
-    """Tests for determine_level function with various content sizes."""
-
-    def test_short_content_is_brief(self) -> None:
-        """Test that 100-500 token content uses BRIEF."""
-        level = determine_level(200)
-        assert level == SummaryLevel.BRIEF
-
-    def test_medium_content_is_map_reduce(self) -> None:
-        """Test that 500+ token content uses MAP_REDUCE."""
-        level = determine_level(1000)
-        assert level == SummaryLevel.MAP_REDUCE
-
-    def test_long_content_is_map_reduce(self) -> None:
-        """Test that 3000+ token content uses MAP_REDUCE."""
-        level = determine_level(5000)
-        assert level == SummaryLevel.MAP_REDUCE
-
-    def test_very_long_content_is_map_reduce(self) -> None:
-        """Test that content over 15000 tokens still uses MAP_REDUCE."""
-        level = determine_level(20000)
-        assert level == SummaryLevel.MAP_REDUCE
+from agent_cli.summarizer.models import SummaryResult
 
 
 class TestSummaryResultStorage:
@@ -36,7 +11,6 @@ class TestSummaryResultStorage:
     def test_to_storage_metadata_creates_entry(self) -> None:
         """Test that to_storage_metadata creates a valid entry."""
         result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
@@ -52,13 +26,11 @@ def test_to_storage_metadata_creates_entry(self) -> None:
         assert entry["metadata"]["conversation_id"] == "test-conversation"
         assert entry["metadata"]["role"] == "summary"
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
         assert entry["metadata"]["collapse_depth"] == 1
 
-    def test_none_level_returns_empty(self) -> None:
-        """Test that NONE level produces no storage entries."""
+    def test_no_summary_returns_empty(self) -> None:
+        """Test that no summary produces no storage entries."""
         result = SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=50,
             output_tokens=0,
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index c5b04f703..05d5625f4 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -7,73 +7,39 @@
 import pytest
 
 from agent_cli.summarizer.models import (
-    SummaryLevel,
     SummaryResult,
 )
 
 
-class TestSummaryLevel:
-    """Tests for SummaryLevel enum."""
-
-    def test_level_values(self) -> None:
-        """Test that levels have correct integer values."""
-        assert SummaryLevel.NONE == 0
-        assert SummaryLevel.BRIEF == 1
-        assert SummaryLevel.MAP_REDUCE == 2
-
-    def test_level_ordering(self) -> None:
-        """Test that levels can be compared."""
-        assert SummaryLevel.NONE < SummaryLevel.BRIEF
-        assert SummaryLevel.BRIEF < SummaryLevel.MAP_REDUCE
-
-
 class TestSummaryResult:
     """Tests for SummaryResult model."""
 
-    def test_none_level_result(self) -> None:
-        """Test result for content that needs no summary."""
+    def test_result_with_no_summary(self) -> None:
+        """Test result when content already fits target."""
         result = SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=50,
             output_tokens=0,
             compression_ratio=0.0,
         )
-        assert result.level == SummaryLevel.NONE
         assert result.summary is None
         assert result.collapse_depth == 0
 
-    def test_brief_level_result(self) -> None:
-        """Test result for brief summary."""
-        result = SummaryResult(
-            level=SummaryLevel.BRIEF,
-            summary="A brief one-sentence summary.",
-            input_tokens=200,
-            output_tokens=10,
-            compression_ratio=0.05,
-        )
-        assert result.level == SummaryLevel.BRIEF
-        assert result.summary == "A brief one-sentence summary."
-        assert result.collapse_depth == 0
-
-    def test_map_reduce_result(self) -> None:
-        """Test result for map-reduce summary."""
+    def test_result_with_summary(self) -> None:
+        """Test result with a generated summary."""
         result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
             compression_ratio=0.02,
             collapse_depth=2,
         )
-        assert result.level == SummaryLevel.MAP_REDUCE
         assert result.summary == "A comprehensive summary."
         assert result.collapse_depth == 2
 
-    def test_to_storage_metadata_none_level(self) -> None:
-        """Test that NONE level produces no storage entries."""
+    def test_to_storage_metadata_no_summary(self) -> None:
+        """Test that no summary produces no storage entries."""
         result = SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=50,
             output_tokens=0,
@@ -82,10 +48,9 @@ def test_to_storage_metadata_none_level(self) -> None:
         entries = result.to_storage_metadata("conv-123")
         assert entries == []
 
-    def test_to_storage_metadata_simple_summary(self) -> None:
+    def test_to_storage_metadata_with_summary(self) -> None:
         """Test storage metadata for a summary."""
         result = SummaryResult(
-            level=SummaryLevel.BRIEF,
             summary="A brief summary.",
             input_tokens=200,
             output_tokens=10,
@@ -99,12 +64,10 @@ def test_to_storage_metadata_simple_summary(self) -> None:
         assert entry["metadata"]["conversation_id"] == "conv-456"
         assert entry["metadata"]["role"] == "summary"
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "BRIEF"
 
-    def test_to_storage_metadata_map_reduce(self) -> None:
-        """Test storage metadata for map-reduce summary."""
+    def test_to_storage_metadata_with_collapse_depth(self) -> None:
+        """Test storage metadata includes collapse depth."""
         result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="Final synthesis of content.",
             input_tokens=20000,
             output_tokens=200,
@@ -113,12 +76,10 @@ def test_to_storage_metadata_map_reduce(self) -> None:
         )
         entries = result.to_storage_metadata("conv-789")
 
-        # Should have 1 entry (the final summary)
         assert len(entries) == 1
         entry = entries[0]
         assert entry["id"] == "conv-789:summary"
         assert entry["content"] == "Final synthesis of content."
-        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
         assert entry["metadata"]["collapse_depth"] == 3
         assert entry["metadata"]["is_final"] is True
 
@@ -126,7 +87,6 @@ def test_compression_ratio_bounds(self) -> None:
         """Test compression ratio validation."""
         # Valid ratio
         result = SummaryResult(
-            level=SummaryLevel.BRIEF,
             summary="Test",
             input_tokens=100,
             output_tokens=10,
@@ -137,7 +97,6 @@ def test_compression_ratio_bounds(self) -> None:
         # Ratio must be between 0 and 1
         with pytest.raises(ValueError, match="less than or equal to 1"):
             SummaryResult(
-                level=SummaryLevel.BRIEF,
                 summary="Test",
                 input_tokens=100,
                 output_tokens=10,
@@ -148,7 +107,6 @@ def test_created_at_default(self) -> None:
         """Test that created_at is automatically set."""
         before = datetime.now(UTC)
         result = SummaryResult(
-            level=SummaryLevel.BRIEF,
             summary="Test",
             input_tokens=100,
             output_tokens=10,
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 188a79172..89a441719 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -88,32 +88,32 @@ def test_large_paragraph_sentence_split(self) -> None:
 class TestEstimateSummaryTokens:
     """Tests for estimate_summary_tokens function."""
 
-    def test_none_level(self) -> None:
-        """Test level 0 (NONE) returns 0."""
-        assert estimate_summary_tokens(1000, level=0) == 0
-
-    def test_brief_level(self) -> None:
-        """Test level 1 (BRIEF) compression."""
-        # BRIEF: ~20% compression, capped at 50, minimum 20
-        result = estimate_summary_tokens(100, level=1)
-        assert result >= 20  # minimum of 20
-        assert result <= 50  # capped at 50
-
-    def test_map_reduce_level(self) -> None:
-        """Test level 2 (MAP_REDUCE) compression."""
-        # MAP_REDUCE: ~10% compression, capped at 500, minimum 50
-        result = estimate_summary_tokens(1000, level=2)
-        assert result >= 50  # minimum of 50
-        assert result <= 500  # capped at 500
-
-    def test_map_reduce_large_input(self) -> None:
-        """Test MAP_REDUCE with large input hits cap."""
-        result = estimate_summary_tokens(50000, level=2)
+    def test_typical_input(self) -> None:
+        """Test typical input uses ~10% compression."""
+        # ~10% compression, capped at 500, minimum 50
+        result = estimate_summary_tokens(1000)
+        assert result == 100  # 1000 // 10 = 100
+
+    def test_medium_input(self) -> None:
+        """Test medium input stays within bounds."""
+        result = estimate_summary_tokens(2000)
+        assert result == 200  # 2000 // 10 = 200
+        assert result >= 50  # above floor
+        assert result <= 500  # below ceiling
+
+    def test_large_input_hits_cap(self) -> None:
+        """Test large input hits 500 token cap."""
+        result = estimate_summary_tokens(50000)
         assert result == 500  # capped at 500
 
-    def test_map_reduce_small_input(self) -> None:
-        """Test MAP_REDUCE with small input uses floor."""
-        result = estimate_summary_tokens(100, level=2)
+    def test_small_input_uses_floor(self) -> None:
+        """Test small input uses 50 token floor."""
+        result = estimate_summary_tokens(100)
+        assert result == 50  # floor of 50 (100 // 10 = 10, but min is 50)
+
+    def test_very_small_input(self) -> None:
+        """Test very small input still uses floor."""
+        result = estimate_summary_tokens(10)
         assert result == 50  # floor of 50
 
 

From 1ca62668f0799be7a1fbe227b4eed4e64dd20e09 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 3 Dec 2025 20:16:20 -0800
Subject: [PATCH 36/37] chore(summarizer): remove dead code

- Remove unused BRIEF_SUMMARY_PROMPT (brief level was removed)
- Remove unused timeout field from SummarizerConfig
- Update tests and examples accordingly
---
 agent_cli/summarizer/_prompts.py  | 9 ---------
 agent_cli/summarizer/models.py    | 1 -
 examples/summarizer_demo.py       | 1 -
 tests/summarizer/test_adaptive.py | 2 --
 tests/summarizer/test_prompts.py  | 8 --------
 5 files changed, 21 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index de59f9404..e49fd417d 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -4,15 +4,6 @@
 and are optimized for structured, factual output.
 """
 
-# Single sentence summary for short content (used at BRIEF level, 100-500 tokens)
-BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
-Focus on the single most important point or takeaway.
-
-Content:
-{content}
-
-One-sentence summary:""".strip()
-
 # Paragraph summary for "general" content type (default when no specific type provided)
 GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 65eb42ed5..721201da3 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -34,7 +34,6 @@ class SummarizerConfig:
     token_max: int = 3000  # LangChain's default - target size after compression
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
-    timeout: float = 60.0
 
     def __post_init__(self) -> None:
         """Normalize the base URL."""
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
index feebc5f20..f5d593a17 100644
--- a/examples/summarizer_demo.py
+++ b/examples/summarizer_demo.py
@@ -348,7 +348,6 @@ async def run_demo(
         api_key=api_key,
         chunk_size=2048,  # BOOOOKSCORE default
         max_concurrent_chunks=3,
-        timeout=120.0,  # Longer timeout for local models
     )
 
     async with httpx.AsyncClient() as client:
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index b7ce45e82..1fbf3d7ba 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -46,12 +46,10 @@ def test_init_with_custom_settings(self) -> None:
             chunk_size=5000,
             chunk_overlap=300,
             max_concurrent_chunks=10,
-            timeout=120.0,
         )
         assert config.chunk_size == 5000
         assert config.chunk_overlap == 300
         assert config.max_concurrent_chunks == 10
-        assert config.timeout == 120.0
 
     def test_trailing_slash_stripped(self) -> None:
         """Test that trailing slash is stripped from base URL."""
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index ef05ebad5..825fe077c 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 from agent_cli.summarizer._prompts import (
-    BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     CONVERSATION_SUMMARY_PROMPT,
     DOCUMENT_SUMMARY_PROMPT,
@@ -19,13 +18,6 @@
 class TestPromptTemplates:
     """Tests for prompt template structure."""
 
-    def test_brief_prompt_has_content_placeholder(self) -> None:
-        """Test BRIEF prompt contains content placeholder."""
-        assert "{content}" in BRIEF_SUMMARY_PROMPT
-        # Test it can be formatted
-        result = BRIEF_SUMMARY_PROMPT.format(content="Test content")
-        assert "Test content" in result
-
     def test_general_prompt_has_placeholders(self) -> None:
         """Test GENERAL prompt contains required placeholders."""
         assert "{content}" in GENERAL_SUMMARY_PROMPT

From f02c584b2d981d6455511d1226dbe50e071399b0 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 22 Apr 2026 14:34:48 -0700
Subject: [PATCH 37/37] fix(summarizer): persist final summaries as L3 entries

---
 agent_cli/_requirements/memory.txt   |  4 +++-
 agent_cli/core/chroma.py             |  1 +
 agent_cli/docs_gen.py                | 11 +++++++++--
 agent_cli/memory/_persistence.py     | 19 ++-----------------
 agent_cli/memory/models.py           |  2 +-
 agent_cli/summarizer/models.py       |  2 ++
 tests/memory/test_store.py           | 17 ++++++++++++++++-
 tests/summarizer/test_integration.py |  2 ++
 tests/summarizer/test_models.py      | 14 ++++++++++++++
 tests/test_api_integration.py        | 17 +++++++++--------
 uv.lock                              |  8 ++++++++
 11 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/agent_cli/_requirements/memory.txt b/agent_cli/_requirements/memory.txt
index 9a4b89d7f..309c83c88 100644
--- a/agent_cli/_requirements/memory.txt
+++ b/agent_cli/_requirements/memory.txt
@@ -333,7 +333,9 @@ tenacity==9.1.2
     #   chromadb
     #   google-genai
 tiktoken==0.12.0
-    # via pydantic-ai-slim
+    # via
+    #   agent-cli
+    #   pydantic-ai-slim
 tokenizers==0.22.2
     # via
     #   chromadb
diff --git a/agent_cli/core/chroma.py b/agent_cli/core/chroma.py
index 0cc639cb7..22455fa65 100644
--- a/agent_cli/core/chroma.py
+++ b/agent_cli/core/chroma.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Any
+
 from agent_cli.constants import DEFAULT_OPENAI_EMBEDDING_MODEL
 
 if TYPE_CHECKING:
diff --git a/agent_cli/docs_gen.py b/agent_cli/docs_gen.py
index 49002f1a4..000448c23 100644
--- a/agent_cli/docs_gen.py
+++ b/agent_cli/docs_gen.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+from functools import cache
 from typing import Any, get_origin
 
 import click
@@ -59,10 +60,16 @@ def _format_default(default: Any) -> str:
     return str(default)
 
 
+@cache
+def _get_root_click_app() -> click.Command:
+    """Build the Click app once for documentation introspection."""
+    return get_command(app)
+
+
 def _get_click_command(command_path: str) -> click.Command | None:
     """Get a Click command from a path like 'transcribe' or 'memory.proxy'."""
     parts = command_path.split(".")
-    click_app = get_command(app)
+    click_app = _get_root_click_app()
 
     cmd: click.Command | click.Group = click_app
     for part in parts:
@@ -209,7 +216,7 @@ def _options_by_panel(
 
 def _list_commands() -> list[str]:
     """List all available commands including subcommands."""
-    click_app = get_command(app)
+    click_app = _get_root_click_app()
     commands = []
 
     def _walk(cmd: click.Command | click.Group, prefix: str = "") -> None:
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 46ac03631..862f753e0 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -4,7 +4,6 @@
 
 import logging
 import shutil
-from datetime import UTC, datetime
 from typing import TYPE_CHECKING
 
 from agent_cli.memory._files import (
@@ -207,23 +206,9 @@ def persist_summary(
         return []
 
     stored_ids: list[str] = []
-    created_at = datetime.now(UTC).isoformat()
 
     for entry in entries:
-        meta_dict = entry["metadata"]
-        # Build MemoryMetadata from the summary result's metadata dict
-        metadata = MemoryMetadata(
-            conversation_id=meta_dict["conversation_id"],
-            role=meta_dict["role"],
-            created_at=meta_dict.get("created_at", created_at),
-            summary_kind="summary",
-            is_final=meta_dict.get("is_final"),
-            input_tokens=meta_dict.get("input_tokens"),
-            output_tokens=meta_dict.get("output_tokens"),
-            compression_ratio=meta_dict.get("compression_ratio"),
-            summary_level=meta_dict.get("summary_level"),
-            collapse_depth=meta_dict.get("collapse_depth"),
-        )
+        metadata = MemoryMetadata(**entry["metadata"])
         record = write_memory_file(
             memory_root,
             content=entry["content"],
@@ -233,7 +218,7 @@ def persist_summary(
         LOGGER.info(
             "Persisted summary file: %s (level=%s)",
             record.path,
-            meta_dict.get("summary_level"),
+            metadata.level,
         )
         stored_ids.append(record.id)
 
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index d52d952ce..e2463ffe9 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -51,7 +51,7 @@ class MemoryMetadata(BaseModel):
 
     # Summary fields (only used when role="summary")
     level: int | None = None
-    """Summary level (deprecated, kept for file structure compatibility)."""
+    """Summary level used for hierarchical summary retrieval and file layout."""
     is_final: bool | None = None
     """Whether this is the final summary."""
     chunk_index: int | None = None
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 721201da3..99354e696 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -88,6 +88,8 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                 "metadata": {
                     "conversation_id": conversation_id,
                     "role": "summary",
+                    "summary_kind": "summary",
+                    "level": 3,
                     "is_final": True,
                     "input_tokens": self.input_tokens,
                     "output_tokens": self.output_tokens,
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 29dbe2e55..d7e19f80d 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -17,12 +17,14 @@ def __init__(
         self.query_result = query_result or {}
         self.get_result = get_result or {}
         self.deleted: list[list[str]] = []
+        self.get_calls: list[dict[str, Any]] = []
         self.upserts: list[tuple[list[str], list[str], list[dict[str, Any]]]] = []
 
     def query(self, **_kwargs: Any) -> dict[str, Any]:
         return self.query_result
 
-    def get(self, **_kwargs: Any) -> dict[str, Any]:
+    def get(self, **kwargs: Any) -> dict[str, Any]:
+        self.get_calls.append(kwargs)
         return self.get_result
 
     def delete(self, ids: list[str]) -> None:
@@ -146,6 +148,8 @@ def test_upsert_summary_entries_simple() -> None:
             "metadata": {
                 "conversation_id": "conv-123",
                 "role": "summary",
+                "summary_kind": "summary",
+                "level": 3,
                 "is_final": True,
                 "summary_level": "MAP_REDUCE",
                 "input_tokens": 1000,
@@ -164,6 +168,8 @@ def test_upsert_summary_entries_simple() -> None:
     upserted_ids, upserted_docs, upserted_metas = fake.upserts[0]
     assert upserted_ids == ["conv-123:summary"]
     assert upserted_docs == ["A paragraph summary."]
+    assert upserted_metas[0]["summary_kind"] == "summary"
+    assert upserted_metas[0]["level"] == 3
     assert upserted_metas[0]["is_final"] is True
 
 
@@ -177,6 +183,8 @@ def test_upsert_summary_entries_with_collapse_depth() -> None:
             "metadata": {
                 "conversation_id": "conv-456",
                 "role": "summary",
+                "summary_kind": "summary",
+                "level": 3,
                 "is_final": True,
                 "summary_level": "MAP_REDUCE",
                 "input_tokens": 5000,
@@ -214,6 +222,8 @@ def test_get_final_summary_returns_summary() -> None:
                 {
                     "conversation_id": "c1",
                     "role": "summary",
+                    "summary_kind": "summary",
+                    "level": 3,
                     "is_final": True,
                     "summary_level": "MAP_REDUCE",
                     "collapse_depth": 1,
@@ -228,6 +238,11 @@ def test_get_final_summary_returns_summary() -> None:
 
     assert result is not None
     assert result.content == "The final summary"
+    assert fake.get_calls[0]["where"]["$and"] == [
+        {"conversation_id": "c1"},
+        {"role": "summary"},
+        {"level": 3},
+    ]
     assert result.metadata.is_final is True
 
 
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index 867815ce9..7301d1071 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -25,6 +25,8 @@ def test_to_storage_metadata_creates_entry(self) -> None:
         assert entry["content"] == "A comprehensive summary."
         assert entry["metadata"]["conversation_id"] == "test-conversation"
         assert entry["metadata"]["role"] == "summary"
+        assert entry["metadata"]["summary_kind"] == "summary"
+        assert entry["metadata"]["level"] == 3
         assert entry["metadata"]["is_final"] is True
         assert entry["metadata"]["collapse_depth"] == 1
 
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index 05d5625f4..98db88d39 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -65,6 +65,20 @@ def test_to_storage_metadata_with_summary(self) -> None:
         assert entry["metadata"]["role"] == "summary"
         assert entry["metadata"]["is_final"] is True
 
+    def test_to_storage_metadata_marks_final_summary_for_memory_layer(self) -> None:
+        """Test storage metadata includes the fields memory retrieval expects."""
+        result = SummaryResult(
+            summary="A brief summary.",
+            input_tokens=200,
+            output_tokens=10,
+            compression_ratio=0.05,
+        )
+        entries = result.to_storage_metadata("conv-456")
+
+        entry = entries[0]
+        assert entry["metadata"]["summary_kind"] == "summary"
+        assert entry["metadata"]["level"] == 3
+
     def test_to_storage_metadata_with_collapse_depth(self) -> None:
         """Test storage metadata includes collapse depth."""
         result = SummaryResult(
diff --git a/tests/test_api_integration.py b/tests/test_api_integration.py
index 858944029..163c74a4b 100644
--- a/tests/test_api_integration.py
+++ b/tests/test_api_integration.py
@@ -160,10 +160,15 @@ def test_api_configuration_handling(monkeypatch: MonkeyPatch) -> None:
         assert True  # Config is created during request
 
 
-def test_temp_file_cleanup(client: TestClient) -> None:
+def test_temp_file_cleanup(
+    client: TestClient,
+    monkeypatch: MonkeyPatch,
+    tmp_path: Path,
+) -> None:
     """Test that temporary files are cleaned up after processing."""
+    monkeypatch.setattr(tempfile, "tempdir", str(tmp_path), raising=False)
     temp_dir = Path(tempfile.gettempdir())
-    temp_files_before = set(temp_dir.iterdir())
+    wav_files_before = set(temp_dir.glob("*.wav"))
 
     with patch("agent_cli.server.proxy.api._transcribe_with_provider") as mock_transcribe:
         mock_transcribe.return_value = "test"
@@ -183,12 +188,8 @@ def test_temp_file_cleanup(client: TestClient) -> None:
     # Give a moment for cleanup
     time.sleep(0.1)
 
-    temp_files_after = set(temp_dir.iterdir())
-    new_files = temp_files_after - temp_files_before
-
-    # No new WAV files should remain
-    wav_files = [f for f in new_files if f.name.endswith(".wav")]
-    assert len(wav_files) == 0
+    wav_files_after = set(temp_dir.glob("*.wav"))
+    assert wav_files_after - wav_files_before == set()
 
 
 @pytest.mark.asyncio
diff --git a/uv.lock b/uv.lock
index 52ecaf5e2..34de4fdf3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -53,6 +53,7 @@ dev = [
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
     { name = "ruff" },
+    { name = "tiktoken" },
     { name = "versioningit" },
     { name = "zensical" },
 ]
@@ -79,6 +80,7 @@ memory = [
     { name = "openai" },
     { name = "pydantic-ai-slim", extra = ["google", "openai"] },
     { name = "pyyaml" },
+    { name = "tiktoken" },
     { name = "transformers" },
     { name = "watchfiles" },
 ]
@@ -113,6 +115,7 @@ test = [
     { name = "pytest-cov" },
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
+    { name = "tiktoken" },
 ]
 vad = [
     { name = "onnxruntime" },
@@ -150,6 +153,7 @@ dev = [
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
     { name = "ruff" },
+    { name = "tiktoken" },
     { name = "versioningit" },
     { name = "zensical" },
 ]
@@ -218,6 +222,9 @@ requires-dist = [
     { name = "setproctitle" },
     { name = "sounddevice", marker = "extra == 'audio'", specifier = ">=0.4.6" },
     { name = "soundfile", marker = "extra == 'kokoro'", specifier = ">=0.12.0" },
+    { name = "tiktoken", marker = "extra == 'dev'", specifier = ">=0.5.0" },
+    { name = "tiktoken", marker = "extra == 'memory'", specifier = ">=0.5.0" },
+    { name = "tiktoken", marker = "extra == 'test'", specifier = ">=0.5.0" },
     { name = "torch", marker = "extra == 'whisper-transformers'", specifier = ">=2.0.0" },
     { name = "transformers", marker = "extra == 'kokoro'", specifier = ">=4.40.0" },
     { name = "transformers", marker = "extra == 'memory'", specifier = ">=4.30.0" },
@@ -250,6 +257,7 @@ dev = [
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
     { name = "ruff" },
+    { name = "tiktoken", specifier = ">=0.5.0" },
     { name = "versioningit" },
     { name = "zensical" },
 ]