From 066e6c7299cb2e3bcb392a42f073e4e70be30410 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 16:37:44 -0800 Subject: [PATCH 01/37] update docs/architecture/memory.md --- docs/architecture/memory.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md index cbff5d938..f99637ff3 100644 --- a/docs/architecture/memory.md +++ b/docs/architecture/memory.md @@ -259,13 +259,12 @@ Executed via `_postprocess_after_turn` (background task). * **Output:** JSON list of strings. Failures fall back to `[]`. ### 4.3 Reconciliation (Memory Management) -Resolves contradictions using a "Search-Decide-Update" loop. +Resolves contradictions using a "Search-Decide-Update" loop with complete enumeration. 1. **Local Search:** For each new fact, retrieve a small neighborhood of existing `role="memory"` entries for the conversation. -2. **LLM Decision:** Uses `UPDATE_MEMORY_PROMPT` (examples + strict JSON schema) to compare `new_facts` vs `existing_memories`. +2. **LLM Decision:** Uses `UPDATE_MEMORY_PROMPT` to compare `new_facts` vs `existing_memories`. The model must return **all memories** (existing + new) with explicit events for each. * **Decisions:** `ADD`, `UPDATE`, `DELETE`, `NONE`. * If no existing memories are found, all new facts are added directly. * On LLM/network failure, defaults to adding all new facts. - * Safeguard: if the model returns only deletes/empties, the new facts are still added to avoid data loss. 3. **Execution:** * **Adds:** Creates new fact files and upserts to Chroma. * **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`. @@ -295,13 +294,14 @@ To replicate the system behavior, the following prompt strategies are required. * **Example:** "My wife is Anne" -> `["The user's wife is named Anne"]`. ### 5.2 Reconciliation (`UPDATE_MEMORY_PROMPT`) -* **Goal:** Compare `new_facts` against `existing_memories` (id + text) and output structured decisions. +* **Goal:** Compare `new_facts` against `existing_memories` and return **all memories** (existing + new) with explicit events. +* **Approach:** The model must enumerate every memory in its response, forcing deliberate decisions rather than implicit omissions. * **Operations:** - * **ADD:** New information (generates a new ID). - * **UPDATE:** Refines existing information (uses the provided short ID). - * **DELETE:** Contradicts existing information (e.g., "I hate pizza" vs "I love pizza"). **If deleting because of a replacement, the new fact must also be returned (ADD or UPDATE).** - * **NONE:** Fact already exists or is irrelevant. -* **Output constraints:** JSON list only; no prose/code fences; IDs for UPDATE/DELETE/NONE must come from the provided list. + * **ADD:** New information not present in existing memories (generates a new sequential ID). + * **UPDATE:** Refines existing information about the **same topic** (keeps the existing ID). + * **DELETE:** Explicitly contradicts existing information (e.g., "I hate pizza" vs "I love pizza"). + * **NONE:** Existing memory is unrelated to new facts, or new fact is an exact duplicate. +* **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences. ### 5.3 Summarization (`SUMMARY_PROMPT`) * **Goal:** Maintain a concise running summary. From d495e6ec9cbdf8b16b725aa7d38fe252c6d5a1a8 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 16:38:04 -0800 Subject: [PATCH 02/37] Turn off ChromaDB telemetry --- agent_cli/core/chroma.py | 1 - 1 file changed, 1 deletion(-) diff --git a/agent_cli/core/chroma.py b/agent_cli/core/chroma.py index 22455fa65..0cc639cb7 100644 --- a/agent_cli/core/chroma.py +++ b/agent_cli/core/chroma.py @@ -3,7 +3,6 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any - from agent_cli.constants import DEFAULT_OPENAI_EMBEDDING_MODEL if TYPE_CHECKING: From d7b2a3d7c6a942ab5727ef8513617551a85d2cd4 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 16:44:37 -0800 Subject: [PATCH 03/37] feat(memory): add output validation with ModelRetry for reconciliation - Add @agent.output_validator to validate LLM decisions - Catch invalid UPDATE/DELETE/NONE with non-existent IDs - Send helpful error messages via ModelRetry for retry - Graceful fallback to add all facts when retries exhausted - Add AI journal POC example for testing MemoryClient - Improve reconciliation prompt with clearer examples --- examples/aijournal_poc.py | 151 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100755 examples/aijournal_poc.py diff --git a/examples/aijournal_poc.py b/examples/aijournal_poc.py new file mode 100755 index 000000000..df5934e9b --- /dev/null +++ b/examples/aijournal_poc.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +"""Minimal AI Journal proof-of-concept using MemoryClient. + +This validates the core hypothesis: MemoryClient can serve as the +foundation for a personal knowledge system (AI journal). + +Usage: + # Add a journal entry + python examples/aijournal_poc.py add "Today I learned about quantum computing at work" + + # Search memories + python examples/aijournal_poc.py search "what did I learn?" + + # Interactive chat with memory + python examples/aijournal_poc.py chat "What have I been working on lately?" +""" + +from __future__ import annotations + +import argparse +import asyncio +import logging +import os +from pathlib import Path + +from agent_cli.memory.client import MemoryClient + +# Enable debug logging for memory module +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + datefmt="%H:%M:%S", +) +# Enable DEBUG for memory ingest to see full prompts +logging.getLogger("agent_cli.memory._ingest").setLevel(logging.DEBUG) + + +# Defaults for local AI setup +DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1" +DEFAULT_MODEL = "gpt-oss-high:20b" +DEFAULT_EMBEDDING_MODEL = "embeddinggemma:300m" + + +def get_client(model: str | None = None) -> tuple[MemoryClient, str]: + """Initialize the memory client with sensible defaults. + + Returns: + Tuple of (client, model_name) + + """ + base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL) + model_name = model or os.environ.get("OPENAI_MODEL", DEFAULT_MODEL) + embedding_model = os.environ.get("EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL) + api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local") + + print(f"Using: {base_url}") + print(f" Chat model: {model_name}") + print(f" Embedding model: {embedding_model}") + + return MemoryClient( + memory_path=Path("~/.aijournal").expanduser(), + openai_base_url=base_url, + chat_api_key=api_key, + embedding_api_key=api_key, + embedding_model=embedding_model, + enable_summarization=True, + enable_git_versioning=False, # Keep it simple for POC + score_threshold=0.1, # Lower threshold for local models + ), model_name + + +async def cmd_add(text: str) -> None: + """Add a journal entry.""" + client, model = get_client() + print(f"Adding entry: {text[:50]}...") + await client.add(text, conversation_id="journal", model=model) + print("✓ Entry processed and facts extracted") + + +async def cmd_search(query: str, top_k: int = 5) -> None: + """Search memories.""" + client, model = get_client() + print(f"Searching for: {query}\n") + + result = await client.search(query, conversation_id="journal", top_k=top_k, model=model) + + if not result.entries: + print("No relevant memories found.") + return + + for i, entry in enumerate(result.entries, 1): + print(f"{i}. [{entry.role}] {entry.content}") + print(f" Score: {entry.score:.3f} | Created: {entry.created_at[:10]}") + print() + + +async def cmd_chat(question: str) -> None: + """Chat with memory-augmented LLM.""" + client, model = get_client() + print(f"Question: {question}\n") + + response = await client.chat( + messages=[{"role": "user", "content": question}], + conversation_id="journal", + model=model, + ) + + # Extract assistant reply + choices = response.get("choices", []) + if choices: + reply = choices[0].get("message", {}).get("content", "") + print(f"Answer: {reply}") + + # Show which memories were used + hits = response.get("memory_hits", []) + if hits: + print(f"\n--- Used {len(hits)} memories ---") + for hit in hits[:3]: + print(f" • {hit['content'][:80]}...") + + +def main() -> None: + """CLI entry point.""" + parser = argparse.ArgumentParser(description="AI Journal POC") + subparsers = parser.add_subparsers(dest="command", required=True) + + # Add command + add_parser = subparsers.add_parser("add", help="Add a journal entry") + add_parser.add_argument("text", help="The journal entry text") + + # Search command + search_parser = subparsers.add_parser("search", help="Search memories") + search_parser.add_argument("query", help="Search query") + search_parser.add_argument("-k", "--top-k", type=int, default=5, help="Number of results") + + # Chat command + chat_parser = subparsers.add_parser("chat", help="Chat with memory") + chat_parser.add_argument("question", help="Question to ask") + + args = parser.parse_args() + + if args.command == "add": + asyncio.run(cmd_add(args.text)) + elif args.command == "search": + asyncio.run(cmd_search(args.query, args.top_k)) + elif args.command == "chat": + asyncio.run(cmd_chat(args.question)) + + +if __name__ == "__main__": + main() From 24e04843fb55e45d0aa51de61b547ac8d54777cb Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 17:04:20 -0800 Subject: [PATCH 04/37] feat(memory): add self-model features to AI journal POC - Add list_all() method to MemoryClient to retrieve all stored memories - Add 'show' command to display all stored facts about the user - Add 'profile' command to generate a structured profile summary using LLM - Enhance 'chat' command to use profile context for personalized responses The POC now demonstrates a "self-model" system that: 1. Extracts facts from user input 2. Stores and retrieves them semantically 3. Generates profile summaries on demand 4. Uses the profile to personalize conversations This validates the core hypothesis: MemoryClient can serve as the foundation for a personal knowledge system that knows who you are. --- examples/aijournal_poc.py | 146 ++++++++++++++++++++++++++++++++++---- 1 file changed, 132 insertions(+), 14 deletions(-) diff --git a/examples/aijournal_poc.py b/examples/aijournal_poc.py index df5934e9b..156c0b97a 100755 --- a/examples/aijournal_poc.py +++ b/examples/aijournal_poc.py @@ -23,6 +23,8 @@ import os from pathlib import Path +import httpx + from agent_cli.memory.client import MemoryClient # Enable debug logging for memory module @@ -94,29 +96,135 @@ async def cmd_search(query: str, top_k: int = 5) -> None: print() -async def cmd_chat(question: str) -> None: +def cmd_show() -> None: + """Show all stored memories (what the system knows about you).""" + client, _ = get_client() + print("=== What I know about you ===\n") + + entries = client.list_all(conversation_id="journal") + + if not entries: + print("No memories stored yet. Add some journal entries first!") + return + + # Sort by created_at + entries.sort(key=lambda x: x["created_at"], reverse=True) + + for i, entry in enumerate(entries, 1): + date = entry["created_at"][:10] if entry["created_at"] else "unknown" + print(f"{i}. [{date}] {entry['content']}") + + print(f"\n--- Total: {len(entries)} memories ---") + + +PROFILE_PROMPT = """Based on the following facts about a person, create a brief profile summary. +Organize the information into categories like: +- **Identity**: Name, relationships, occupation +- **Interests & Activities**: Hobbies, regular activities +- **Goals & Values**: What they care about, what they're working towards +- **Recent Events**: Notable recent happenings + +Only include categories that have relevant information. Be concise. + +Facts: +{facts} + +Profile Summary:""" + + +async def cmd_profile() -> None: + """Generate a profile summary from stored memories.""" + client, model = get_client() + + entries = client.list_all(conversation_id="journal") + + if not entries: + print("No memories stored yet. Add some journal entries first!") + return + + # Format facts for the prompt + facts = "\n".join(f"- {e['content']}" for e in entries) + prompt = PROFILE_PROMPT.format(facts=facts) + + print("=== Your Profile ===\n") + print("(Generating profile from stored memories...)\n") + + # Direct LLM call (bypasses memory storage) + base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL) + api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local") + + async with httpx.AsyncClient(timeout=120.0) as http: + response = await http.post( + f"{base_url}/chat/completions", + headers={"Authorization": f"Bearer {api_key}"}, + json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.7, + }, + ) + data = response.json() + + choices = data.get("choices", []) + if choices: + profile = choices[0].get("message", {}).get("content", "") + print(profile) + + print(f"\n--- Based on {len(entries)} memories ---") + + +CHAT_SYSTEM_PROMPT = """You are a helpful AI assistant with memory of the user. + +Here's what you know about the user: +{profile} + +Use this knowledge naturally in your responses. Be helpful and personable.""" + + +async def cmd_chat(question: str, with_profile: bool = True) -> None: """Chat with memory-augmented LLM.""" client, model = get_client() + + # Build profile context + profile_text = "" + if with_profile: + entries = client.list_all(conversation_id="journal") + if entries: + profile_text = "\n".join(f"- {e['content']}" for e in entries) + print(f"Question: {question}\n") - response = await client.chat( - messages=[{"role": "user", "content": question}], - conversation_id="journal", - model=model, - ) + # Build messages with profile context + messages: list[dict[str, str]] = [] + if profile_text: + system_prompt = CHAT_SYSTEM_PROMPT.format(profile=profile_text) + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": question}) - # Extract assistant reply - choices = response.get("choices", []) + # Direct LLM call with profile context + base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL) + api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local") + + async with httpx.AsyncClient(timeout=120.0) as http: + response = await http.post( + f"{base_url}/chat/completions", + headers={"Authorization": f"Bearer {api_key}"}, + json={ + "model": model, + "messages": messages, + "temperature": 0.7, + }, + ) + data = response.json() + + choices = data.get("choices", []) if choices: reply = choices[0].get("message", {}).get("content", "") print(f"Answer: {reply}") - # Show which memories were used - hits = response.get("memory_hits", []) - if hits: - print(f"\n--- Used {len(hits)} memories ---") - for hit in hits[:3]: - print(f" • {hit['content'][:80]}...") + if profile_text: + entry_count = len(client.list_all(conversation_id="journal")) + print(f"\n--- Using profile with {entry_count} memories ---") def main() -> None: @@ -137,6 +245,12 @@ def main() -> None: chat_parser = subparsers.add_parser("chat", help="Chat with memory") chat_parser.add_argument("question", help="Question to ask") + # Show command - display what the system knows about you + subparsers.add_parser("show", help="Show all stored memories") + + # Profile command - generate a profile summary + subparsers.add_parser("profile", help="Generate profile from memories") + args = parser.parse_args() if args.command == "add": @@ -145,6 +259,10 @@ def main() -> None: asyncio.run(cmd_search(args.query, args.top_k)) elif args.command == "chat": asyncio.run(cmd_chat(args.question)) + elif args.command == "show": + cmd_show() + elif args.command == "profile": + asyncio.run(cmd_profile()) if __name__ == "__main__": From f083b9ee94a9f240540820d88d46f4a5db44cc2a Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 17:51:37 -0800 Subject: [PATCH 05/37] docs: add detailed comparison between AI journal POC and aijournal Analyzes architecture, features, and test results comparing our MemoryClient-based POC (~200 LOC) with the full aijournal project (~15,000+ LOC). Key findings: - POC successfully extracts facts and generates accurate profiles - Main gap is learning over time (strength tracking, decay, feedback) - Recommends adding simple strength field to close 80% of functionality gap with 20% of aijournal's complexity Includes concrete test results from ingesting 12+ blog posts. --- docs/aijournal-poc-comparison.md | 245 +++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 docs/aijournal-poc-comparison.md diff --git a/docs/aijournal-poc-comparison.md b/docs/aijournal-poc-comparison.md new file mode 100644 index 000000000..a6f928f0b --- /dev/null +++ b/docs/aijournal-poc-comparison.md @@ -0,0 +1,245 @@ +# AI Journal POC vs aijournal: Detailed Comparison + +This document analyzes the differences between our MemoryClient-based AI Journal POC and the full-featured aijournal project, identifying strengths, gaps, and potential paths forward. + +## Executive Summary + +| Aspect | Our POC | aijournal | +|--------|---------|-----------| +| **Complexity** | ~200 LOC | ~15,000+ LOC | +| **Setup Time** | Instant | `aijournal init` + config | +| **Profile Storage** | Generated on-demand | Persisted YAML with versioning | +| **Claim System** | Raw fact strings | Typed atoms with strength/decay | +| **Context Layers** | Single flat layer | 4 hierarchical layers (L1-L4) | +| **Learning** | Static extraction | Feedback loops + interview probing | + +## 1. Architecture Comparison + +### 1.1 Data Model + +**Our POC:** +``` +~/.aijournal/ + entries/ + journal/ + facts/ # Extracted facts as markdown + turns/ # Chat turns + chroma/ # Vector embeddings +``` + +**aijournal:** +``` +workspace/ + data/ + journal/YYYY/MM/DD/*.md # Raw entries + normalized/YYYY-MM-DD/ # Structured YAML + profile/ + self_profile.yaml # Facets (values, goals, traits) + claims.yaml # Typed claim atoms + derived/ + summaries/ # Daily summaries + microfacts/ # Extracted facts + persona/persona_core.yaml # L1 context (~1200 tokens) + index/ # Vector store + metadata + chat_sessions/ # Conversation history + pending/profile_updates/ # Queued changes +``` + +**Analysis:** aijournal separates authoritative data (human-editable) from derived data (reproducible). Our POC conflates these, making it harder to inspect or manually correct the knowledge base. + +### 1.2 Claim Representation + +**Our POC - Raw facts:** +``` +"Bas is a software engineer" +"The user loves hiking" +"The user's wife is named Anne" +``` + +**aijournal - Typed claim atoms:** +```yaml +- type: trait + subject: self + predicate: occupation + statement: "Works as a software engineer focused on AI systems" + scope: {domain: work, context: [professional]} + strength: 0.85 + status: accepted + provenance: + sources: [entry:2025-01-15-morning] + first_seen: 2025-01-15 + last_updated: 2025-01-20 +``` + +**Analysis:** aijournal's typed claims enable: +- Filtering by type (traits vs preferences vs goals) +- Confidence tracking via `strength` +- Time-decay for relevance +- Conflict detection between claims +- Source attribution for verification + +### 1.3 Context Layers + +**Our POC:** Single layer - all facts dumped into system prompt + +**aijournal - Hierarchical layers:** + +| Layer | Content | Tokens | Use Case | +|-------|---------|--------|----------| +| L1 | Persona core + top claims | ~1,200 | Quick chat, advice | +| L2 | L1 + recent summaries/facts | ~2,000 | Daily check-ins | +| L3 | L2 + full claims + facets | ~2,600 | Deep conversations | +| L4 | L3 + prompts + config + history | ~3,200 | External AI export | + +**Analysis:** Layered context prevents token overflow and allows appropriate depth for different interactions. + +## 2. Feature Comparison + +### 2.1 Fact Extraction + +| Feature | Our POC | aijournal | +|---------|---------|-----------| +| Extraction method | PydanticAI agent | Ollama + custom prompts | +| Output format | Raw strings | Typed MicroFact objects | +| Reconciliation | ADD/UPDATE/DELETE/NONE | Consolidation with strength weighting | +| Deduplication | Semantic similarity | Hash + semantic + scope matching | + +**Our POC advantage:** The reconciliation logic (PromptedOutput with JSON mode) prevents duplicate facts effectively. + +**aijournal advantage:** Consolidation weights existing evidence: `strength_new = clamp01((w_prev * strength_prev + w_obs * signal) / (w_prev + w_obs))` + +### 2.2 Profile Generation + +| Feature | Our POC | aijournal | +|---------|---------|-----------| +| Generation | On-demand via LLM | Pre-built `persona_core.yaml` | +| Caching | None | Persisted with staleness tracking | +| Categories | LLM-determined | Defined schema (values, goals, traits, etc.) | +| Token budget | Unlimited (risk of overflow) | Configurable (~1,200 default) | + +**Our POC advantage:** Flexible - LLM determines categories dynamically based on content. + +**aijournal advantage:** Deterministic, auditable, and respects token limits. + +### 2.3 Chat Integration + +| Feature | Our POC | aijournal | +|---------|---------|-----------| +| Context injection | All facts in system prompt | Layer-appropriate context | +| Citations | None | `[entry:id#p]` markers | +| Feedback | None | Up/down adjustments to claim strength | +| Memory storage | Bypassed (direct LLM call) | Persisted with telemetry | + +**Our POC advantage:** Simple, no side effects. + +**aijournal advantage:** Learning loop - feedback strengthens/weakens claims over time. + +### 2.4 Missing in Our POC + +1. **Interview/Probing Mode** + - aijournal generates questions to fill knowledge gaps + - Ranks facets by `staleness × impact_weight` to prioritize probing + +2. **Time Decay** + - aijournal: `effective_strength = strength × exp(-λ × staleness)` + - Our POC: All facts treated equally regardless of age + +3. **Conflict Resolution** + - aijournal: Detects contradictions, downgrades to `tentative`, queues questions + - Our POC: UPDATE replaces old fact entirely + +4. **Advisor Mode** + - aijournal: Separate `advise` command with coaching preferences + - Our POC: Generic chat only + +5. **Export/Packs** + - aijournal: Generate context bundles for external AIs + - Our POC: No export capability + +## 3. Test Results Analysis + +### 3.1 Blog Post Ingestion + +We fed 12+ blog posts into our POC: + +| Metric | Result | +|--------|--------| +| Posts processed | ~12 | +| Facts extracted | 52 | +| Extraction accuracy | High - captured key themes | +| Profile quality | Excellent - identified all major interests | + +**Sample extracted facts:** +- "Bas is a software engineer" +- "Bas works on AI systems" +- "The user loves hiking" +- "You went for a 5km run this morning" +- "You discovered that local vision models like Qwen3-VL-32B can identify niche books" + +### 3.2 Profile Generation Quality + +The generated profile correctly identified: +- ✅ Professional identity (software engineer, AI focus) +- ✅ Personal relationships (wife Anne) +- ✅ Hobbies (hiking, running, learning Dutch) +- ✅ Technical interests (local AI, terminal productivity, homelab) +- ✅ Values (minimalism, security, reproducibility) + +### 3.3 Chat Intelligence + +The chat demonstrated: +- **Specific recall:** "You use the Glove80 keyboard with programmable layers" +- **Temporal understanding:** Tracked evolution of views on AI coding +- **Theme synthesis:** Connected local AI + security + productivity interests +- **Nuanced responses:** Acknowledged both benefits and limitations + +## 4. Recommendations + +### 4.1 Quick Wins (Keep POC Simple) + +1. **Persist profile summary** - Cache the LLM-generated profile to avoid regeneration +2. **Add timestamps to facts** - Already have `created_at`, use it for recency weighting +3. **Token budgeting** - Limit facts sent to chat based on relevance + recency + +### 4.2 Medium-Term Enhancements + +1. **Claim typing** - Categorize facts into types (trait, preference, goal, relationship) +2. **Strength tracking** - Increment when same fact extracted multiple times +3. **Simple decay** - Weight recent facts higher in context + +### 4.3 aijournal Features Worth Adopting + +1. **Interview mode** - Generate questions to learn more +2. **Feedback loop** - Up/down on responses affects claim strength +3. **Layered context** - L1 for quick chats, L4 for deep dives +4. **Citations** - Link responses to source facts + +### 4.4 What NOT to Adopt + +1. **7-stage pipeline** - Overkill for our use case +2. **Strict schema governance** - Adds friction without clear benefit for POC +3. **Markdown file storage** - ChromaDB is sufficient for our needs + +## 5. Conclusion + +Our POC validates the core hypothesis: **MemoryClient can serve as the foundation for a personal knowledge system**. With ~200 lines of code, we achieved: + +- Accurate fact extraction from unstructured text +- Coherent profile generation from diverse content +- Personalized conversations using stored knowledge + +The main gap is **learning over time** - our system doesn't strengthen beliefs based on repetition or feedback. Adding simple strength tracking and decay would close 80% of the functionality gap with 20% of aijournal's complexity. + +### Recommended Next Step + +Add a `strength` field to stored facts and implement: +```python +# On duplicate fact detection +existing.strength = min(1.0, existing.strength + 0.1) +existing.last_seen = now() + +# On retrieval +effective_strength = fact.strength * exp(-0.1 * days_since_last_seen) +``` + +This single change would transform our static knowledge base into a learning system. From b9ad5ceedf94c0f5ebe961e08e1a20d21b86a664 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 19:30:24 -0800 Subject: [PATCH 06/37] feat(memory): add adaptive summarization with hierarchical storage Implement research-grounded summarization inspired by Letta and Mem0: - AdaptiveSummarizer with 5 levels (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) - Hierarchical summary storage (L1 chunks, L2 groups, L3 final) in ChromaDB - File-based persistence with YAML front matter in markdown files - Token counting via tiktoken with fallback to cl100k_base - Level-specific compression ratios (20%, 12%, 7%, capped 2000 tokens) Structure: - agent_cli/summarizer/ - standalone reusable summarization module - summaries/L1/chunk_*.md, L2/group_*.md, L3/final.md file hierarchy - Soft-delete old summaries to deleted/ folder before replacing --- agent_cli/memory/_files.py | 37 +- agent_cli/memory/_ingest.py | 92 ++++- agent_cli/memory/_persistence.py | 105 +++++- agent_cli/memory/_store.py | 151 ++++++++ agent_cli/memory/models.py | 20 ++ agent_cli/summarizer/__init__.py | 31 ++ agent_cli/summarizer/adaptive.py | 502 +++++++++++++++++++++++++++ agent_cli/summarizer/models.py | 220 ++++++++++++ agent_cli/summarizer/prompts.py | 135 +++++++ agent_cli/summarizer/utils.py | 258 ++++++++++++++ pyproject.toml | 8 +- tests/memory/test_store.py | 226 ++++++++++++ tests/summarizer/__init__.py | 1 + tests/summarizer/test_adaptive.py | 434 +++++++++++++++++++++++ tests/summarizer/test_integration.py | 466 +++++++++++++++++++++++++ tests/summarizer/test_models.py | 332 ++++++++++++++++++ tests/summarizer/test_prompts.py | 180 ++++++++++ tests/summarizer/test_utils.py | 193 ++++++++++ 18 files changed, 3386 insertions(+), 5 deletions(-) create mode 100644 agent_cli/summarizer/__init__.py create mode 100644 agent_cli/summarizer/adaptive.py create mode 100644 agent_cli/summarizer/models.py create mode 100644 agent_cli/summarizer/prompts.py create mode 100644 agent_cli/summarizer/utils.py create mode 100644 tests/summarizer/__init__.py create mode 100644 tests/summarizer/test_adaptive.py create mode 100644 tests/summarizer/test_integration.py create mode 100644 tests/summarizer/test_models.py create mode 100644 tests/summarizer/test_prompts.py create mode 100644 tests/summarizer/test_utils.py diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py index 536e49e80..65fbbc1b2 100644 --- a/agent_cli/memory/_files.py +++ b/agent_cli/memory/_files.py @@ -23,6 +23,11 @@ _SNAPSHOT_FILENAME = "memory_index.json" _DELETED_DIRNAME = "deleted" +# Summary level constants for hierarchical file structure +_SUMMARY_LEVEL_L1 = 1 +_SUMMARY_LEVEL_L2 = 2 +_SUMMARY_LEVEL_L3 = 3 + @dataclass class MemoryFileRecord: @@ -89,6 +94,16 @@ def write_memory_file( summary_kind: str | None = None, doc_id: str | None = None, source_id: str | None = None, + # Hierarchical summary fields + level: int | None = None, + is_final: bool | None = None, + chunk_index: int | None = None, + parent_group: int | None = None, + group_index: int | None = None, + input_tokens: int | None = None, + output_tokens: int | None = None, + compression_ratio: float | None = None, + summary_level_name: str | None = None, ) -> MemoryFileRecord: """Render and persist a memory document to disk.""" entries_dir, _ = ensure_store_dirs(root) @@ -97,7 +112,18 @@ def write_memory_file( safe_ts = _safe_timestamp(created_at) # Route by role/category for readability - if summary_kind: + if summary_kind and level is not None: + # Hierarchical summary file structure + if level == _SUMMARY_LEVEL_L1: + subdir = Path("summaries") / "L1" + filename = f"chunk_{chunk_index or 0}.md" + elif level == _SUMMARY_LEVEL_L2: + subdir = Path("summaries") / "L2" + filename = f"group_{group_index or 0}.md" + else: # level == _SUMMARY_LEVEL_L3 + subdir = Path("summaries") / "L3" + filename = "final.md" + elif summary_kind: subdir = Path("summaries") filename = "summary.md" elif role == "user": @@ -119,6 +145,15 @@ def write_memory_file( created_at=created_at, summary_kind=summary_kind, source_id=source_id, + level=level, + is_final=is_final, + chunk_index=chunk_index, + parent_group=parent_group, + group_index=group_index, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=compression_ratio, + summary_level_name=summary_level_name, ) front_matter = _render_front_matter(doc_id, metadata) diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py index 266b9f80f..6673000c1 100644 --- a/agent_cli/memory/_ingest.py +++ b/agent_cli/memory/_ingest.py @@ -10,7 +10,12 @@ from uuid import uuid4 from agent_cli.memory._git import commit_changes -from agent_cli.memory._persistence import delete_memory_files, persist_entries, persist_summary +from agent_cli.memory._persistence import ( + delete_memory_files, + persist_entries, + persist_hierarchical_summary, + persist_summary, +) from agent_cli.memory._prompt import ( FACT_INSTRUCTIONS, FACT_SYSTEM_PROMPT, @@ -34,6 +39,8 @@ from chromadb import Collection + from agent_cli.summarizer import SummaryResult + LOGGER = logging.getLogger(__name__) _SUMMARY_ROLE = "summary" @@ -285,7 +292,12 @@ async def update_summary( model: str, max_tokens: int = 256, ) -> str | None: - """Update the conversation summary based on new facts.""" + """Update the conversation summary based on new facts. + + This is the simple Mem0-style rolling summary that incrementally + updates based on new facts. For full content adaptive summarization, + use `summarize_content` instead. + """ if not new_facts: return prior_summary @@ -311,6 +323,82 @@ async def update_summary( return result.output.summary or prior_summary +async def summarize_content( + *, + content: str, + prior_summary: str | None = None, + content_type: str = "general", + openai_base_url: str, + api_key: str | None, + model: str, +) -> SummaryResult: + """Adaptively summarize content based on its length. + + Uses the AdaptiveSummarizer to automatically select the appropriate + summarization strategy (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) + based on input token count. + + Args: + content: The content to summarize. + prior_summary: Optional prior summary for context continuity. + content_type: Type of content ("general", "conversation", "journal", "document"). + openai_base_url: Base URL for OpenAI-compatible API. + api_key: API key for the LLM. + model: Model name to use for summarization. + + Returns: + SummaryResult with the summary and metadata. + + """ + # Import here to avoid circular imports and allow optional dependency + from agent_cli.summarizer import AdaptiveSummarizer # noqa: PLC0415 + + summarizer = AdaptiveSummarizer( + openai_base_url=openai_base_url, + model=model, + api_key=api_key, + ) + return await summarizer.summarize( + content=content, + prior_summary=prior_summary, + content_type=content_type, + ) + + +async def store_adaptive_summary( + collection: Collection, + memory_root: Path, + conversation_id: str, + summary_result: SummaryResult, +) -> list[str]: + """Store an adaptive summary result to files and ChromaDB. + + This stores all levels of a hierarchical summary (L1, L2, L3) or + just the final summary for simpler levels. Old summaries are deleted first. + + Files are stored as Markdown with YAML front matter in a hierarchical structure: + - summaries/L1/chunk_{n}.md - L1 chunk summaries + - summaries/L2/group_{n}.md - L2 group summaries + - summaries/L3/final.md - L3 final summary + + Args: + collection: ChromaDB collection. + memory_root: Root path for memory files. + conversation_id: The conversation this summary belongs to. + summary_result: The result from AdaptiveSummarizer.summarize(). + + Returns: + List of IDs that were stored. + + """ + return persist_hierarchical_summary( + collection, + memory_root=memory_root, + conversation_id=conversation_id, + summary_result=summary_result, + ) + + async def extract_and_store_facts_and_summaries( *, collection: Collection, diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index bd8f4dfd4..9c38f7315 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -3,10 +3,13 @@ from __future__ import annotations import logging +import shutil +from datetime import UTC, datetime from typing import TYPE_CHECKING from agent_cli.memory._files import ( _DELETED_DIRNAME, + _slugify, ensure_store_dirs, load_snapshot, read_memory_file, @@ -14,7 +17,13 @@ write_memory_file, write_snapshot, ) -from agent_cli.memory._store import delete_entries, list_conversation_entries, upsert_memories +from agent_cli.memory._store import ( + delete_entries, + delete_summaries, + list_conversation_entries, + upsert_hierarchical_summary, + upsert_memories, +) from agent_cli.memory.entities import Fact, Summary, Turn if TYPE_CHECKING: @@ -23,6 +32,7 @@ from chromadb import Collection from agent_cli.memory.models import MemoryMetadata + from agent_cli.summarizer import SummaryResult LOGGER = logging.getLogger(__name__) @@ -180,3 +190,96 @@ def evict_if_needed( ids_to_remove = [e.id for e in overflow] delete_entries(collection, ids_to_remove) delete_memory_files(memory_root, conversation_id, ids_to_remove) + + +def persist_hierarchical_summary( + collection: Collection, + *, + memory_root: Path, + conversation_id: str, + summary_result: SummaryResult, +) -> list[str]: + """Persist a hierarchical summary to disk and ChromaDB. + + This function: + 1. Deletes existing summaries (files and ChromaDB entries) + 2. Writes new summary files to disk in hierarchical structure + 3. Stores entries in ChromaDB + + Args: + collection: ChromaDB collection. + memory_root: Root path for memory files. + conversation_id: The conversation this summary belongs to. + summary_result: The result from AdaptiveSummarizer.summarize(). + + Returns: + List of IDs that were stored. + + """ + from agent_cli.summarizer import SummaryLevel # noqa: PLC0415 + + # Skip if no summary needed + if summary_result.level == SummaryLevel.NONE: + return [] + + # Delete existing summary files + _delete_summary_files(memory_root, conversation_id) + + # Delete existing ChromaDB entries + delete_summaries(collection, conversation_id) + + # Get storage metadata from SummaryResult + entries = summary_result.to_storage_metadata(conversation_id) + if not entries: + return [] + + stored_ids: list[str] = [] + created_at = datetime.now(UTC).isoformat() + + for entry in entries: + meta = entry["metadata"] + record = write_memory_file( + memory_root, + conversation_id=meta["conversation_id"], + role=meta["role"], + created_at=meta.get("created_at", created_at), + content=entry["content"], + summary_kind="summary", + doc_id=entry["id"], + level=meta.get("level"), + is_final=meta.get("is_final"), + chunk_index=meta.get("chunk_index"), + parent_group=meta.get("parent_group"), + group_index=meta.get("group_index"), + input_tokens=meta.get("input_tokens"), + output_tokens=meta.get("output_tokens"), + compression_ratio=meta.get("compression_ratio"), + summary_level_name=meta.get("summary_level"), + ) + LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta.get("level")) + stored_ids.append(record.id) + + # Store in ChromaDB + upsert_hierarchical_summary(collection, conversation_id, summary_result) + + return stored_ids + + +def _delete_summary_files(memory_root: Path, conversation_id: str) -> None: + """Delete all summary files for a conversation.""" + entries_dir, _ = ensure_store_dirs(memory_root) + safe_conversation = _slugify(conversation_id) + summaries_dir = entries_dir / safe_conversation / "summaries" + + if summaries_dir.exists(): + # Move to deleted folder instead of hard delete + deleted_dir = entries_dir / _DELETED_DIRNAME / safe_conversation / "summaries" + deleted_dir.parent.mkdir(parents=True, exist_ok=True) + + # If deleted summaries already exist, remove them first + if deleted_dir.exists(): + shutil.rmtree(deleted_dir) + + # Move current summaries to deleted + shutil.move(str(summaries_dir), str(deleted_dir)) + LOGGER.info("Moved old summaries to deleted: %s", deleted_dir) diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py index 96e7c66af..4f3755b12 100644 --- a/agent_cli/memory/_store.py +++ b/agent_cli/memory/_store.py @@ -167,3 +167,154 @@ def list_conversation_entries( def delete_entries(collection: Collection, ids: list[str]) -> None: """Delete entries by ID.""" delete_docs(collection, ids) + + +def upsert_hierarchical_summary( + collection: Collection, + conversation_id: str, + summary_result: Any, +) -> list[str]: + """Store all levels of a hierarchical summary. + + Uses SummaryResult.to_storage_metadata() to generate ChromaDB entries + for L1 (chunk), L2 (group), and L3 (final) summaries. + + Args: + collection: ChromaDB collection. + conversation_id: The conversation this summary belongs to. + summary_result: A SummaryResult from the adaptive summarizer. + + Returns: + List of IDs that were upserted. + + """ + entries = summary_result.to_storage_metadata(conversation_id) + if not entries: + return [] + + ids: list[str] = [] + contents: list[str] = [] + metadatas: list[MemoryMetadata] = [] + + for entry in entries: + ids.append(entry["id"]) + contents.append(entry["content"]) + # Convert the raw metadata dict to MemoryMetadata + meta_dict = entry["metadata"] + metadatas.append( + MemoryMetadata( + conversation_id=meta_dict["conversation_id"], + role=meta_dict["role"], + created_at=meta_dict["created_at"], + level=meta_dict.get("level"), + is_final=meta_dict.get("is_final"), + chunk_index=meta_dict.get("chunk_index"), + parent_group=meta_dict.get("parent_group"), + group_index=meta_dict.get("group_index"), + input_tokens=meta_dict.get("input_tokens"), + output_tokens=meta_dict.get("output_tokens"), + compression_ratio=meta_dict.get("compression_ratio"), + summary_level_name=meta_dict.get("summary_level"), + ), + ) + + upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas) + return ids + + +def get_summary_at_level( + collection: Collection, + conversation_id: str, + level: int, +) -> list[StoredMemory]: + """Retrieve summaries at a specific level for a conversation. + + Args: + collection: ChromaDB collection. + conversation_id: The conversation to retrieve summaries for. + level: Summary level (1=chunk, 2=group, 3=final). + + Returns: + List of StoredMemory entries at the requested level. + + """ + filters: list[dict[str, Any]] = [ + {"conversation_id": conversation_id}, + {"role": "summary"}, + {"level": level}, + ] + result = collection.get(where={"$and": filters}) + docs = result.get("documents") or [] + metas = result.get("metadatas") or [] + ids = result.get("ids") or [] + + records: list[StoredMemory] = [] + for doc, meta, entry_id in zip(docs, metas, ids, strict=False): + records.append( + StoredMemory( + id=entry_id, + content=doc, + metadata=MemoryMetadata(**dict(meta)), + distance=None, + ), + ) + return records + + +def get_final_summary( + collection: Collection, + conversation_id: str, +) -> StoredMemory | None: + """Get the L3 (final) summary for a conversation. + + This is a convenience wrapper around get_summary_at_level for the + most common use case of retrieving the top-level summary. + + Args: + collection: ChromaDB collection. + conversation_id: The conversation to retrieve the summary for. + + Returns: + The final summary entry, or None if not found. + + """ + summaries = get_summary_at_level(collection, conversation_id, level=3) + # Return the one marked as final, or the first if none marked + for summary in summaries: + if summary.metadata.is_final: + return summary + return summaries[0] if summaries else None + + +def delete_summaries( + collection: Collection, + conversation_id: str, + *, + levels: list[int] | None = None, +) -> int: + """Delete summary entries for a conversation. + + Args: + collection: ChromaDB collection. + conversation_id: The conversation to delete summaries from. + levels: Optional list of levels to delete. If None, deletes all levels. + + Returns: + Number of entries deleted. + + """ + filters: list[dict[str, Any]] = [ + {"conversation_id": conversation_id}, + {"role": "summary"}, + ] + if levels: + filters.append({"level": {"$in": levels}}) + + # First get the IDs to count them + result = collection.get(where={"$and": filters}) + ids = result.get("ids") or [] + + if ids: + delete_docs(collection, list(ids)) + + return len(ids) diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py index 9ef076d57..6dc689d8f 100644 --- a/agent_cli/memory/models.py +++ b/agent_cli/memory/models.py @@ -49,6 +49,26 @@ class MemoryMetadata(BaseModel): replaced_by: str | None = None source_id: str | None = None + # Hierarchical summary fields (only used when role="summary") + level: int | None = None + """Summary level: 1=chunk, 2=group, 3=final.""" + is_final: bool | None = None + """Whether this is the final L3 summary.""" + chunk_index: int | None = None + """For L1 summaries: index of the source chunk.""" + parent_group: int | None = None + """For L1 summaries: which L2 group this chunk belongs to.""" + group_index: int | None = None + """For L2 summaries: index of this group.""" + input_tokens: int | None = None + """Number of tokens in the original input (L3 only).""" + output_tokens: int | None = None + """Number of tokens in the summary output (L3 only).""" + compression_ratio: float | None = None + """Ratio of output to input tokens (L3 only).""" + summary_level_name: str | None = None + """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL').""" + class SummaryOutput(BaseModel): """Structured summary returned by the LLM.""" diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py new file mode 100644 index 000000000..c6f1d85a1 --- /dev/null +++ b/agent_cli/summarizer/__init__.py @@ -0,0 +1,31 @@ +"""Adaptive summarization module for variable-length content. + +This module provides research-grounded summarization that scales with input complexity, +inspired by Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, +compression ratios) architectures. + +Example: + from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel + + summarizer = AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + result = await summarizer.summarize(long_document) + print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}") + +""" + +from agent_cli.summarizer.adaptive import AdaptiveSummarizer +from agent_cli.summarizer.models import ( + HierarchicalSummary, + SummaryLevel, + SummaryResult, +) + +__all__ = [ + "AdaptiveSummarizer", + "HierarchicalSummary", + "SummaryLevel", + "SummaryResult", +] diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py new file mode 100644 index 000000000..ed0074d87 --- /dev/null +++ b/agent_cli/summarizer/adaptive.py @@ -0,0 +1,502 @@ +"""Adaptive summarization that scales with input complexity. + +This module implements research-grounded summarization inspired by: +- Letta: Partial eviction (30%), middle truncation, fire-and-forget background processing +- Mem0: Rolling summaries, 90%+ compression, two-phase architecture + +Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta) +""" + +from __future__ import annotations + +import asyncio +import logging + +import httpx +from pydantic import BaseModel +from pydantic_ai import Agent +from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.providers.openai import OpenAIProvider +from pydantic_ai.settings import ModelSettings + +from agent_cli.summarizer.models import ( + ChunkSummary, + HierarchicalSummary, + SummaryLevel, + SummaryResult, +) +from agent_cli.summarizer.prompts import ( + BRIEF_SUMMARY_PROMPT, + CHUNK_SUMMARY_PROMPT, + META_SUMMARY_PROMPT, + ROLLING_SUMMARY_PROMPT, + format_prior_context, + format_summaries_for_meta, + get_prompt_for_content_type, +) +from agent_cli.summarizer.utils import ( + chunk_text, + count_tokens, + estimate_summary_tokens, + tokens_to_words, +) + +logger = logging.getLogger(__name__) + +# Thresholds for summary levels (in tokens) +LEVEL_THRESHOLDS = { + SummaryLevel.NONE: 100, + SummaryLevel.BRIEF: 500, + SummaryLevel.STANDARD: 3000, + SummaryLevel.DETAILED: 15000, + # HIERARCHICAL is everything above DETAILED +} + +# Number of L1 chunks to group together for L2 summaries +L2_GROUP_SIZE = 5 +# Minimum number of L1 chunks before L2 grouping is applied +L2_MIN_CHUNKS = 5 + + +class SummaryOutput(BaseModel): + """Structured output for summary generation.""" + + summary: str + + +class AdaptiveSummarizer: + """Adaptive summarization that scales with input complexity. + + Automatically selects the appropriate summarization strategy based on + input length: + - NONE (< 100 tokens): No summary needed + - BRIEF (100-500 tokens): Single sentence + - STANDARD (500-3000 tokens): Paragraph summary + - DETAILED (3000-15000 tokens): Chunked + meta-summary + - HIERARCHICAL (> 15000 tokens): Multi-level tree of summaries + + Example: + summarizer = AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="llama3.1:8b", + ) + result = await summarizer.summarize(long_document) + print(f"Level: {result.level.name}") + print(f"Summary: {result.summary}") + print(f"Compression: {result.compression_ratio:.1%}") + + """ + + def __init__( + self, + openai_base_url: str, + model: str, + api_key: str | None = None, + chunk_size: int = 3000, + chunk_overlap: int = 200, + max_concurrent_chunks: int = 5, + timeout: float = 60.0, + ) -> None: + """Initialize the adaptive summarizer. + + Args: + openai_base_url: Base URL for OpenAI-compatible API. + model: Model name to use for summarization. + api_key: API key (optional for local models). + chunk_size: Target token count per chunk for hierarchical summarization. + chunk_overlap: Token overlap between chunks. + max_concurrent_chunks: Maximum parallel chunk summarizations. + timeout: Request timeout in seconds. + + """ + self.openai_base_url = openai_base_url.rstrip("/") + self.model = model + self.api_key = api_key or "not-needed" + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.max_concurrent_chunks = max_concurrent_chunks + self.timeout = timeout + + self._provider = OpenAIProvider(api_key=self.api_key, base_url=self.openai_base_url) + + def determine_level(self, token_count: int) -> SummaryLevel: + """Determine the appropriate summary level based on token count. + + Args: + token_count: Number of tokens in the input. + + Returns: + The recommended SummaryLevel. + + """ + if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]: + return SummaryLevel.NONE + if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]: + return SummaryLevel.BRIEF + if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]: + return SummaryLevel.STANDARD + if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]: + return SummaryLevel.DETAILED + return SummaryLevel.HIERARCHICAL + + async def summarize( + self, + content: str, + prior_summary: str | None = None, + content_type: str = "general", + ) -> SummaryResult: + """Summarize content with adaptive strategy based on length. + + Args: + content: The content to summarize. + prior_summary: Optional prior summary for context continuity. + content_type: Type of content ("general", "conversation", "journal", "document"). + + Returns: + SummaryResult with summary and metadata. + + """ + if not content or not content.strip(): + return SummaryResult( + level=SummaryLevel.NONE, + summary=None, + hierarchical=None, + input_tokens=0, + output_tokens=0, + compression_ratio=0.0, + ) + + input_tokens = count_tokens(content, self.model) + level = self.determine_level(input_tokens) + + logger.info( + "Summarizing %d tokens at level %s (type=%s)", + input_tokens, + level.name, + content_type, + ) + + if level == SummaryLevel.NONE: + return SummaryResult( + level=level, + summary=None, + hierarchical=None, + input_tokens=input_tokens, + output_tokens=0, + compression_ratio=0.0, + ) + + if level == SummaryLevel.BRIEF: + summary = await self._brief_summary(content) + elif level == SummaryLevel.STANDARD: + summary = await self._standard_summary(content, prior_summary, content_type) + elif level == SummaryLevel.DETAILED: + return await self._detailed_summary(content, input_tokens) + else: # HIERARCHICAL + return await self._hierarchical_summary(content, input_tokens) + + output_tokens = count_tokens(summary, self.model) if summary else 0 + compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0 + + return SummaryResult( + level=level, + summary=summary, + hierarchical=None, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=compression_ratio, + ) + + async def update_rolling_summary( + self, + prior_summary: str | None, + new_facts: list[str], + ) -> str: + """Update a rolling summary with new facts (Mem0-style). + + This is optimized for incremental updates where you have discrete + new facts to integrate into an existing summary. + + Args: + prior_summary: The existing summary to update. + new_facts: List of new facts to integrate. + + Returns: + Updated summary string. + + """ + if not new_facts: + return prior_summary or "" + + new_content = "\n".join(f"- {fact}" for fact in new_facts) + combined_tokens = count_tokens( + (prior_summary or "") + new_content, + self.model, + ) + + target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt = ROLLING_SUMMARY_PROMPT.format( + prior_summary=prior_summary or "(No prior summary)", + new_content=new_content, + max_words=max_words, + ) + + return await self._generate_summary(prompt, max_tokens=target_tokens + 50) + + async def _brief_summary(self, content: str) -> str: + """Generate a single-sentence summary for brief content.""" + prompt = BRIEF_SUMMARY_PROMPT.format(content=content) + return await self._generate_summary(prompt, max_tokens=50) + + async def _standard_summary( + self, + content: str, + prior_summary: str | None, + content_type: str, + ) -> str: + """Generate a paragraph summary for standard-length content.""" + input_tokens = count_tokens(content, self.model) + target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt_template = get_prompt_for_content_type(content_type) + prior_context = format_prior_context(prior_summary) + + prompt = prompt_template.format( + content=content, + prior_context=prior_context, + max_words=max_words, + ) + + return await self._generate_summary(prompt, max_tokens=target_tokens + 50) + + async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryResult: + """Generate chunked summaries with meta-summary for detailed content.""" + chunks = chunk_text( + content, + chunk_size=self.chunk_size, + overlap=self.chunk_overlap, + model=self.model, + ) + + logger.info("Detailed summary: processing %d chunks", len(chunks)) + + # Summarize chunks (with concurrency limit) + semaphore = asyncio.Semaphore(self.max_concurrent_chunks) + + async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary: + async with semaphore: + chunk_tokens = count_tokens(chunk, self.model) + target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt = CHUNK_SUMMARY_PROMPT.format( + chunk_index=idx + 1, + total_chunks=len(chunks), + content=chunk, + max_words=max_words, + ) + + summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50) + summary_tokens = count_tokens(summary, self.model) + + return ChunkSummary( + chunk_index=idx, + content=summary, + token_count=summary_tokens, + source_tokens=chunk_tokens, + parent_group=None, + ) + + chunk_summaries = await asyncio.gather( + *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)], + ) + + # Generate meta-summary + all_summaries = [cs.content for cs in chunk_summaries] + meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED) + max_words = tokens_to_words(meta_target) + + meta_prompt = META_SUMMARY_PROMPT.format( + summaries=format_summaries_for_meta(all_summaries), + max_words=max_words, + ) + + final_summary = await self._generate_summary(meta_prompt, max_tokens=meta_target + 100) + output_tokens = count_tokens(final_summary, self.model) + + hierarchical = HierarchicalSummary( + l1_summaries=list(chunk_summaries), + l2_summaries=[], # Not used for DETAILED level + l3_summary=final_summary, + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + ) + + return SummaryResult( + level=SummaryLevel.DETAILED, + summary=final_summary, + hierarchical=hierarchical, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + ) + + async def _hierarchical_summary(self, content: str, input_tokens: int) -> SummaryResult: + """Build a tree of summaries for very long content. + + Structure: + - L1: Individual chunk summaries + - L2: Group summaries (groups of ~5 L1 summaries) + - L3: Final synthesis + """ + chunks = chunk_text( + content, + chunk_size=self.chunk_size, + overlap=self.chunk_overlap, + model=self.model, + ) + + logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks)) + + # L1: Summarize each chunk + semaphore = asyncio.Semaphore(self.max_concurrent_chunks) + + async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary: + async with semaphore: + chunk_tokens = count_tokens(chunk, self.model) + target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt = CHUNK_SUMMARY_PROMPT.format( + chunk_index=idx + 1, + total_chunks=len(chunks), + content=chunk, + max_words=max_words, + ) + + summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50) + summary_tokens = count_tokens(summary, self.model) + + # Assign to group (5 chunks per group) + group_idx = idx // 5 + + return ChunkSummary( + chunk_index=idx, + content=summary, + token_count=summary_tokens, + source_tokens=chunk_tokens, + parent_group=group_idx, + ) + + l1_summaries = await asyncio.gather( + *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)], + ) + + # L2: Group summaries (if more than L2_MIN_CHUNKS chunks) + l2_summaries: list[str] = [] + if len(l1_summaries) > L2_MIN_CHUNKS: + groups: list[list[str]] = [] + for i in range(0, len(l1_summaries), L2_GROUP_SIZE): + group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]] + groups.append(group) + + async def summarize_group(group: list[str]) -> str: + combined_tokens = sum(count_tokens(s, self.model) for s in group) + target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt = META_SUMMARY_PROMPT.format( + summaries=format_summaries_for_meta(group), + max_words=max_words, + ) + return await self._generate_summary(prompt, max_tokens=target_tokens + 50) + + l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups]) + + # L3: Final synthesis + summaries_to_synthesize = ( + l2_summaries if l2_summaries else [cs.content for cs in l1_summaries] + ) + final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL) + max_words = tokens_to_words(final_target) + + final_prompt = META_SUMMARY_PROMPT.format( + summaries=format_summaries_for_meta(summaries_to_synthesize), + max_words=max_words, + ) + + final_summary = await self._generate_summary(final_prompt, max_tokens=final_target + 100) + output_tokens = count_tokens(final_summary, self.model) + + hierarchical = HierarchicalSummary( + l1_summaries=list(l1_summaries), + l2_summaries=list(l2_summaries), + l3_summary=final_summary, + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + ) + + return SummaryResult( + level=SummaryLevel.HIERARCHICAL, + summary=final_summary, + hierarchical=hierarchical, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + ) + + async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str: + """Generate a summary using the LLM. + + Uses PydanticAI for structured output with fallback to raw generation. + """ + model = OpenAIChatModel( + model_name=self.model, + provider=self._provider, + settings=ModelSettings( + temperature=0.3, + max_tokens=max_tokens, + ), + ) + + agent = Agent( + model=model, + system_prompt="You are a concise summarizer. Output only the summary, no preamble.", + output_type=SummaryOutput, + retries=2, + ) + + try: + result = await agent.run(prompt) + return result.output.summary.strip() + except Exception as e: + logger.warning("Structured summary failed, trying raw generation: %s", e) + # Fallback to raw HTTP call + return await self._raw_generate(prompt, max_tokens) + + async def _raw_generate(self, prompt: str, max_tokens: int) -> str: + """Fallback raw HTTP generation without structured output.""" + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + f"{self.openai_base_url}/chat/completions", + headers={"Authorization": f"Bearer {self.api_key}"}, + json={ + "model": self.model, + "messages": [ + {"role": "system", "content": "You are a concise summarizer."}, + {"role": "user", "content": prompt}, + ], + "temperature": 0.3, + "max_tokens": max_tokens, + }, + ) + response.raise_for_status() + data = response.json() + + choices = data.get("choices", []) + if choices: + return choices[0].get("message", {}).get("content", "").strip() + return "" diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py new file mode 100644 index 000000000..f231a41e5 --- /dev/null +++ b/agent_cli/summarizer/models.py @@ -0,0 +1,220 @@ +"""Data models for adaptive summarization.""" + +from __future__ import annotations + +from datetime import datetime +from enum import IntEnum +from typing import Any + +from pydantic import BaseModel, Field + +# Hierarchical level constants for storage +HIERARCHICAL_LEVEL_L1 = 1 +HIERARCHICAL_LEVEL_L2 = 2 +HIERARCHICAL_LEVEL_L3 = 3 + + +class SummaryLevel(IntEnum): + """Summary granularity levels based on input complexity. + + Thresholds are based on Mem0 research showing optimal compression ratios + at different content lengths. Token counts are approximate guidelines. + """ + + NONE = 0 + """< 100 tokens: No summary needed, facts only.""" + + BRIEF = 1 + """100-500 tokens: Single-sentence summary (~20% compression).""" + + STANDARD = 2 + """500-3000 tokens: Paragraph summary (~12% compression).""" + + DETAILED = 3 + """3000-15000 tokens: Chunked summaries + meta-summary (~7% compression).""" + + HIERARCHICAL = 4 + """> 15000 tokens: Tree of summaries with multiple levels.""" + + +class ChunkSummary(BaseModel): + """Summary of a single chunk within a hierarchical summary.""" + + chunk_index: int = Field(..., description="Index of this chunk in the original content") + content: str = Field(..., description="The summarized content of this chunk") + token_count: int = Field(..., ge=0, description="Token count of this summary") + source_tokens: int = Field(..., ge=0, description="Token count of the source chunk") + parent_group: int | None = Field( + default=None, + description="Index of the L2 group this chunk belongs to", + ) + + +class HierarchicalSummary(BaseModel): + """A hierarchical summary with multiple levels. + + Structure inspired by Letta's partial eviction pattern: + - L1: Individual chunk summaries (parallel processing) + - L2: Group summaries (groups of ~5 L1 summaries) + - L3: Final synthesis (single top-level summary) + """ + + l1_summaries: list[ChunkSummary] = Field( + default_factory=list, + description="Level 1: Individual chunk summaries", + ) + l2_summaries: list[str] = Field( + default_factory=list, + description="Level 2: Group summaries (if > 5 chunks)", + ) + l3_summary: str = Field( + ..., + description="Level 3: Final synthesized summary", + ) + chunk_size: int = Field( + default=3000, + description="Token size used for chunking", + ) + chunk_overlap: int = Field( + default=200, + description="Token overlap between chunks", + ) + + def get_summary_at_level(self, level: int) -> str | list[str]: + """Get summary content at a specific level. + + Args: + level: 1 for chunk summaries, 2 for group summaries, 3 for final. + + Returns: + Summary content at the requested level. + + """ + if level == HIERARCHICAL_LEVEL_L1: + return [cs.content for cs in self.l1_summaries] + if level == HIERARCHICAL_LEVEL_L2: + return self.l2_summaries if self.l2_summaries else [self.l3_summary] + return self.l3_summary + + +class SummaryResult(BaseModel): + """Result of adaptive summarization. + + Contains the summary at the appropriate level for the input complexity, + along with metadata about the compression achieved. + """ + + level: SummaryLevel = Field(..., description="The summarization level used") + summary: str | None = Field( + default=None, + description="The final summary text (None for NONE level)", + ) + hierarchical: HierarchicalSummary | None = Field( + default=None, + description="Full hierarchical structure (for DETAILED/HIERARCHICAL levels)", + ) + input_tokens: int = Field(..., ge=0, description="Token count of the input content") + output_tokens: int = Field(..., ge=0, description="Token count of the summary") + compression_ratio: float = Field( + ..., + ge=0.0, + le=1.0, + description="Ratio of output to input tokens (lower = more compression)", + ) + created_at: datetime = Field( + default_factory=datetime.utcnow, + description="Timestamp when summary was created", + ) + + @property + def chunk_summaries(self) -> list[str] | None: + """Get L1 chunk summaries if available.""" + if self.hierarchical: + return [cs.content for cs in self.hierarchical.l1_summaries] + return None + + def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: + """Convert to metadata entries for ChromaDB storage. + + Returns a list of metadata dicts, one for each summary level stored. + """ + entries: list[dict[str, Any]] = [] + timestamp = self.created_at.isoformat() + + if self.level == SummaryLevel.NONE: + return entries + + # For hierarchical summaries, store each level + if self.hierarchical: + # L1: Individual chunk summaries + entries.extend( + { + "id": f"{conversation_id}:summary:L1:{cs.chunk_index}", + "content": cs.content, + "metadata": { + "conversation_id": conversation_id, + "role": "summary", + "level": HIERARCHICAL_LEVEL_L1, + "chunk_index": cs.chunk_index, + "parent_group": cs.parent_group, + "token_count": cs.token_count, + "created_at": timestamp, + }, + } + for cs in self.hierarchical.l1_summaries + ) + + # L2: Group summaries + entries.extend( + { + "id": f"{conversation_id}:summary:L2:{idx}", + "content": l2_summary, + "metadata": { + "conversation_id": conversation_id, + "role": "summary", + "level": HIERARCHICAL_LEVEL_L2, + "group_index": idx, + "created_at": timestamp, + }, + } + for idx, l2_summary in enumerate(self.hierarchical.l2_summaries) + ) + + # L3: Final summary + entries.append( + { + "id": f"{conversation_id}:summary:L3:final", + "content": self.hierarchical.l3_summary, + "metadata": { + "conversation_id": conversation_id, + "role": "summary", + "level": HIERARCHICAL_LEVEL_L3, + "is_final": True, + "input_tokens": self.input_tokens, + "output_tokens": self.output_tokens, + "compression_ratio": self.compression_ratio, + "created_at": timestamp, + }, + }, + ) + elif self.summary: + # Non-hierarchical: just store the single summary + entries.append( + { + "id": f"{conversation_id}:summary:L3:final", + "content": self.summary, + "metadata": { + "conversation_id": conversation_id, + "role": "summary", + "level": HIERARCHICAL_LEVEL_L3, + "is_final": True, + "summary_level": self.level.name, + "input_tokens": self.input_tokens, + "output_tokens": self.output_tokens, + "compression_ratio": self.compression_ratio, + "created_at": timestamp, + }, + }, + ) + + return entries diff --git a/agent_cli/summarizer/prompts.py b/agent_cli/summarizer/prompts.py new file mode 100644 index 000000000..101422b77 --- /dev/null +++ b/agent_cli/summarizer/prompts.py @@ -0,0 +1,135 @@ +"""Prompt templates for adaptive summarization. + +These prompts are designed to work with various LLM sizes (8B-20B parameters) +and are optimized for structured, factual output. +""" + +# Level 1: BRIEF - Single sentence summary +BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words). +Focus on the single most important point or takeaway. + +Content: +{content} + +One-sentence summary:""".strip() + +# Level 2: STANDARD - Paragraph summary +STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph. + +Focus on: +- Key facts, decisions, and outcomes +- Important context that should be remembered +- Skip transient details, greetings, and chitchat + +{prior_context} + +Content to summarize: +{content} + +Summary (maximum {max_words} words):""".strip() + +# Level 3: DETAILED - Used for individual chunks in hierarchical summarization +CHUNK_SUMMARY_PROMPT = """Summarize this section of a longer document. +Capture the main points while preserving important details. + +Section {chunk_index} of {total_chunks}: +{content} + +Summary of this section (maximum {max_words} words):""".strip() + +# Level 4: META - Combine multiple summaries into one +META_SUMMARY_PROMPT = """Synthesize these summaries into a single coherent overview. +Identify common themes and key points across all sections. +Eliminate redundancy while preserving unique insights. + +Summaries to combine: +{summaries} + +Combined summary (maximum {max_words} words):""".strip() + +# Rolling summary update (Mem0-style) +ROLLING_SUMMARY_PROMPT = """Update the running summary with new information. +Integrate new facts seamlessly while keeping the summary concise. +Drop redundant or superseded information. +Preserve durable facts about identity, preferences, and important events. + +Current summary: +{prior_summary} + +New information to integrate: +{new_content} + +Updated summary (maximum {max_words} words):""".strip() + +# For conversation-specific summarization +CONVERSATION_SUMMARY_PROMPT = """Summarize this conversation from the AI assistant's perspective. +Focus on: +- What the user wanted or asked about +- Key information the user shared about themselves +- Decisions made or conclusions reached +- Any commitments or follow-ups mentioned + +Conversation: +{content} + +Summary (maximum {max_words} words):""".strip() + +# For journal/personal content +JOURNAL_SUMMARY_PROMPT = """Summarize this personal entry or reflection. +Preserve: +- Key events and experiences mentioned +- Emotions and insights expressed +- Goals, plans, or intentions stated +- People, places, or things that are important + +Entry: +{content} + +Summary (maximum {max_words} words):""".strip() + +# For technical/document content +DOCUMENT_SUMMARY_PROMPT = """Summarize this technical content or documentation. +Focus on: +- Main concepts and their relationships +- Key procedures or processes described +- Important specifications or requirements +- Conclusions or recommendations + +Document: +{content} + +Summary (maximum {max_words} words):""".strip() + + +def get_prompt_for_content_type(content_type: str) -> str: + """Get the appropriate prompt template for a content type. + + Args: + content_type: One of "general", "conversation", "journal", "document". + + Returns: + The prompt template string. + + """ + prompts = { + "general": STANDARD_SUMMARY_PROMPT, + "conversation": CONVERSATION_SUMMARY_PROMPT, + "journal": JOURNAL_SUMMARY_PROMPT, + "document": DOCUMENT_SUMMARY_PROMPT, + } + return prompts.get(content_type, STANDARD_SUMMARY_PROMPT) + + +def format_prior_context(prior_summary: str | None) -> str: + """Format prior summary context for inclusion in prompts.""" + if prior_summary: + return f"Prior context (for continuity):\n{prior_summary}\n" + return "" + + +def format_summaries_for_meta(summaries: list[str]) -> str: + """Format a list of summaries for the meta-summary prompt.""" + formatted = [] + for i, summary in enumerate(summaries, 1): + formatted.append(f"[Section {i}]\n{summary}") + return "\n\n".join(formatted) diff --git a/agent_cli/summarizer/utils.py b/agent_cli/summarizer/utils.py new file mode 100644 index 000000000..bc319f5b5 --- /dev/null +++ b/agent_cli/summarizer/utils.py @@ -0,0 +1,258 @@ +"""Utility functions for adaptive summarization.""" + +from __future__ import annotations + +import re +from functools import lru_cache +from typing import TYPE_CHECKING + +from agent_cli.summarizer.models import SummaryLevel + +if TYPE_CHECKING: + import tiktoken + + +@lru_cache(maxsize=4) +def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding: + """Get tiktoken encoding for a model, with caching. + + Falls back to cl100k_base for unknown models (covers most modern LLMs). + """ + import tiktoken # noqa: PLC0415 + + try: + return tiktoken.encoding_for_model(model) + except KeyError: + return tiktoken.get_encoding("cl100k_base") + + +def count_tokens(text: str, model: str = "gpt-4") -> int: + """Count tokens in text using tiktoken. + + Args: + text: The text to count tokens for. + model: Model name for tokenizer selection. + + Returns: + Number of tokens in the text. + + """ + if not text: + return 0 + enc = _get_encoding(model) + return len(enc.encode(text)) + + +def chunk_text( + text: str, + chunk_size: int = 3000, + overlap: int = 200, + model: str = "gpt-4", +) -> list[str]: + """Split text into overlapping chunks by token count. + + Uses semantic boundaries (paragraphs, sentences) when possible to avoid + splitting mid-thought. Falls back to token-based splitting if no good + boundaries are found. + + Args: + text: The text to chunk. + chunk_size: Target token count per chunk. + overlap: Token overlap between chunks for context continuity. + model: Model name for tokenizer. + + Returns: + List of text chunks. + + """ + if not text: + return [] + + total_tokens = count_tokens(text, model) + if total_tokens <= chunk_size: + return [text] + + # Split into paragraphs first + paragraphs = re.split(r"\n\s*\n", text) + paragraphs = [p.strip() for p in paragraphs if p.strip()] + + if not paragraphs: + return [text] + + chunks: list[str] = [] + current_chunk: list[str] = [] + current_tokens = 0 + + for para in paragraphs: + para_tokens = count_tokens(para, model) + + # If single paragraph exceeds chunk size, split it further + if para_tokens > chunk_size: + # Flush current chunk if any + if current_chunk: + chunks.append("\n\n".join(current_chunk)) + current_chunk = [] + current_tokens = 0 + + # Split large paragraph by sentences + sentences = _split_sentences(para) + for sentence in sentences: + sent_tokens = count_tokens(sentence, model) + if current_tokens + sent_tokens > chunk_size and current_chunk: + chunks.append(" ".join(current_chunk)) + # Keep overlap from end of previous chunk + overlap_text = _get_overlap_text(current_chunk, overlap, model) + current_chunk = [overlap_text] if overlap_text else [] + current_tokens = count_tokens(overlap_text, model) if overlap_text else 0 + current_chunk.append(sentence) + current_tokens += sent_tokens + elif current_tokens + para_tokens > chunk_size: + # Flush current chunk and start new one + chunks.append("\n\n".join(current_chunk)) + # Keep overlap from end of previous chunk + overlap_text = _get_overlap_text(current_chunk, overlap, model) + current_chunk = [overlap_text, para] if overlap_text else [para] + current_tokens = ( + count_tokens(overlap_text, model) + para_tokens if overlap_text else para_tokens + ) + else: + current_chunk.append(para) + current_tokens += para_tokens + + # Don't forget the last chunk + if current_chunk: + chunks.append("\n\n".join(current_chunk)) + + return chunks + + +def _split_sentences(text: str) -> list[str]: + """Split text into sentences, preserving common abbreviations.""" + # Simple sentence splitting that handles common cases + # Matches period/question/exclamation followed by space and capital letter + sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text) + return [s.strip() for s in sentences if s.strip()] + + +def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str: + """Extract overlap text from end of chunk list. + + Takes text from the end of the chunk list until reaching target_tokens. + """ + if not chunks or target_tokens <= 0: + return "" + + # Work backwards through chunks + overlap_parts: list[str] = [] + tokens_collected = 0 + + for chunk in reversed(chunks): + chunk_tokens = count_tokens(chunk, model) + if tokens_collected + chunk_tokens <= target_tokens: + overlap_parts.insert(0, chunk) + tokens_collected += chunk_tokens + else: + # Take partial chunk if needed + words = chunk.split() + partial: list[str] = [] + for word in reversed(words): + word_tokens = count_tokens(word, model) + if tokens_collected + word_tokens <= target_tokens: + partial.insert(0, word) + tokens_collected += word_tokens + else: + break + if partial: + overlap_parts.insert(0, " ".join(partial)) + break + + return " ".join(overlap_parts) + + +def middle_truncate( + text: str, + budget_chars: int, + head_frac: float = 0.3, + tail_frac: float = 0.3, +) -> tuple[str, int]: + """Middle-truncate text to fit within a character budget. + + Keeps the first head_frac and last tail_frac portions, dropping the middle. + This preserves context from both the beginning (often contains setup) and + end (often contains conclusions/recent events). + + Inspired by Letta's `middle_truncate_text` function. + + Args: + text: Text to truncate. + budget_chars: Maximum character count for output. + head_frac: Fraction of budget for the head portion. + tail_frac: Fraction of budget for the tail portion. + + Returns: + Tuple of (truncated_text, dropped_char_count). + + """ + if budget_chars <= 0 or len(text) <= budget_chars: + return text, 0 + + head_len = max(0, int(budget_chars * head_frac)) + tail_len = max(0, int(budget_chars * tail_frac)) + + # Ensure head + tail doesn't exceed budget + if head_len + tail_len > budget_chars: + tail_len = max(0, budget_chars - head_len) + + head = text[:head_len] + tail = text[-tail_len:] if tail_len > 0 else "" + dropped = max(0, len(text) - (len(head) + len(tail))) + + marker = f"\n[...{dropped} characters truncated...]\n" + + # If marker would overflow budget, shrink tail + available_for_marker = budget_chars - (len(head) + len(tail)) + if available_for_marker < len(marker): + over = len(marker) - available_for_marker + tail = tail[:-over] if over < len(tail) else "" + + return head + marker + tail, dropped + + +def estimate_summary_tokens(input_tokens: int, level: int) -> int: + """Estimate target summary tokens based on input size and level. + + Compression ratios based on Mem0 research: + - BRIEF: ~20% compression (80% reduction) + - STANDARD: ~12% compression (88% reduction) + - DETAILED: ~7% compression (93% reduction) + - HIERARCHICAL: Capped with diminishing returns + + Args: + input_tokens: Number of tokens in the input. + level: Summary level (1-4). + + Returns: + Target number of tokens for the summary. + + """ + if level == SummaryLevel.NONE: + return 0 + if level == SummaryLevel.BRIEF: + return min(50, max(20, input_tokens // 5)) + if level == SummaryLevel.STANDARD: + return min(200, max(50, input_tokens // 8)) + if level == SummaryLevel.DETAILED: + return min(500, max(100, input_tokens // 15)) + # HIERARCHICAL + # Base of 1000 tokens plus diminishing returns for additional content + base = 1000 + additional = max(0, (input_tokens - 15000) // 100) + return min(2000, base + additional) + + +def tokens_to_words(tokens: int) -> int: + """Convert token count to approximate word count. + + Rough approximation: 1 token ≈ 0.75 words for English text. + """ + return int(tokens * 0.75) diff --git a/pyproject.toml b/pyproject.toml index dcc98fed8..44fcc0403 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,11 @@ vectordb = [ "watchfiles>=0.21.0", ] rag = ["agent-cli[vectordb]", "markitdown[docx,pdf,pptx]>=0.1.3"] -memory = ["agent-cli[vectordb]", "pyyaml>=6.0.0"] +memory = [ + "agent-cli[vectordb]", + "pyyaml>=6.0.0", + "tiktoken>=0.5.0", # For token counting in adaptive summarization +] # Feature extras vad = ["onnxruntime>=1.16.0"] @@ -82,6 +86,7 @@ test = [ "pytest-cov>=4.0.0", "pytest-timeout", "pytest-mock", + "tiktoken>=0.5.0", # For summarizer tests ] dev = [ "agent-cli[test]", @@ -113,6 +118,7 @@ dev = [ "pre-commit-uv>=4.1.4", "zensical", "markdown-gfm-admonition", + "tiktoken>=0.5.0", # For summarizer tests ] [project.scripts] diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py index 98334e459..3edd0eeb9 100644 --- a/tests/memory/test_store.py +++ b/tests/memory/test_store.py @@ -148,3 +148,229 @@ def test_upsert_and_delete_entries_delegate() -> None: _store.delete_entries(fake, ["x"]) assert fake.deleted == [["x"]] + + +# --- Hierarchical Summary Tests --- + + +class _MockSummaryResult: + """Mock SummaryResult for testing without importing the full summarizer module.""" + + def __init__(self, entries: list[dict[str, Any]]) -> None: + self._entries = entries + + def to_storage_metadata(self, _conversation_id: str) -> list[dict[str, Any]]: + # Just return the pre-configured entries (ignores conversation_id) + return self._entries + + +def test_upsert_hierarchical_summary_simple() -> None: + """Test upserting a simple (non-hierarchical) summary.""" + fake = _FakeCollection() + entries = [ + { + "id": "conv-123:summary:L3:final", + "content": "A standard paragraph summary.", + "metadata": { + "conversation_id": "conv-123", + "role": "summary", + "level": 3, + "is_final": True, + "summary_level": "STANDARD", + "input_tokens": 1000, + "output_tokens": 50, + "compression_ratio": 0.05, + "created_at": "2024-01-01T00:00:00", + }, + }, + ] + mock_result = _MockSummaryResult(entries) + + ids = _store.upsert_hierarchical_summary(fake, "conv-123", mock_result) + + assert ids == ["conv-123:summary:L3:final"] + assert len(fake.upserts) == 1 + upserted_ids, upserted_docs, upserted_metas = fake.upserts[0] + assert upserted_ids == ["conv-123:summary:L3:final"] + assert upserted_docs == ["A standard paragraph summary."] + assert upserted_metas[0]["level"] == 3 + assert upserted_metas[0]["is_final"] is True + + +def test_upsert_hierarchical_summary_with_chunks() -> None: + """Test upserting a hierarchical summary with L1 and L3 entries.""" + fake = _FakeCollection() + entries = [ + { + "id": "conv-456:summary:L1:0", + "content": "Chunk 0 summary", + "metadata": { + "conversation_id": "conv-456", + "role": "summary", + "level": 1, + "chunk_index": 0, + "parent_group": 0, + "created_at": "2024-01-01T00:00:00", + }, + }, + { + "id": "conv-456:summary:L1:1", + "content": "Chunk 1 summary", + "metadata": { + "conversation_id": "conv-456", + "role": "summary", + "level": 1, + "chunk_index": 1, + "parent_group": 0, + "created_at": "2024-01-01T00:00:00", + }, + }, + { + "id": "conv-456:summary:L3:final", + "content": "Final synthesis", + "metadata": { + "conversation_id": "conv-456", + "role": "summary", + "level": 3, + "is_final": True, + "input_tokens": 5000, + "output_tokens": 100, + "compression_ratio": 0.02, + "created_at": "2024-01-01T00:00:00", + }, + }, + ] + mock_result = _MockSummaryResult(entries) + + ids = _store.upsert_hierarchical_summary(fake, "conv-456", mock_result) + + assert len(ids) == 3 + assert "conv-456:summary:L1:0" in ids + assert "conv-456:summary:L1:1" in ids + assert "conv-456:summary:L3:final" in ids + + +def test_upsert_hierarchical_summary_empty() -> None: + """Test upserting when there are no entries (e.g., NONE level).""" + fake = _FakeCollection() + mock_result = _MockSummaryResult([]) + + ids = _store.upsert_hierarchical_summary(fake, "conv-789", mock_result) + + assert ids == [] + assert len(fake.upserts) == 0 + + +def test_get_summary_at_level() -> None: + """Test retrieving summaries at a specific level.""" + fake = _FakeCollection( + get_result={ + "documents": ["Chunk 0", "Chunk 1"], + "metadatas": [ + { + "conversation_id": "c1", + "role": "summary", + "level": 1, + "chunk_index": 0, + "created_at": "now", + }, + { + "conversation_id": "c1", + "role": "summary", + "level": 1, + "chunk_index": 1, + "created_at": "now", + }, + ], + "ids": ["c1:summary:L1:0", "c1:summary:L1:1"], + }, + ) + + records = _store.get_summary_at_level(fake, "c1", level=1) + + assert len(records) == 2 + assert records[0].metadata.level == 1 + assert records[0].metadata.chunk_index == 0 + assert records[1].metadata.chunk_index == 1 + + +def test_get_final_summary_returns_final() -> None: + """Test getting the L3 final summary.""" + fake = _FakeCollection( + get_result={ + "documents": ["The final summary"], + "metadatas": [ + { + "conversation_id": "c1", + "role": "summary", + "level": 3, + "is_final": True, + "created_at": "now", + }, + ], + "ids": ["c1:summary:L3:final"], + }, + ) + + result = _store.get_final_summary(fake, "c1") + + assert result is not None + assert result.content == "The final summary" + assert result.metadata.is_final is True + + +def test_get_final_summary_returns_none_when_missing() -> None: + """Test that get_final_summary returns None when no summary exists.""" + fake = _FakeCollection(get_result={"documents": [], "metadatas": [], "ids": []}) + + result = _store.get_final_summary(fake, "c1") + + assert result is None + + +def test_delete_summaries_all_levels() -> None: + """Test deleting all summary levels for a conversation.""" + fake = _FakeCollection( + get_result={ + "documents": ["L1", "L3"], + "metadatas": [ + {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"}, + {"conversation_id": "c1", "role": "summary", "level": 3, "created_at": "now"}, + ], + "ids": ["c1:summary:L1:0", "c1:summary:L3:final"], + }, + ) + + deleted_count = _store.delete_summaries(fake, "c1") + + assert deleted_count == 2 + assert len(fake.deleted) == 1 + assert set(fake.deleted[0]) == {"c1:summary:L1:0", "c1:summary:L3:final"} + + +def test_delete_summaries_specific_levels() -> None: + """Test deleting only specific summary levels.""" + fake = _FakeCollection( + get_result={ + "documents": ["L1 chunk"], + "metadatas": [ + {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"}, + ], + "ids": ["c1:summary:L1:0"], + }, + ) + + deleted_count = _store.delete_summaries(fake, "c1", levels=[1]) + + assert deleted_count == 1 + assert fake.deleted[0] == ["c1:summary:L1:0"] + + +def test_delete_summaries_no_entries() -> None: + """Test deleting when no summaries exist.""" + fake = _FakeCollection(get_result={"documents": [], "metadatas": [], "ids": []}) + + deleted_count = _store.delete_summaries(fake, "c1") + + assert deleted_count == 0 + assert len(fake.deleted) == 0 diff --git a/tests/summarizer/__init__.py b/tests/summarizer/__init__.py new file mode 100644 index 000000000..d6801b313 --- /dev/null +++ b/tests/summarizer/__init__.py @@ -0,0 +1 @@ +"""Tests for the adaptive summarizer module.""" diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py new file mode 100644 index 000000000..1f010999e --- /dev/null +++ b/tests/summarizer/test_adaptive.py @@ -0,0 +1,434 @@ +"""Unit tests for AdaptiveSummarizer.""" + +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from agent_cli.summarizer.adaptive import ( + LEVEL_THRESHOLDS, + AdaptiveSummarizer, + SummaryOutput, +) +from agent_cli.summarizer.models import SummaryLevel, SummaryResult + + +class TestAdaptiveSummarizerInit: + """Tests for AdaptiveSummarizer initialization.""" + + def test_basic_init(self) -> None: + """Test basic initialization with required parameters.""" + summarizer = AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="llama3.1:8b", + ) + assert summarizer.openai_base_url == "http://localhost:8000/v1" + assert summarizer.model == "llama3.1:8b" + assert summarizer.api_key == "not-needed" + + def test_init_with_api_key(self) -> None: + """Test initialization with custom API key.""" + summarizer = AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + api_key="sk-test-key", + ) + assert summarizer.api_key == "sk-test-key" + + def test_init_with_custom_settings(self) -> None: + """Test initialization with custom chunk settings.""" + summarizer = AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + chunk_size=5000, + chunk_overlap=300, + max_concurrent_chunks=10, + timeout=120.0, + ) + assert summarizer.chunk_size == 5000 + assert summarizer.chunk_overlap == 300 + assert summarizer.max_concurrent_chunks == 10 + assert summarizer.timeout == 120.0 + + def test_trailing_slash_stripped(self) -> None: + """Test that trailing slash is stripped from base URL.""" + summarizer = AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1/", + model="gpt-4", + ) + assert summarizer.openai_base_url == "http://localhost:8000/v1" + + +class TestDetermineLevel: + """Tests for level determination based on token count.""" + + @pytest.fixture + def summarizer(self) -> AdaptiveSummarizer: + """Create a summarizer instance.""" + return AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + + def test_none_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + """Test NONE level for very short content.""" + assert summarizer.determine_level(50) == SummaryLevel.NONE + assert summarizer.determine_level(99) == SummaryLevel.NONE + + def test_brief_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + """Test BRIEF level for short content.""" + assert summarizer.determine_level(100) == SummaryLevel.BRIEF + assert summarizer.determine_level(300) == SummaryLevel.BRIEF + assert summarizer.determine_level(499) == SummaryLevel.BRIEF + + def test_standard_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + """Test STANDARD level for medium content.""" + assert summarizer.determine_level(500) == SummaryLevel.STANDARD + assert summarizer.determine_level(1500) == SummaryLevel.STANDARD + assert summarizer.determine_level(2999) == SummaryLevel.STANDARD + + def test_detailed_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + """Test DETAILED level for longer content.""" + assert summarizer.determine_level(3000) == SummaryLevel.DETAILED + assert summarizer.determine_level(8000) == SummaryLevel.DETAILED + assert summarizer.determine_level(14999) == SummaryLevel.DETAILED + + def test_hierarchical_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + """Test HIERARCHICAL level for very long content.""" + assert summarizer.determine_level(15000) == SummaryLevel.HIERARCHICAL + assert summarizer.determine_level(50000) == SummaryLevel.HIERARCHICAL + assert summarizer.determine_level(100000) == SummaryLevel.HIERARCHICAL + + def test_thresholds_match_constants(self) -> None: + """Verify thresholds match the module constants.""" + assert LEVEL_THRESHOLDS[SummaryLevel.NONE] == 100 + assert LEVEL_THRESHOLDS[SummaryLevel.BRIEF] == 500 + assert LEVEL_THRESHOLDS[SummaryLevel.STANDARD] == 3000 + assert LEVEL_THRESHOLDS[SummaryLevel.DETAILED] == 15000 + + +class TestSummarize: + """Tests for main summarize method.""" + + @pytest.fixture + def summarizer(self) -> AdaptiveSummarizer: + """Create a summarizer instance.""" + return AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + + @pytest.mark.asyncio + async def test_empty_content_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None: + """Test that empty content returns NONE level result.""" + result = await summarizer.summarize("") + assert result.level == SummaryLevel.NONE + assert result.summary is None + assert result.input_tokens == 0 + assert result.output_tokens == 0 + + @pytest.mark.asyncio + async def test_whitespace_only_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None: + """Test that whitespace-only content returns NONE level result.""" + result = await summarizer.summarize(" \n\n ") + assert result.level == SummaryLevel.NONE + assert result.summary is None + + @pytest.mark.asyncio + async def test_very_short_content_no_summary(self, summarizer: AdaptiveSummarizer) -> None: + """Test that very short content gets NONE level (no summary).""" + # Less than 100 tokens + result = await summarizer.summarize("Hello world") + assert result.level == SummaryLevel.NONE + assert result.summary is None + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_brief_summary") + async def test_brief_level_calls_brief_summary( + self, + mock_brief: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test that BRIEF level content calls _brief_summary.""" + mock_brief.return_value = "Brief summary." + + # Create content that's ~100-500 tokens + content = "This is a test sentence. " * 30 # ~150 tokens + + result = await summarizer.summarize(content) + + mock_brief.assert_called_once_with(content) + assert result.level == SummaryLevel.BRIEF + assert result.summary == "Brief summary." + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_standard_summary") + async def test_standard_level_calls_standard_summary( + self, + mock_standard: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test that STANDARD level content calls _standard_summary.""" + mock_standard.return_value = "Standard summary paragraph." + + # Create content that's ~500-3000 tokens + content = "This is a test sentence with more words. " * 100 # ~800 tokens + + result = await summarizer.summarize(content, content_type="general") + + mock_standard.assert_called_once_with(content, None, "general") + assert result.level == SummaryLevel.STANDARD + assert result.summary == "Standard summary paragraph." + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_standard_summary") + async def test_prior_summary_passed_to_standard( + self, + mock_standard: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test that prior_summary is passed to _standard_summary.""" + mock_standard.return_value = "Updated summary." + + content = "This is a test sentence with more words. " * 100 + prior = "Previous context summary." + + await summarizer.summarize(content, prior_summary=prior) + + mock_standard.assert_called_once_with(content, prior, "general") + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_detailed_summary") + async def test_detailed_level_calls_detailed_summary( + self, + mock_detailed: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test that DETAILED level content calls _detailed_summary.""" + mock_result = SummaryResult( + level=SummaryLevel.DETAILED, + summary="Detailed summary.", + hierarchical=None, + input_tokens=5000, + output_tokens=100, + compression_ratio=0.02, + ) + mock_detailed.return_value = mock_result + + # Create content that's ~3000-15000 tokens + content = "Word " * 5000 # ~5000 tokens + + result = await summarizer.summarize(content) + + assert mock_detailed.called + assert result.level == SummaryLevel.DETAILED + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_hierarchical_summary") + async def test_hierarchical_level_calls_hierarchical_summary( + self, + mock_hierarchical: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test that HIERARCHICAL level content calls _hierarchical_summary.""" + mock_result = SummaryResult( + level=SummaryLevel.HIERARCHICAL, + summary="Hierarchical summary.", + hierarchical=None, + input_tokens=20000, + output_tokens=500, + compression_ratio=0.025, + ) + mock_hierarchical.return_value = mock_result + + # Create content that's > 15000 tokens + content = "Word " * 20000 + + result = await summarizer.summarize(content) + + assert mock_hierarchical.called + assert result.level == SummaryLevel.HIERARCHICAL + + +class TestUpdateRollingSummary: + """Tests for rolling summary updates.""" + + @pytest.fixture + def summarizer(self) -> AdaptiveSummarizer: + """Create a summarizer instance.""" + return AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + + @pytest.mark.asyncio + async def test_empty_facts_returns_prior(self, summarizer: AdaptiveSummarizer) -> None: + """Test that empty facts list returns prior summary.""" + result = await summarizer.update_rolling_summary( + prior_summary="Existing summary", + new_facts=[], + ) + assert result == "Existing summary" + + @pytest.mark.asyncio + async def test_empty_facts_no_prior_returns_empty(self, summarizer: AdaptiveSummarizer) -> None: + """Test that empty facts with no prior returns empty string.""" + result = await summarizer.update_rolling_summary( + prior_summary=None, + new_facts=[], + ) + assert result == "" + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_generate_summary") + async def test_new_facts_calls_generate( + self, + mock_generate: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test that new facts trigger summary generation.""" + mock_generate.return_value = "Updated summary with new facts." + + result = await summarizer.update_rolling_summary( + prior_summary="Old summary", + new_facts=["User likes coffee", "User lives in Amsterdam"], + ) + + mock_generate.assert_called_once() + assert result == "Updated summary with new facts." + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_generate_summary") + async def test_facts_formatted_as_list( + self, + mock_generate: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test that facts are formatted as bullet list in prompt.""" + mock_generate.return_value = "Summary" + + await summarizer.update_rolling_summary( + prior_summary="Prior", + new_facts=["Fact one", "Fact two"], + ) + + # Check the prompt contains formatted facts + call_args = mock_generate.call_args + prompt = call_args[0][0] + assert "- Fact one" in prompt + assert "- Fact two" in prompt + + +class TestGenerateSummary: + """Tests for _generate_summary method.""" + + @pytest.fixture + def summarizer(self) -> AdaptiveSummarizer: + """Create a summarizer instance.""" + return AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + + @pytest.mark.asyncio + async def test_generate_summary_with_pydantic_ai( + self, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test summary generation using PydanticAI agent.""" + # Mock the entire agent creation and run + mock_result = MagicMock() + mock_result.output = SummaryOutput(summary="Generated summary.") + + with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class: + mock_agent = MagicMock() + mock_agent.run = AsyncMock(return_value=mock_result) + mock_agent_class.return_value = mock_agent + + result = await summarizer._generate_summary("Test prompt", max_tokens=100) + + assert result == "Generated summary." + mock_agent.run.assert_called_once_with("Test prompt") + + @pytest.mark.asyncio + @patch.object(AdaptiveSummarizer, "_raw_generate") + async def test_fallback_to_raw_generate_on_error( + self, + mock_raw: AsyncMock, + summarizer: AdaptiveSummarizer, + ) -> None: + """Test fallback to raw HTTP on PydanticAI error.""" + mock_raw.return_value = "Fallback summary" + + with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class: + mock_agent = MagicMock() + mock_agent.run = AsyncMock(side_effect=Exception("API error")) + mock_agent_class.return_value = mock_agent + + result = await summarizer._generate_summary("Test prompt", max_tokens=100) + + mock_raw.assert_called_once_with("Test prompt", 100) + assert result == "Fallback summary" + + +class TestRawGenerate: + """Tests for _raw_generate fallback method.""" + + @pytest.fixture + def summarizer(self) -> AdaptiveSummarizer: + """Create a summarizer instance.""" + return AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + + @pytest.mark.asyncio + async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> None: + """Test successful raw HTTP generation.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "choices": [{"message": {"content": "Raw generated summary"}}], + } + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = MagicMock() + mock_client.post = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + result = await summarizer._raw_generate("Test prompt", max_tokens=100) + + assert result == "Raw generated summary" + + @pytest.mark.asyncio + async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer) -> None: + """Test raw generate with empty choices returns empty string.""" + mock_response = MagicMock() + mock_response.json.return_value = {"choices": []} + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = MagicMock() + mock_client.post = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + result = await summarizer._raw_generate("Test prompt", max_tokens=100) + + assert result == "" + + +class TestSummaryOutput: + """Tests for SummaryOutput pydantic model.""" + + def test_basic_creation(self) -> None: + """Test creating a SummaryOutput.""" + output = SummaryOutput(summary="Test summary text") + assert output.summary == "Test summary text" + + def test_whitespace_preserved(self) -> None: + """Test that whitespace in summary is preserved.""" + output = SummaryOutput(summary=" Summary with spaces ") + assert output.summary == " Summary with spaces " diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py new file mode 100644 index 000000000..381f9f5b6 --- /dev/null +++ b/tests/summarizer/test_integration.py @@ -0,0 +1,466 @@ +"""Integration tests for the summarizer with memory system.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any +from unittest.mock import patch + +import pytest + +from agent_cli.memory._ingest import summarize_content +from agent_cli.memory._persistence import persist_hierarchical_summary +from agent_cli.memory._store import ( + get_final_summary, + get_summary_at_level, + upsert_hierarchical_summary, +) +from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel, SummaryResult +from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary + +if TYPE_CHECKING: + from pathlib import Path + + +class _FakeCollection: + """Minimal Chroma-like collection for testing.""" + + def __init__(self) -> None: + self._store: dict[str, tuple[str, dict[str, Any]]] = {} + + def upsert( + self, + *, + ids: list[str], + documents: list[str], + metadatas: list[dict[str, Any]], + ) -> None: + for doc_id, doc, meta in zip(ids, documents, metadatas, strict=False): + self._store[doc_id] = (doc, meta) + + def get( + self, + *, + where: dict[str, Any] | None = None, + include: list[str] | None = None, # noqa: ARG002 + ) -> dict[str, Any]: + if where is None: + return {"documents": [], "metadatas": [], "ids": []} + + results: list[tuple[str, tuple[str, dict[str, Any]]]] = [] + for doc_id, (doc, meta) in self._store.items(): + # Check all conditions in $and clause + conditions = where.get("$and", [where]) + match = True + for clause in conditions: + for k, v in clause.items(): + if k == "$and": + continue + if isinstance(v, dict): + if "$in" in v and meta.get(k) not in v["$in"]: + match = False + if "$ne" in v and meta.get(k) == v["$ne"]: + match = False + elif meta.get(k) != v: + match = False + if match: + results.append((doc_id, (doc, meta))) + + docs = [doc for _, (doc, _) in results] + metas = [meta for _, (_, meta) in results] + ids = [doc_id for doc_id, _ in results] + return {"documents": docs, "metadatas": metas, "ids": ids} + + def delete( + self, + ids: list[str] | None = None, + where: dict[str, Any] | None = None, # noqa: ARG002 + ) -> None: + if ids: + for doc_id in ids: + self._store.pop(doc_id, None) + + +@pytest.fixture +def fake_collection() -> _FakeCollection: + """Create a fake ChromaDB collection.""" + return _FakeCollection() + + +@pytest.fixture +def memory_root(tmp_path: Path) -> Path: + """Create a temporary memory root directory.""" + return tmp_path / "memory" + + +class TestSummaryResultStorageMetadata: + """Test SummaryResult.to_storage_metadata for various levels.""" + + def test_standard_summary_produces_single_entry(self) -> None: + """Test that STANDARD level produces a single L3 entry.""" + result = SummaryResult( + level=SummaryLevel.STANDARD, + summary="A paragraph summary of the content.", + hierarchical=None, + input_tokens=1000, + output_tokens=50, + compression_ratio=0.05, + ) + + entries = result.to_storage_metadata("conv-123") + + assert len(entries) == 1 + entry = entries[0] + assert entry["id"] == "conv-123:summary:L3:final" + assert entry["content"] == "A paragraph summary of the content." + assert entry["metadata"]["level"] == 3 + assert entry["metadata"]["is_final"] is True + assert entry["metadata"]["summary_level"] == "STANDARD" + + def test_hierarchical_summary_produces_multiple_entries(self) -> None: + """Test that HIERARCHICAL level produces L1, L2, L3 entries.""" + l1_summaries = [ + ChunkSummary( + chunk_index=0, + content="Chunk 0", + token_count=10, + source_tokens=100, + parent_group=0, + ), + ChunkSummary( + chunk_index=1, + content="Chunk 1", + token_count=10, + source_tokens=100, + parent_group=0, + ), + ChunkSummary( + chunk_index=2, + content="Chunk 2", + token_count=10, + source_tokens=100, + parent_group=0, + ), + ] + hierarchical = HierarchicalSummary( + l1_summaries=l1_summaries, + l2_summaries=["Group 0 summary"], + l3_summary="Final hierarchical synthesis.", + ) + result = SummaryResult( + level=SummaryLevel.HIERARCHICAL, + summary="Final hierarchical synthesis.", + hierarchical=hierarchical, + input_tokens=20000, + output_tokens=200, + compression_ratio=0.01, + ) + + entries = result.to_storage_metadata("conv-456") + + # Should have 3 L1 + 1 L2 + 1 L3 = 5 entries + assert len(entries) == 5 + + # Check L1 entries + l1_entries = [e for e in entries if e["metadata"]["level"] == 1] + assert len(l1_entries) == 3 + + # Check L2 entries + l2_entries = [e for e in entries if e["metadata"]["level"] == 2] + assert len(l2_entries) == 1 + + # Check L3 entry + l3_entries = [e for e in entries if e["metadata"]["level"] == 3] + assert len(l3_entries) == 1 + + +class TestHierarchicalSummaryStorage: + """Test storing hierarchical summaries to ChromaDB.""" + + def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None: + """Test storing a simple (non-hierarchical) summary.""" + result = SummaryResult( + level=SummaryLevel.STANDARD, + summary="A standard summary.", + hierarchical=None, + input_tokens=1000, + output_tokens=50, + compression_ratio=0.05, + ) + + ids = upsert_hierarchical_summary(fake_collection, "conv-123", result) + + assert len(ids) == 1 + assert "conv-123:summary:L3:final" in ids + + # Verify retrieval + stored = get_final_summary(fake_collection, "conv-123") + assert stored is not None + assert stored.content == "A standard summary." + + def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> None: + """Test storing a hierarchical summary with all levels.""" + l1_summaries = [ + ChunkSummary( + chunk_index=0, + content="Chunk 0 summary", + token_count=10, + source_tokens=100, + ), + ChunkSummary( + chunk_index=1, + content="Chunk 1 summary", + token_count=10, + source_tokens=100, + ), + ] + hierarchical = HierarchicalSummary( + l1_summaries=l1_summaries, + l2_summaries=[], + l3_summary="Final summary", + ) + result = SummaryResult( + level=SummaryLevel.DETAILED, + summary="Final summary", + hierarchical=hierarchical, + input_tokens=5000, + output_tokens=100, + compression_ratio=0.02, + ) + + ids = upsert_hierarchical_summary(fake_collection, "conv-789", result) + + assert len(ids) == 3 # 2 L1 + 1 L3 + + # Verify L1 retrieval + l1_stored = get_summary_at_level(fake_collection, "conv-789", level=1) + assert len(l1_stored) == 2 + + # Verify L3 retrieval + final = get_final_summary(fake_collection, "conv-789") + assert final is not None + assert final.content == "Final summary" + + +class TestFilePersistence: + """Test hierarchical summary file persistence.""" + + def test_persist_hierarchical_creates_files( + self, + fake_collection: _FakeCollection, + memory_root: Path, + ) -> None: + """Test that persist_hierarchical_summary creates correct file structure.""" + l1_summaries = [ + ChunkSummary( + chunk_index=0, + content="Chunk 0 content", + token_count=10, + source_tokens=100, + parent_group=0, + ), + ChunkSummary( + chunk_index=1, + content="Chunk 1 content", + token_count=10, + source_tokens=100, + parent_group=0, + ), + ] + hierarchical = HierarchicalSummary( + l1_summaries=l1_summaries, + l2_summaries=["Group 0 summary"], + l3_summary="Final synthesis", + ) + result = SummaryResult( + level=SummaryLevel.HIERARCHICAL, + summary="Final synthesis", + hierarchical=hierarchical, + input_tokens=20000, + output_tokens=200, + compression_ratio=0.01, + ) + + ids = persist_hierarchical_summary( + fake_collection, + memory_root=memory_root, + conversation_id="test-conv", + summary_result=result, + ) + + assert len(ids) == 4 # 2 L1 + 1 L2 + 1 L3 + + # Check file structure (note: _slugify converts - to - not _) + entries_dir = memory_root / "entries" / "test-conv" + l1_dir = entries_dir / "summaries" / "L1" + l2_dir = entries_dir / "summaries" / "L2" + l3_dir = entries_dir / "summaries" / "L3" + + assert l1_dir.exists() + assert l2_dir.exists() + assert l3_dir.exists() + + # Check L1 files + l1_files = list(l1_dir.glob("*.md")) + assert len(l1_files) == 2 + + # Check L2 files + l2_files = list(l2_dir.glob("*.md")) + assert len(l2_files) == 1 + + # Check L3 files + l3_files = list(l3_dir.glob("*.md")) + assert len(l3_files) == 1 + assert (l3_dir / "final.md").exists() + + def test_persist_simple_summary_creates_l3_file( + self, + fake_collection: _FakeCollection, + memory_root: Path, + ) -> None: + """Test that a simple summary creates just L3/final.md.""" + result = SummaryResult( + level=SummaryLevel.STANDARD, + summary="A standard paragraph summary.", + hierarchical=None, + input_tokens=1000, + output_tokens=50, + compression_ratio=0.05, + ) + + ids = persist_hierarchical_summary( + fake_collection, + memory_root=memory_root, + conversation_id="simple-conv", + summary_result=result, + ) + + assert len(ids) == 1 + + # Check file exists (note: _slugify converts - to - not _) + entries_dir = memory_root / "entries" / "simple-conv" + l3_file = entries_dir / "summaries" / "L3" / "final.md" + assert l3_file.exists() + + # Check content has YAML front matter + content = l3_file.read_text(encoding="utf-8") + assert "---" in content + assert "level: 3" in content + assert "A standard paragraph summary." in content + + def test_persist_deletes_old_summaries( + self, + fake_collection: _FakeCollection, + memory_root: Path, + ) -> None: + """Test that persisting new summary deletes old summary files.""" + # Create first summary + result1 = SummaryResult( + level=SummaryLevel.STANDARD, + summary="First summary.", + hierarchical=None, + input_tokens=1000, + output_tokens=50, + compression_ratio=0.05, + ) + + persist_hierarchical_summary( + fake_collection, + memory_root=memory_root, + conversation_id="conv", + summary_result=result1, + ) + + entries_dir = memory_root / "entries" / "conv" + first_file = entries_dir / "summaries" / "L3" / "final.md" + assert first_file.exists() + assert "First summary." in first_file.read_text() + + # Create second summary (should replace first) + result2 = SummaryResult( + level=SummaryLevel.STANDARD, + summary="Second summary.", + hierarchical=None, + input_tokens=1000, + output_tokens=50, + compression_ratio=0.05, + ) + + persist_hierarchical_summary( + fake_collection, + memory_root=memory_root, + conversation_id="conv", + summary_result=result2, + ) + + # First summary should be moved to deleted + assert first_file.exists() + assert "Second summary." in first_file.read_text() + + # Old summary should be in deleted folder + deleted_dir = memory_root / "entries" / "deleted" / "conv" / "summaries" + assert deleted_dir.exists() + + +class TestAdaptiveSummarizerLevelDetermination: + """Test that AdaptiveSummarizer correctly determines summary levels.""" + + @pytest.fixture + def summarizer(self) -> AdaptiveSummarizer: + """Create an AdaptiveSummarizer instance.""" + return AdaptiveSummarizer( + openai_base_url="http://localhost:8000/v1", + model="test-model", + ) + + def test_very_short_content_is_none(self, summarizer: AdaptiveSummarizer) -> None: + """Test that content under 100 tokens gets NONE level.""" + level = summarizer.determine_level(50) + assert level == SummaryLevel.NONE + + def test_short_content_is_brief(self, summarizer: AdaptiveSummarizer) -> None: + """Test that 100-500 token content gets BRIEF level.""" + level = summarizer.determine_level(300) + assert level == SummaryLevel.BRIEF + + def test_medium_content_is_standard(self, summarizer: AdaptiveSummarizer) -> None: + """Test that 500-3000 token content gets STANDARD level.""" + level = summarizer.determine_level(1500) + assert level == SummaryLevel.STANDARD + + def test_long_content_is_detailed(self, summarizer: AdaptiveSummarizer) -> None: + """Test that 3000-15000 token content gets DETAILED level.""" + level = summarizer.determine_level(8000) + assert level == SummaryLevel.DETAILED + + def test_very_long_content_is_hierarchical(self, summarizer: AdaptiveSummarizer) -> None: + """Test that content over 15000 tokens gets HIERARCHICAL level.""" + level = summarizer.determine_level(25000) + assert level == SummaryLevel.HIERARCHICAL + + +class TestSummarizeContentFunction: + """Test the summarize_content function from _ingest.""" + + @pytest.mark.asyncio + async def test_summarize_content_creates_result(self) -> None: + """Test that summarize_content returns a valid SummaryResult.""" + with patch.object(AdaptiveSummarizer, "summarize") as mock_summarize: + mock_result = SummaryResult( + level=SummaryLevel.STANDARD, + summary="Mocked summary.", + hierarchical=None, + input_tokens=1000, + output_tokens=50, + compression_ratio=0.05, + ) + mock_summarize.return_value = mock_result + + result = await summarize_content( + content="Some content to summarize " * 100, + openai_base_url="http://localhost:8000/v1", + api_key=None, + model="test-model", + ) + + assert result.level == SummaryLevel.STANDARD + assert result.summary == "Mocked summary." diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py new file mode 100644 index 000000000..5a6583cd2 --- /dev/null +++ b/tests/summarizer/test_models.py @@ -0,0 +1,332 @@ +"""Unit tests for summarizer models.""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest + +from agent_cli.summarizer.models import ( + ChunkSummary, + HierarchicalSummary, + SummaryLevel, + SummaryResult, +) + + +class TestSummaryLevel: + """Tests for SummaryLevel enum.""" + + def test_level_values(self) -> None: + """Test that levels have correct integer values.""" + assert SummaryLevel.NONE == 0 + assert SummaryLevel.BRIEF == 1 + assert SummaryLevel.STANDARD == 2 + assert SummaryLevel.DETAILED == 3 + assert SummaryLevel.HIERARCHICAL == 4 + + def test_level_ordering(self) -> None: + """Test that levels can be compared.""" + assert SummaryLevel.NONE < SummaryLevel.BRIEF + assert SummaryLevel.BRIEF < SummaryLevel.STANDARD + assert SummaryLevel.STANDARD < SummaryLevel.DETAILED + assert SummaryLevel.DETAILED < SummaryLevel.HIERARCHICAL + + +class TestChunkSummary: + """Tests for ChunkSummary model.""" + + def test_basic_creation(self) -> None: + """Test creating a chunk summary.""" + chunk = ChunkSummary( + chunk_index=0, + content="This is a summary of chunk 1.", + token_count=10, + source_tokens=100, + parent_group=None, + ) + assert chunk.chunk_index == 0 + assert chunk.content == "This is a summary of chunk 1." + assert chunk.token_count == 10 + assert chunk.source_tokens == 100 + assert chunk.parent_group is None + + def test_with_parent_group(self) -> None: + """Test creating a chunk summary with parent group.""" + chunk = ChunkSummary( + chunk_index=5, + content="Summary text", + token_count=8, + source_tokens=200, + parent_group=1, + ) + assert chunk.parent_group == 1 + + def test_validation_negative_tokens(self) -> None: + """Test that negative token counts fail validation.""" + with pytest.raises(ValueError, match="greater than or equal to 0"): + ChunkSummary( + chunk_index=0, + content="Test", + token_count=-1, + source_tokens=100, + ) + + +class TestHierarchicalSummary: + """Tests for HierarchicalSummary model.""" + + def test_basic_creation(self) -> None: + """Test creating a hierarchical summary.""" + l1 = [ + ChunkSummary( + chunk_index=0, + content="Chunk 1 summary", + token_count=10, + source_tokens=100, + ), + ChunkSummary( + chunk_index=1, + content="Chunk 2 summary", + token_count=12, + source_tokens=120, + ), + ] + hs = HierarchicalSummary( + l1_summaries=l1, + l2_summaries=["Group summary"], + l3_summary="Final summary of all content.", + ) + assert len(hs.l1_summaries) == 2 + assert len(hs.l2_summaries) == 1 + assert hs.l3_summary == "Final summary of all content." + + def test_default_chunk_settings(self) -> None: + """Test default chunk size and overlap.""" + hs = HierarchicalSummary( + l1_summaries=[], + l2_summaries=[], + l3_summary="Final", + ) + assert hs.chunk_size == 3000 + assert hs.chunk_overlap == 200 + + def test_get_summary_at_level_1(self) -> None: + """Test getting L1 summaries.""" + l1 = [ + ChunkSummary(chunk_index=0, content="C1", token_count=5, source_tokens=50), + ChunkSummary(chunk_index=1, content="C2", token_count=5, source_tokens=50), + ] + hs = HierarchicalSummary(l1_summaries=l1, l2_summaries=[], l3_summary="Final") + result = hs.get_summary_at_level(1) + assert result == ["C1", "C2"] + + def test_get_summary_at_level_2_with_l2(self) -> None: + """Test getting L2 summaries when available.""" + hs = HierarchicalSummary( + l1_summaries=[], + l2_summaries=["Group A", "Group B"], + l3_summary="Final", + ) + result = hs.get_summary_at_level(2) + assert result == ["Group A", "Group B"] + + def test_get_summary_at_level_2_fallback(self) -> None: + """Test getting L2 falls back to L3 when no L2 summaries.""" + hs = HierarchicalSummary( + l1_summaries=[], + l2_summaries=[], + l3_summary="Final summary", + ) + result = hs.get_summary_at_level(2) + assert result == ["Final summary"] + + def test_get_summary_at_level_3(self) -> None: + """Test getting L3 summary.""" + hs = HierarchicalSummary( + l1_summaries=[], + l2_summaries=["Group"], + l3_summary="The final summary", + ) + result = hs.get_summary_at_level(3) + assert result == "The final summary" + + +class TestSummaryResult: + """Tests for SummaryResult model.""" + + def test_none_level_result(self) -> None: + """Test result for content that needs no summary.""" + result = SummaryResult( + level=SummaryLevel.NONE, + summary=None, + hierarchical=None, + input_tokens=50, + output_tokens=0, + compression_ratio=0.0, + ) + assert result.level == SummaryLevel.NONE + assert result.summary is None + assert result.chunk_summaries is None + + def test_brief_level_result(self) -> None: + """Test result for brief summary.""" + result = SummaryResult( + level=SummaryLevel.BRIEF, + summary="A brief one-sentence summary.", + hierarchical=None, + input_tokens=200, + output_tokens=10, + compression_ratio=0.05, + ) + assert result.level == SummaryLevel.BRIEF + assert result.summary == "A brief one-sentence summary." + assert result.chunk_summaries is None + + def test_hierarchical_result_with_chunk_summaries(self) -> None: + """Test hierarchical result exposes chunk summaries.""" + l1 = [ + ChunkSummary(chunk_index=0, content="Chunk 1", token_count=10, source_tokens=100), + ChunkSummary(chunk_index=1, content="Chunk 2", token_count=10, source_tokens=100), + ] + hierarchical = HierarchicalSummary( + l1_summaries=l1, + l2_summaries=[], + l3_summary="Final", + ) + result = SummaryResult( + level=SummaryLevel.DETAILED, + summary="Final", + hierarchical=hierarchical, + input_tokens=5000, + output_tokens=100, + compression_ratio=0.02, + ) + assert result.chunk_summaries == ["Chunk 1", "Chunk 2"] + + def test_to_storage_metadata_none_level(self) -> None: + """Test that NONE level produces no storage entries.""" + result = SummaryResult( + level=SummaryLevel.NONE, + summary=None, + hierarchical=None, + input_tokens=50, + output_tokens=0, + compression_ratio=0.0, + ) + entries = result.to_storage_metadata("conv-123") + assert entries == [] + + def test_to_storage_metadata_simple_summary(self) -> None: + """Test storage metadata for simple (non-hierarchical) summary.""" + result = SummaryResult( + level=SummaryLevel.STANDARD, + summary="A standard paragraph summary.", + hierarchical=None, + input_tokens=1000, + output_tokens=50, + compression_ratio=0.05, + ) + entries = result.to_storage_metadata("conv-456") + assert len(entries) == 1 + entry = entries[0] + assert entry["id"] == "conv-456:summary:L3:final" + assert entry["content"] == "A standard paragraph summary." + assert entry["metadata"]["conversation_id"] == "conv-456" + assert entry["metadata"]["role"] == "summary" + assert entry["metadata"]["level"] == 3 + assert entry["metadata"]["is_final"] is True + assert entry["metadata"]["summary_level"] == "STANDARD" + + def test_to_storage_metadata_hierarchical(self) -> None: + """Test storage metadata for hierarchical summary.""" + l1 = [ + ChunkSummary( + chunk_index=0, + content="Chunk 0 text", + token_count=10, + source_tokens=100, + parent_group=0, + ), + ChunkSummary( + chunk_index=1, + content="Chunk 1 text", + token_count=12, + source_tokens=120, + parent_group=0, + ), + ] + hierarchical = HierarchicalSummary( + l1_summaries=l1, + l2_summaries=["Group 0 summary"], + l3_summary="Final synthesis", + ) + result = SummaryResult( + level=SummaryLevel.HIERARCHICAL, + summary="Final synthesis", + hierarchical=hierarchical, + input_tokens=20000, + output_tokens=200, + compression_ratio=0.01, + ) + entries = result.to_storage_metadata("conv-789") + + # Should have 2 L1 + 1 L2 + 1 L3 = 4 entries + assert len(entries) == 4 + + # Check L1 entries + l1_entries = [e for e in entries if e["metadata"]["level"] == 1] + assert len(l1_entries) == 2 + assert l1_entries[0]["id"] == "conv-789:summary:L1:0" + assert l1_entries[0]["metadata"]["chunk_index"] == 0 + + # Check L2 entry + l2_entries = [e for e in entries if e["metadata"]["level"] == 2] + assert len(l2_entries) == 1 + assert l2_entries[0]["id"] == "conv-789:summary:L2:0" + assert l2_entries[0]["content"] == "Group 0 summary" + + # Check L3 entry + l3_entries = [e for e in entries if e["metadata"]["level"] == 3] + assert len(l3_entries) == 1 + assert l3_entries[0]["id"] == "conv-789:summary:L3:final" + assert l3_entries[0]["metadata"]["is_final"] is True + + def test_compression_ratio_bounds(self) -> None: + """Test compression ratio validation.""" + # Valid ratio + result = SummaryResult( + level=SummaryLevel.BRIEF, + summary="Test", + hierarchical=None, + input_tokens=100, + output_tokens=10, + compression_ratio=0.1, + ) + assert result.compression_ratio == 0.1 + + # Ratio must be between 0 and 1 + with pytest.raises(ValueError, match="less than or equal to 1"): + SummaryResult( + level=SummaryLevel.BRIEF, + summary="Test", + hierarchical=None, + input_tokens=100, + output_tokens=10, + compression_ratio=1.5, + ) + + def test_created_at_default(self) -> None: + """Test that created_at is automatically set.""" + before = datetime.now(UTC) + result = SummaryResult( + level=SummaryLevel.BRIEF, + summary="Test", + hierarchical=None, + input_tokens=100, + output_tokens=10, + compression_ratio=0.1, + ) + after = datetime.now(UTC) + # Compare without timezone since result.created_at may be naive + assert before.replace(tzinfo=None) <= result.created_at <= after.replace(tzinfo=None) diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py new file mode 100644 index 000000000..e126def22 --- /dev/null +++ b/tests/summarizer/test_prompts.py @@ -0,0 +1,180 @@ +"""Unit tests for summarizer prompt templates.""" + +from __future__ import annotations + +from agent_cli.summarizer.prompts import ( + BRIEF_SUMMARY_PROMPT, + CHUNK_SUMMARY_PROMPT, + CONVERSATION_SUMMARY_PROMPT, + DOCUMENT_SUMMARY_PROMPT, + JOURNAL_SUMMARY_PROMPT, + META_SUMMARY_PROMPT, + ROLLING_SUMMARY_PROMPT, + STANDARD_SUMMARY_PROMPT, + format_prior_context, + format_summaries_for_meta, + get_prompt_for_content_type, +) + + +class TestPromptTemplates: + """Tests for prompt template structure.""" + + def test_brief_prompt_has_content_placeholder(self) -> None: + """Test BRIEF prompt contains content placeholder.""" + assert "{content}" in BRIEF_SUMMARY_PROMPT + # Test it can be formatted + result = BRIEF_SUMMARY_PROMPT.format(content="Test content") + assert "Test content" in result + + def test_standard_prompt_has_placeholders(self) -> None: + """Test STANDARD prompt contains required placeholders.""" + assert "{content}" in STANDARD_SUMMARY_PROMPT + assert "{prior_context}" in STANDARD_SUMMARY_PROMPT + assert "{max_words}" in STANDARD_SUMMARY_PROMPT + + result = STANDARD_SUMMARY_PROMPT.format( + content="Main content", + prior_context="Previous context", + max_words=100, + ) + assert "Main content" in result + assert "Previous context" in result + assert "100" in result + + def test_chunk_prompt_has_placeholders(self) -> None: + """Test CHUNK prompt contains required placeholders.""" + assert "{content}" in CHUNK_SUMMARY_PROMPT + assert "{chunk_index}" in CHUNK_SUMMARY_PROMPT + assert "{total_chunks}" in CHUNK_SUMMARY_PROMPT + assert "{max_words}" in CHUNK_SUMMARY_PROMPT + + result = CHUNK_SUMMARY_PROMPT.format( + content="Chunk content", + chunk_index=1, + total_chunks=5, + max_words=50, + ) + assert "Chunk content" in result + assert "1" in result + assert "5" in result + + def test_meta_prompt_has_placeholders(self) -> None: + """Test META prompt contains required placeholders.""" + assert "{summaries}" in META_SUMMARY_PROMPT + assert "{max_words}" in META_SUMMARY_PROMPT + + result = META_SUMMARY_PROMPT.format( + summaries="Summary 1\n\nSummary 2", + max_words=200, + ) + assert "Summary 1" in result + assert "200" in result + + def test_rolling_prompt_has_placeholders(self) -> None: + """Test ROLLING prompt contains required placeholders.""" + assert "{prior_summary}" in ROLLING_SUMMARY_PROMPT + assert "{new_content}" in ROLLING_SUMMARY_PROMPT + assert "{max_words}" in ROLLING_SUMMARY_PROMPT + + def test_conversation_prompt_has_content(self) -> None: + """Test CONVERSATION prompt contains content placeholder.""" + assert "{content}" in CONVERSATION_SUMMARY_PROMPT + assert "{max_words}" in CONVERSATION_SUMMARY_PROMPT + + def test_journal_prompt_has_content(self) -> None: + """Test JOURNAL prompt contains content placeholder.""" + assert "{content}" in JOURNAL_SUMMARY_PROMPT + assert "{max_words}" in JOURNAL_SUMMARY_PROMPT + + def test_document_prompt_has_content(self) -> None: + """Test DOCUMENT prompt contains content placeholder.""" + assert "{content}" in DOCUMENT_SUMMARY_PROMPT + assert "{max_words}" in DOCUMENT_SUMMARY_PROMPT + + +class TestGetPromptForContentType: + """Tests for get_prompt_for_content_type function.""" + + def test_general_returns_standard(self) -> None: + """Test general content type returns standard prompt.""" + prompt = get_prompt_for_content_type("general") + assert prompt == STANDARD_SUMMARY_PROMPT + + def test_conversation_returns_conversation(self) -> None: + """Test conversation content type returns conversation prompt.""" + prompt = get_prompt_for_content_type("conversation") + assert prompt == CONVERSATION_SUMMARY_PROMPT + + def test_journal_returns_journal(self) -> None: + """Test journal content type returns journal prompt.""" + prompt = get_prompt_for_content_type("journal") + assert prompt == JOURNAL_SUMMARY_PROMPT + + def test_document_returns_document(self) -> None: + """Test document content type returns document prompt.""" + prompt = get_prompt_for_content_type("document") + assert prompt == DOCUMENT_SUMMARY_PROMPT + + def test_unknown_returns_standard(self) -> None: + """Test unknown content type falls back to standard.""" + prompt = get_prompt_for_content_type("unknown_type") + assert prompt == STANDARD_SUMMARY_PROMPT + + def test_empty_returns_standard(self) -> None: + """Test empty string falls back to standard.""" + prompt = get_prompt_for_content_type("") + assert prompt == STANDARD_SUMMARY_PROMPT + + +class TestFormatPriorContext: + """Tests for format_prior_context function.""" + + def test_with_prior_summary(self) -> None: + """Test formatting with a prior summary.""" + result = format_prior_context("Previous summary text") + assert "Prior context" in result + assert "Previous summary text" in result + + def test_without_prior_summary(self) -> None: + """Test formatting without prior summary returns empty string.""" + result = format_prior_context(None) + assert result == "" + + def test_empty_string_prior_summary(self) -> None: + """Test formatting with empty string prior summary.""" + result = format_prior_context("") + assert result == "" + + +class TestFormatSummariesForMeta: + """Tests for format_summaries_for_meta function.""" + + def test_single_summary(self) -> None: + """Test formatting a single summary.""" + result = format_summaries_for_meta(["Summary one"]) + assert "[Section 1]" in result + assert "Summary one" in result + + def test_multiple_summaries(self) -> None: + """Test formatting multiple summaries.""" + summaries = ["First summary", "Second summary", "Third summary"] + result = format_summaries_for_meta(summaries) + + assert "[Section 1]" in result + assert "[Section 2]" in result + assert "[Section 3]" in result + assert "First summary" in result + assert "Second summary" in result + assert "Third summary" in result + + def test_empty_list(self) -> None: + """Test formatting empty list.""" + result = format_summaries_for_meta([]) + assert result == "" + + def test_summaries_separated(self) -> None: + """Test summaries are separated by double newlines.""" + summaries = ["Sum 1", "Sum 2"] + result = format_summaries_for_meta(summaries) + assert "\n\n" in result diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py new file mode 100644 index 000000000..458e9b37d --- /dev/null +++ b/tests/summarizer/test_utils.py @@ -0,0 +1,193 @@ +"""Unit tests for summarizer utility functions.""" + +from __future__ import annotations + +from agent_cli.summarizer.utils import ( + chunk_text, + count_tokens, + estimate_summary_tokens, + middle_truncate, + tokens_to_words, +) + + +class TestCountTokens: + """Tests for count_tokens function.""" + + def test_empty_string(self) -> None: + """Test counting tokens in empty string.""" + assert count_tokens("") == 0 + + def test_simple_sentence(self) -> None: + """Test counting tokens in a simple sentence.""" + # "Hello world" is typically 2 tokens + count = count_tokens("Hello world") + assert count > 0 + assert count < 10 + + def test_longer_text(self) -> None: + """Test that longer text has more tokens.""" + short = count_tokens("Hello") + long = count_tokens("Hello world, this is a longer sentence with more words.") + assert long > short + + def test_different_model_fallback(self) -> None: + """Test that unknown models fall back to cl100k_base.""" + # Should not raise, should fall back gracefully + count = count_tokens("Hello world", model="unknown-model-xyz") + assert count > 0 + + +class TestChunkText: + """Tests for chunk_text function.""" + + def test_empty_text(self) -> None: + """Test chunking empty text returns empty list.""" + assert chunk_text("") == [] + + def test_short_text_single_chunk(self) -> None: + """Test that short text stays as single chunk.""" + text = "This is a short paragraph." + chunks = chunk_text(text, chunk_size=1000) + assert len(chunks) == 1 + assert chunks[0] == text + + def test_multiple_paragraphs_chunking(self) -> None: + """Test chunking multiple paragraphs.""" + paragraphs = ["Paragraph one. " * 50, "Paragraph two. " * 50, "Paragraph three. " * 50] + text = "\n\n".join(paragraphs) + + # Use small chunk size to force splitting + chunks = chunk_text(text, chunk_size=200, overlap=20) + assert len(chunks) > 1 + + def test_overlap_preserved(self) -> None: + """Test that chunks have overlap for context continuity.""" + # Create text that will definitely need chunking + text = "Sentence one about topic A. " * 20 + "\n\n" + "Sentence two about topic B. " * 20 + + chunks = chunk_text(text, chunk_size=100, overlap=30) + + # With overlap, later chunks should contain some content from earlier + if len(chunks) > 1: + # Overlap means adjacent chunks share some content + # This is a rough check - exact overlap depends on tokenization + assert len(chunks) >= 2 + + def test_large_paragraph_sentence_split(self) -> None: + """Test that large paragraphs are split by sentences.""" + # One giant paragraph with multiple sentences + sentences = [ + f"This is sentence number {i}. It contains important information." for i in range(50) + ] + text = " ".join(sentences) + + chunks = chunk_text(text, chunk_size=100, overlap=20) + assert len(chunks) > 1 + + +class TestMiddleTruncate: + """Tests for middle_truncate function.""" + + def test_no_truncation_needed(self) -> None: + """Test that short text is not truncated.""" + text = "Short text" + result, dropped = middle_truncate(text, budget_chars=100) + assert result == text + assert dropped == 0 + + def test_basic_truncation(self) -> None: + """Test basic middle truncation.""" + text = "A" * 100 # 100 character string + result, dropped = middle_truncate(text, budget_chars=50) + + # Should have head + marker + tail + assert len(result) <= 50 + 50 # Allow for marker + assert dropped > 0 + assert "[..." in result + assert "truncated...]" in result + + def test_head_tail_fractions(self) -> None: + """Test custom head/tail fractions.""" + text = "AAAAA" + "BBBBB" * 20 + "CCCCC" + result, dropped = middle_truncate(text, budget_chars=30, head_frac=0.5, tail_frac=0.5) + + # Should preserve beginning (A's) and end (C's) + assert result.startswith("A") + assert dropped > 0 + + def test_zero_budget(self) -> None: + """Test with zero budget returns original.""" + text = "Some text" + result, dropped = middle_truncate(text, budget_chars=0) + assert result == text + assert dropped == 0 + + def test_negative_budget(self) -> None: + """Test with negative budget returns original.""" + text = "Some text" + result, dropped = middle_truncate(text, budget_chars=-10) + assert result == text + assert dropped == 0 + + +class TestEstimateSummaryTokens: + """Tests for estimate_summary_tokens function.""" + + def test_none_level(self) -> None: + """Test level 0 (NONE) returns 0.""" + assert estimate_summary_tokens(1000, level=0) == 0 + + def test_brief_level(self) -> None: + """Test level 1 (BRIEF) compression.""" + # BRIEF: ~20% compression, capped at 50 + result = estimate_summary_tokens(100, level=1) + assert result >= 20 # minimum of 20 + assert result <= 50 # capped at 50 + + def test_standard_level(self) -> None: + """Test level 2 (STANDARD) compression.""" + # STANDARD: ~12% compression, capped at 200 + result = estimate_summary_tokens(1000, level=2) + assert result >= 50 # minimum of 50 + assert result <= 200 # capped at 200 + + def test_detailed_level(self) -> None: + """Test level 3 (DETAILED) compression.""" + # DETAILED: ~7% compression, capped at 500 + result = estimate_summary_tokens(10000, level=3) + assert result >= 100 # minimum of 100 + assert result <= 500 # capped at 500 + + def test_hierarchical_level(self) -> None: + """Test level 4 (HIERARCHICAL) compression.""" + # HIERARCHICAL: base of 1000 + diminishing returns + result = estimate_summary_tokens(50000, level=4) + assert result >= 1000 # base minimum + assert result <= 2000 # capped at 2000 + + def test_hierarchical_small_input(self) -> None: + """Test HIERARCHICAL with smaller input.""" + # Even with small input, should return base + result = estimate_summary_tokens(5000, level=4) + assert result == 1000 # just the base, no additional + + +class TestTokensToWords: + """Tests for tokens_to_words function.""" + + def test_basic_conversion(self) -> None: + """Test basic token to word conversion.""" + # 1 token ≈ 0.75 words + assert tokens_to_words(100) == 75 + assert tokens_to_words(1000) == 750 + + def test_zero_tokens(self) -> None: + """Test zero tokens returns zero words.""" + assert tokens_to_words(0) == 0 + + def test_small_values(self) -> None: + """Test small token values.""" + assert tokens_to_words(1) == 0 # int(0.75) = 0 + assert tokens_to_words(2) == 1 # int(1.5) = 1 + assert tokens_to_words(4) == 3 # int(3.0) = 3 From 47c32a688185390603caaa31b31f0b70b2b378a1 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 20:00:28 -0800 Subject: [PATCH 07/37] refactor(summarizer): improve code quality and add Letta-style features - Fix datetime.utcnow() deprecation, use datetime.now(UTC) - Extract duplicate chunk summarization to _summarize_single_chunk() - Add SummarizationError exception for better error handling - Add retry with exponential backoff (1s, 2s, 4s) for generation failures - Add middle-truncation fallback for oversized content (Letta-style) - Export SummarizationError from module __init__ --- agent_cli/summarizer/__init__.py | 3 +- agent_cli/summarizer/adaptive.py | 176 ++++++++++++++++++++++--------- agent_cli/summarizer/models.py | 4 +- tests/summarizer/test_models.py | 4 +- 4 files changed, 135 insertions(+), 52 deletions(-) diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py index c6f1d85a1..d017dfd4b 100644 --- a/agent_cli/summarizer/__init__.py +++ b/agent_cli/summarizer/__init__.py @@ -16,7 +16,7 @@ """ -from agent_cli.summarizer.adaptive import AdaptiveSummarizer +from agent_cli.summarizer.adaptive import AdaptiveSummarizer, SummarizationError from agent_cli.summarizer.models import ( HierarchicalSummary, SummaryLevel, @@ -26,6 +26,7 @@ __all__ = [ "AdaptiveSummarizer", "HierarchicalSummary", + "SummarizationError", "SummaryLevel", "SummaryResult", ] diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index ed0074d87..e8ff2f9a0 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -38,6 +38,7 @@ chunk_text, count_tokens, estimate_summary_tokens, + middle_truncate, tokens_to_words, ) @@ -57,6 +58,14 @@ # Minimum number of L1 chunks before L2 grouping is applied L2_MIN_CHUNKS = 5 +# Retry settings for summarization failures +MAX_SUMMARIZE_RETRIES = 3 + +# Maximum characters per chunk before applying middle truncation +# This prevents context overflow errors for very large chunks +# (roughly 12K tokens with cl100k_base encoding) +MAX_CHUNK_CHARS = 48000 + class SummaryOutput(BaseModel): """Structured output for summary generation.""" @@ -64,6 +73,10 @@ class SummaryOutput(BaseModel): summary: str +class SummarizationError(Exception): + """Raised when summarization fails after all retries.""" + + class AdaptiveSummarizer: """Adaptive summarization that scales with input complexity. @@ -245,6 +258,68 @@ async def update_rolling_summary( return await self._generate_summary(prompt, max_tokens=target_tokens + 50) + async def _summarize_single_chunk( + self, + chunk: str, + chunk_index: int, + total_chunks: int, + *, + parent_group: int | None = None, + ) -> ChunkSummary: + """Summarize a single chunk of content. + + Extracted to avoid duplication between _detailed_summary and + _hierarchical_summary methods. Uses middle truncation as a fallback + for oversized content (Letta-style). + + Args: + chunk: The text chunk to summarize. + chunk_index: Index of this chunk (0-based). + total_chunks: Total number of chunks being processed. + parent_group: Optional L2 group index for hierarchical summaries. + + Returns: + ChunkSummary with the summarized content. + + """ + # Apply middle truncation if chunk is too large (Letta-style fallback) + source_tokens = count_tokens(chunk, self.model) + content_to_summarize = chunk + if len(chunk) > MAX_CHUNK_CHARS: + content_to_summarize, dropped = middle_truncate( + chunk, + MAX_CHUNK_CHARS, + head_frac=0.3, + tail_frac=0.3, + ) + logger.warning( + "Chunk %d truncated: dropped %d chars to fit context window", + chunk_index, + dropped, + ) + + chunk_tokens = count_tokens(content_to_summarize, self.model) + target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt = CHUNK_SUMMARY_PROMPT.format( + chunk_index=chunk_index + 1, + total_chunks=total_chunks, + content=content_to_summarize, + max_words=max_words, + ) + + summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50) + summary_tokens = count_tokens(summary, self.model) + + return ChunkSummary( + chunk_index=chunk_index, + content=summary, + token_count=summary_tokens, + source_tokens=source_tokens, # Report original token count + parent_group=parent_group, + ) + async def _brief_summary(self, content: str) -> str: """Generate a single-sentence summary for brief content.""" prompt = BRIEF_SUMMARY_PROMPT.format(content=content) @@ -286,32 +361,17 @@ async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryRes # Summarize chunks (with concurrency limit) semaphore = asyncio.Semaphore(self.max_concurrent_chunks) - async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary: + async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: async with semaphore: - chunk_tokens = count_tokens(chunk, self.model) - target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt = CHUNK_SUMMARY_PROMPT.format( - chunk_index=idx + 1, - total_chunks=len(chunks), - content=chunk, - max_words=max_words, - ) - - summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50) - summary_tokens = count_tokens(summary, self.model) - - return ChunkSummary( - chunk_index=idx, - content=summary, - token_count=summary_tokens, - source_tokens=chunk_tokens, + return await self._summarize_single_chunk( + chunk, + idx, + len(chunks), parent_group=None, ) chunk_summaries = await asyncio.gather( - *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)], + *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], ) # Generate meta-summary @@ -364,35 +424,19 @@ async def _hierarchical_summary(self, content: str, input_tokens: int) -> Summar # L1: Summarize each chunk semaphore = asyncio.Semaphore(self.max_concurrent_chunks) - async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary: + async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: async with semaphore: - chunk_tokens = count_tokens(chunk, self.model) - target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt = CHUNK_SUMMARY_PROMPT.format( - chunk_index=idx + 1, - total_chunks=len(chunks), - content=chunk, - max_words=max_words, - ) - - summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50) - summary_tokens = count_tokens(summary, self.model) - - # Assign to group (5 chunks per group) - group_idx = idx // 5 - - return ChunkSummary( - chunk_index=idx, - content=summary, - token_count=summary_tokens, - source_tokens=chunk_tokens, + # Assign to L2 group (L2_GROUP_SIZE chunks per group) + group_idx = idx // L2_GROUP_SIZE + return await self._summarize_single_chunk( + chunk, + idx, + len(chunks), parent_group=group_idx, ) l1_summaries = await asyncio.gather( - *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)], + *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], ) # L2: Group summaries (if more than L2_MIN_CHUNKS chunks) @@ -448,10 +492,29 @@ async def summarize_group(group: list[str]) -> str: compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, ) - async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str: + async def _generate_summary( + self, + prompt: str, + max_tokens: int = 256, + *, + attempt: int = 0, + ) -> str: """Generate a summary using the LLM. Uses PydanticAI for structured output with fallback to raw generation. + Implements exponential backoff retry on failures. + + Args: + prompt: The prompt to send to the LLM. + max_tokens: Maximum tokens for the response. + attempt: Current retry attempt (for internal recursion). + + Returns: + The generated summary text. + + Raises: + SummarizationError: If all retries are exhausted. + """ model = OpenAIChatModel( model_name=self.model, @@ -475,7 +538,26 @@ async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str: except Exception as e: logger.warning("Structured summary failed, trying raw generation: %s", e) # Fallback to raw HTTP call - return await self._raw_generate(prompt, max_tokens) + try: + return await self._raw_generate(prompt, max_tokens) + except Exception as raw_err: + if attempt < MAX_SUMMARIZE_RETRIES: + wait_time = 2**attempt # Exponential backoff: 1, 2, 4 seconds + logger.warning( + "Raw generation failed (attempt %d/%d), retrying in %ds: %s", + attempt + 1, + MAX_SUMMARIZE_RETRIES, + wait_time, + raw_err, + ) + await asyncio.sleep(wait_time) + return await self._generate_summary( + prompt, + max_tokens, + attempt=attempt + 1, + ) + msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries" + raise SummarizationError(msg) from raw_err async def _raw_generate(self, prompt: str, max_tokens: int) -> str: """Fallback raw HTTP generation without structured output.""" diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index f231a41e5..de9bc609a 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -2,7 +2,7 @@ from __future__ import annotations -from datetime import datetime +from datetime import UTC, datetime from enum import IntEnum from typing import Any @@ -122,7 +122,7 @@ class SummaryResult(BaseModel): description="Ratio of output to input tokens (lower = more compression)", ) created_at: datetime = Field( - default_factory=datetime.utcnow, + default_factory=lambda: datetime.now(UTC), description="Timestamp when summary was created", ) diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py index 5a6583cd2..e27fa18e0 100644 --- a/tests/summarizer/test_models.py +++ b/tests/summarizer/test_models.py @@ -328,5 +328,5 @@ def test_created_at_default(self) -> None: compression_ratio=0.1, ) after = datetime.now(UTC) - # Compare without timezone since result.created_at may be naive - assert before.replace(tzinfo=None) <= result.created_at <= after.replace(tzinfo=None) + # All datetimes should be UTC-aware + assert before <= result.created_at <= after From f145f37dc653aae8d9842922a72a9842be8c9ea5 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 20:11:26 -0800 Subject: [PATCH 08/37] refactor(summarizer): replace class with functional API - Remove AdaptiveSummarizer class in favor of standalone functions - Add SummarizerConfig dataclass for configuration - Export determine_level() as pure function (no state needed) - Update summarize(), update_rolling_summary() to take config parameter - Update _ingest.py to use new functional API - Update all tests for new API This matches the functional style used throughout the codebase, reducing state and improving testability. --- agent_cli/memory/_ingest.py | 12 +- agent_cli/memory/_persistence.py | 2 +- agent_cli/summarizer/__init__.py | 19 +- agent_cli/summarizer/adaptive.py | 925 ++++++++++++++------------- tests/summarizer/test_adaptive.py | 219 ++++--- tests/summarizer/test_integration.py | 37 +- 6 files changed, 617 insertions(+), 597 deletions(-) diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py index 6673000c1..53e3f3c3f 100644 --- a/agent_cli/memory/_ingest.py +++ b/agent_cli/memory/_ingest.py @@ -334,9 +334,8 @@ async def summarize_content( ) -> SummaryResult: """Adaptively summarize content based on its length. - Uses the AdaptiveSummarizer to automatically select the appropriate - summarization strategy (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) - based on input token count. + Automatically selects the appropriate summarization strategy + (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) based on input token count. Args: content: The content to summarize. @@ -351,15 +350,16 @@ async def summarize_content( """ # Import here to avoid circular imports and allow optional dependency - from agent_cli.summarizer import AdaptiveSummarizer # noqa: PLC0415 + from agent_cli.summarizer import SummarizerConfig, summarize # noqa: PLC0415 - summarizer = AdaptiveSummarizer( + config = SummarizerConfig( openai_base_url=openai_base_url, model=model, api_key=api_key, ) - return await summarizer.summarize( + return await summarize( content=content, + config=config, prior_summary=prior_summary, content_type=content_type, ) diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index 9c38f7315..e27eb83fe 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -210,7 +210,7 @@ def persist_hierarchical_summary( collection: ChromaDB collection. memory_root: Root path for memory files. conversation_id: The conversation this summary belongs to. - summary_result: The result from AdaptiveSummarizer.summarize(). + summary_result: The result from summarize(). Returns: List of IDs that were stored. diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py index d017dfd4b..09210146c 100644 --- a/agent_cli/summarizer/__init__.py +++ b/agent_cli/summarizer/__init__.py @@ -5,18 +5,24 @@ compression ratios) architectures. Example: - from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel + from agent_cli.summarizer import summarize, SummarizerConfig, determine_level - summarizer = AdaptiveSummarizer( + config = SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", ) - result = await summarizer.summarize(long_document) + result = await summarize(long_document, config) print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}") """ -from agent_cli.summarizer.adaptive import AdaptiveSummarizer, SummarizationError +from agent_cli.summarizer.adaptive import ( + SummarizationError, + SummarizerConfig, + determine_level, + summarize, + update_rolling_summary, +) from agent_cli.summarizer.models import ( HierarchicalSummary, SummaryLevel, @@ -24,9 +30,12 @@ ) __all__ = [ - "AdaptiveSummarizer", "HierarchicalSummary", "SummarizationError", + "SummarizerConfig", "SummaryLevel", "SummaryResult", + "determine_level", + "summarize", + "update_rolling_summary", ] diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index e8ff2f9a0..38fa865d0 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -11,6 +11,7 @@ import asyncio import logging +from dataclasses import dataclass import httpx from pydantic import BaseModel @@ -77,508 +78,512 @@ class SummarizationError(Exception): """Raised when summarization fails after all retries.""" -class AdaptiveSummarizer: - """Adaptive summarization that scales with input complexity. - - Automatically selects the appropriate summarization strategy based on - input length: - - NONE (< 100 tokens): No summary needed - - BRIEF (100-500 tokens): Single sentence - - STANDARD (500-3000 tokens): Paragraph summary - - DETAILED (3000-15000 tokens): Chunked + meta-summary - - HIERARCHICAL (> 15000 tokens): Multi-level tree of summaries +@dataclass +class SummarizerConfig: + """Configuration for summarization operations. Example: - summarizer = AdaptiveSummarizer( + config = SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="llama3.1:8b", ) - result = await summarizer.summarize(long_document) + result = await summarize(long_document, config) print(f"Level: {result.level.name}") - print(f"Summary: {result.summary}") print(f"Compression: {result.compression_ratio:.1%}") """ - def __init__( - self, - openai_base_url: str, - model: str, - api_key: str | None = None, - chunk_size: int = 3000, - chunk_overlap: int = 200, - max_concurrent_chunks: int = 5, - timeout: float = 60.0, - ) -> None: - """Initialize the adaptive summarizer. - - Args: - openai_base_url: Base URL for OpenAI-compatible API. - model: Model name to use for summarization. - api_key: API key (optional for local models). - chunk_size: Target token count per chunk for hierarchical summarization. - chunk_overlap: Token overlap between chunks. - max_concurrent_chunks: Maximum parallel chunk summarizations. - timeout: Request timeout in seconds. - - """ - self.openai_base_url = openai_base_url.rstrip("/") - self.model = model - self.api_key = api_key or "not-needed" - self.chunk_size = chunk_size - self.chunk_overlap = chunk_overlap - self.max_concurrent_chunks = max_concurrent_chunks - self.timeout = timeout - - self._provider = OpenAIProvider(api_key=self.api_key, base_url=self.openai_base_url) - - def determine_level(self, token_count: int) -> SummaryLevel: - """Determine the appropriate summary level based on token count. - - Args: - token_count: Number of tokens in the input. - - Returns: - The recommended SummaryLevel. - - """ - if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]: - return SummaryLevel.NONE - if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]: - return SummaryLevel.BRIEF - if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]: - return SummaryLevel.STANDARD - if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]: - return SummaryLevel.DETAILED - return SummaryLevel.HIERARCHICAL - - async def summarize( - self, - content: str, - prior_summary: str | None = None, - content_type: str = "general", - ) -> SummaryResult: - """Summarize content with adaptive strategy based on length. - - Args: - content: The content to summarize. - prior_summary: Optional prior summary for context continuity. - content_type: Type of content ("general", "conversation", "journal", "document"). - - Returns: - SummaryResult with summary and metadata. - - """ - if not content or not content.strip(): - return SummaryResult( - level=SummaryLevel.NONE, - summary=None, - hierarchical=None, - input_tokens=0, - output_tokens=0, - compression_ratio=0.0, - ) + openai_base_url: str + model: str + api_key: str | None = None + chunk_size: int = 3000 + chunk_overlap: int = 200 + max_concurrent_chunks: int = 5 + timeout: float = 60.0 - input_tokens = count_tokens(content, self.model) - level = self.determine_level(input_tokens) + def __post_init__(self) -> None: + """Normalize the base URL.""" + self.openai_base_url = self.openai_base_url.rstrip("/") + if self.api_key is None: + self.api_key = "not-needed" - logger.info( - "Summarizing %d tokens at level %s (type=%s)", - input_tokens, - level.name, - content_type, - ) - if level == SummaryLevel.NONE: - return SummaryResult( - level=level, - summary=None, - hierarchical=None, - input_tokens=input_tokens, - output_tokens=0, - compression_ratio=0.0, - ) +def determine_level(token_count: int) -> SummaryLevel: + """Determine the appropriate summary level based on token count. - if level == SummaryLevel.BRIEF: - summary = await self._brief_summary(content) - elif level == SummaryLevel.STANDARD: - summary = await self._standard_summary(content, prior_summary, content_type) - elif level == SummaryLevel.DETAILED: - return await self._detailed_summary(content, input_tokens) - else: # HIERARCHICAL - return await self._hierarchical_summary(content, input_tokens) + Args: + token_count: Number of tokens in the input. - output_tokens = count_tokens(summary, self.model) if summary else 0 - compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0 + Returns: + The recommended SummaryLevel. + """ + if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]: + return SummaryLevel.NONE + if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]: + return SummaryLevel.BRIEF + if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]: + return SummaryLevel.STANDARD + if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]: + return SummaryLevel.DETAILED + return SummaryLevel.HIERARCHICAL + + +async def summarize( + content: str, + config: SummarizerConfig, + prior_summary: str | None = None, + content_type: str = "general", +) -> SummaryResult: + """Summarize content with adaptive strategy based on length. + + Args: + content: The content to summarize. + config: Summarizer configuration. + prior_summary: Optional prior summary for context continuity. + content_type: Type of content ("general", "conversation", "journal", "document"). + + Returns: + SummaryResult with summary and metadata. + + """ + if not content or not content.strip(): return SummaryResult( - level=level, - summary=summary, + level=SummaryLevel.NONE, + summary=None, hierarchical=None, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=compression_ratio, - ) - - async def update_rolling_summary( - self, - prior_summary: str | None, - new_facts: list[str], - ) -> str: - """Update a rolling summary with new facts (Mem0-style). - - This is optimized for incremental updates where you have discrete - new facts to integrate into an existing summary. - - Args: - prior_summary: The existing summary to update. - new_facts: List of new facts to integrate. - - Returns: - Updated summary string. - - """ - if not new_facts: - return prior_summary or "" - - new_content = "\n".join(f"- {fact}" for fact in new_facts) - combined_tokens = count_tokens( - (prior_summary or "") + new_content, - self.model, - ) - - target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt = ROLLING_SUMMARY_PROMPT.format( - prior_summary=prior_summary or "(No prior summary)", - new_content=new_content, - max_words=max_words, + input_tokens=0, + output_tokens=0, + compression_ratio=0.0, ) - return await self._generate_summary(prompt, max_tokens=target_tokens + 50) - - async def _summarize_single_chunk( - self, - chunk: str, - chunk_index: int, - total_chunks: int, - *, - parent_group: int | None = None, - ) -> ChunkSummary: - """Summarize a single chunk of content. - - Extracted to avoid duplication between _detailed_summary and - _hierarchical_summary methods. Uses middle truncation as a fallback - for oversized content (Letta-style). - - Args: - chunk: The text chunk to summarize. - chunk_index: Index of this chunk (0-based). - total_chunks: Total number of chunks being processed. - parent_group: Optional L2 group index for hierarchical summaries. - - Returns: - ChunkSummary with the summarized content. - - """ - # Apply middle truncation if chunk is too large (Letta-style fallback) - source_tokens = count_tokens(chunk, self.model) - content_to_summarize = chunk - if len(chunk) > MAX_CHUNK_CHARS: - content_to_summarize, dropped = middle_truncate( - chunk, - MAX_CHUNK_CHARS, - head_frac=0.3, - tail_frac=0.3, - ) - logger.warning( - "Chunk %d truncated: dropped %d chars to fit context window", - chunk_index, - dropped, - ) + input_tokens = count_tokens(content, config.model) + level = determine_level(input_tokens) - chunk_tokens = count_tokens(content_to_summarize, self.model) - target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) + logger.info( + "Summarizing %d tokens at level %s (type=%s)", + input_tokens, + level.name, + content_type, + ) - prompt = CHUNK_SUMMARY_PROMPT.format( - chunk_index=chunk_index + 1, - total_chunks=total_chunks, - content=content_to_summarize, - max_words=max_words, + if level == SummaryLevel.NONE: + return SummaryResult( + level=level, + summary=None, + hierarchical=None, + input_tokens=input_tokens, + output_tokens=0, + compression_ratio=0.0, ) - summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50) - summary_tokens = count_tokens(summary, self.model) + if level == SummaryLevel.BRIEF: + summary = await _brief_summary(content, config) + elif level == SummaryLevel.STANDARD: + summary = await _standard_summary(content, config, prior_summary, content_type) + elif level == SummaryLevel.DETAILED: + return await _detailed_summary(content, input_tokens, config) + else: # HIERARCHICAL + return await _hierarchical_summary(content, input_tokens, config) + + output_tokens = count_tokens(summary, config.model) if summary else 0 + compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0 + + return SummaryResult( + level=level, + summary=summary, + hierarchical=None, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=compression_ratio, + ) + + +async def update_rolling_summary( + prior_summary: str | None, + new_facts: list[str], + config: SummarizerConfig, +) -> str: + """Update a rolling summary with new facts (Mem0-style). + + This is optimized for incremental updates where you have discrete + new facts to integrate into an existing summary. + + Args: + prior_summary: The existing summary to update. + new_facts: List of new facts to integrate. + config: Summarizer configuration. + + Returns: + Updated summary string. - return ChunkSummary( - chunk_index=chunk_index, - content=summary, - token_count=summary_tokens, - source_tokens=source_tokens, # Report original token count - parent_group=parent_group, - ) - - async def _brief_summary(self, content: str) -> str: - """Generate a single-sentence summary for brief content.""" - prompt = BRIEF_SUMMARY_PROMPT.format(content=content) - return await self._generate_summary(prompt, max_tokens=50) - - async def _standard_summary( - self, - content: str, - prior_summary: str | None, - content_type: str, - ) -> str: - """Generate a paragraph summary for standard-length content.""" - input_tokens = count_tokens(content, self.model) - target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt_template = get_prompt_for_content_type(content_type) - prior_context = format_prior_context(prior_summary) - - prompt = prompt_template.format( - content=content, - prior_context=prior_context, - max_words=max_words, - ) + """ + if not new_facts: + return prior_summary or "" - return await self._generate_summary(prompt, max_tokens=target_tokens + 50) + new_content = "\n".join(f"- {fact}" for fact in new_facts) + combined_tokens = count_tokens( + (prior_summary or "") + new_content, + config.model, + ) - async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryResult: - """Generate chunked summaries with meta-summary for detailed content.""" - chunks = chunk_text( - content, - chunk_size=self.chunk_size, - overlap=self.chunk_overlap, - model=self.model, - ) + target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) - logger.info("Detailed summary: processing %d chunks", len(chunks)) + prompt = ROLLING_SUMMARY_PROMPT.format( + prior_summary=prior_summary or "(No prior summary)", + new_content=new_content, + max_words=max_words, + ) - # Summarize chunks (with concurrency limit) - semaphore = asyncio.Semaphore(self.max_concurrent_chunks) + return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) - async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: - async with semaphore: - return await self._summarize_single_chunk( - chunk, - idx, - len(chunks), - parent_group=None, - ) - chunk_summaries = await asyncio.gather( - *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], - ) +async def _summarize_single_chunk( + chunk: str, + chunk_index: int, + total_chunks: int, + config: SummarizerConfig, + *, + parent_group: int | None = None, +) -> ChunkSummary: + """Summarize a single chunk of content. - # Generate meta-summary - all_summaries = [cs.content for cs in chunk_summaries] - meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED) - max_words = tokens_to_words(meta_target) + Uses middle truncation as a fallback for oversized content (Letta-style). - meta_prompt = META_SUMMARY_PROMPT.format( - summaries=format_summaries_for_meta(all_summaries), - max_words=max_words, - ) + Args: + chunk: The text chunk to summarize. + chunk_index: Index of this chunk (0-based). + total_chunks: Total number of chunks being processed. + config: Summarizer configuration. + parent_group: Optional L2 group index for hierarchical summaries. - final_summary = await self._generate_summary(meta_prompt, max_tokens=meta_target + 100) - output_tokens = count_tokens(final_summary, self.model) + Returns: + ChunkSummary with the summarized content. - hierarchical = HierarchicalSummary( - l1_summaries=list(chunk_summaries), - l2_summaries=[], # Not used for DETAILED level - l3_summary=final_summary, - chunk_size=self.chunk_size, - chunk_overlap=self.chunk_overlap, + """ + # Apply middle truncation if chunk is too large (Letta-style fallback) + source_tokens = count_tokens(chunk, config.model) + content_to_summarize = chunk + if len(chunk) > MAX_CHUNK_CHARS: + content_to_summarize, dropped = middle_truncate( + chunk, + MAX_CHUNK_CHARS, + head_frac=0.3, + tail_frac=0.3, ) - - return SummaryResult( - level=SummaryLevel.DETAILED, - summary=final_summary, - hierarchical=hierarchical, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + logger.warning( + "Chunk %d truncated: dropped %d chars to fit context window", + chunk_index, + dropped, ) - async def _hierarchical_summary(self, content: str, input_tokens: int) -> SummaryResult: - """Build a tree of summaries for very long content. - - Structure: - - L1: Individual chunk summaries - - L2: Group summaries (groups of ~5 L1 summaries) - - L3: Final synthesis - """ - chunks = chunk_text( - content, - chunk_size=self.chunk_size, - overlap=self.chunk_overlap, - model=self.model, - ) + chunk_tokens = count_tokens(content_to_summarize, config.model) + target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt = CHUNK_SUMMARY_PROMPT.format( + chunk_index=chunk_index + 1, + total_chunks=total_chunks, + content=content_to_summarize, + max_words=max_words, + ) + + summary = await _generate_summary(prompt, config, max_tokens=target_tokens + 50) + summary_tokens = count_tokens(summary, config.model) + + return ChunkSummary( + chunk_index=chunk_index, + content=summary, + token_count=summary_tokens, + source_tokens=source_tokens, # Report original token count + parent_group=parent_group, + ) + + +async def _brief_summary(content: str, config: SummarizerConfig) -> str: + """Generate a single-sentence summary for brief content.""" + prompt = BRIEF_SUMMARY_PROMPT.format(content=content) + return await _generate_summary(prompt, config, max_tokens=50) + + +async def _standard_summary( + content: str, + config: SummarizerConfig, + prior_summary: str | None, + content_type: str, +) -> str: + """Generate a paragraph summary for standard-length content.""" + input_tokens = count_tokens(content, config.model) + target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt_template = get_prompt_for_content_type(content_type) + prior_context = format_prior_context(prior_summary) + + prompt = prompt_template.format( + content=content, + prior_context=prior_context, + max_words=max_words, + ) + + return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) + + +async def _detailed_summary( + content: str, + input_tokens: int, + config: SummarizerConfig, +) -> SummaryResult: + """Generate chunked summaries with meta-summary for detailed content.""" + chunks = chunk_text( + content, + chunk_size=config.chunk_size, + overlap=config.chunk_overlap, + model=config.model, + ) + + logger.info("Detailed summary: processing %d chunks", len(chunks)) + + # Summarize chunks (with concurrency limit) + semaphore = asyncio.Semaphore(config.max_concurrent_chunks) + + async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: + async with semaphore: + return await _summarize_single_chunk( + chunk, + idx, + len(chunks), + config, + parent_group=None, + ) - logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks)) + chunk_summaries = await asyncio.gather( + *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], + ) + + # Generate meta-summary + all_summaries = [cs.content for cs in chunk_summaries] + meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED) + max_words = tokens_to_words(meta_target) + + meta_prompt = META_SUMMARY_PROMPT.format( + summaries=format_summaries_for_meta(all_summaries), + max_words=max_words, + ) + + final_summary = await _generate_summary( + meta_prompt, + config, + max_tokens=meta_target + 100, + ) + output_tokens = count_tokens(final_summary, config.model) + + hierarchical = HierarchicalSummary( + l1_summaries=list(chunk_summaries), + l2_summaries=[], # Not used for DETAILED level + l3_summary=final_summary, + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + ) + + return SummaryResult( + level=SummaryLevel.DETAILED, + summary=final_summary, + hierarchical=hierarchical, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + ) + + +async def _hierarchical_summary( + content: str, + input_tokens: int, + config: SummarizerConfig, +) -> SummaryResult: + """Build a tree of summaries for very long content. + + Structure: + - L1: Individual chunk summaries + - L2: Group summaries (groups of ~5 L1 summaries) + - L3: Final synthesis + """ + chunks = chunk_text( + content, + chunk_size=config.chunk_size, + overlap=config.chunk_overlap, + model=config.model, + ) + + logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks)) + + # L1: Summarize each chunk + semaphore = asyncio.Semaphore(config.max_concurrent_chunks) + + async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: + async with semaphore: + # Assign to L2 group (L2_GROUP_SIZE chunks per group) + group_idx = idx // L2_GROUP_SIZE + return await _summarize_single_chunk( + chunk, + idx, + len(chunks), + config, + parent_group=group_idx, + ) - # L1: Summarize each chunk - semaphore = asyncio.Semaphore(self.max_concurrent_chunks) + l1_summaries = await asyncio.gather( + *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], + ) + + # L2: Group summaries (if more than L2_MIN_CHUNKS chunks) + l2_summaries: list[str] = [] + if len(l1_summaries) > L2_MIN_CHUNKS: + groups: list[list[str]] = [] + for i in range(0, len(l1_summaries), L2_GROUP_SIZE): + group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]] + groups.append(group) + + async def summarize_group(group: list[str]) -> str: + combined_tokens = sum(count_tokens(s, config.model) for s in group) + target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) + max_words = tokens_to_words(target_tokens) + + prompt = META_SUMMARY_PROMPT.format( + summaries=format_summaries_for_meta(group), + max_words=max_words, + ) + return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) + + l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups]) + + # L3: Final synthesis + summaries_to_synthesize = l2_summaries if l2_summaries else [cs.content for cs in l1_summaries] + final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL) + max_words = tokens_to_words(final_target) + + final_prompt = META_SUMMARY_PROMPT.format( + summaries=format_summaries_for_meta(summaries_to_synthesize), + max_words=max_words, + ) + + final_summary = await _generate_summary( + final_prompt, + config, + max_tokens=final_target + 100, + ) + output_tokens = count_tokens(final_summary, config.model) + + hierarchical = HierarchicalSummary( + l1_summaries=list(l1_summaries), + l2_summaries=list(l2_summaries), + l3_summary=final_summary, + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + ) + + return SummaryResult( + level=SummaryLevel.HIERARCHICAL, + summary=final_summary, + hierarchical=hierarchical, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + ) + + +async def _generate_summary( + prompt: str, + config: SummarizerConfig, + max_tokens: int = 256, + *, + attempt: int = 0, +) -> str: + """Generate a summary using the LLM. + + Uses PydanticAI for structured output with fallback to raw generation. + Implements exponential backoff retry on failures. + + Args: + prompt: The prompt to send to the LLM. + config: Summarizer configuration. + max_tokens: Maximum tokens for the response. + attempt: Current retry attempt (for internal recursion). + + Returns: + The generated summary text. + + Raises: + SummarizationError: If all retries are exhausted. - async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: - async with semaphore: - # Assign to L2 group (L2_GROUP_SIZE chunks per group) - group_idx = idx // L2_GROUP_SIZE - return await self._summarize_single_chunk( - chunk, - idx, - len(chunks), - parent_group=group_idx, + """ + provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url) + model = OpenAIChatModel( + model_name=config.model, + provider=provider, + settings=ModelSettings( + temperature=0.3, + max_tokens=max_tokens, + ), + ) + + agent = Agent( + model=model, + system_prompt="You are a concise summarizer. Output only the summary, no preamble.", + output_type=SummaryOutput, + retries=2, + ) + + try: + result = await agent.run(prompt) + return result.output.summary.strip() + except Exception as e: + logger.warning("Structured summary failed, trying raw generation: %s", e) + # Fallback to raw HTTP call + try: + return await _raw_generate(prompt, config, max_tokens) + except Exception as raw_err: + if attempt < MAX_SUMMARIZE_RETRIES: + wait_time = 2**attempt # Exponential backoff: 1, 2, 4 seconds + logger.warning( + "Raw generation failed (attempt %d/%d), retrying in %ds: %s", + attempt + 1, + MAX_SUMMARIZE_RETRIES, + wait_time, + raw_err, ) - - l1_summaries = await asyncio.gather( - *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], - ) - - # L2: Group summaries (if more than L2_MIN_CHUNKS chunks) - l2_summaries: list[str] = [] - if len(l1_summaries) > L2_MIN_CHUNKS: - groups: list[list[str]] = [] - for i in range(0, len(l1_summaries), L2_GROUP_SIZE): - group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]] - groups.append(group) - - async def summarize_group(group: list[str]) -> str: - combined_tokens = sum(count_tokens(s, self.model) for s in group) - target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt = META_SUMMARY_PROMPT.format( - summaries=format_summaries_for_meta(group), - max_words=max_words, + await asyncio.sleep(wait_time) + return await _generate_summary( + prompt, + config, + max_tokens, + attempt=attempt + 1, ) - return await self._generate_summary(prompt, max_tokens=target_tokens + 50) - - l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups]) - - # L3: Final synthesis - summaries_to_synthesize = ( - l2_summaries if l2_summaries else [cs.content for cs in l1_summaries] - ) - final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL) - max_words = tokens_to_words(final_target) - - final_prompt = META_SUMMARY_PROMPT.format( - summaries=format_summaries_for_meta(summaries_to_synthesize), - max_words=max_words, + msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries" + raise SummarizationError(msg) from raw_err + + +async def _raw_generate(prompt: str, config: SummarizerConfig, max_tokens: int) -> str: + """Fallback raw HTTP generation without structured output.""" + async with httpx.AsyncClient(timeout=config.timeout) as client: + response = await client.post( + f"{config.openai_base_url}/chat/completions", + headers={"Authorization": f"Bearer {config.api_key}"}, + json={ + "model": config.model, + "messages": [ + {"role": "system", "content": "You are a concise summarizer."}, + {"role": "user", "content": prompt}, + ], + "temperature": 0.3, + "max_tokens": max_tokens, + }, ) + response.raise_for_status() + data = response.json() - final_summary = await self._generate_summary(final_prompt, max_tokens=final_target + 100) - output_tokens = count_tokens(final_summary, self.model) - - hierarchical = HierarchicalSummary( - l1_summaries=list(l1_summaries), - l2_summaries=list(l2_summaries), - l3_summary=final_summary, - chunk_size=self.chunk_size, - chunk_overlap=self.chunk_overlap, - ) - - return SummaryResult( - level=SummaryLevel.HIERARCHICAL, - summary=final_summary, - hierarchical=hierarchical, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, - ) - - async def _generate_summary( - self, - prompt: str, - max_tokens: int = 256, - *, - attempt: int = 0, - ) -> str: - """Generate a summary using the LLM. - - Uses PydanticAI for structured output with fallback to raw generation. - Implements exponential backoff retry on failures. - - Args: - prompt: The prompt to send to the LLM. - max_tokens: Maximum tokens for the response. - attempt: Current retry attempt (for internal recursion). - - Returns: - The generated summary text. - - Raises: - SummarizationError: If all retries are exhausted. - - """ - model = OpenAIChatModel( - model_name=self.model, - provider=self._provider, - settings=ModelSettings( - temperature=0.3, - max_tokens=max_tokens, - ), - ) - - agent = Agent( - model=model, - system_prompt="You are a concise summarizer. Output only the summary, no preamble.", - output_type=SummaryOutput, - retries=2, - ) - - try: - result = await agent.run(prompt) - return result.output.summary.strip() - except Exception as e: - logger.warning("Structured summary failed, trying raw generation: %s", e) - # Fallback to raw HTTP call - try: - return await self._raw_generate(prompt, max_tokens) - except Exception as raw_err: - if attempt < MAX_SUMMARIZE_RETRIES: - wait_time = 2**attempt # Exponential backoff: 1, 2, 4 seconds - logger.warning( - "Raw generation failed (attempt %d/%d), retrying in %ds: %s", - attempt + 1, - MAX_SUMMARIZE_RETRIES, - wait_time, - raw_err, - ) - await asyncio.sleep(wait_time) - return await self._generate_summary( - prompt, - max_tokens, - attempt=attempt + 1, - ) - msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries" - raise SummarizationError(msg) from raw_err - - async def _raw_generate(self, prompt: str, max_tokens: int) -> str: - """Fallback raw HTTP generation without structured output.""" - async with httpx.AsyncClient(timeout=self.timeout) as client: - response = await client.post( - f"{self.openai_base_url}/chat/completions", - headers={"Authorization": f"Bearer {self.api_key}"}, - json={ - "model": self.model, - "messages": [ - {"role": "system", "content": "You are a concise summarizer."}, - {"role": "user", "content": prompt}, - ], - "temperature": 0.3, - "max_tokens": max_tokens, - }, - ) - response.raise_for_status() - data = response.json() - - choices = data.get("choices", []) - if choices: - return choices[0].get("message", {}).get("content", "").strip() - return "" + choices = data.get("choices", []) + if choices: + return choices[0].get("message", {}).get("content", "").strip() + return "" diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py index 1f010999e..f5db1486c 100644 --- a/tests/summarizer/test_adaptive.py +++ b/tests/summarizer/test_adaptive.py @@ -1,4 +1,4 @@ -"""Unit tests for AdaptiveSummarizer.""" +"""Unit tests for adaptive summarization functions.""" from __future__ import annotations @@ -8,37 +8,42 @@ from agent_cli.summarizer.adaptive import ( LEVEL_THRESHOLDS, - AdaptiveSummarizer, + SummarizerConfig, SummaryOutput, + _generate_summary, + _raw_generate, + determine_level, + summarize, + update_rolling_summary, ) from agent_cli.summarizer.models import SummaryLevel, SummaryResult -class TestAdaptiveSummarizerInit: - """Tests for AdaptiveSummarizer initialization.""" +class TestSummarizerConfig: + """Tests for SummarizerConfig initialization.""" def test_basic_init(self) -> None: """Test basic initialization with required parameters.""" - summarizer = AdaptiveSummarizer( + config = SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="llama3.1:8b", ) - assert summarizer.openai_base_url == "http://localhost:8000/v1" - assert summarizer.model == "llama3.1:8b" - assert summarizer.api_key == "not-needed" + assert config.openai_base_url == "http://localhost:8000/v1" + assert config.model == "llama3.1:8b" + assert config.api_key == "not-needed" def test_init_with_api_key(self) -> None: """Test initialization with custom API key.""" - summarizer = AdaptiveSummarizer( + config = SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", api_key="sk-test-key", ) - assert summarizer.api_key == "sk-test-key" + assert config.api_key == "sk-test-key" def test_init_with_custom_settings(self) -> None: """Test initialization with custom chunk settings.""" - summarizer = AdaptiveSummarizer( + config = SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", chunk_size=5000, @@ -46,59 +51,51 @@ def test_init_with_custom_settings(self) -> None: max_concurrent_chunks=10, timeout=120.0, ) - assert summarizer.chunk_size == 5000 - assert summarizer.chunk_overlap == 300 - assert summarizer.max_concurrent_chunks == 10 - assert summarizer.timeout == 120.0 + assert config.chunk_size == 5000 + assert config.chunk_overlap == 300 + assert config.max_concurrent_chunks == 10 + assert config.timeout == 120.0 def test_trailing_slash_stripped(self) -> None: """Test that trailing slash is stripped from base URL.""" - summarizer = AdaptiveSummarizer( + config = SummarizerConfig( openai_base_url="http://localhost:8000/v1/", model="gpt-4", ) - assert summarizer.openai_base_url == "http://localhost:8000/v1" + assert config.openai_base_url == "http://localhost:8000/v1" class TestDetermineLevel: """Tests for level determination based on token count.""" - @pytest.fixture - def summarizer(self) -> AdaptiveSummarizer: - """Create a summarizer instance.""" - return AdaptiveSummarizer( - openai_base_url="http://localhost:8000/v1", - model="gpt-4", - ) - - def test_none_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + def test_none_level_threshold(self) -> None: """Test NONE level for very short content.""" - assert summarizer.determine_level(50) == SummaryLevel.NONE - assert summarizer.determine_level(99) == SummaryLevel.NONE + assert determine_level(50) == SummaryLevel.NONE + assert determine_level(99) == SummaryLevel.NONE - def test_brief_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + def test_brief_level_threshold(self) -> None: """Test BRIEF level for short content.""" - assert summarizer.determine_level(100) == SummaryLevel.BRIEF - assert summarizer.determine_level(300) == SummaryLevel.BRIEF - assert summarizer.determine_level(499) == SummaryLevel.BRIEF + assert determine_level(100) == SummaryLevel.BRIEF + assert determine_level(300) == SummaryLevel.BRIEF + assert determine_level(499) == SummaryLevel.BRIEF - def test_standard_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + def test_standard_level_threshold(self) -> None: """Test STANDARD level for medium content.""" - assert summarizer.determine_level(500) == SummaryLevel.STANDARD - assert summarizer.determine_level(1500) == SummaryLevel.STANDARD - assert summarizer.determine_level(2999) == SummaryLevel.STANDARD + assert determine_level(500) == SummaryLevel.STANDARD + assert determine_level(1500) == SummaryLevel.STANDARD + assert determine_level(2999) == SummaryLevel.STANDARD - def test_detailed_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + def test_detailed_level_threshold(self) -> None: """Test DETAILED level for longer content.""" - assert summarizer.determine_level(3000) == SummaryLevel.DETAILED - assert summarizer.determine_level(8000) == SummaryLevel.DETAILED - assert summarizer.determine_level(14999) == SummaryLevel.DETAILED + assert determine_level(3000) == SummaryLevel.DETAILED + assert determine_level(8000) == SummaryLevel.DETAILED + assert determine_level(14999) == SummaryLevel.DETAILED - def test_hierarchical_level_threshold(self, summarizer: AdaptiveSummarizer) -> None: + def test_hierarchical_level_threshold(self) -> None: """Test HIERARCHICAL level for very long content.""" - assert summarizer.determine_level(15000) == SummaryLevel.HIERARCHICAL - assert summarizer.determine_level(50000) == SummaryLevel.HIERARCHICAL - assert summarizer.determine_level(100000) == SummaryLevel.HIERARCHICAL + assert determine_level(15000) == SummaryLevel.HIERARCHICAL + assert determine_level(50000) == SummaryLevel.HIERARCHICAL + assert determine_level(100000) == SummaryLevel.HIERARCHICAL def test_thresholds_match_constants(self) -> None: """Verify thresholds match the module constants.""" @@ -109,46 +106,55 @@ def test_thresholds_match_constants(self) -> None: class TestSummarize: - """Tests for main summarize method.""" + """Tests for main summarize function.""" @pytest.fixture - def summarizer(self) -> AdaptiveSummarizer: - """Create a summarizer instance.""" - return AdaptiveSummarizer( + def config(self) -> SummarizerConfig: + """Create a config instance.""" + return SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", ) @pytest.mark.asyncio - async def test_empty_content_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None: + async def test_empty_content_returns_none_level( + self, + config: SummarizerConfig, + ) -> None: """Test that empty content returns NONE level result.""" - result = await summarizer.summarize("") + result = await summarize("", config) assert result.level == SummaryLevel.NONE assert result.summary is None assert result.input_tokens == 0 assert result.output_tokens == 0 @pytest.mark.asyncio - async def test_whitespace_only_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None: + async def test_whitespace_only_returns_none_level( + self, + config: SummarizerConfig, + ) -> None: """Test that whitespace-only content returns NONE level result.""" - result = await summarizer.summarize(" \n\n ") + result = await summarize(" \n\n ", config) assert result.level == SummaryLevel.NONE assert result.summary is None @pytest.mark.asyncio - async def test_very_short_content_no_summary(self, summarizer: AdaptiveSummarizer) -> None: + async def test_very_short_content_no_summary( + self, + config: SummarizerConfig, + ) -> None: """Test that very short content gets NONE level (no summary).""" # Less than 100 tokens - result = await summarizer.summarize("Hello world") + result = await summarize("Hello world", config) assert result.level == SummaryLevel.NONE assert result.summary is None @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_brief_summary") + @patch("agent_cli.summarizer.adaptive._brief_summary") async def test_brief_level_calls_brief_summary( self, mock_brief: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test that BRIEF level content calls _brief_summary.""" mock_brief.return_value = "Brief summary." @@ -156,18 +162,18 @@ async def test_brief_level_calls_brief_summary( # Create content that's ~100-500 tokens content = "This is a test sentence. " * 30 # ~150 tokens - result = await summarizer.summarize(content) + result = await summarize(content, config) - mock_brief.assert_called_once_with(content) + mock_brief.assert_called_once_with(content, config) assert result.level == SummaryLevel.BRIEF assert result.summary == "Brief summary." @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_standard_summary") + @patch("agent_cli.summarizer.adaptive._standard_summary") async def test_standard_level_calls_standard_summary( self, mock_standard: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test that STANDARD level content calls _standard_summary.""" mock_standard.return_value = "Standard summary paragraph." @@ -175,18 +181,18 @@ async def test_standard_level_calls_standard_summary( # Create content that's ~500-3000 tokens content = "This is a test sentence with more words. " * 100 # ~800 tokens - result = await summarizer.summarize(content, content_type="general") + result = await summarize(content, config, content_type="general") - mock_standard.assert_called_once_with(content, None, "general") + mock_standard.assert_called_once_with(content, config, None, "general") assert result.level == SummaryLevel.STANDARD assert result.summary == "Standard summary paragraph." @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_standard_summary") + @patch("agent_cli.summarizer.adaptive._standard_summary") async def test_prior_summary_passed_to_standard( self, mock_standard: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test that prior_summary is passed to _standard_summary.""" mock_standard.return_value = "Updated summary." @@ -194,16 +200,16 @@ async def test_prior_summary_passed_to_standard( content = "This is a test sentence with more words. " * 100 prior = "Previous context summary." - await summarizer.summarize(content, prior_summary=prior) + await summarize(content, config, prior_summary=prior) - mock_standard.assert_called_once_with(content, prior, "general") + mock_standard.assert_called_once_with(content, config, prior, "general") @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_detailed_summary") + @patch("agent_cli.summarizer.adaptive._detailed_summary") async def test_detailed_level_calls_detailed_summary( self, mock_detailed: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test that DETAILED level content calls _detailed_summary.""" mock_result = SummaryResult( @@ -219,17 +225,17 @@ async def test_detailed_level_calls_detailed_summary( # Create content that's ~3000-15000 tokens content = "Word " * 5000 # ~5000 tokens - result = await summarizer.summarize(content) + result = await summarize(content, config) assert mock_detailed.called assert result.level == SummaryLevel.DETAILED @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_hierarchical_summary") + @patch("agent_cli.summarizer.adaptive._hierarchical_summary") async def test_hierarchical_level_calls_hierarchical_summary( self, mock_hierarchical: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test that HIERARCHICAL level content calls _hierarchical_summary.""" mock_result = SummaryResult( @@ -245,7 +251,7 @@ async def test_hierarchical_level_calls_hierarchical_summary( # Create content that's > 15000 tokens content = "Word " * 20000 - result = await summarizer.summarize(content) + result = await summarize(content, config) assert mock_hierarchical.called assert result.level == SummaryLevel.HIERARCHICAL @@ -255,62 +261,69 @@ class TestUpdateRollingSummary: """Tests for rolling summary updates.""" @pytest.fixture - def summarizer(self) -> AdaptiveSummarizer: - """Create a summarizer instance.""" - return AdaptiveSummarizer( + def config(self) -> SummarizerConfig: + """Create a config instance.""" + return SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", ) @pytest.mark.asyncio - async def test_empty_facts_returns_prior(self, summarizer: AdaptiveSummarizer) -> None: + async def test_empty_facts_returns_prior(self, config: SummarizerConfig) -> None: """Test that empty facts list returns prior summary.""" - result = await summarizer.update_rolling_summary( + result = await update_rolling_summary( prior_summary="Existing summary", new_facts=[], + config=config, ) assert result == "Existing summary" @pytest.mark.asyncio - async def test_empty_facts_no_prior_returns_empty(self, summarizer: AdaptiveSummarizer) -> None: + async def test_empty_facts_no_prior_returns_empty( + self, + config: SummarizerConfig, + ) -> None: """Test that empty facts with no prior returns empty string.""" - result = await summarizer.update_rolling_summary( + result = await update_rolling_summary( prior_summary=None, new_facts=[], + config=config, ) assert result == "" @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_generate_summary") + @patch("agent_cli.summarizer.adaptive._generate_summary") async def test_new_facts_calls_generate( self, mock_generate: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test that new facts trigger summary generation.""" mock_generate.return_value = "Updated summary with new facts." - result = await summarizer.update_rolling_summary( + result = await update_rolling_summary( prior_summary="Old summary", new_facts=["User likes coffee", "User lives in Amsterdam"], + config=config, ) mock_generate.assert_called_once() assert result == "Updated summary with new facts." @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_generate_summary") + @patch("agent_cli.summarizer.adaptive._generate_summary") async def test_facts_formatted_as_list( self, mock_generate: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test that facts are formatted as bullet list in prompt.""" mock_generate.return_value = "Summary" - await summarizer.update_rolling_summary( + await update_rolling_summary( prior_summary="Prior", new_facts=["Fact one", "Fact two"], + config=config, ) # Check the prompt contains formatted facts @@ -321,12 +334,12 @@ async def test_facts_formatted_as_list( class TestGenerateSummary: - """Tests for _generate_summary method.""" + """Tests for _generate_summary function.""" @pytest.fixture - def summarizer(self) -> AdaptiveSummarizer: - """Create a summarizer instance.""" - return AdaptiveSummarizer( + def config(self) -> SummarizerConfig: + """Create a config instance.""" + return SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", ) @@ -334,7 +347,7 @@ def summarizer(self) -> AdaptiveSummarizer: @pytest.mark.asyncio async def test_generate_summary_with_pydantic_ai( self, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test summary generation using PydanticAI agent.""" # Mock the entire agent creation and run @@ -346,17 +359,17 @@ async def test_generate_summary_with_pydantic_ai( mock_agent.run = AsyncMock(return_value=mock_result) mock_agent_class.return_value = mock_agent - result = await summarizer._generate_summary("Test prompt", max_tokens=100) + result = await _generate_summary("Test prompt", config, max_tokens=100) assert result == "Generated summary." mock_agent.run.assert_called_once_with("Test prompt") @pytest.mark.asyncio - @patch.object(AdaptiveSummarizer, "_raw_generate") + @patch("agent_cli.summarizer.adaptive._raw_generate") async def test_fallback_to_raw_generate_on_error( self, mock_raw: AsyncMock, - summarizer: AdaptiveSummarizer, + config: SummarizerConfig, ) -> None: """Test fallback to raw HTTP on PydanticAI error.""" mock_raw.return_value = "Fallback summary" @@ -366,25 +379,25 @@ async def test_fallback_to_raw_generate_on_error( mock_agent.run = AsyncMock(side_effect=Exception("API error")) mock_agent_class.return_value = mock_agent - result = await summarizer._generate_summary("Test prompt", max_tokens=100) + result = await _generate_summary("Test prompt", config, max_tokens=100) - mock_raw.assert_called_once_with("Test prompt", 100) + mock_raw.assert_called_once_with("Test prompt", config, 100) assert result == "Fallback summary" class TestRawGenerate: - """Tests for _raw_generate fallback method.""" + """Tests for _raw_generate fallback function.""" @pytest.fixture - def summarizer(self) -> AdaptiveSummarizer: - """Create a summarizer instance.""" - return AdaptiveSummarizer( + def config(self) -> SummarizerConfig: + """Create a config instance.""" + return SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", ) @pytest.mark.asyncio - async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> None: + async def test_raw_generate_success(self, config: SummarizerConfig) -> None: """Test successful raw HTTP generation.""" mock_response = MagicMock() mock_response.json.return_value = { @@ -398,12 +411,12 @@ async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> Non mock_client.__aexit__ = AsyncMock(return_value=None) mock_client_class.return_value = mock_client - result = await summarizer._raw_generate("Test prompt", max_tokens=100) + result = await _raw_generate("Test prompt", config, max_tokens=100) assert result == "Raw generated summary" @pytest.mark.asyncio - async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer) -> None: + async def test_raw_generate_empty_choices(self, config: SummarizerConfig) -> None: """Test raw generate with empty choices returns empty string.""" mock_response = MagicMock() mock_response.json.return_value = {"choices": []} @@ -415,7 +428,7 @@ async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer) mock_client.__aexit__ = AsyncMock(return_value=None) mock_client_class.return_value = mock_client - result = await summarizer._raw_generate("Test prompt", max_tokens=100) + result = await _raw_generate("Test prompt", config, max_tokens=100) assert result == "" diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py index 381f9f5b6..e58a20f6c 100644 --- a/tests/summarizer/test_integration.py +++ b/tests/summarizer/test_integration.py @@ -14,7 +14,7 @@ get_summary_at_level, upsert_hierarchical_summary, ) -from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel, SummaryResult +from agent_cli.summarizer import SummaryLevel, SummaryResult, determine_level from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary if TYPE_CHECKING: @@ -401,40 +401,32 @@ def test_persist_deletes_old_summaries( assert deleted_dir.exists() -class TestAdaptiveSummarizerLevelDetermination: - """Test that AdaptiveSummarizer correctly determines summary levels.""" +class TestDetermineLevelFunction: + """Test that determine_level correctly determines summary levels.""" - @pytest.fixture - def summarizer(self) -> AdaptiveSummarizer: - """Create an AdaptiveSummarizer instance.""" - return AdaptiveSummarizer( - openai_base_url="http://localhost:8000/v1", - model="test-model", - ) - - def test_very_short_content_is_none(self, summarizer: AdaptiveSummarizer) -> None: + def test_very_short_content_is_none(self) -> None: """Test that content under 100 tokens gets NONE level.""" - level = summarizer.determine_level(50) + level = determine_level(50) assert level == SummaryLevel.NONE - def test_short_content_is_brief(self, summarizer: AdaptiveSummarizer) -> None: + def test_short_content_is_brief(self) -> None: """Test that 100-500 token content gets BRIEF level.""" - level = summarizer.determine_level(300) + level = determine_level(300) assert level == SummaryLevel.BRIEF - def test_medium_content_is_standard(self, summarizer: AdaptiveSummarizer) -> None: + def test_medium_content_is_standard(self) -> None: """Test that 500-3000 token content gets STANDARD level.""" - level = summarizer.determine_level(1500) + level = determine_level(1500) assert level == SummaryLevel.STANDARD - def test_long_content_is_detailed(self, summarizer: AdaptiveSummarizer) -> None: + def test_long_content_is_detailed(self) -> None: """Test that 3000-15000 token content gets DETAILED level.""" - level = summarizer.determine_level(8000) + level = determine_level(8000) assert level == SummaryLevel.DETAILED - def test_very_long_content_is_hierarchical(self, summarizer: AdaptiveSummarizer) -> None: + def test_very_long_content_is_hierarchical(self) -> None: """Test that content over 15000 tokens gets HIERARCHICAL level.""" - level = summarizer.determine_level(25000) + level = determine_level(25000) assert level == SummaryLevel.HIERARCHICAL @@ -444,7 +436,8 @@ class TestSummarizeContentFunction: @pytest.mark.asyncio async def test_summarize_content_creates_result(self) -> None: """Test that summarize_content returns a valid SummaryResult.""" - with patch.object(AdaptiveSummarizer, "summarize") as mock_summarize: + # Patch at source since _ingest imports inside the function + with patch("agent_cli.summarizer.summarize") as mock_summarize: mock_result = SummaryResult( level=SummaryLevel.STANDARD, summary="Mocked summary.", From 44cfdda19e7948e3afc0516b6e498bc4ca31c771 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 20:21:59 -0800 Subject: [PATCH 09/37] refactor(summarizer): make internal modules private and simplify public API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename prompts.py → _prompts.py and utils.py → _utils.py - Reduce public API to 6 essential exports: SummarizerConfig, summarize, SummaryResult, SummaryLevel, HierarchicalSummary, SummarizationError - Remove determine_level, update_rolling_summary, count_tokens from public API - Update imports in adaptive.py and test files --- agent_cli/summarizer/__init__.py | 20 ++++--------------- .../summarizer/{prompts.py => _prompts.py} | 0 agent_cli/summarizer/{utils.py => _utils.py} | 0 agent_cli/summarizer/adaptive.py | 16 +++++++-------- tests/summarizer/test_integration.py | 3 ++- tests/summarizer/test_prompts.py | 2 +- tests/summarizer/test_utils.py | 2 +- 7 files changed, 16 insertions(+), 27 deletions(-) rename agent_cli/summarizer/{prompts.py => _prompts.py} (100%) rename agent_cli/summarizer/{utils.py => _utils.py} (100%) diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py index 09210146c..fc0994c4c 100644 --- a/agent_cli/summarizer/__init__.py +++ b/agent_cli/summarizer/__init__.py @@ -5,29 +5,19 @@ compression ratios) architectures. Example: - from agent_cli.summarizer import summarize, SummarizerConfig, determine_level + from agent_cli.summarizer import summarize, SummarizerConfig config = SummarizerConfig( openai_base_url="http://localhost:8000/v1", model="gpt-4", ) result = await summarize(long_document, config) - print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}") + print(f"Level: {result.level.name}, Compression: {result.compression_ratio:.1%}") """ -from agent_cli.summarizer.adaptive import ( - SummarizationError, - SummarizerConfig, - determine_level, - summarize, - update_rolling_summary, -) -from agent_cli.summarizer.models import ( - HierarchicalSummary, - SummaryLevel, - SummaryResult, -) +from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize +from agent_cli.summarizer.models import HierarchicalSummary, SummaryLevel, SummaryResult __all__ = [ "HierarchicalSummary", @@ -35,7 +25,5 @@ "SummarizerConfig", "SummaryLevel", "SummaryResult", - "determine_level", "summarize", - "update_rolling_summary", ] diff --git a/agent_cli/summarizer/prompts.py b/agent_cli/summarizer/_prompts.py similarity index 100% rename from agent_cli/summarizer/prompts.py rename to agent_cli/summarizer/_prompts.py diff --git a/agent_cli/summarizer/utils.py b/agent_cli/summarizer/_utils.py similarity index 100% rename from agent_cli/summarizer/utils.py rename to agent_cli/summarizer/_utils.py diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 38fa865d0..590dabc55 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -20,13 +20,7 @@ from pydantic_ai.providers.openai import OpenAIProvider from pydantic_ai.settings import ModelSettings -from agent_cli.summarizer.models import ( - ChunkSummary, - HierarchicalSummary, - SummaryLevel, - SummaryResult, -) -from agent_cli.summarizer.prompts import ( +from agent_cli.summarizer._prompts import ( BRIEF_SUMMARY_PROMPT, CHUNK_SUMMARY_PROMPT, META_SUMMARY_PROMPT, @@ -35,13 +29,19 @@ format_summaries_for_meta, get_prompt_for_content_type, ) -from agent_cli.summarizer.utils import ( +from agent_cli.summarizer._utils import ( chunk_text, count_tokens, estimate_summary_tokens, middle_truncate, tokens_to_words, ) +from agent_cli.summarizer.models import ( + ChunkSummary, + HierarchicalSummary, + SummaryLevel, + SummaryResult, +) logger = logging.getLogger(__name__) diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py index e58a20f6c..6eeb133ed 100644 --- a/tests/summarizer/test_integration.py +++ b/tests/summarizer/test_integration.py @@ -14,7 +14,8 @@ get_summary_at_level, upsert_hierarchical_summary, ) -from agent_cli.summarizer import SummaryLevel, SummaryResult, determine_level +from agent_cli.summarizer import SummaryLevel, SummaryResult +from agent_cli.summarizer.adaptive import determine_level from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary if TYPE_CHECKING: diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py index e126def22..05937f71a 100644 --- a/tests/summarizer/test_prompts.py +++ b/tests/summarizer/test_prompts.py @@ -2,7 +2,7 @@ from __future__ import annotations -from agent_cli.summarizer.prompts import ( +from agent_cli.summarizer._prompts import ( BRIEF_SUMMARY_PROMPT, CHUNK_SUMMARY_PROMPT, CONVERSATION_SUMMARY_PROMPT, diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py index 458e9b37d..22eb4039e 100644 --- a/tests/summarizer/test_utils.py +++ b/tests/summarizer/test_utils.py @@ -2,7 +2,7 @@ from __future__ import annotations -from agent_cli.summarizer.utils import ( +from agent_cli.summarizer._utils import ( chunk_text, count_tokens, estimate_summary_tokens, From 1de48ddc6f58a73ab336e9123080d9390023a346 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 20:39:01 -0800 Subject: [PATCH 10/37] refactor(memory): wire AdaptiveSummarizer into memory pipeline Replace the old rolling summary system with the new hierarchical adaptive summarizer. This simplifies the codebase by removing redundant code paths and using a single, research-backed approach. Changes: - Update extract_and_store_facts_and_summaries() to use summarize_content() and store_adaptive_summary() instead of update_summary()/persist_summary() - Remove old summary functions: update_summary, persist_summary, get_summary_entry - Remove Summary entity and SummaryOutput model (unused) - Add summary_level to L3 metadata for consistency - Update tests to mock new summarizer interface The new system automatically selects summarization level (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) based on content complexity, storing summaries in a L1/L2/L3 hierarchical structure. --- agent_cli/memory/_ingest.py | 80 ++++++---------------------- agent_cli/memory/_persistence.py | 27 +--------- agent_cli/memory/_retrieval.py | 5 +- agent_cli/memory/_store.py | 25 --------- agent_cli/memory/entities.py | 9 ---- agent_cli/memory/models.py | 16 +----- agent_cli/summarizer/models.py | 1 + tests/memory/test_engine.py | 46 +++++++++++----- tests/memory/test_git_integration.py | 14 +++-- tests/memory/test_store.py | 17 ------ 10 files changed, 64 insertions(+), 176 deletions(-) diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py index 53e3f3c3f..b0b472b71 100644 --- a/agent_cli/memory/_ingest.py +++ b/agent_cli/memory/_ingest.py @@ -14,24 +14,21 @@ delete_memory_files, persist_entries, persist_hierarchical_summary, - persist_summary, ) from agent_cli.memory._prompt import ( FACT_INSTRUCTIONS, FACT_SYSTEM_PROMPT, - SUMMARY_PROMPT, UPDATE_MEMORY_PROMPT, ) from agent_cli.memory._retrieval import gather_relevant_existing_memories -from agent_cli.memory._store import delete_entries, get_summary_entry -from agent_cli.memory.entities import Fact, Summary +from agent_cli.memory._store import delete_entries, get_final_summary +from agent_cli.memory.entities import Fact from agent_cli.memory.models import ( MemoryAdd, MemoryDecision, MemoryDelete, MemoryIgnore, MemoryUpdate, - SummaryOutput, ) if TYPE_CHECKING: @@ -43,8 +40,6 @@ LOGGER = logging.getLogger(__name__) -_SUMMARY_ROLE = "summary" - def _elapsed_ms(start: float) -> float: """Return elapsed milliseconds since start.""" @@ -283,46 +278,6 @@ def validate_decisions(decisions: list[MemoryDecision]) -> list[MemoryDecision]: return to_add, to_delete, replacement_map -async def update_summary( - *, - prior_summary: str | None, - new_facts: list[str], - openai_base_url: str, - api_key: str | None, - model: str, - max_tokens: int = 256, -) -> str | None: - """Update the conversation summary based on new facts. - - This is the simple Mem0-style rolling summary that incrementally - updates based on new facts. For full content adaptive summarization, - use `summarize_content` instead. - """ - if not new_facts: - return prior_summary - - from pydantic_ai import Agent # noqa: PLC0415 - from pydantic_ai.models.openai import OpenAIChatModel # noqa: PLC0415 - from pydantic_ai.providers.openai import OpenAIProvider # noqa: PLC0415 - from pydantic_ai.settings import ModelSettings # noqa: PLC0415 - - system_prompt = SUMMARY_PROMPT - user_parts: list[str] = [] - if prior_summary: - user_parts.append(f"Previous summary:\n{prior_summary}") - user_parts.append("New facts:\n" + "\n".join(f"- {fact}" for fact in new_facts)) - prompt_text = "\n\n".join(user_parts) - provider = OpenAIProvider(api_key=api_key or "dummy", base_url=openai_base_url) - model_cfg = OpenAIChatModel( - model_name=model, - provider=provider, - settings=ModelSettings(temperature=0.2, max_tokens=max_tokens), - ) - agent = Agent(model=model_cfg, system_prompt=system_prompt, output_type=SummaryOutput) - result = await agent.run(prompt_text) - return result.output.summary or prior_summary - - async def summarize_content( *, content: str, @@ -459,37 +414,34 @@ async def extract_and_store_facts_and_summaries( entries=list(to_add), ) - if enable_summarization: - prior_summary_entry = get_summary_entry( - collection, - conversation_id, - role=_SUMMARY_ROLE, - ) + if enable_summarization and facts: + # Get prior summary for context continuity + prior_summary_entry = get_final_summary(collection, conversation_id) prior_summary = prior_summary_entry.content if prior_summary_entry else None + # Summarize the new facts + content_to_summarize = "\n".join(facts) summary_start = perf_counter() - new_summary = await update_summary( + summary_result = await summarize_content( + content=content_to_summarize, prior_summary=prior_summary, - new_facts=facts, + content_type="conversation", openai_base_url=openai_base_url, api_key=api_key, model=model, ) LOGGER.info( - "Summary update completed in %.1f ms (conversation=%s)", + "Summary update completed in %.1f ms (conversation=%s, level=%s)", _elapsed_ms(summary_start), conversation_id, + summary_result.level.name, ) - if new_summary: - summary_obj = Summary( - conversation_id=conversation_id, - content=new_summary, - created_at=datetime.now(UTC), - ) - persist_summary( + if summary_result.summary: + await store_adaptive_summary( collection, memory_root=memory_root, - summary=summary_obj, + conversation_id=conversation_id, + summary_result=summary_result, ) if enable_git_versioning: diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index e27eb83fe..91585ade8 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -24,7 +24,7 @@ upsert_hierarchical_summary, upsert_memories, ) -from agent_cli.memory.entities import Fact, Summary, Turn +from agent_cli.memory.entities import Fact, Turn if TYPE_CHECKING: from pathlib import Path @@ -89,31 +89,6 @@ def persist_entries( upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas) -def persist_summary( - collection: Collection, - *, - memory_root: Path, - summary: Summary, -) -> None: - """Persist a summary to disk and Chroma.""" - doc_id = _safe_identifier(f"{summary.conversation_id}{_SUMMARY_DOC_ID_SUFFIX}-summary") - record = write_memory_file( - memory_root, - conversation_id=summary.conversation_id, - role="summary", - created_at=summary.created_at.isoformat(), - content=summary.content, - summary_kind="summary", - doc_id=doc_id, - ) - upsert_memories( - collection, - ids=[record.id], - contents=[record.content], - metadatas=[record.metadata], - ) - - def delete_memory_files( memory_root: Path, conversation_id: str, diff --git a/agent_cli/memory/_retrieval.py b/agent_cli/memory/_retrieval.py index 283b0afb6..3be059171 100644 --- a/agent_cli/memory/_retrieval.py +++ b/agent_cli/memory/_retrieval.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any from agent_cli.core.reranker import OnnxCrossEncoder, predict_relevance -from agent_cli.memory._store import get_summary_entry, query_memories +from agent_cli.memory._store import get_final_summary, query_memories from agent_cli.memory.models import ( ChatRequest, MemoryEntry, @@ -24,7 +24,6 @@ LOGGER = logging.getLogger(__name__) _DEFAULT_MMR_LAMBDA = 0.7 -_SUMMARY_ROLE = "summary" _MIN_MAX_EPSILON = 1e-8 # Avoid division by zero in min-max normalization @@ -212,7 +211,7 @@ def recency_score(meta: MemoryMetadata) -> float: summaries: list[str] = [] if include_summary: - summary_entry = get_summary_entry(collection, conversation_id, role=_SUMMARY_ROLE) + summary_entry = get_final_summary(collection, conversation_id) if summary_entry: summaries.append(f"Conversation summary:\n{summary_entry.content}") diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py index 4f3755b12..722dcda9e 100644 --- a/agent_cli/memory/_store.py +++ b/agent_cli/memory/_store.py @@ -111,31 +111,6 @@ def query_memories( return records -def get_summary_entry( - collection: Collection, - conversation_id: str, - *, - role: str = "summary", -) -> StoredMemory | None: - """Return the latest summary entry for a conversation, if present.""" - result = collection.get( - where={"$and": [{"conversation_id": conversation_id}, {"role": role}]}, - ) - docs = result.get("documents") or [] - metas = result.get("metadatas") or [] - ids = result.get("ids") or [] - - if not docs or not metas or not ids: - return None - - return StoredMemory( - id=ids[0], - content=docs[0], - metadata=MemoryMetadata(**dict(metas[0])), - distance=None, - ) - - def list_conversation_entries( collection: Collection, conversation_id: str, diff --git a/agent_cli/memory/entities.py b/agent_cli/memory/entities.py index 70b16a78c..a352b0bbf 100644 --- a/agent_cli/memory/entities.py +++ b/agent_cli/memory/entities.py @@ -32,12 +32,3 @@ class Fact(BaseModel): source_id: str = Field(..., description="UUID of the Turn this fact was extracted from") created_at: datetime # Facts are always role="memory" implicitly in the storage layer - - -class Summary(BaseModel): - """The rolling summary of a conversation.""" - - conversation_id: str - content: str - created_at: datetime - # Summaries are role="summary" implicitly diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py index 6dc689d8f..4eb289c7d 100644 --- a/agent_cli/memory/models.py +++ b/agent_cli/memory/models.py @@ -4,7 +4,7 @@ from typing import Literal -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import BaseModel, ConfigDict class Message(BaseModel): @@ -70,20 +70,6 @@ class MemoryMetadata(BaseModel): """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL').""" -class SummaryOutput(BaseModel): - """Structured summary returned by the LLM.""" - - summary: str - - @field_validator("summary") - @classmethod - def _not_empty(cls, v: str) -> str: - if not v or not str(v).strip(): - msg = "field must be non-empty" - raise ValueError(msg) - return str(v).strip() - - class StoredMemory(BaseModel): """Memory document as stored in the vector DB.""" diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index de9bc609a..843d1dfe5 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -190,6 +190,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: "role": "summary", "level": HIERARCHICAL_LEVEL_L3, "is_final": True, + "summary_level": self.level.name, "input_tokens": self.input_tokens, "output_tokens": self.output_tokens, "compression_ratio": self.compression_ratio, diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py index 440127acb..12e419de9 100644 --- a/tests/memory/test_engine.py +++ b/tests/memory/test_engine.py @@ -22,8 +22,8 @@ MemoryMetadata, Message, StoredMemory, - SummaryOutput, ) +from agent_cli.summarizer import SummaryLevel, SummaryResult class _DummyReranker: @@ -250,13 +250,13 @@ def fake_query_memories( ) monkeypatch.setattr( _retrieval, - "get_summary_entry", - lambda _collection, _cid, role: StoredMemory( # type: ignore[return-value] - id=f"{role}-id", - content=f"{role} content", + "get_final_summary", + lambda _collection, _cid: StoredMemory( + id="summary-id", + content="summary content", metadata=MemoryMetadata( conversation_id="conv1", - role=role, + role="summary", created_at=now.isoformat(), ), ), @@ -349,11 +349,19 @@ def __init__(self, output: Any) -> None: self.output = output prompt_str = str(prompt_text) - if "New facts:" in prompt_str: - return _Result(SummaryOutput(summary="summary up to 256")) if "Hello, I enjoy biking" in prompt_str: return _Result(["User likes cats.", "User loves biking."]) - return _Result(SummaryOutput(summary="noop")) + return _Result([]) + + async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: + return SummaryResult( + level=SummaryLevel.STANDARD, + summary="summary up to 256", + hierarchical=None, + input_tokens=100, + output_tokens=20, + compression_ratio=0.2, + ) async def fake_reconcile( _collection: Any, @@ -377,6 +385,7 @@ async def fake_reconcile( import pydantic_ai # noqa: PLC0415 monkeypatch.setattr(pydantic_ai.Agent, "run", fake_agent_run) + monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content) # High relevance so they aren't filtered monkeypatch.setattr(_retrieval, "predict_relevance", lambda _model, pairs: [5.0 for _ in pairs]) @@ -568,11 +577,19 @@ def __init__(self, output: Any) -> None: self.output = output prompt_str = str(prompt_text) - if "New facts:" in prompt_str: - return _Result(SummaryOutput(summary="summary text")) if "My cat is Luna" in prompt_str: return _Result(["User has a cat named Luna."]) - return _Result(SummaryOutput(summary="noop")) + return _Result([]) + + async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: + return SummaryResult( + level=SummaryLevel.STANDARD, + summary="summary text", + hierarchical=None, + input_tokens=100, + output_tokens=20, + compression_ratio=0.2, + ) monkeypatch.setattr(engine._streaming, "stream_chat_sse", fake_stream_chat_sse) @@ -598,6 +615,7 @@ async def fake_reconcile( import pydantic_ai # noqa: PLC0415 monkeypatch.setattr(pydantic_ai.Agent, "run", fake_agent_run) + monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content) response = await engine.process_chat_request( request, @@ -613,5 +631,5 @@ async def fake_reconcile( files = list(tmp_path.glob("entries/**/*.md")) assert len(files) == 4 # user + assistant + fact + 1 summary - assert any("facts" in f.parts for f in files) - assert any(f.parent.name == "summaries" and f.name == "summary.md" for f in files) + assert any("facts" in str(f) for f in files) + assert any("summaries/L3/final.md" in str(f) for f in files) diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py index 7d59f7c0e..db197b023 100644 --- a/tests/memory/test_git_integration.py +++ b/tests/memory/test_git_integration.py @@ -14,6 +14,7 @@ from agent_cli.memory import _ingest from agent_cli.memory.client import MemoryClient from agent_cli.memory.entities import Fact +from agent_cli.summarizer import SummaryLevel, SummaryResult if TYPE_CHECKING: from pathlib import Path @@ -63,12 +64,19 @@ async def fake_reconcile( ] return entries, [], {} - async def fake_update_summary(*_args: Any, **_kwargs: Any) -> str: - return "User likes testing." + async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: + return SummaryResult( + level=SummaryLevel.STANDARD, + summary="User likes testing.", + hierarchical=None, + input_tokens=100, + output_tokens=20, + compression_ratio=0.2, + ) monkeypatch.setattr(_ingest, "extract_salient_facts", fake_extract) monkeypatch.setattr(_ingest, "reconcile_facts", fake_reconcile) - monkeypatch.setattr(_ingest, "update_summary", fake_update_summary) + monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content) # Patch Reranker to avoid loading ONNX model monkeypatch.setattr("agent_cli.memory.client.get_reranker_model", MagicMock()) diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py index 3edd0eeb9..453a21a9a 100644 --- a/tests/memory/test_store.py +++ b/tests/memory/test_store.py @@ -101,23 +101,6 @@ def query(self, **kwargs: Any) -> dict[str, Any]: assert {"role": {"$ne": "summary"}} in clauses -def test_get_summary_entry_returns_entry() -> None: - # ChromaDB's .get() returns flat lists (not nested like .query()) - fake = _FakeCollection( - get_result={ - "documents": ["summary text"], - "metadatas": [ - {"conversation_id": "c1", "role": "summary", "created_at": "now"}, - ], - "ids": ["sum1"], - }, - ) - entry = _store.get_summary_entry(fake, "c1", role="summary") - assert entry is not None - assert entry.id == "sum1" - assert entry.metadata.role == "summary" - - def test_list_conversation_entries_filters_summaries() -> None: # ChromaDB's .get() returns flat lists (not nested like .query()) fake = _FakeCollection( From cd43bb3dd4c4b29235bbbdbfff29cfc23300c495 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 20:42:41 -0800 Subject: [PATCH 11/37] docs: add summarizer spec and update memory docs for hierarchical summaries - Create docs/architecture/summarizer.md with comprehensive technical specification for the adaptive summarization system - Update memory.md to reflect new L1/L2/L3 hierarchical summary structure - Document level thresholds, compression ratios, and research basis - Add content-type aware prompts documentation - Document integration with memory system and storage format --- docs/architecture/memory.md | 37 ++- docs/architecture/summarizer.md | 553 ++++++++++++++++++++++++++++++++ 2 files changed, 581 insertions(+), 9 deletions(-) create mode 100644 docs/architecture/summarizer.md diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md index f99637ff3..b42e739a9 100644 --- a/docs/architecture/memory.md +++ b/docs/architecture/memory.md @@ -164,7 +164,13 @@ entries/ assistant/ __.md # Raw assistant responses summaries/ - summary.md # The single rolling summary of the conversation + L1/ + chunk_0.md # Level 1: Individual chunk summaries + chunk_1.md + L2/ + group_0.md # Level 2: Group summaries (groups of ~5 L1s) + L3/ + final.md # Level 3: Final synthesized summary ``` **Deleted Directory Structure (Soft Deletes):** @@ -176,7 +182,7 @@ entries/ facts/ __.md summaries/ - summary.md # Tombstoned summary + L1/, L2/, L3/ # Tombstoned summary levels ``` ### 2.2 File Format @@ -270,10 +276,18 @@ Resolves contradictions using a "Search-Decide-Update" loop with complete enumer * **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`. * **Deletes:** Soft-deletes files (moved under `deleted/`) and removes from Chroma. -### 4.4 Summarization -* **Input:** Previous summary (if any) + newly extracted facts. -* **Prompt:** `SUMMARY_PROMPT` (updates the running summary). -* **Persistence:** Writes a single `summaries/summary.md` per conversation (deterministic doc ID). +### 4.4 Summarization (Adaptive Hierarchical) +Uses the `agent_cli.summarizer` module for research-backed adaptive summarization. + +* **Level Selection:** Automatically determines summarization depth based on token count: + * `NONE` (< 100 tokens): No summary needed, facts only. + * `BRIEF` (100-500 tokens): Single-sentence summary (~20% compression). + * `STANDARD` (500-3000 tokens): Paragraph summary (~12% compression). + * `DETAILED` (3000-15000 tokens): Chunked summaries + meta-summary (~7% compression). + * `HIERARCHICAL` (> 15000 tokens): Full L1/L2/L3 tree structure. +* **Input:** Previous L3 summary (if any) + newly extracted facts. +* **Persistence:** Stores summaries in `summaries/L1/`, `L2/`, `L3/` subdirectories with YAML front matter containing compression metrics. +* **See:** `docs/architecture/summarizer.md` for detailed algorithm specification. ### 4.5 Eviction * **Trigger:** If total entries in conversation > `max_entries` (default 500). @@ -303,9 +317,14 @@ To replicate the system behavior, the following prompt strategies are required. * **NONE:** Existing memory is unrelated to new facts, or new fact is an exact duplicate. * **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences. -### 5.3 Summarization (`SUMMARY_PROMPT`) -* **Goal:** Maintain a concise running summary. -* **Constraints:** Aggregate related facts. Drop transient chit-chat. Focus on durable info. +### 5.3 Summarization (Adaptive Prompts) +The summarizer uses level-specific prompts from `agent_cli.summarizer._prompts`: +* **`BRIEF_PROMPT`:** Single-sentence distillation for short content. +* **`STANDARD_PROMPT`:** Paragraph summary with prior context integration. +* **`CHUNK_PROMPT`:** Individual chunk summarization for hierarchical processing. +* **`META_PROMPT`:** Synthesizes multiple chunk summaries into cohesive narrative. +* **`ROLLING_PROMPT`:** Integrates new facts with existing summary. +* **Content-type variants:** `CONVERSATION_PROMPT`, `JOURNAL_PROMPT`, `DOCUMENT_PROMPT` for domain-specific summarization. --- diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md new file mode 100644 index 000000000..940ddddb8 --- /dev/null +++ b/docs/architecture/summarizer.md @@ -0,0 +1,553 @@ +# Agent CLI: Adaptive Summarizer Technical Specification + +This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem. The design is grounded in research from Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, compression ratios). + +## 1. System Overview + +The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count. + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ Adaptive Summarization Pipeline │ +├─────────────────────────────────────────────────────────────────────┤ +│ │ +│ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy │ +│ │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ Level Thresholds: │ │ +│ │ < 100 tokens ──▶ NONE (no summary needed) │ │ +│ │ 100-500 ──▶ BRIEF (single sentence) │ │ +│ │ 500-3000 ──▶ STANDARD (paragraph) │ │ +│ │ 3000-15000 ──▶ DETAILED (chunked + meta) │ │ +│ │ > 15000 ──▶ HIERARCHICAL (L1/L2/L3 tree) │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ +│ Output: SummaryResult with compression metrics │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +**Design Goals:** + +- **Adaptive compression:** Match summarization depth to content complexity. +- **Research-grounded:** Based on proven approaches from Letta and Mem0. +- **Hierarchical structure:** Preserve detail at multiple granularities. +- **Content-type awareness:** Domain-specific prompts for conversations, journals, documents. + +--- + +## 2. Architectural Decisions + +### 2.1 Token-Based Level Selection + +**Decision:** Select summarization strategy based on input token count with fixed thresholds. + +**Rationale:** + +- **Predictable behavior:** Users can anticipate output length based on input size. +- **Optimal compression:** Each level targets a specific compression ratio validated by research. +- **Efficiency:** Avoid over-processing short content or under-processing long content. + +**Implementation:** + +```python +THRESHOLD_NONE = 100 # Below this: no summary needed +THRESHOLD_BRIEF = 500 # 100-500: single sentence (~20% compression) +THRESHOLD_STANDARD = 3000 # 500-3000: paragraph (~12% compression) +THRESHOLD_DETAILED = 15000 # 3000-15000: chunked (~7% compression) +# Above 15000: hierarchical tree structure +``` + +**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. + +### 2.2 Hierarchical Summary Structure (L1/L2/L3) + +**Decision:** For long content, build a tree of summaries at three levels of granularity. + +**Rationale:** + +- **Partial eviction:** Inspired by Letta's memory architecture—keep detailed summaries for recent content, compressed summaries for older content. +- **Flexible retrieval:** Different use cases need different detail levels. +- **Progressive compression:** Each level provides ~5x compression over the previous. + +**Implementation:** + +- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks with 200 token overlap. +- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. +- **L3 (Final Summary):** Single synthesized summary of all L2 summaries. + +**Storage:** +```text +summaries/ + L1/ + chunk_0.md # Summary of tokens 0-3000 + chunk_1.md # Summary of tokens 2800-5800 (overlap) + L2/ + group_0.md # Synthesis of chunk_0 through chunk_4 + L3/ + final.md # Final narrative summary +``` + +### 2.3 Content-Type Aware Prompts + +**Decision:** Use different prompt templates for different content domains. + +**Rationale:** + +- **Conversations:** Focus on user preferences, decisions, action items. +- **Journals:** Emphasize personal insights, emotional context, growth patterns. +- **Documents:** Prioritize key findings, methodology, conclusions. + +**Implementation:** + +```python +def get_prompt_for_content_type(content_type: str) -> str: + match content_type: + case "conversation": return CONVERSATION_PROMPT + case "journal": return JOURNAL_PROMPT + case "document": return DOCUMENT_PROMPT + case _: return STANDARD_PROMPT +``` + +### 2.4 Prior Summary Integration + +**Decision:** Always provide the previous summary as context when updating. + +**Rationale:** + +- **Continuity:** New summaries should build on existing context, not replace it. +- **Incremental updates:** Avoid re-summarizing all content on every update. +- **Context preservation:** Important information from earlier content persists. + +**Implementation:** + +- The `prior_summary` parameter is passed through the entire pipeline. +- `ROLLING_PROMPT` specifically handles integrating new facts with existing summaries. +- For hierarchical summaries, only the L3 summary is used as prior context. + +### 2.5 Compression Ratio Tracking + +**Decision:** Track and report compression metrics for every summary. + +**Rationale:** + +- **Transparency:** Users can understand how much information was compressed. +- **Quality monitoring:** Unusual ratios may indicate summarization issues. +- **Optimization:** Metrics inform future threshold tuning. + +**Implementation:** + +```python +@dataclass +class SummaryResult: + level: SummaryLevel + summary: str | None + hierarchical: HierarchicalSummary | None + input_tokens: int + output_tokens: int + compression_ratio: float # output/input (lower = more compression) +``` + +--- + +## 3. Data Model + +### 3.1 Summary Levels + +| Level | Token Range | Target Compression | Strategy | +| :--- | :--- | :--- | :--- | +| `NONE` | < 100 | N/A | No summarization | +| `BRIEF` | 100-500 | ~20% | Single sentence | +| `STANDARD` | 500-3000 | ~12% | Paragraph | +| `DETAILED` | 3000-15000 | ~7% | Chunked + meta | +| `HIERARCHICAL` | > 15000 | ~3-5% | L1/L2/L3 tree | + +### 3.2 Hierarchical Summary Structure + +```python +class ChunkSummary(BaseModel): + chunk_index: int # Position in original content + content: str # The summarized text + token_count: int # Tokens in this summary + source_tokens: int # Tokens in source chunk + parent_group: int | None # L2 group this belongs to + +class HierarchicalSummary(BaseModel): + l1_summaries: list[ChunkSummary] # Individual chunk summaries + l2_summaries: list[str] # Group summaries + l3_summary: str # Final synthesis + chunk_size: int = 3000 # Tokens per chunk + chunk_overlap: int = 200 # Overlap between chunks +``` + +### 3.3 Storage Metadata (ChromaDB) + +Summaries are stored with rich metadata for retrieval and management: + +| Field | L1 | L2 | L3 | Description | +| :--- | :---: | :---: | :---: | :--- | +| `id` | ✓ | ✓ | ✓ | `{conversation_id}:summary:L{n}:{index}` | +| `conversation_id` | ✓ | ✓ | ✓ | Scope key | +| `role` | ✓ | ✓ | ✓ | Always `"summary"` | +| `level` | ✓ | ✓ | ✓ | 1, 2, or 3 | +| `chunk_index` | ✓ | | | Position in L1 sequence | +| `group_index` | | ✓ | | Position in L2 sequence | +| `parent_group` | ✓ | | | Which L2 group owns this L1 | +| `is_final` | | | ✓ | Marks the top-level summary | +| `summary_level` | | | ✓ | Name of SummaryLevel enum | +| `input_tokens` | | | ✓ | Original content token count | +| `output_tokens` | | | ✓ | Total summary token count | +| `compression_ratio` | | | ✓ | Output/input ratio | +| `created_at` | ✓ | ✓ | ✓ | ISO 8601 timestamp | + +### 3.4 File Format + +Summary files use Markdown with YAML front matter: + +```markdown +--- +id: "journal:summary:L3:final" +conversation_id: "journal" +role: "summary" +level: 3 +is_final: true +summary_level: "STANDARD" +input_tokens: 1500 +output_tokens: 180 +compression_ratio: 0.12 +created_at: "2025-01-15T10:30:00Z" +--- + +The user has been exploring adaptive summarization techniques... +``` + +--- + +## 4. Processing Pipeline + +### 4.1 Main Entry Point + +```python +async def summarize( + content: str, + config: SummarizerConfig, + prior_summary: str | None = None, + content_type: str = "general", +) -> SummaryResult +``` + +### 4.2 Level Selection Flow + +``` +Input Content + │ + ▼ +┌─────────────┐ +│ Count Tokens│ (tiktoken, cl100k_base) +└──────┬──────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ determine_level(token_count) -> Level │ +│ │ +│ < 100 ──▶ NONE │ +│ < 500 ──▶ BRIEF │ +│ < 3000 ──▶ STANDARD │ +│ < 15000 ──▶ DETAILED │ +│ else ──▶ HIERARCHICAL │ +└──────┬──────────────────────────────────┘ + │ + ▼ + Execute level-specific strategy +``` + +### 4.3 Strategy Execution by Level + +#### NONE Level +- **Action:** Return immediately with no summary. +- **Output:** `SummaryResult(level=NONE, summary=None, compression_ratio=1.0)` + +#### BRIEF Level +- **Prompt:** `BRIEF_PROMPT` - distill to single sentence. +- **LLM Call:** Single generation with low max_tokens. +- **Output:** One-sentence summary. + +#### STANDARD Level +- **Prompt:** `STANDARD_PROMPT` with optional prior summary context. +- **LLM Call:** Single generation. +- **Output:** Paragraph-length summary. + +#### DETAILED Level +1. **Chunk:** Split content into ~3000 token chunks with 200 token overlap. +2. **Parallel L1:** Generate summary for each chunk using `CHUNK_PROMPT`. +3. **Meta-synthesis:** Combine L1 summaries using `META_PROMPT`. +4. **Output:** `HierarchicalSummary` with L1s and L3 (no L2 needed for this size). + +#### HIERARCHICAL Level +1. **Chunk:** Split into ~3000 token chunks with overlap. +2. **Parallel L1:** Generate chunk summaries. +3. **Group:** Organize L1s into groups of ~5. +4. **Parallel L2:** Summarize each group. +5. **L3 Synthesis:** Final meta-summary of all L2s. +6. **Output:** Full `HierarchicalSummary` tree. + +### 4.4 Chunking Algorithm + +```python +def chunk_text( + text: str, + chunk_size: int = 3000, + overlap: int = 200, +) -> list[str]: + """Split text into overlapping chunks on paragraph boundaries.""" +``` + +**Strategy:** + +1. **Paragraph-first:** Try to split on double newlines. +2. **Sentence fallback:** If paragraph exceeds chunk_size, split on sentence boundaries. +3. **Character fallback:** For very long sentences (e.g., code), use character splitting. +4. **Overlap handling:** Each chunk starts with the last `overlap` tokens of the previous. + +### 4.5 Middle Truncation (Utility) + +For contexts where the summary exceeds available space: + +```python +def middle_truncate( + text: str, + token_budget: int, + head_fraction: float = 0.3, + tail_fraction: float = 0.7, +) -> str: + """Keep head and tail, remove middle (least likely to contain key info).""" +``` + +**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). + +--- + +## 5. Prompt Specifications + +### 5.1 Brief Summary (`BRIEF_PROMPT`) + +``` +Distill the following content into a single, comprehensive sentence +that captures the essential meaning: + +{content} + +Summary (one sentence): +``` + +### 5.2 Standard Summary (`STANDARD_PROMPT`) + +``` +Summarize the following content in a concise paragraph. +{prior_context} +Focus on key information, decisions, and actionable insights. + +Content: +{content} + +Summary: +``` + +### 5.3 Chunk Summary (`CHUNK_PROMPT`) + +``` +Summarize this section of a larger document. +Preserve specific details, names, and numbers that may be important. + +Section {chunk_index} of {total_chunks}: +{content} + +Section summary: +``` + +### 5.4 Meta Summary (`META_PROMPT`) + +``` +Synthesize these section summaries into a coherent narrative. +Maintain logical flow and preserve the most important information. + +Section Summaries: +{summaries} + +Synthesized Summary: +``` + +### 5.5 Rolling Summary (`ROLLING_PROMPT`) + +``` +Update the existing summary to incorporate new information. +Preserve important historical context while integrating new facts. + +Existing Summary: +{prior_summary} + +New Information: +{new_facts} + +Updated Summary: +``` + +### 5.6 Content-Type Prompts + +**Conversation:** +``` +Summarize this conversation focusing on: +- User preferences and decisions +- Action items and commitments +- Key topics discussed +``` + +**Journal:** +``` +Summarize this journal entry focusing on: +- Personal insights and reflections +- Emotional context and growth +- Goals and intentions +``` + +**Document:** +``` +Summarize this document focusing on: +- Key findings and conclusions +- Methodology and approach +- Recommendations and implications +``` + +--- + +## 6. Integration with Memory System + +### 6.1 Entry Point + +The memory system calls the summarizer via `_ingest.summarize_content()`: + +```python +async def summarize_content( + content: str, + prior_summary: str | None = None, + content_type: str = "general", + openai_base_url: str, + api_key: str | None, + model: str, +) -> SummaryResult +``` + +### 6.2 Storage Flow + +``` +summarize_content() + │ + ▼ +SummaryResult + │ + ▼ +store_adaptive_summary() + │ + ├──▶ persist_hierarchical_summary() + │ │ + │ ├──▶ Delete old summaries (L1, L2, L3) + │ ├──▶ Write new summary files + │ └──▶ Upsert to ChromaDB + │ + └──▶ Return stored IDs +``` + +### 6.3 Retrieval Integration + +The memory retrieval system uses `get_final_summary()` to fetch the L3 summary: + +```python +def get_final_summary( + collection: Collection, + conversation_id: str, +) -> StoredMemory | None: + """Retrieve the L3 final summary for injection into prompts.""" +``` + +--- + +## 7. Configuration Reference + +| Parameter | Default | Description | +| :--- | :--- | :--- | +| `openai_base_url` | *required* | Base URL for LLM API | +| `model` | *required* | Model ID for summarization | +| `api_key` | `None` | API key (optional for local models) | +| `chunk_size` | `3000` | Tokens per chunk for hierarchical | +| `chunk_overlap` | `200` | Token overlap between chunks | + +### 7.1 Level Thresholds (Constants) + +| Constant | Value | Description | +| :--- | :--- | :--- | +| `THRESHOLD_NONE` | 100 | Below: no summary | +| `THRESHOLD_BRIEF` | 500 | Below: single sentence | +| `THRESHOLD_STANDARD` | 3000 | Below: paragraph | +| `THRESHOLD_DETAILED` | 15000 | Below: chunked | + +--- + +## 8. Error Handling + +### 8.1 Graceful Degradation + +| Error | Fallback | +| :--- | :--- | +| LLM timeout | Return input unchanged with NONE level | +| LLM error | Retry up to 3 times, then return NONE | +| Token counting failure | Estimate based on character count (÷4) | +| Chunking failure | Fall back to character-based splitting | + +### 8.2 Validation + +- **Empty content:** Returns NONE level immediately. +- **Whitespace-only:** Returns NONE level. +- **Invalid compression ratio:** Clamped to [0.0, 1.0]. + +--- + +## 9. Performance Considerations + +### 9.1 Token Counting + +- Uses `tiktoken` with `cl100k_base` encoding (GPT-4 tokenizer). +- Caches tokenizer instance for efficiency. +- Falls back to character-based estimation if tiktoken unavailable. + +### 9.2 Parallel Processing + +For DETAILED and HIERARCHICAL levels: +- L1 chunk summaries can be generated in parallel. +- L2 group summaries can be generated in parallel. +- Only L3 synthesis requires sequential processing. + +### 9.3 Caching + +- Token counts are computed once per content string. +- Prompt templates are loaded once at module import. +- ChromaDB connection is reused across operations. + +--- + +## 10. Comparison with Alternative Approaches + +| Aspect | Adaptive Summarizer | Rolling Summary | Fixed Chunking | +| :--- | :--- | :--- | :--- | +| **Compression** | 3-20% (varies by level) | ~15% fixed | ~10% fixed | +| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Single level | +| **Context awareness** | Content-type prompts | Generic | Generic | +| **Efficiency** | Skip short content | Always summarize | Always chunk | +| **Research basis** | Letta + Mem0 | Mem0 only | None | + +--- + +## 11. Future Enhancements + +- **Semantic chunking:** Split on topic boundaries rather than token counts. +- **Incremental L1 updates:** Only re-summarize changed chunks. +- **Quality scoring:** Evaluate summary quality and trigger re-summarization. +- **User feedback loop:** Learn preferred compression ratios per user. From 0e9382270c007c4811036c421d4fe733a387b41b Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 21:31:37 -0800 Subject: [PATCH 12/37] Add example script --- agent_cli/summarizer/_utils.py | 4 +- examples/summarizer_demo.py | 483 +++++++++++++++++++++++++++++++++ 2 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 examples/summarizer_demo.py diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index bc319f5b5..030b5729e 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -40,7 +40,9 @@ def count_tokens(text: str, model: str = "gpt-4") -> int: if not text: return 0 enc = _get_encoding(model) - return len(enc.encode(text)) + # Disable special token checking - LLM outputs may contain special tokens + # like <|constrain|>, <|endoftext|>, etc. that we want to count normally + return len(enc.encode(text, disallowed_special=())) def chunk_text( diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py new file mode 100644 index 000000000..6a542dbdc --- /dev/null +++ b/examples/summarizer_demo.py @@ -0,0 +1,483 @@ +"""Demonstrate the summarizer on texts of varying lengths from the internet. + +This script fetches content of different sizes and shows how the adaptive +summarizer automatically selects the appropriate strategy (BRIEF, STANDARD, +DETAILED, or HIERARCHICAL) based on content length. + +Usage: + python examples/summarizer_demo.py + + # Test specific levels only + python examples/summarizer_demo.py --level brief + python examples/summarizer_demo.py --level standard + python examples/summarizer_demo.py --level detailed + python examples/summarizer_demo.py --level hierarchical + + # Use a different model + python examples/summarizer_demo.py --model "gpt-4o-mini" +""" # noqa: INP001 + +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import re +import textwrap +import traceback +from dataclasses import dataclass + +import httpx + +from agent_cli.summarizer import ( + SummarizerConfig, + SummaryLevel, + SummaryResult, + summarize, +) + +# Defaults for local AI setup (same as aijournal_poc.py) +DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1" +DEFAULT_MODEL = "gpt-oss-high:20b" + + +@dataclass +class TextSample: + """A sample text for testing the summarizer.""" + + name: str + description: str + url: str + expected_level: SummaryLevel + content_type: str = "general" + # If URL fetch fails, use this fallback + fallback_content: str | None = None + + +# Thresholds from adaptive.py: +# NONE: < 100 tokens +# BRIEF: 100-500 tokens +# STANDARD: 500-3000 tokens +# DETAILED: 3000-15000 tokens +# HIERARCHICAL: > 15000 tokens + +# Sample texts of varying lengths to demonstrate different summarization levels +SAMPLES: list[TextSample] = [ + TextSample( + name="Brief - Short News Article", + description="~150-400 tokens - triggers BRIEF level (100-500 token range)", + url="https://httpbin.org/json", # Returns small JSON we'll convert to text + expected_level=SummaryLevel.BRIEF, + fallback_content=""" + Breaking News: Scientists at the Marine Biology Institute have made a + groundbreaking discovery in the Mariana Trench. A new species of deep-sea + fish, dubbed "Pseudoliparis swirei," has been found surviving at depths + exceeding 8,000 meters, making it one of the deepest-living fish ever + documented. + + The research team, led by Dr. Sarah Chen from the University of Washington, + used advanced unmanned submersibles equipped with high-resolution cameras + and collection apparatus. The expedition lasted three months and covered + multiple dive sites across the western Pacific. + + "This discovery fundamentally changes our understanding of life in extreme + environments," Dr. Chen stated in a press conference. "The adaptations + these fish have developed to survive crushing pressures and near-freezing + temperatures are remarkable." + + The fish displays several unique characteristics including translucent skin, + specialized proteins that prevent cellular damage under pressure, and an + unusual metabolism that allows survival with minimal oxygen. Scientists + believe studying these adaptations could lead to breakthroughs in medicine + and materials science. + + The finding has been published in the journal Nature and has already + generated significant interest from the scientific community worldwide. + Further expeditions are planned to study the species in its natural habitat. + """, + ), + TextSample( + name="Standard - Technology Article", + description="~800-2000 tokens - triggers STANDARD level (500-3000 token range)", + url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence", + expected_level=SummaryLevel.STANDARD, + content_type="document", + fallback_content=""" + Artificial intelligence (AI) is the intelligence of machines or software, + as opposed to the intelligence of humans or other animals. It is a field + of computer science that develops and studies intelligent machines. The + field encompasses a wide range of approaches and technologies. + + AI research has been defined as the field of study of intelligent agents, + which refers to any system that perceives its environment and takes actions + that maximize its chances of achieving its goals. This definition emphasizes + the practical aspects of building systems that can operate effectively. + + The term "artificial intelligence" has been used to describe machines that + mimic cognitive functions that humans associate with the human mind, such + as learning and problem solving. As machines become increasingly capable, + tasks considered to require "intelligence" are often removed from the + definition of AI, a phenomenon known as the AI effect. + + History of Artificial Intelligence + + The field of AI research was founded at a workshop held on the campus of + Dartmouth College during the summer of 1956. The attendees became the + founders and leaders of AI research. They and their students produced + programs that the press described as astonishing. + + Early AI research in the 1950s explored topics like problem solving and + symbolic methods. In the 1960s, the US Department of Defense took interest + and began training computers to mimic basic human reasoning. DARPA completed + street mapping projects in the 1970s and produced intelligent personal + assistants in 2003, long before Siri, Alexa or Cortana. + + Modern AI Approaches + + Modern AI techniques have become pervasive and include machine learning, + deep learning, natural language processing, computer vision, robotics, + and autonomous systems. These technologies power everything from search + engines to self-driving cars. + + Machine learning is a subset of AI that enables systems to learn and improve + from experience without being explicitly programmed. Deep learning uses + neural networks with many layers to analyze various factors of data. + + Neural networks are computing systems inspired by biological neural networks. + They consist of interconnected nodes that process information using + connectionist approaches to computation. Modern neural networks can have + millions or billions of parameters. + + Applications of AI + + AI applications are transforming industries including healthcare, finance, + transportation, and entertainment. In healthcare, AI helps diagnose diseases + and develop new treatments. In finance, AI powers fraud detection and + algorithmic trading. + + Autonomous vehicles use AI to perceive their environment and make driving + decisions. Virtual assistants use natural language processing to understand + and respond to user queries. Recommendation systems use AI to suggest + content based on user preferences. + + Ethical Considerations + + The field was founded on the assumption that human intelligence can be + so precisely described that a machine can be made to simulate it. This + raised philosophical arguments about the mind and the ethical consequences + of creating artificial beings endowed with human-like intelligence. + + Major concerns include job displacement, algorithmic bias, privacy violations, + and the potential for misuse. Researchers and policymakers are working to + develop frameworks for responsible AI development and deployment. + + The future of AI holds both tremendous promise and significant challenges. + As these systems become more capable, society must grapple with questions + about control, accountability, and the nature of intelligence itself. + """, + ), + TextSample( + name="Detailed - Full Article", + description="~4000-10000 tokens - triggers DETAILED level (3000-15000 token range)", + url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning", + expected_level=SummaryLevel.DETAILED, + content_type="document", + fallback_content=None, # We'll generate synthetic content + ), + TextSample( + name="Hierarchical - Long Document", + description="~16000+ tokens - triggers HIERARCHICAL level (>15000 tokens)", + url="https://www.gutenberg.org/cache/epub/84/pg84.txt", # Frankenstein (truncated) + expected_level=SummaryLevel.HIERARCHICAL, + content_type="document", + fallback_content=None, # We'll generate synthetic content (~16K tokens) + ), +] + + +def generate_synthetic_content(target_tokens: int, topic: str = "technology") -> str: + """Generate synthetic content for testing when URLs fail.""" + # Each paragraph is roughly 50-100 tokens + paragraphs = [ + f"Section on {topic} - Part {{i}}: This section explores various aspects " + f"of {topic} and its implications for modern society. The development of " + f"new technologies continues to reshape how we live and work. Researchers " + f"have made significant progress in understanding the fundamentals.", + f"The history of {topic} spans many decades of innovation. Early pioneers " + f"laid the groundwork for current advancements. Their contributions remain " + f"relevant today as we build upon established foundations.", + f"Current applications of {topic} include healthcare, transportation, and " + f"communication. These sectors have seen dramatic improvements in efficiency " + f"and capability. Future developments promise even greater transformations.", + f"Challenges in {topic} include ethical considerations, resource constraints, " + f"and technical limitations. Addressing these requires collaboration across " + f"disciplines. Solutions often emerge from unexpected directions.", + f"The future of {topic} looks promising with continued investment and research. " + f"Emerging trends suggest new possibilities. Stakeholders must prepare for " + f"rapid change while maintaining focus on beneficial outcomes.", + ] + + result = [] + tokens_per_para = 75 # approximate + needed_paragraphs = target_tokens // tokens_per_para + 1 + + for i in range(needed_paragraphs): + para = paragraphs[i % len(paragraphs)].format(i=i + 1) + result.append(para) + + return "\n\n".join(result) + + +async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: # noqa: PLR0912 + """Fetch content from URL or use fallback.""" + try: + # Add User-Agent header to avoid 403 errors from some sites + headers = { + "User-Agent": "Mozilla/5.0 (compatible; SummarizerDemo/1.0)", + } + response = await client.get( + sample.url, + timeout=30.0, + follow_redirects=True, + headers=headers, + ) + response.raise_for_status() + + content = response.text + + # Handle Wikipedia API JSON responses + if "wikipedia.org/api" in sample.url: + try: + data = json.loads(content) + if "extract" in data: + content = data["extract"] + elif "text" in data: + content = data["text"] + except json.JSONDecodeError: + pass + + # For httpbin JSON, create a readable summary + if "httpbin.org/json" in sample.url: + content = sample.fallback_content or "" + + # Strip HTML tags if present + if "<" in content and ">" in content: + content = re.sub(r"<[^>]+>", " ", content) + content = re.sub(r"\s+", " ", content).strip() + + # Check if content is too short for expected level + min_words_for_level = { + SummaryLevel.BRIEF: 80, # Need ~100 tokens + SummaryLevel.STANDARD: 400, # Need ~500 tokens + SummaryLevel.DETAILED: 2500, # Need ~3000 tokens + SummaryLevel.HIERARCHICAL: 12000, # Need ~15000 tokens + } + min_words = min_words_for_level.get(sample.expected_level, 50) + + if len(content.split()) < min_words: + print(f" 📎 Fetched content too short ({len(content.split())} words), using fallback") + if sample.fallback_content: + content = sample.fallback_content + else: + target_tokens = { + SummaryLevel.BRIEF: 300, + SummaryLevel.STANDARD: 1500, + SummaryLevel.DETAILED: 8000, + SummaryLevel.HIERARCHICAL: 16000, # Keep manageable for demo + } + content = generate_synthetic_content( + target_tokens.get(sample.expected_level, 1000), + ) + + # For HIERARCHICAL, truncate very long content to keep demo fast + # but ensure we stay above 15000 tokens (~13000 words) + if sample.expected_level == SummaryLevel.HIERARCHICAL: + words = content.split() + # ~16000 tokens ≈ 13500 words (need >15000 tokens for HIERARCHICAL) + if len(words) > 13500: # noqa: PLR2004 + content = " ".join(words[:13500]) + print(" 📎 Truncated to ~13500 words for faster demo") + + return content.strip() + + except Exception as e: + print(f" ⚠️ Failed to fetch URL: {e}") + + if sample.fallback_content: + return sample.fallback_content.strip() + + # Generate synthetic content for the expected level + target_tokens = { + SummaryLevel.BRIEF: 300, + SummaryLevel.STANDARD: 1500, + SummaryLevel.DETAILED: 8000, + SummaryLevel.HIERARCHICAL: 16000, # Keep manageable for demo + } + return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000)) + + +def print_result(sample: TextSample, result: SummaryResult, content: str) -> None: + """Print a formatted summary result.""" + print("\n" + "=" * 70) + print(f"📄 {sample.name}") + print(f" {sample.description}") + print("=" * 70) + + # Input stats + word_count = len(content.split()) + print("\n📊 Input Statistics:") + print(f" Words: {word_count:,}") + print(f" Tokens: {result.input_tokens:,}") + print(f" Content type: {sample.content_type}") + + # Summarization result + level_emoji = { + SummaryLevel.NONE: "⏭️", + SummaryLevel.BRIEF: "📝", + SummaryLevel.STANDARD: "📄", + SummaryLevel.DETAILED: "📚", + SummaryLevel.HIERARCHICAL: "🏗️", + } + print("\n🎯 Summarization Result:") + print(f" Level: {level_emoji.get(result.level, '❓')} {result.level.name}") + print(f" Expected: {sample.expected_level.name}") + print(f" Match: {'✅' if result.level == sample.expected_level else '⚠️'}") + print(f" Output tokens: {result.output_tokens:,}") + print(f" Compression: {result.compression_ratio:.1%}") + + # Summary content + if result.summary: + print("\n📝 Summary:") + wrapped = textwrap.fill( + result.summary, + width=68, + initial_indent=" ", + subsequent_indent=" ", + ) + print(wrapped) + + # Hierarchical details if present + if result.hierarchical: + h = result.hierarchical + print("\n🏗️ Hierarchical Structure:") + print(f" L1 chunks: {len(h.l1_summaries)}") + print(f" L2 groups: {len(h.l2_summaries)}") + if h.l2_summaries: + print(f" L2 preview: {h.l2_summaries[0][:100]}...") + print("\n L3 Final Summary:") + wrapped = textwrap.fill( + h.l3_summary, + width=68, + initial_indent=" ", + subsequent_indent=" ", + ) + print(wrapped) + + +async def run_demo( + level_filter: str | None = None, + model: str | None = None, + base_url: str | None = None, +) -> None: + """Run the summarizer demo.""" + # Configuration + actual_base_url = base_url or os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL) + actual_model = model or os.environ.get("OPENAI_MODEL", DEFAULT_MODEL) + api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local") + + print("🔧 Configuration:") + print(f" Base URL: {actual_base_url}") + print(f" Model: {actual_model}") + + config = SummarizerConfig( + openai_base_url=actual_base_url, + model=actual_model, + api_key=api_key, + chunk_size=3000, + max_concurrent_chunks=3, + timeout=120.0, # Longer timeout for local models + ) + + # Filter samples if requested + samples = SAMPLES + if level_filter: + level_map = { + "brief": SummaryLevel.BRIEF, + "standard": SummaryLevel.STANDARD, + "detailed": SummaryLevel.DETAILED, + "hierarchical": SummaryLevel.HIERARCHICAL, + } + target_level = level_map.get(level_filter.lower()) + if target_level: + samples = [s for s in SAMPLES if s.expected_level == target_level] + print(f"\n🔍 Filtering to {level_filter.upper()} level only") + + async with httpx.AsyncClient() as client: + for sample in samples: + print(f"\n⏳ Processing: {sample.name}...") + + # Fetch content + content = await fetch_content(sample, client) + + try: + # Summarize + result = await summarize( + content=content, + config=config, + content_type=sample.content_type, + ) + + # Display results + print_result(sample, result, content) + + except Exception as e: + print(f"\n❌ Error summarizing {sample.name}: {e}") + + traceback.print_exc() + + print("\n" + "=" * 70) + print("✅ Demo complete!") + print("=" * 70) + + +def main() -> None: + """CLI entry point.""" + parser = argparse.ArgumentParser( + description="Demonstrate adaptive summarization on texts of varying lengths", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(""" + Examples: + python examples/summarizer_demo.py + python examples/summarizer_demo.py --level standard + python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1" + """), + ) + + parser.add_argument( + "--level", + choices=["brief", "standard", "detailed", "hierarchical"], + help="Only test a specific summarization level", + ) + parser.add_argument( + "--model", + help=f"Model to use (default: {DEFAULT_MODEL})", + ) + parser.add_argument( + "--base-url", + help=f"OpenAI-compatible API base URL (default: {DEFAULT_BASE_URL})", + ) + + args = parser.parse_args() + + asyncio.run( + run_demo( + level_filter=args.level, + model=args.model, + base_url=args.base_url, + ), + ) + + +if __name__ == "__main__": + main() From 8c3768c3808650e2df452f07d304372f2e21747a Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 21:54:45 -0800 Subject: [PATCH 13/37] refactor(summarizer): YAGNI cleanup and fix prior_context bug Removed unused code: - update_rolling_summary() - never called anywhere - _raw_generate() fallback - errors should fail loudly - retry/backoff logic - same reason - parent_group from ChunkSummary - stored but never read - ROLLING_SUMMARY_PROMPT - only used by removed function Kept middle_truncate() - useful for handling very large inputs (e.g., conversations with pasted codebases). Bugfix: - Add {prior_context} to CONVERSATION, JOURNAL, DOCUMENT prompts - Previously prior_summary was silently ignored for non-"general" types - Python's .format() ignores extra kwargs, hiding the bug Updates documentation to reflect fail-fast error handling. --- agent_cli/summarizer/_prompts.py | 20 ++--- agent_cli/summarizer/adaptive.py | 141 ++---------------------------- agent_cli/summarizer/models.py | 5 -- docs/architecture/summarizer.md | 62 ++++++------- tests/summarizer/test_adaptive.py | 141 ++---------------------------- tests/summarizer/test_models.py | 13 --- tests/summarizer/test_prompts.py | 22 ++--- 7 files changed, 51 insertions(+), 353 deletions(-) diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py index 101422b77..f46b39ebf 100644 --- a/agent_cli/summarizer/_prompts.py +++ b/agent_cli/summarizer/_prompts.py @@ -47,20 +47,6 @@ Combined summary (maximum {max_words} words):""".strip() -# Rolling summary update (Mem0-style) -ROLLING_SUMMARY_PROMPT = """Update the running summary with new information. -Integrate new facts seamlessly while keeping the summary concise. -Drop redundant or superseded information. -Preserve durable facts about identity, preferences, and important events. - -Current summary: -{prior_summary} - -New information to integrate: -{new_content} - -Updated summary (maximum {max_words} words):""".strip() - # For conversation-specific summarization CONVERSATION_SUMMARY_PROMPT = """Summarize this conversation from the AI assistant's perspective. Focus on: @@ -69,6 +55,8 @@ - Decisions made or conclusions reached - Any commitments or follow-ups mentioned +{prior_context} + Conversation: {content} @@ -82,6 +70,8 @@ - Goals, plans, or intentions stated - People, places, or things that are important +{prior_context} + Entry: {content} @@ -95,6 +85,8 @@ - Important specifications or requirements - Conclusions or recommendations +{prior_context} + Document: {content} diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 590dabc55..989bd86ba 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -13,7 +13,6 @@ import logging from dataclasses import dataclass -import httpx from pydantic import BaseModel from pydantic_ai import Agent from pydantic_ai.models.openai import OpenAIChatModel @@ -24,7 +23,6 @@ BRIEF_SUMMARY_PROMPT, CHUNK_SUMMARY_PROMPT, META_SUMMARY_PROMPT, - ROLLING_SUMMARY_PROMPT, format_prior_context, format_summaries_for_meta, get_prompt_for_content_type, @@ -33,7 +31,6 @@ chunk_text, count_tokens, estimate_summary_tokens, - middle_truncate, tokens_to_words, ) from agent_cli.summarizer.models import ( @@ -59,14 +56,6 @@ # Minimum number of L1 chunks before L2 grouping is applied L2_MIN_CHUNKS = 5 -# Retry settings for summarization failures -MAX_SUMMARIZE_RETRIES = 3 - -# Maximum characters per chunk before applying middle truncation -# This prevents context overflow errors for very large chunks -# (roughly 12K tokens with cl100k_base encoding) -MAX_CHUNK_CHARS = 48000 - class SummaryOutput(BaseModel): """Structured output for summary generation.""" @@ -199,93 +188,32 @@ async def summarize( ) -async def update_rolling_summary( - prior_summary: str | None, - new_facts: list[str], - config: SummarizerConfig, -) -> str: - """Update a rolling summary with new facts (Mem0-style). - - This is optimized for incremental updates where you have discrete - new facts to integrate into an existing summary. - - Args: - prior_summary: The existing summary to update. - new_facts: List of new facts to integrate. - config: Summarizer configuration. - - Returns: - Updated summary string. - - """ - if not new_facts: - return prior_summary or "" - - new_content = "\n".join(f"- {fact}" for fact in new_facts) - combined_tokens = count_tokens( - (prior_summary or "") + new_content, - config.model, - ) - - target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt = ROLLING_SUMMARY_PROMPT.format( - prior_summary=prior_summary or "(No prior summary)", - new_content=new_content, - max_words=max_words, - ) - - return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) - - async def _summarize_single_chunk( chunk: str, chunk_index: int, total_chunks: int, config: SummarizerConfig, - *, - parent_group: int | None = None, ) -> ChunkSummary: """Summarize a single chunk of content. - Uses middle truncation as a fallback for oversized content (Letta-style). - Args: chunk: The text chunk to summarize. chunk_index: Index of this chunk (0-based). total_chunks: Total number of chunks being processed. config: Summarizer configuration. - parent_group: Optional L2 group index for hierarchical summaries. Returns: ChunkSummary with the summarized content. """ - # Apply middle truncation if chunk is too large (Letta-style fallback) source_tokens = count_tokens(chunk, config.model) - content_to_summarize = chunk - if len(chunk) > MAX_CHUNK_CHARS: - content_to_summarize, dropped = middle_truncate( - chunk, - MAX_CHUNK_CHARS, - head_frac=0.3, - tail_frac=0.3, - ) - logger.warning( - "Chunk %d truncated: dropped %d chars to fit context window", - chunk_index, - dropped, - ) - - chunk_tokens = count_tokens(content_to_summarize, config.model) - target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD) + target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD) max_words = tokens_to_words(target_tokens) prompt = CHUNK_SUMMARY_PROMPT.format( chunk_index=chunk_index + 1, total_chunks=total_chunks, - content=content_to_summarize, + content=chunk, max_words=max_words, ) @@ -296,8 +224,7 @@ async def _summarize_single_chunk( chunk_index=chunk_index, content=summary, token_count=summary_tokens, - source_tokens=source_tokens, # Report original token count - parent_group=parent_group, + source_tokens=source_tokens, ) @@ -355,7 +282,6 @@ async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: idx, len(chunks), config, - parent_group=None, ) chunk_summaries = await asyncio.gather( @@ -423,14 +349,11 @@ async def _hierarchical_summary( async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: async with semaphore: - # Assign to L2 group (L2_GROUP_SIZE chunks per group) - group_idx = idx // L2_GROUP_SIZE return await _summarize_single_chunk( chunk, idx, len(chunks), config, - parent_group=group_idx, ) l1_summaries = await asyncio.gather( @@ -497,25 +420,19 @@ async def _generate_summary( prompt: str, config: SummarizerConfig, max_tokens: int = 256, - *, - attempt: int = 0, ) -> str: """Generate a summary using the LLM. - Uses PydanticAI for structured output with fallback to raw generation. - Implements exponential backoff retry on failures. - Args: prompt: The prompt to send to the LLM. config: Summarizer configuration. max_tokens: Maximum tokens for the response. - attempt: Current retry attempt (for internal recursion). Returns: The generated summary text. Raises: - SummarizationError: If all retries are exhausted. + SummarizationError: If summarization fails. """ provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url) @@ -539,51 +456,5 @@ async def _generate_summary( result = await agent.run(prompt) return result.output.summary.strip() except Exception as e: - logger.warning("Structured summary failed, trying raw generation: %s", e) - # Fallback to raw HTTP call - try: - return await _raw_generate(prompt, config, max_tokens) - except Exception as raw_err: - if attempt < MAX_SUMMARIZE_RETRIES: - wait_time = 2**attempt # Exponential backoff: 1, 2, 4 seconds - logger.warning( - "Raw generation failed (attempt %d/%d), retrying in %ds: %s", - attempt + 1, - MAX_SUMMARIZE_RETRIES, - wait_time, - raw_err, - ) - await asyncio.sleep(wait_time) - return await _generate_summary( - prompt, - config, - max_tokens, - attempt=attempt + 1, - ) - msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries" - raise SummarizationError(msg) from raw_err - - -async def _raw_generate(prompt: str, config: SummarizerConfig, max_tokens: int) -> str: - """Fallback raw HTTP generation without structured output.""" - async with httpx.AsyncClient(timeout=config.timeout) as client: - response = await client.post( - f"{config.openai_base_url}/chat/completions", - headers={"Authorization": f"Bearer {config.api_key}"}, - json={ - "model": config.model, - "messages": [ - {"role": "system", "content": "You are a concise summarizer."}, - {"role": "user", "content": prompt}, - ], - "temperature": 0.3, - "max_tokens": max_tokens, - }, - ) - response.raise_for_status() - data = response.json() - - choices = data.get("choices", []) - if choices: - return choices[0].get("message", {}).get("content", "").strip() - return "" + msg = f"Summarization failed: {e}" + raise SummarizationError(msg) from e diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index 843d1dfe5..4f5c51191 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -44,10 +44,6 @@ class ChunkSummary(BaseModel): content: str = Field(..., description="The summarized content of this chunk") token_count: int = Field(..., ge=0, description="Token count of this summary") source_tokens: int = Field(..., ge=0, description="Token count of the source chunk") - parent_group: int | None = Field( - default=None, - description="Index of the L2 group this chunk belongs to", - ) class HierarchicalSummary(BaseModel): @@ -156,7 +152,6 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: "role": "summary", "level": HIERARCHICAL_LEVEL_L1, "chunk_index": cs.chunk_index, - "parent_group": cs.parent_group, "token_count": cs.token_count, "created_at": timestamp, }, diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index 940ddddb8..59f1dbb5e 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -8,19 +8,19 @@ The adaptive summarizer provides **content-aware compression** that scales summa ``` ┌─────────────────────────────────────────────────────────────────────┐ -│ Adaptive Summarization Pipeline │ +│ Adaptive Summarization Pipeline │ ├─────────────────────────────────────────────────────────────────────┤ │ │ -│ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy │ +│ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy │ │ │ -│ ┌─────────────────────────────────────────────────────────────┐ │ -│ │ Level Thresholds: │ │ -│ │ < 100 tokens ──▶ NONE (no summary needed) │ │ -│ │ 100-500 ──▶ BRIEF (single sentence) │ │ -│ │ 500-3000 ──▶ STANDARD (paragraph) │ │ -│ │ 3000-15000 ──▶ DETAILED (chunked + meta) │ │ -│ │ > 15000 ──▶ HIERARCHICAL (L1/L2/L3 tree) │ │ -│ └─────────────────────────────────────────────────────────────┘ │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ Level Thresholds: │ │ +│ │ < 100 tokens ──▶ NONE (no summary needed) │ │ +│ │ 100-500 ──▶ BRIEF (single sentence) │ │ +│ │ 500-3000 ──▶ STANDARD (paragraph) │ │ +│ │ 3000-15000 ──▶ DETAILED (chunked + meta) │ │ +│ │ > 15000 ──▶ HIERARCHICAL (L1/L2/L3 tree) │ │ +│ └─────────────────────────────────────────────────────────────┘ │ │ │ │ Output: SummaryResult with compression metrics │ └─────────────────────────────────────────────────────────────────────┘ @@ -310,19 +310,19 @@ def chunk_text( ### 4.5 Middle Truncation (Utility) -For contexts where the summary exceeds available space: +For handling very large inputs that could exceed context windows: ```python def middle_truncate( text: str, - token_budget: int, - head_fraction: float = 0.3, - tail_fraction: float = 0.7, -) -> str: + budget_chars: int, + head_frac: float = 0.3, + tail_frac: float = 0.3, +) -> tuple[str, int]: """Keep head and tail, remove middle (least likely to contain key info).""" ``` -**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). +**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). Useful when summarizing very long conversations that may contain pasted codebases. --- @@ -376,22 +376,9 @@ Section Summaries: Synthesized Summary: ``` -### 5.5 Rolling Summary (`ROLLING_PROMPT`) +### 5.5 Content-Type Prompts -``` -Update the existing summary to incorporate new information. -Preserve important historical context while integrating new facts. - -Existing Summary: -{prior_summary} - -New Information: -{new_facts} - -Updated Summary: -``` - -### 5.6 Content-Type Prompts +All content-type prompts include `{prior_context}` for rolling summary continuity. **Conversation:** ``` @@ -493,14 +480,15 @@ def get_final_summary( ## 8. Error Handling -### 8.1 Graceful Degradation +### 8.1 Fail-Fast Philosophy + +Errors are propagated rather than hidden behind fallbacks: -| Error | Fallback | +| Error | Behavior | | :--- | :--- | -| LLM timeout | Return input unchanged with NONE level | -| LLM error | Retry up to 3 times, then return NONE | -| Token counting failure | Estimate based on character count (÷4) | -| Chunking failure | Fall back to character-based splitting | +| LLM timeout | Raises `SummarizationError` | +| LLM error | Raises `SummarizationError` | +| Token counting failure | Falls back to `cl100k_base` encoding | ### 8.2 Validation diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py index f5db1486c..ac04bc126 100644 --- a/tests/summarizer/test_adaptive.py +++ b/tests/summarizer/test_adaptive.py @@ -8,13 +8,12 @@ from agent_cli.summarizer.adaptive import ( LEVEL_THRESHOLDS, + SummarizationError, SummarizerConfig, SummaryOutput, _generate_summary, - _raw_generate, determine_level, summarize, - update_rolling_summary, ) from agent_cli.summarizer.models import SummaryLevel, SummaryResult @@ -257,82 +256,6 @@ async def test_hierarchical_level_calls_hierarchical_summary( assert result.level == SummaryLevel.HIERARCHICAL -class TestUpdateRollingSummary: - """Tests for rolling summary updates.""" - - @pytest.fixture - def config(self) -> SummarizerConfig: - """Create a config instance.""" - return SummarizerConfig( - openai_base_url="http://localhost:8000/v1", - model="gpt-4", - ) - - @pytest.mark.asyncio - async def test_empty_facts_returns_prior(self, config: SummarizerConfig) -> None: - """Test that empty facts list returns prior summary.""" - result = await update_rolling_summary( - prior_summary="Existing summary", - new_facts=[], - config=config, - ) - assert result == "Existing summary" - - @pytest.mark.asyncio - async def test_empty_facts_no_prior_returns_empty( - self, - config: SummarizerConfig, - ) -> None: - """Test that empty facts with no prior returns empty string.""" - result = await update_rolling_summary( - prior_summary=None, - new_facts=[], - config=config, - ) - assert result == "" - - @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._generate_summary") - async def test_new_facts_calls_generate( - self, - mock_generate: AsyncMock, - config: SummarizerConfig, - ) -> None: - """Test that new facts trigger summary generation.""" - mock_generate.return_value = "Updated summary with new facts." - - result = await update_rolling_summary( - prior_summary="Old summary", - new_facts=["User likes coffee", "User lives in Amsterdam"], - config=config, - ) - - mock_generate.assert_called_once() - assert result == "Updated summary with new facts." - - @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._generate_summary") - async def test_facts_formatted_as_list( - self, - mock_generate: AsyncMock, - config: SummarizerConfig, - ) -> None: - """Test that facts are formatted as bullet list in prompt.""" - mock_generate.return_value = "Summary" - - await update_rolling_summary( - prior_summary="Prior", - new_facts=["Fact one", "Fact two"], - config=config, - ) - - # Check the prompt contains formatted facts - call_args = mock_generate.call_args - prompt = call_args[0][0] - assert "- Fact one" in prompt - assert "- Fact two" in prompt - - class TestGenerateSummary: """Tests for _generate_summary function.""" @@ -365,72 +288,18 @@ async def test_generate_summary_with_pydantic_ai( mock_agent.run.assert_called_once_with("Test prompt") @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._raw_generate") - async def test_fallback_to_raw_generate_on_error( + async def test_raises_summarization_error_on_failure( self, - mock_raw: AsyncMock, config: SummarizerConfig, ) -> None: - """Test fallback to raw HTTP on PydanticAI error.""" - mock_raw.return_value = "Fallback summary" - + """Test that SummarizationError is raised on failure.""" with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class: mock_agent = MagicMock() mock_agent.run = AsyncMock(side_effect=Exception("API error")) mock_agent_class.return_value = mock_agent - result = await _generate_summary("Test prompt", config, max_tokens=100) - - mock_raw.assert_called_once_with("Test prompt", config, 100) - assert result == "Fallback summary" - - -class TestRawGenerate: - """Tests for _raw_generate fallback function.""" - - @pytest.fixture - def config(self) -> SummarizerConfig: - """Create a config instance.""" - return SummarizerConfig( - openai_base_url="http://localhost:8000/v1", - model="gpt-4", - ) - - @pytest.mark.asyncio - async def test_raw_generate_success(self, config: SummarizerConfig) -> None: - """Test successful raw HTTP generation.""" - mock_response = MagicMock() - mock_response.json.return_value = { - "choices": [{"message": {"content": "Raw generated summary"}}], - } - - with patch("httpx.AsyncClient") as mock_client_class: - mock_client = MagicMock() - mock_client.post = AsyncMock(return_value=mock_response) - mock_client.__aenter__ = AsyncMock(return_value=mock_client) - mock_client.__aexit__ = AsyncMock(return_value=None) - mock_client_class.return_value = mock_client - - result = await _raw_generate("Test prompt", config, max_tokens=100) - - assert result == "Raw generated summary" - - @pytest.mark.asyncio - async def test_raw_generate_empty_choices(self, config: SummarizerConfig) -> None: - """Test raw generate with empty choices returns empty string.""" - mock_response = MagicMock() - mock_response.json.return_value = {"choices": []} - - with patch("httpx.AsyncClient") as mock_client_class: - mock_client = MagicMock() - mock_client.post = AsyncMock(return_value=mock_response) - mock_client.__aenter__ = AsyncMock(return_value=mock_client) - mock_client.__aexit__ = AsyncMock(return_value=None) - mock_client_class.return_value = mock_client - - result = await _raw_generate("Test prompt", config, max_tokens=100) - - assert result == "" + with pytest.raises(SummarizationError, match="Summarization failed"): + await _generate_summary("Test prompt", config, max_tokens=100) class TestSummaryOutput: diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py index e27fa18e0..23509d2e3 100644 --- a/tests/summarizer/test_models.py +++ b/tests/summarizer/test_models.py @@ -43,24 +43,11 @@ def test_basic_creation(self) -> None: content="This is a summary of chunk 1.", token_count=10, source_tokens=100, - parent_group=None, ) assert chunk.chunk_index == 0 assert chunk.content == "This is a summary of chunk 1." assert chunk.token_count == 10 assert chunk.source_tokens == 100 - assert chunk.parent_group is None - - def test_with_parent_group(self) -> None: - """Test creating a chunk summary with parent group.""" - chunk = ChunkSummary( - chunk_index=5, - content="Summary text", - token_count=8, - source_tokens=200, - parent_group=1, - ) - assert chunk.parent_group == 1 def test_validation_negative_tokens(self) -> None: """Test that negative token counts fail validation.""" diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py index 05937f71a..660229709 100644 --- a/tests/summarizer/test_prompts.py +++ b/tests/summarizer/test_prompts.py @@ -9,7 +9,6 @@ DOCUMENT_SUMMARY_PROMPT, JOURNAL_SUMMARY_PROMPT, META_SUMMARY_PROMPT, - ROLLING_SUMMARY_PROMPT, STANDARD_SUMMARY_PROMPT, format_prior_context, format_summaries_for_meta, @@ -71,26 +70,23 @@ def test_meta_prompt_has_placeholders(self) -> None: assert "Summary 1" in result assert "200" in result - def test_rolling_prompt_has_placeholders(self) -> None: - """Test ROLLING prompt contains required placeholders.""" - assert "{prior_summary}" in ROLLING_SUMMARY_PROMPT - assert "{new_content}" in ROLLING_SUMMARY_PROMPT - assert "{max_words}" in ROLLING_SUMMARY_PROMPT - - def test_conversation_prompt_has_content(self) -> None: - """Test CONVERSATION prompt contains content placeholder.""" + def test_conversation_prompt_has_placeholders(self) -> None: + """Test CONVERSATION prompt contains required placeholders.""" assert "{content}" in CONVERSATION_SUMMARY_PROMPT assert "{max_words}" in CONVERSATION_SUMMARY_PROMPT + assert "{prior_context}" in CONVERSATION_SUMMARY_PROMPT - def test_journal_prompt_has_content(self) -> None: - """Test JOURNAL prompt contains content placeholder.""" + def test_journal_prompt_has_placeholders(self) -> None: + """Test JOURNAL prompt contains required placeholders.""" assert "{content}" in JOURNAL_SUMMARY_PROMPT assert "{max_words}" in JOURNAL_SUMMARY_PROMPT + assert "{prior_context}" in JOURNAL_SUMMARY_PROMPT - def test_document_prompt_has_content(self) -> None: - """Test DOCUMENT prompt contains content placeholder.""" + def test_document_prompt_has_placeholders(self) -> None: + """Test DOCUMENT prompt contains required placeholders.""" assert "{content}" in DOCUMENT_SUMMARY_PROMPT assert "{max_words}" in DOCUMENT_SUMMARY_PROMPT + assert "{prior_context}" in DOCUMENT_SUMMARY_PROMPT class TestGetPromptForContentType: From a171aafcd46d9c45cb24a8d704dd437c74000629 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 22:25:41 -0800 Subject: [PATCH 14/37] feat(cli): add summarize command for adaptive hierarchical summarization Expose the full power of the summarizer through a CLI command that: - Follows existing CLI patterns using shared opts module - Supports all LLM providers (ollama, openai, gemini) - Offers content-type prompts (general, conversation, journal, document) - Provides output formats (text, json, full hierarchical) - Includes chunking options and rolling summary support - Reads from file or stdin --- agent_cli/agents/__init__.py | 2 + agent_cli/agents/summarize.py | 435 ++++++++++++++++++++++++++++++++++ agent_cli/cli.py | 1 + 3 files changed, 438 insertions(+) create mode 100644 agent_cli/agents/summarize.py diff --git a/agent_cli/agents/__init__.py b/agent_cli/agents/__init__.py index 1ec88de0a..6a0c7838d 100644 --- a/agent_cli/agents/__init__.py +++ b/agent_cli/agents/__init__.py @@ -7,6 +7,7 @@ memory, rag_proxy, speak, + summarize, transcribe, voice_edit, ) @@ -18,6 +19,7 @@ "memory", "rag_proxy", "speak", + "summarize", "transcribe", "voice_edit", ] diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py new file mode 100644 index 000000000..abc8dfc72 --- /dev/null +++ b/agent_cli/agents/summarize.py @@ -0,0 +1,435 @@ +"""Summarize text files or stdin using adaptive hierarchical summarization.""" + +from __future__ import annotations + +import asyncio +import contextlib +import json +import sys +import time +from enum import Enum +from pathlib import Path # noqa: TC003 +from typing import TYPE_CHECKING + +import typer + +from agent_cli import config, opts +from agent_cli.cli import app +from agent_cli.core.utils import ( + console, + create_status, + print_command_line_args, + print_error_message, + print_input_panel, + print_output_panel, + print_with_style, + setup_logging, +) +from agent_cli.summarizer import SummarizationError, SummarizerConfig, summarize +from agent_cli.summarizer._utils import count_tokens + +if TYPE_CHECKING: + from agent_cli.summarizer import SummaryResult + + +class ContentType(str, Enum): + """Content type for specialized summarization prompts.""" + + general = "general" + conversation = "conversation" + journal = "journal" + document = "document" + + +class OutputFormat(str, Enum): + """Output format for the summarization result.""" + + text = "text" + json = "json" + full = "full" + + +def _read_input(file_path: Path | None) -> str | None: + """Read input from file or stdin.""" + if file_path: + if not file_path.exists(): + print_error_message( + f"File not found: {file_path}", + "Please check the file path and try again.", + ) + return None + return file_path.read_text(encoding="utf-8") + + # Read from stdin + if sys.stdin.isatty(): + print_error_message( + "No input provided", + "Provide a file path or pipe content via stdin.", + ) + return None + + return sys.stdin.read() + + +def _display_input_preview( + content: str, + token_count: int, + *, + quiet: bool, + max_preview_chars: int = 500, +) -> None: + """Display a preview of the input content.""" + if quiet: + return + + preview = content[:max_preview_chars] + if len(content) > max_preview_chars: + preview += f"\n... [{len(content) - max_preview_chars} more characters]" + + print_input_panel( + preview, + title=f"Input ({token_count:,} tokens)", + ) + + +def _display_result( + result: SummaryResult, + elapsed: float, + output_format: OutputFormat, + *, + quiet: bool, +) -> None: + """Display the summarization result.""" + if output_format == OutputFormat.json: + print(json.dumps(result.model_dump(mode="json"), indent=2)) + return + + if output_format == OutputFormat.full: + _display_full_result(result, elapsed, quiet=quiet) + return + + # Text output - just the summary + if quiet: + if result.summary: + print(result.summary) + elif result.summary: + print_output_panel( + result.summary, + title=f"Summary (Level: {result.level.name})", + subtitle=f"[dim]{result.output_tokens:,} tokens | {result.compression_ratio:.1%} of original | {elapsed:.2f}s[/dim]", + ) + else: + print_with_style( + f"No summary generated (input too short: {result.input_tokens} tokens)", + style="yellow", + ) + + +def _display_full_result( + result: SummaryResult, + elapsed: float, + *, + quiet: bool, +) -> None: + """Display full hierarchical result with all levels.""" + if quiet: + if result.summary: + print(result.summary) + return + + console.print() + console.print("[bold cyan]Summarization Result[/bold cyan]") + console.print(f" Level: [bold]{result.level.name}[/bold]") + console.print(f" Input tokens: [bold]{result.input_tokens:,}[/bold]") + console.print(f" Output tokens: [bold]{result.output_tokens:,}[/bold]") + console.print(f" Compression: [bold]{result.compression_ratio:.1%}[/bold]") + console.print(f" Time: [bold]{elapsed:.2f}s[/bold]") + console.print() + + if result.hierarchical: + if result.hierarchical.l1_summaries: + console.print( + f"[bold yellow]L1 Chunk Summaries " + f"({len(result.hierarchical.l1_summaries)} chunks)[/bold yellow]", + ) + for cs in result.hierarchical.l1_summaries: + console.print( + f"\n[dim]--- Chunk {cs.chunk_index + 1} " + f"({cs.source_tokens:,} → {cs.token_count:,} tokens) ---[/dim]", + ) + console.print(cs.content) + + if result.hierarchical.l2_summaries: + console.print( + f"\n[bold yellow]L2 Group Summaries " + f"({len(result.hierarchical.l2_summaries)} groups)[/bold yellow]", + ) + for idx, l2_summary in enumerate(result.hierarchical.l2_summaries): + console.print(f"\n[dim]--- Group {idx + 1} ---[/dim]") + console.print(l2_summary) + + console.print("\n[bold green]L3 Final Summary[/bold green]") + print_output_panel(result.hierarchical.l3_summary, title="Final Summary") + elif result.summary: + print_output_panel( + result.summary, + title=f"Summary ({result.level.name})", + ) + + +def _get_llm_config( + provider_cfg: config.ProviderSelection, + ollama_cfg: config.Ollama, + openai_llm_cfg: config.OpenAILLM, + gemini_llm_cfg: config.GeminiLLM, +) -> tuple[str, str, str | None]: + """Get openai_base_url, model, and api_key from provider config.""" + if provider_cfg.llm_provider == "ollama": + # Ollama uses OpenAI-compatible API at /v1 + base_url = ollama_cfg.llm_ollama_host.rstrip("/") + if not base_url.endswith("/v1"): + base_url = f"{base_url}/v1" + return base_url, ollama_cfg.llm_ollama_model, None + if provider_cfg.llm_provider == "openai": + base_url = openai_llm_cfg.openai_base_url or "https://api.openai.com/v1" + return base_url, openai_llm_cfg.llm_openai_model, openai_llm_cfg.openai_api_key + # gemini + return ( + "https://generativelanguage.googleapis.com/v1beta/openai", + gemini_llm_cfg.llm_gemini_model, + gemini_llm_cfg.gemini_api_key, + ) + + +async def _async_summarize( + content: str, + *, + content_type: ContentType, + prior_summary: str | None, + provider_cfg: config.ProviderSelection, + ollama_cfg: config.Ollama, + openai_llm_cfg: config.OpenAILLM, + gemini_llm_cfg: config.GeminiLLM, + general_cfg: config.General, + chunk_size: int, + chunk_overlap: int, + max_concurrent_chunks: int, + output_format: OutputFormat, +) -> None: + """Asynchronous summarization entry point.""" + setup_logging(general_cfg.log_level, general_cfg.log_file, quiet=general_cfg.quiet) + + openai_base_url, model, api_key = _get_llm_config( + provider_cfg, + ollama_cfg, + openai_llm_cfg, + gemini_llm_cfg, + ) + + token_count = count_tokens(content, model) + _display_input_preview(content, token_count, quiet=general_cfg.quiet) + + summarizer_config = SummarizerConfig( + openai_base_url=openai_base_url, + model=model, + api_key=api_key, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + max_concurrent_chunks=max_concurrent_chunks, + ) + + try: + if not general_cfg.quiet: + status = create_status(f"Summarizing with {model}...", "bold yellow") + else: + status = contextlib.nullcontext() + + with status: + start_time = time.monotonic() + result = await summarize( + content, + summarizer_config, + prior_summary=prior_summary, + content_type=content_type.value, + ) + elapsed = time.monotonic() - start_time + + _display_result(result, elapsed, output_format, quiet=general_cfg.quiet) + + except SummarizationError as e: + print_error_message( + str(e), + f"Check that your LLM server is running at {openai_base_url}", + ) + sys.exit(1) + except Exception as e: + print_error_message(str(e), "An unexpected error occurred during summarization.") + sys.exit(1) + + +@app.command("summarize") +def summarize_command( + *, + file_path: Path | None = typer.Argument( # noqa: B008 + None, + help="Path to file to summarize. If not provided, reads from stdin.", + ), + # --- Content Options --- + content_type: ContentType = typer.Option( # noqa: B008 + ContentType.general, + "--type", + "-t", + help="Content type for specialized summarization prompts.", + rich_help_panel="Content Options", + ), + prior_summary: str | None = typer.Option( + None, + "--prior-summary", + help="Prior summary to integrate with (for rolling summaries).", + rich_help_panel="Content Options", + ), + prior_summary_file: Path | None = typer.Option( # noqa: B008 + None, + "--prior-summary-file", + help="File containing prior summary to integrate with.", + rich_help_panel="Content Options", + ), + # --- Chunking Options --- + chunk_size: int = typer.Option( + 3000, + "--chunk-size", + help="Target token count per chunk for hierarchical summarization.", + rich_help_panel="Chunking Options", + ), + chunk_overlap: int = typer.Option( + 200, + "--chunk-overlap", + help="Token overlap between chunks for context continuity.", + rich_help_panel="Chunking Options", + ), + max_concurrent_chunks: int = typer.Option( + 5, + "--max-concurrent", + help="Maximum number of chunks to process in parallel.", + rich_help_panel="Chunking Options", + ), + # --- Output Options --- + output_format: OutputFormat = typer.Option( # noqa: B008 + OutputFormat.text, + "--output", + "-o", + help="Output format: 'text' (summary only), 'json' (full result), 'full' (all levels).", + rich_help_panel="Output Options", + ), + # --- Provider Selection --- + llm_provider: str = opts.LLM_PROVIDER, + # --- LLM Configuration --- + # Ollama (local service) + llm_ollama_model: str = opts.LLM_OLLAMA_MODEL, + llm_ollama_host: str = opts.LLM_OLLAMA_HOST, + # OpenAI + llm_openai_model: str = opts.LLM_OPENAI_MODEL, + openai_api_key: str | None = opts.OPENAI_API_KEY, + openai_base_url: str | None = opts.OPENAI_BASE_URL, + # Gemini + llm_gemini_model: str = opts.LLM_GEMINI_MODEL, + gemini_api_key: str | None = opts.GEMINI_API_KEY, + # --- General Options --- + log_level: str = opts.LOG_LEVEL, + log_file: str | None = opts.LOG_FILE, + quiet: bool = opts.QUIET, + config_file: str | None = opts.CONFIG_FILE, + print_args: bool = opts.PRINT_ARGS, +) -> None: + """Summarize text using adaptive hierarchical summarization. + + Reads from a file or stdin and produces a summary scaled to the input complexity: + + - NONE (<100 tokens): No summary needed + - BRIEF (100-500): Single sentence + - STANDARD (500-3000): Paragraph + - DETAILED (3000-15000): Chunked with meta-summary + - HIERARCHICAL (>15000): Full L1/L2/L3 tree + + Examples: + # Summarize a file + agent-cli summarize document.txt + + # Summarize with conversation-specific prompts + agent-cli summarize chat.txt --type conversation + + # Pipe content from stdin + cat book.txt | agent-cli summarize + + # Get full hierarchical output + agent-cli summarize large_document.txt --output full + + # Use OpenAI instead of Ollama + agent-cli summarize notes.md --llm-provider openai + + """ + if print_args: + print_command_line_args(locals()) + + # Create config objects following the standard pattern + provider_cfg = config.ProviderSelection( + llm_provider=llm_provider, + asr_provider="wyoming", # Not used, but required by model + tts_provider="wyoming", # Not used, but required by model + ) + ollama_cfg = config.Ollama( + llm_ollama_model=llm_ollama_model, + llm_ollama_host=llm_ollama_host, + ) + openai_llm_cfg = config.OpenAILLM( + llm_openai_model=llm_openai_model, + openai_api_key=openai_api_key, + openai_base_url=openai_base_url, + ) + gemini_llm_cfg = config.GeminiLLM( + llm_gemini_model=llm_gemini_model, + gemini_api_key=gemini_api_key, + ) + general_cfg = config.General( + log_level=log_level, + log_file=log_file, + quiet=quiet, + clipboard=False, # summarize doesn't use clipboard + ) + + # Read content + content = _read_input(file_path) + if content is None: + raise typer.Exit(1) + + if not content.strip(): + print_error_message("Empty input", "The input file or stdin is empty.") + raise typer.Exit(1) + + # Handle prior summary from file + actual_prior_summary = prior_summary + if prior_summary_file: + if not prior_summary_file.exists(): + print_error_message( + f"Prior summary file not found: {prior_summary_file}", + "Please check the file path.", + ) + raise typer.Exit(1) + actual_prior_summary = prior_summary_file.read_text(encoding="utf-8") + + asyncio.run( + _async_summarize( + content, + content_type=content_type, + prior_summary=actual_prior_summary, + provider_cfg=provider_cfg, + ollama_cfg=ollama_cfg, + openai_llm_cfg=openai_llm_cfg, + gemini_llm_cfg=gemini_llm_cfg, + general_cfg=general_cfg, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + max_concurrent_chunks=max_concurrent_chunks, + output_format=output_format, + ), + ) diff --git a/agent_cli/cli.py b/agent_cli/cli.py index 981404159..3542f41fc 100644 --- a/agent_cli/cli.py +++ b/agent_cli/cli.py @@ -121,6 +121,7 @@ def set_config_defaults(ctx: typer.Context, config_file: str | None) -> dict[str memory, rag_proxy, speak, + summarize, transcribe, transcribe_live, voice_edit, From 8dff17ff47c60ad9c629a449fac59c1ab9b0fb22 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 22:35:19 -0800 Subject: [PATCH 15/37] refactor(memory): remove dead parent_group field and bundle metadata args - Remove unused parent_group from MemoryMetadata (was never assigned) - Refactor write_memory_file to accept optional MemoryMetadata object instead of 17 individual parameters - Simplify upsert_hierarchical_summary to use MemoryMetadata(**dict) - Rename summary_level to summary_level_name for consistency - Make tiktoken optional in token counting with fallback heuristic --- agent_cli/memory/_files.py | 88 ++++++++++++++-------------- agent_cli/memory/_persistence.py | 35 ++++++----- agent_cli/memory/_store.py | 17 +----- agent_cli/memory/models.py | 2 - agent_cli/summarizer/_utils.py | 17 +++++- agent_cli/summarizer/models.py | 4 +- docs/architecture/summarizer.md | 2 - tests/memory/test_store.py | 4 +- tests/summarizer/test_integration.py | 7 +-- tests/summarizer/test_models.py | 4 +- 10 files changed, 82 insertions(+), 98 deletions(-) diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py index 65fbbc1b2..0bb0a5d94 100644 --- a/agent_cli/memory/_files.py +++ b/agent_cli/memory/_files.py @@ -87,76 +87,74 @@ def soft_delete_memory_file( def write_memory_file( root: Path, *, - conversation_id: str, - role: str, - created_at: str, content: str, - summary_kind: str | None = None, doc_id: str | None = None, + # Either pass pre-built metadata OR individual fields + metadata: MemoryMetadata | None = None, + # Individual fields (used when metadata is None) + conversation_id: str | None = None, + role: str | None = None, + created_at: str | None = None, + summary_kind: str | None = None, source_id: str | None = None, - # Hierarchical summary fields - level: int | None = None, - is_final: bool | None = None, - chunk_index: int | None = None, - parent_group: int | None = None, - group_index: int | None = None, - input_tokens: int | None = None, - output_tokens: int | None = None, - compression_ratio: float | None = None, - summary_level_name: str | None = None, ) -> MemoryFileRecord: - """Render and persist a memory document to disk.""" + """Render and persist a memory document to disk. + + Can be called in two ways: + 1. With pre-built metadata: write_memory_file(root, content=..., metadata=..., doc_id=...) + 2. With individual fields: write_memory_file(root, content=..., conversation_id=..., role=..., ...) + + """ entries_dir, _ = ensure_store_dirs(root) - safe_conversation = _slugify(conversation_id) doc_id = doc_id or str(uuid4()) - safe_ts = _safe_timestamp(created_at) + + # Build or use provided metadata + if metadata is not None: + meta = metadata + else: + if conversation_id is None or role is None or created_at is None: + msg = "Must provide metadata or (conversation_id, role, created_at)" + raise ValueError(msg) + meta = MemoryMetadata( + conversation_id=conversation_id, + role=role, + created_at=created_at, + summary_kind=summary_kind, + source_id=source_id, + ) + + safe_conversation = _slugify(meta.conversation_id) + safe_ts = _safe_timestamp(meta.created_at) # Route by role/category for readability - if summary_kind and level is not None: + if meta.summary_kind and meta.level is not None: # Hierarchical summary file structure - if level == _SUMMARY_LEVEL_L1: + if meta.level == _SUMMARY_LEVEL_L1: subdir = Path("summaries") / "L1" - filename = f"chunk_{chunk_index or 0}.md" - elif level == _SUMMARY_LEVEL_L2: + filename = f"chunk_{meta.chunk_index or 0}.md" + elif meta.level == _SUMMARY_LEVEL_L2: subdir = Path("summaries") / "L2" - filename = f"group_{group_index or 0}.md" + filename = f"group_{meta.group_index or 0}.md" else: # level == _SUMMARY_LEVEL_L3 subdir = Path("summaries") / "L3" filename = "final.md" - elif summary_kind: + elif meta.summary_kind: subdir = Path("summaries") filename = "summary.md" - elif role == "user": + elif meta.role == "user": subdir = Path("turns") / "user" filename = f"{safe_ts}__{doc_id}.md" - elif role == "assistant": + elif meta.role == "assistant": subdir = Path("turns") / "assistant" filename = f"{safe_ts}__{doc_id}.md" - elif role == "memory": + elif meta.role == "memory": subdir = Path("facts") filename = f"{safe_ts}__{doc_id}.md" else: subdir = Path() filename = f"{doc_id}.md" - metadata = MemoryMetadata( - conversation_id=conversation_id, - role=role, - created_at=created_at, - summary_kind=summary_kind, - source_id=source_id, - level=level, - is_final=is_final, - chunk_index=chunk_index, - parent_group=parent_group, - group_index=group_index, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=compression_ratio, - summary_level_name=summary_level_name, - ) - - front_matter = _render_front_matter(doc_id, metadata) + front_matter = _render_front_matter(doc_id, meta) body = front_matter + "\n" + content.strip() + "\n" file_path = entries_dir / safe_conversation / subdir / filename @@ -164,7 +162,7 @@ def write_memory_file( atomic_write_text(file_path, body) - return MemoryFileRecord(id=doc_id, path=file_path, metadata=metadata, content=content) + return MemoryFileRecord(id=doc_id, path=file_path, metadata=meta, content=content) def load_memory_files(root: Path) -> list[MemoryFileRecord]: diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index 91585ade8..2af3a2687 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -25,13 +25,13 @@ upsert_memories, ) from agent_cli.memory.entities import Fact, Turn +from agent_cli.memory.models import MemoryMetadata if TYPE_CHECKING: from pathlib import Path from chromadb import Collection - from agent_cli.memory.models import MemoryMetadata from agent_cli.summarizer import SummaryResult LOGGER = logging.getLogger(__name__) @@ -212,26 +212,29 @@ def persist_hierarchical_summary( created_at = datetime.now(UTC).isoformat() for entry in entries: - meta = entry["metadata"] + meta_dict = entry["metadata"] + # Build MemoryMetadata from the summary result's metadata dict + metadata = MemoryMetadata( + conversation_id=meta_dict["conversation_id"], + role=meta_dict["role"], + created_at=meta_dict.get("created_at", created_at), + summary_kind="summary", + level=meta_dict.get("level"), + is_final=meta_dict.get("is_final"), + chunk_index=meta_dict.get("chunk_index"), + group_index=meta_dict.get("group_index"), + input_tokens=meta_dict.get("input_tokens"), + output_tokens=meta_dict.get("output_tokens"), + compression_ratio=meta_dict.get("compression_ratio"), + summary_level_name=meta_dict.get("summary_level_name"), + ) record = write_memory_file( memory_root, - conversation_id=meta["conversation_id"], - role=meta["role"], - created_at=meta.get("created_at", created_at), content=entry["content"], - summary_kind="summary", doc_id=entry["id"], - level=meta.get("level"), - is_final=meta.get("is_final"), - chunk_index=meta.get("chunk_index"), - parent_group=meta.get("parent_group"), - group_index=meta.get("group_index"), - input_tokens=meta.get("input_tokens"), - output_tokens=meta.get("output_tokens"), - compression_ratio=meta.get("compression_ratio"), - summary_level_name=meta.get("summary_level"), + metadata=metadata, ) - LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta.get("level")) + LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level")) stored_ids.append(record.id) # Store in ChromaDB diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py index 722dcda9e..b668a2d3b 100644 --- a/agent_cli/memory/_store.py +++ b/agent_cli/memory/_store.py @@ -176,22 +176,7 @@ def upsert_hierarchical_summary( contents.append(entry["content"]) # Convert the raw metadata dict to MemoryMetadata meta_dict = entry["metadata"] - metadatas.append( - MemoryMetadata( - conversation_id=meta_dict["conversation_id"], - role=meta_dict["role"], - created_at=meta_dict["created_at"], - level=meta_dict.get("level"), - is_final=meta_dict.get("is_final"), - chunk_index=meta_dict.get("chunk_index"), - parent_group=meta_dict.get("parent_group"), - group_index=meta_dict.get("group_index"), - input_tokens=meta_dict.get("input_tokens"), - output_tokens=meta_dict.get("output_tokens"), - compression_ratio=meta_dict.get("compression_ratio"), - summary_level_name=meta_dict.get("summary_level"), - ), - ) + metadatas.append(MemoryMetadata(**meta_dict)) upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas) return ids diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py index 4eb289c7d..06266c575 100644 --- a/agent_cli/memory/models.py +++ b/agent_cli/memory/models.py @@ -56,8 +56,6 @@ class MemoryMetadata(BaseModel): """Whether this is the final L3 summary.""" chunk_index: int | None = None """For L1 summaries: index of the source chunk.""" - parent_group: int | None = None - """For L1 summaries: which L2 group this chunk belongs to.""" group_index: int | None = None """For L2 summaries: index of this group.""" input_tokens: int | None = None diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index 030b5729e..731c55058 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -13,12 +13,16 @@ @lru_cache(maxsize=4) -def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding: +def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None: """Get tiktoken encoding for a model, with caching. Falls back to cl100k_base for unknown models (covers most modern LLMs). + Returns None when tiktoken is not installed so callers can use a heuristic. """ - import tiktoken # noqa: PLC0415 + try: + import tiktoken # noqa: PLC0415 + except ModuleNotFoundError: + return None try: return tiktoken.encoding_for_model(model) @@ -27,7 +31,7 @@ def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding: def count_tokens(text: str, model: str = "gpt-4") -> int: - """Count tokens in text using tiktoken. + """Count tokens in text using tiktoken, with a lightweight fallback. Args: text: The text to count tokens for. @@ -40,11 +44,18 @@ def count_tokens(text: str, model: str = "gpt-4") -> int: if not text: return 0 enc = _get_encoding(model) + if enc is None: + return _estimate_token_count(text) # Disable special token checking - LLM outputs may contain special tokens # like <|constrain|>, <|endoftext|>, etc. that we want to count normally return len(enc.encode(text, disallowed_special=())) +def _estimate_token_count(text: str) -> int: + """Very rough token estimate based on character length (~4 chars/token).""" + return max(1, (len(text) + 3) // 4) + + def chunk_text( text: str, chunk_size: int = 3000, diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index 4f5c51191..ce6da9082 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -185,7 +185,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: "role": "summary", "level": HIERARCHICAL_LEVEL_L3, "is_final": True, - "summary_level": self.level.name, + "summary_level_name": self.level.name, "input_tokens": self.input_tokens, "output_tokens": self.output_tokens, "compression_ratio": self.compression_ratio, @@ -204,7 +204,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: "role": "summary", "level": HIERARCHICAL_LEVEL_L3, "is_final": True, - "summary_level": self.level.name, + "summary_level_name": self.level.name, "input_tokens": self.input_tokens, "output_tokens": self.output_tokens, "compression_ratio": self.compression_ratio, diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index 59f1dbb5e..ec7b769f2 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -169,7 +169,6 @@ class ChunkSummary(BaseModel): content: str # The summarized text token_count: int # Tokens in this summary source_tokens: int # Tokens in source chunk - parent_group: int | None # L2 group this belongs to class HierarchicalSummary(BaseModel): l1_summaries: list[ChunkSummary] # Individual chunk summaries @@ -191,7 +190,6 @@ Summaries are stored with rich metadata for retrieval and management: | `level` | ✓ | ✓ | ✓ | 1, 2, or 3 | | `chunk_index` | ✓ | | | Position in L1 sequence | | `group_index` | | ✓ | | Position in L2 sequence | -| `parent_group` | ✓ | | | Which L2 group owns this L1 | | `is_final` | | | ✓ | Marks the top-level summary | | `summary_level` | | | ✓ | Name of SummaryLevel enum | | `input_tokens` | | | ✓ | Original content token count | diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py index 453a21a9a..0851d9637 100644 --- a/tests/memory/test_store.py +++ b/tests/memory/test_store.py @@ -159,7 +159,7 @@ def test_upsert_hierarchical_summary_simple() -> None: "role": "summary", "level": 3, "is_final": True, - "summary_level": "STANDARD", + "summary_level_name": "STANDARD", "input_tokens": 1000, "output_tokens": 50, "compression_ratio": 0.05, @@ -192,7 +192,6 @@ def test_upsert_hierarchical_summary_with_chunks() -> None: "role": "summary", "level": 1, "chunk_index": 0, - "parent_group": 0, "created_at": "2024-01-01T00:00:00", }, }, @@ -204,7 +203,6 @@ def test_upsert_hierarchical_summary_with_chunks() -> None: "role": "summary", "level": 1, "chunk_index": 1, - "parent_group": 0, "created_at": "2024-01-01T00:00:00", }, }, diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py index 6eeb133ed..5cb97115d 100644 --- a/tests/summarizer/test_integration.py +++ b/tests/summarizer/test_integration.py @@ -115,7 +115,7 @@ def test_standard_summary_produces_single_entry(self) -> None: assert entry["content"] == "A paragraph summary of the content." assert entry["metadata"]["level"] == 3 assert entry["metadata"]["is_final"] is True - assert entry["metadata"]["summary_level"] == "STANDARD" + assert entry["metadata"]["summary_level_name"] == "STANDARD" def test_hierarchical_summary_produces_multiple_entries(self) -> None: """Test that HIERARCHICAL level produces L1, L2, L3 entries.""" @@ -125,21 +125,18 @@ def test_hierarchical_summary_produces_multiple_entries(self) -> None: content="Chunk 0", token_count=10, source_tokens=100, - parent_group=0, ), ChunkSummary( chunk_index=1, content="Chunk 1", token_count=10, source_tokens=100, - parent_group=0, ), ChunkSummary( chunk_index=2, content="Chunk 2", token_count=10, source_tokens=100, - parent_group=0, ), ] hierarchical = HierarchicalSummary( @@ -257,14 +254,12 @@ def test_persist_hierarchical_creates_files( content="Chunk 0 content", token_count=10, source_tokens=100, - parent_group=0, ), ChunkSummary( chunk_index=1, content="Chunk 1 content", token_count=10, source_tokens=100, - parent_group=0, ), ] hierarchical = HierarchicalSummary( diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py index 23509d2e3..d39621119 100644 --- a/tests/summarizer/test_models.py +++ b/tests/summarizer/test_models.py @@ -223,7 +223,7 @@ def test_to_storage_metadata_simple_summary(self) -> None: assert entry["metadata"]["role"] == "summary" assert entry["metadata"]["level"] == 3 assert entry["metadata"]["is_final"] is True - assert entry["metadata"]["summary_level"] == "STANDARD" + assert entry["metadata"]["summary_level_name"] == "STANDARD" def test_to_storage_metadata_hierarchical(self) -> None: """Test storage metadata for hierarchical summary.""" @@ -233,14 +233,12 @@ def test_to_storage_metadata_hierarchical(self) -> None: content="Chunk 0 text", token_count=10, source_tokens=100, - parent_group=0, ), ChunkSummary( chunk_index=1, content="Chunk 1 text", token_count=12, source_tokens=120, - parent_group=0, ), ] hierarchical = HierarchicalSummary( From 08e9ac5e23388d6857a836ccadb95775458e758a Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 22:52:39 -0800 Subject: [PATCH 16/37] perf: lazy imports for pydantic_ai, sounddevice, and numpy Improve CLI startup time from ~0.51s to ~0.16s (69% faster) by deferring heavy imports until they're actually needed: - pydantic_ai: lazy in memory/_ingest.py, summarizer/adaptive.py, rag/engine.py - sounddevice: lazy in core/audio.py (moved to TYPE_CHECKING + function imports) - numpy: lazy in rag/_retriever.py and services/tts.py Update tests to patch modules directly (e.g., pydantic_ai.Agent) instead of through module attributes that no longer exist at import time. Add scripts/profile_imports.py for measuring import performance. --- agent_cli/summarizer/adaptive.py | 9 +- scripts/profile_imports.py | 141 ++++++++++++++++++++++++++++++ tests/summarizer/test_adaptive.py | 4 +- 3 files changed, 148 insertions(+), 6 deletions(-) create mode 100755 scripts/profile_imports.py diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 989bd86ba..99fa4641a 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -14,10 +14,6 @@ from dataclasses import dataclass from pydantic import BaseModel -from pydantic_ai import Agent -from pydantic_ai.models.openai import OpenAIChatModel -from pydantic_ai.providers.openai import OpenAIProvider -from pydantic_ai.settings import ModelSettings from agent_cli.summarizer._prompts import ( BRIEF_SUMMARY_PROMPT, @@ -435,6 +431,11 @@ async def _generate_summary( SummarizationError: If summarization fails. """ + from pydantic_ai import Agent # noqa: PLC0415 + from pydantic_ai.models.openai import OpenAIChatModel # noqa: PLC0415 + from pydantic_ai.providers.openai import OpenAIProvider # noqa: PLC0415 + from pydantic_ai.settings import ModelSettings # noqa: PLC0415 + provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url) model = OpenAIChatModel( model_name=config.model, diff --git a/scripts/profile_imports.py b/scripts/profile_imports.py new file mode 100755 index 000000000..d70b5b39e --- /dev/null +++ b/scripts/profile_imports.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""Profile CLI import times to identify slow imports. + +Usage: + python scripts/profile_imports.py # Basic timing + python scripts/profile_imports.py -v # Verbose (show all imports) + python scripts/profile_imports.py --top 20 # Show top 20 slowest + python scripts/profile_imports.py --cli-only # Just measure CLI startup time + + # Raw importtime output (for detailed analysis): + python -X importtime -c "from agent_cli.cli import app" 2>&1 | sort -t'|' -k2 -n +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +import time +from pathlib import Path + + +def measure_import_time(module: str, runs: int = 3) -> float: + """Measure average import time for a module.""" + times = [] + for _ in range(runs): + start = time.perf_counter() + result = subprocess.run( + [sys.executable, "-c", f"import {module}"], + check=False, + capture_output=True, + cwd=Path(__file__).parent.parent, + ) + elapsed = time.perf_counter() - start + if result.returncode != 0: + print(f"Error importing {module}: {result.stderr.decode()}") + return -1 + times.append(elapsed) + return sum(times) / len(times) + + +def get_import_breakdown(module: str) -> list[tuple[float, str]]: + """Get detailed import times using -X importtime.""" + result = subprocess.run( + [sys.executable, "-X", "importtime", "-c", f"import {module}"], + check=False, + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent, + ) + + imports = [] + for line in result.stderr.splitlines(): + if "|" not in line: + continue + parts = line.split("|") + if len(parts) >= 2: # noqa: PLR2004 + try: + # importtime format: "import time: self [us] | cumulative | name" + cumulative = int(parts[1].strip()) + name = parts[2].strip() if len(parts) > 2 else "unknown" # noqa: PLR2004 + imports.append((cumulative / 1_000_000, name)) # Convert to seconds + except (ValueError, IndexError): + continue + + return sorted(imports, reverse=True) + + +def main() -> None: + """Run import profiling and display results.""" + parser = argparse.ArgumentParser(description="Profile CLI import times") + parser.add_argument("-v", "--verbose", action="store_true", help="Show all imports") + parser.add_argument("--top", type=int, default=15, help="Show top N slowest imports") + parser.add_argument("--runs", type=int, default=3, help="Number of runs for averaging") + parser.add_argument("--cli-only", action="store_true", help="Only measure CLI import time") + args = parser.parse_args() + + if args.cli_only: + avg = measure_import_time("agent_cli.cli", runs=args.runs) + print(f"CLI import time: {avg:.3f}s (avg of {args.runs} runs)") + return + + print("=" * 60) + print("CLI Import Time Profiling") + print("=" * 60) + + # Measure key entry points + modules = [ + ("agent_cli", "Base package"), + ("agent_cli.cli", "CLI app (full)"), + ("agent_cli.memory", "Memory module (chromadb)"), + ("agent_cli.rag", "RAG module"), + ("agent_cli.summarizer", "Summarizer module"), + ("agent_cli.agents.assistant", "Assistant agent"), + ("agent_cli.agents.summarize", "Summarize agent"), + ("pydantic_ai", "pydantic-ai"), + ("openai", "OpenAI SDK"), + ] + + print(f"\n{'Module':<30} {'Time (s)':<12} Description") + print("-" * 60) + + for module, desc in modules: + avg_time = measure_import_time(module, runs=args.runs) + if avg_time >= 0: + bar = "█" * int(avg_time * 20) # Visual bar (1 block = 50ms) + print(f"{module:<30} {avg_time:>8.3f}s {desc} {bar}") + + # Detailed breakdown + print(f"\n{'=' * 60}") + print(f"Top {args.top} slowest imports (cumulative time)") + print("=" * 60) + + imports = get_import_breakdown("agent_cli.cli") + + shown = 0 + for cumtime, name in imports: + if shown >= args.top and not args.verbose: + break + # Skip very fast imports unless verbose + if cumtime < 0.001 and not args.verbose: # noqa: PLR2004 + continue + bar = "█" * int(cumtime * 100) # 1 block = 10ms + print(f"{cumtime:>8.3f}s {name:<40} {bar}") + shown += 1 + + # Summary + if imports: + total = imports[0][0] if imports else 0 + print(f"\n{'=' * 60}") + print(f"Total CLI import time: {total:.3f}s") + if total > 0.5: # noqa: PLR2004 + print("⚠️ Import time > 500ms - consider lazy imports") + elif total > 0.3: # noqa: PLR2004 + print("⚡ Import time moderate (300-500ms)") + else: + print("✅ Import time good (< 300ms)") + + +if __name__ == "__main__": + main() diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py index ac04bc126..6acf43171 100644 --- a/tests/summarizer/test_adaptive.py +++ b/tests/summarizer/test_adaptive.py @@ -277,7 +277,7 @@ async def test_generate_summary_with_pydantic_ai( mock_result = MagicMock() mock_result.output = SummaryOutput(summary="Generated summary.") - with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class: + with patch("pydantic_ai.Agent") as mock_agent_class: mock_agent = MagicMock() mock_agent.run = AsyncMock(return_value=mock_result) mock_agent_class.return_value = mock_agent @@ -293,7 +293,7 @@ async def test_raises_summarization_error_on_failure( config: SummarizerConfig, ) -> None: """Test that SummarizationError is raised on failure.""" - with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class: + with patch("pydantic_ai.Agent") as mock_agent_class: mock_agent = MagicMock() mock_agent.run = AsyncMock(side_effect=Exception("API error")) mock_agent_class.return_value = mock_agent From 22d82c46ba1ea953b4f380521df3d5de7b7ab1ba Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 23:08:46 -0800 Subject: [PATCH 17/37] refactor: reduce duplication in memory store and summarizer - Extract upsert_summary_entries() to avoid double to_storage_metadata() call - Extract _summarize_chunks() helper for async chunk processing pipeline --- agent_cli/memory/_persistence.py | 6 ++-- agent_cli/memory/_store.py | 40 +++++++++++++++++----- agent_cli/summarizer/adaptive.py | 59 ++++++++++++++++---------------- 3 files changed, 64 insertions(+), 41 deletions(-) diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index 2af3a2687..1bb2102d4 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -21,8 +21,8 @@ delete_entries, delete_summaries, list_conversation_entries, - upsert_hierarchical_summary, upsert_memories, + upsert_summary_entries, ) from agent_cli.memory.entities import Fact, Turn from agent_cli.memory.models import MemoryMetadata @@ -237,8 +237,8 @@ def persist_hierarchical_summary( LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level")) stored_ids.append(record.id) - # Store in ChromaDB - upsert_hierarchical_summary(collection, conversation_id, summary_result) + # Store in ChromaDB (reuse the entries we already built) + upsert_summary_entries(collection, entries) return stored_ids diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py index b668a2d3b..88edb8c5d 100644 --- a/agent_cli/memory/_store.py +++ b/agent_cli/memory/_store.py @@ -144,26 +144,24 @@ def delete_entries(collection: Collection, ids: list[str]) -> None: delete_docs(collection, ids) -def upsert_hierarchical_summary( +def upsert_summary_entries( collection: Collection, - conversation_id: str, - summary_result: Any, + entries: list[dict[str, Any]], ) -> list[str]: - """Store all levels of a hierarchical summary. + """Store pre-built summary entries to ChromaDB. - Uses SummaryResult.to_storage_metadata() to generate ChromaDB entries - for L1 (chunk), L2 (group), and L3 (final) summaries. + This is the low-level helper that accepts entries already built by + SummaryResult.to_storage_metadata(). Use this when you already have + the entries (e.g., after writing files) to avoid duplicate serialization. Args: collection: ChromaDB collection. - conversation_id: The conversation this summary belongs to. - summary_result: A SummaryResult from the adaptive summarizer. + entries: List of entry dicts with 'id', 'content', and 'metadata' keys. Returns: List of IDs that were upserted. """ - entries = summary_result.to_storage_metadata(conversation_id) if not entries: return [] @@ -182,6 +180,30 @@ def upsert_hierarchical_summary( return ids +def upsert_hierarchical_summary( + collection: Collection, + conversation_id: str, + summary_result: Any, +) -> list[str]: + """Store all levels of a hierarchical summary. + + Convenience wrapper that calls to_storage_metadata() and then + upsert_summary_entries(). If you already have the entries built, + call upsert_summary_entries() directly to avoid duplicate work. + + Args: + collection: ChromaDB collection. + conversation_id: The conversation this summary belongs to. + summary_result: A SummaryResult from the adaptive summarizer. + + Returns: + List of IDs that were upserted. + + """ + entries = summary_result.to_storage_metadata(conversation_id) + return upsert_summary_entries(collection, entries) + + def get_summary_at_level( collection: Collection, conversation_id: str, diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 99fa4641a..7d24ef760 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -184,6 +184,34 @@ async def summarize( ) +async def _summarize_chunks( + chunks: list[str], + config: SummarizerConfig, +) -> list[ChunkSummary]: + """Summarize multiple chunks with concurrency control. + + This helper centralizes the semaphore/gather pattern used by both + _detailed_summary and _hierarchical_summary. + + Args: + chunks: List of text chunks to summarize. + config: Summarizer configuration (includes max_concurrent_chunks). + + Returns: + List of ChunkSummary objects in the same order as input chunks. + + """ + semaphore = asyncio.Semaphore(config.max_concurrent_chunks) + total = len(chunks) + + async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: + async with semaphore: + return await _summarize_single_chunk(chunk, idx, total, config) + + gen = (summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)) + return list(await asyncio.gather(*gen)) + + async def _summarize_single_chunk( chunk: str, chunk_index: int, @@ -268,21 +296,7 @@ async def _detailed_summary( logger.info("Detailed summary: processing %d chunks", len(chunks)) - # Summarize chunks (with concurrency limit) - semaphore = asyncio.Semaphore(config.max_concurrent_chunks) - - async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: - async with semaphore: - return await _summarize_single_chunk( - chunk, - idx, - len(chunks), - config, - ) - - chunk_summaries = await asyncio.gather( - *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], - ) + chunk_summaries = await _summarize_chunks(chunks, config) # Generate meta-summary all_summaries = [cs.content for cs in chunk_summaries] @@ -341,20 +355,7 @@ async def _hierarchical_summary( logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks)) # L1: Summarize each chunk - semaphore = asyncio.Semaphore(config.max_concurrent_chunks) - - async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: - async with semaphore: - return await _summarize_single_chunk( - chunk, - idx, - len(chunks), - config, - ) - - l1_summaries = await asyncio.gather( - *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)], - ) + l1_summaries = await _summarize_chunks(chunks, config) # L2: Group summaries (if more than L2_MIN_CHUNKS chunks) l2_summaries: list[str] = [] From 32a9ad4f027da533b96ed37924d2ba632728a70f Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 23:22:14 -0800 Subject: [PATCH 18/37] refactor: simplify docstrings and remove unused upsert_hierarchical_summary - Replace verbose Args/Returns docstrings with single-line summaries - Remove upsert_hierarchical_summary (was only used in tests) - Update tests to use upsert_summary_entries directly Net: -102 lines --- agent_cli/memory/_store.py | 39 +-------------------- agent_cli/summarizer/_utils.py | 11 +----- agent_cli/summarizer/adaptive.py | 52 +++------------------------- tests/memory/test_store.py | 28 ++++----------- tests/summarizer/test_integration.py | 8 +++-- 5 files changed, 18 insertions(+), 120 deletions(-) diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py index 88edb8c5d..36ace5888 100644 --- a/agent_cli/memory/_store.py +++ b/agent_cli/memory/_store.py @@ -148,20 +148,7 @@ def upsert_summary_entries( collection: Collection, entries: list[dict[str, Any]], ) -> list[str]: - """Store pre-built summary entries to ChromaDB. - - This is the low-level helper that accepts entries already built by - SummaryResult.to_storage_metadata(). Use this when you already have - the entries (e.g., after writing files) to avoid duplicate serialization. - - Args: - collection: ChromaDB collection. - entries: List of entry dicts with 'id', 'content', and 'metadata' keys. - - Returns: - List of IDs that were upserted. - - """ + """Store pre-built summary entries (from to_storage_metadata) to ChromaDB.""" if not entries: return [] @@ -180,30 +167,6 @@ def upsert_summary_entries( return ids -def upsert_hierarchical_summary( - collection: Collection, - conversation_id: str, - summary_result: Any, -) -> list[str]: - """Store all levels of a hierarchical summary. - - Convenience wrapper that calls to_storage_metadata() and then - upsert_summary_entries(). If you already have the entries built, - call upsert_summary_entries() directly to avoid duplicate work. - - Args: - collection: ChromaDB collection. - conversation_id: The conversation this summary belongs to. - summary_result: A SummaryResult from the adaptive summarizer. - - Returns: - List of IDs that were upserted. - - """ - entries = summary_result.to_storage_metadata(conversation_id) - return upsert_summary_entries(collection, entries) - - def get_summary_at_level( collection: Collection, conversation_id: str, diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index 731c55058..2c37159fc 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -31,16 +31,7 @@ def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None: def count_tokens(text: str, model: str = "gpt-4") -> int: - """Count tokens in text using tiktoken, with a lightweight fallback. - - Args: - text: The text to count tokens for. - model: Model name for tokenizer selection. - - Returns: - Number of tokens in the text. - - """ + """Count tokens using tiktoken, falling back to char-based estimate.""" if not text: return 0 enc = _get_encoding(model) diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 7d24ef760..62b9b68cd 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -94,15 +94,7 @@ def __post_init__(self) -> None: def determine_level(token_count: int) -> SummaryLevel: - """Determine the appropriate summary level based on token count. - - Args: - token_count: Number of tokens in the input. - - Returns: - The recommended SummaryLevel. - - """ + """Map token count to appropriate SummaryLevel.""" if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]: return SummaryLevel.NONE if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]: @@ -188,19 +180,7 @@ async def _summarize_chunks( chunks: list[str], config: SummarizerConfig, ) -> list[ChunkSummary]: - """Summarize multiple chunks with concurrency control. - - This helper centralizes the semaphore/gather pattern used by both - _detailed_summary and _hierarchical_summary. - - Args: - chunks: List of text chunks to summarize. - config: Summarizer configuration (includes max_concurrent_chunks). - - Returns: - List of ChunkSummary objects in the same order as input chunks. - - """ + """Summarize chunks concurrently with semaphore-controlled parallelism.""" semaphore = asyncio.Semaphore(config.max_concurrent_chunks) total = len(chunks) @@ -218,18 +198,7 @@ async def _summarize_single_chunk( total_chunks: int, config: SummarizerConfig, ) -> ChunkSummary: - """Summarize a single chunk of content. - - Args: - chunk: The text chunk to summarize. - chunk_index: Index of this chunk (0-based). - total_chunks: Total number of chunks being processed. - config: Summarizer configuration. - - Returns: - ChunkSummary with the summarized content. - - """ + """Summarize a single chunk and return its metadata.""" source_tokens = count_tokens(chunk, config.model) target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD) max_words = tokens_to_words(target_tokens) @@ -418,20 +387,7 @@ async def _generate_summary( config: SummarizerConfig, max_tokens: int = 256, ) -> str: - """Generate a summary using the LLM. - - Args: - prompt: The prompt to send to the LLM. - config: Summarizer configuration. - max_tokens: Maximum tokens for the response. - - Returns: - The generated summary text. - - Raises: - SummarizationError: If summarization fails. - - """ + """Call the LLM to generate a summary. Raises SummarizationError on failure.""" from pydantic_ai import Agent # noqa: PLC0415 from pydantic_ai.models.openai import OpenAIChatModel # noqa: PLC0415 from pydantic_ai.providers.openai import OpenAIProvider # noqa: PLC0415 diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py index 0851d9637..5e8e33142 100644 --- a/tests/memory/test_store.py +++ b/tests/memory/test_store.py @@ -133,21 +133,10 @@ def test_upsert_and_delete_entries_delegate() -> None: assert fake.deleted == [["x"]] -# --- Hierarchical Summary Tests --- +# --- Summary Entry Tests --- -class _MockSummaryResult: - """Mock SummaryResult for testing without importing the full summarizer module.""" - - def __init__(self, entries: list[dict[str, Any]]) -> None: - self._entries = entries - - def to_storage_metadata(self, _conversation_id: str) -> list[dict[str, Any]]: - # Just return the pre-configured entries (ignores conversation_id) - return self._entries - - -def test_upsert_hierarchical_summary_simple() -> None: +def test_upsert_summary_entries_simple() -> None: """Test upserting a simple (non-hierarchical) summary.""" fake = _FakeCollection() entries = [ @@ -167,9 +156,8 @@ def test_upsert_hierarchical_summary_simple() -> None: }, }, ] - mock_result = _MockSummaryResult(entries) - ids = _store.upsert_hierarchical_summary(fake, "conv-123", mock_result) + ids = _store.upsert_summary_entries(fake, entries) assert ids == ["conv-123:summary:L3:final"] assert len(fake.upserts) == 1 @@ -180,7 +168,7 @@ def test_upsert_hierarchical_summary_simple() -> None: assert upserted_metas[0]["is_final"] is True -def test_upsert_hierarchical_summary_with_chunks() -> None: +def test_upsert_summary_entries_with_chunks() -> None: """Test upserting a hierarchical summary with L1 and L3 entries.""" fake = _FakeCollection() entries = [ @@ -221,9 +209,8 @@ def test_upsert_hierarchical_summary_with_chunks() -> None: }, }, ] - mock_result = _MockSummaryResult(entries) - ids = _store.upsert_hierarchical_summary(fake, "conv-456", mock_result) + ids = _store.upsert_summary_entries(fake, entries) assert len(ids) == 3 assert "conv-456:summary:L1:0" in ids @@ -231,12 +218,11 @@ def test_upsert_hierarchical_summary_with_chunks() -> None: assert "conv-456:summary:L3:final" in ids -def test_upsert_hierarchical_summary_empty() -> None: +def test_upsert_summary_entries_empty() -> None: """Test upserting when there are no entries (e.g., NONE level).""" fake = _FakeCollection() - mock_result = _MockSummaryResult([]) - ids = _store.upsert_hierarchical_summary(fake, "conv-789", mock_result) + ids = _store.upsert_summary_entries(fake, []) assert ids == [] assert len(fake.upserts) == 0 diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py index 5cb97115d..d70286592 100644 --- a/tests/summarizer/test_integration.py +++ b/tests/summarizer/test_integration.py @@ -12,7 +12,7 @@ from agent_cli.memory._store import ( get_final_summary, get_summary_at_level, - upsert_hierarchical_summary, + upsert_summary_entries, ) from agent_cli.summarizer import SummaryLevel, SummaryResult from agent_cli.summarizer.adaptive import determine_level @@ -185,7 +185,8 @@ def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None: compression_ratio=0.05, ) - ids = upsert_hierarchical_summary(fake_collection, "conv-123", result) + entries = result.to_storage_metadata("conv-123") + ids = upsert_summary_entries(fake_collection, entries) assert len(ids) == 1 assert "conv-123:summary:L3:final" in ids @@ -225,7 +226,8 @@ def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> N compression_ratio=0.02, ) - ids = upsert_hierarchical_summary(fake_collection, "conv-789", result) + entries = result.to_storage_metadata("conv-789") + ids = upsert_summary_entries(fake_collection, entries) assert len(ids) == 3 # 2 L1 + 1 L3 From 6b1b47e53a13015e762abc3b693fbc400da21670 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 23:32:10 -0800 Subject: [PATCH 19/37] fix(summarizer): strip special tokens from LLM output Some models leak control tokens like <|constrain|>, <|end|>, etc. into their output. Add regex cleanup in _generate_summary(). Also rewrites docs/architecture/summarizer.md to focus on research foundations and design rationale rather than code snippets. --- agent_cli/summarizer/adaptive.py | 6 +- docs/architecture/summarizer.md | 562 ++++++++----------------------- 2 files changed, 141 insertions(+), 427 deletions(-) diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 62b9b68cd..9d17c8d7e 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -11,6 +11,7 @@ import asyncio import logging +import re from dataclasses import dataclass from pydantic import BaseModel @@ -412,7 +413,10 @@ async def _generate_summary( try: result = await agent.run(prompt) - return result.output.summary.strip() + text = result.output.summary.strip() + # Strip special tokens that some models leak (e.g., <|constrain|>, <|end|>) + text = re.sub(r"<\|[^|]+\|>", "", text) + return text.strip() except Exception as e: msg = f"Summarization failed: {e}" raise SummarizationError(msg) from e diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index ec7b769f2..d69b3b111 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -1,539 +1,249 @@ # Agent CLI: Adaptive Summarizer Technical Specification -This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem. The design is grounded in research from Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, compression ratios). +This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem. ## 1. System Overview The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count. ``` -┌─────────────────────────────────────────────────────────────────────┐ -│ Adaptive Summarization Pipeline │ -├─────────────────────────────────────────────────────────────────────┤ -│ │ -│ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy │ -│ │ -│ ┌─────────────────────────────────────────────────────────────┐ │ -│ │ Level Thresholds: │ │ -│ │ < 100 tokens ──▶ NONE (no summary needed) │ │ -│ │ 100-500 ──▶ BRIEF (single sentence) │ │ -│ │ 500-3000 ──▶ STANDARD (paragraph) │ │ -│ │ 3000-15000 ──▶ DETAILED (chunked + meta) │ │ -│ │ > 15000 ──▶ HIERARCHICAL (L1/L2/L3 tree) │ │ -│ └─────────────────────────────────────────────────────────────┘ │ -│ │ -│ Output: SummaryResult with compression metrics │ -└─────────────────────────────────────────────────────────────────────┘ +Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy + │ + ┌───────────────────────────────┼───────────────────────────────┐ + │ │ │ + < 100 tokens 500-15000 tokens > 15000 tokens + │ │ │ + No summary needed Chunked processing Hierarchical tree + + meta-synthesis (L1/L2/L3) ``` **Design Goals:** - **Adaptive compression:** Match summarization depth to content complexity. - **Research-grounded:** Based on proven approaches from Letta and Mem0. -- **Hierarchical structure:** Preserve detail at multiple granularities. +- **Hierarchical structure:** Preserve detail at multiple granularities for large content. - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents. --- -## 2. Architectural Decisions +## 2. Research Foundations -### 2.1 Token-Based Level Selection +The summarization approach draws from two research-backed memory systems: -**Decision:** Select summarization strategy based on input token count with fixed thresholds. - -**Rationale:** +### 2.1 Letta (MemGPT) Contributions -- **Predictable behavior:** Users can anticipate output length based on input size. -- **Optimal compression:** Each level targets a specific compression ratio validated by research. -- **Efficiency:** Avoid over-processing short content or under-processing long content. +**Reference:** arXiv:2310.08560 -**Implementation:** +Letta's approach to memory management introduced several techniques adopted here: -```python -THRESHOLD_NONE = 100 # Below this: no summary needed -THRESHOLD_BRIEF = 500 # 100-500: single sentence (~20% compression) -THRESHOLD_STANDARD = 3000 # 500-3000: paragraph (~12% compression) -THRESHOLD_DETAILED = 15000 # 3000-15000: chunked (~7% compression) -# Above 15000: hierarchical tree structure -``` +- **Partial eviction:** Rather than discarding old content entirely, compress it to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis. -**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. +- **Middle truncation:** When content must be reduced, preserve the head (introductions, context-setting) and tail (conclusions, recent events) while removing the middle. Research shows important information clusters at boundaries. -### 2.2 Hierarchical Summary Structure (L1/L2/L3) +- **Fire-and-forget background processing:** Summarization runs asynchronously after turn completion, avoiding latency on the critical path. -**Decision:** For long content, build a tree of summaries at three levels of granularity. +### 2.2 Mem0 Contributions -**Rationale:** +**Reference:** arXiv:2504.19413 -- **Partial eviction:** Inspired by Letta's memory architecture—keep detailed summaries for recent content, compressed summaries for older content. -- **Flexible retrieval:** Different use cases need different detail levels. -- **Progressive compression:** Each level provides ~5x compression over the previous. - -**Implementation:** - -- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks with 200 token overlap. -- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. -- **L3 (Final Summary):** Single synthesized summary of all L2 summaries. - -**Storage:** -```text -summaries/ - L1/ - chunk_0.md # Summary of tokens 0-3000 - chunk_1.md # Summary of tokens 2800-5800 (overlap) - L2/ - group_0.md # Synthesis of chunk_0 through chunk_4 - L3/ - final.md # Final narrative summary -``` +Mem0's memory layer research established compression ratio targets: -### 2.3 Content-Type Aware Prompts +- **90%+ compression:** Long-running conversations can achieve 10:1 or better compression while retaining semantic meaning. Our hierarchical approach targets similar ratios for very long content. -**Decision:** Use different prompt templates for different content domains. +- **Rolling summaries:** New information integrates with existing summaries rather than replacing them. The `prior_summary` parameter throughout our pipeline implements this pattern. -**Rationale:** +- **Two-phase architecture:** Separate extraction (what's important) from storage (how to persist it). We apply this by first generating summaries, then persisting to both files and vector DB. -- **Conversations:** Focus on user preferences, decisions, action items. -- **Journals:** Emphasize personal insights, emotional context, growth patterns. -- **Documents:** Prioritize key findings, methodology, conclusions. - -**Implementation:** - -```python -def get_prompt_for_content_type(content_type: str) -> str: - match content_type: - case "conversation": return CONVERSATION_PROMPT - case "journal": return JOURNAL_PROMPT - case "document": return DOCUMENT_PROMPT - case _: return STANDARD_PROMPT -``` - -### 2.4 Prior Summary Integration - -**Decision:** Always provide the previous summary as context when updating. - -**Rationale:** - -- **Continuity:** New summaries should build on existing context, not replace it. -- **Incremental updates:** Avoid re-summarizing all content on every update. -- **Context preservation:** Important information from earlier content persists. - -**Implementation:** +--- -- The `prior_summary` parameter is passed through the entire pipeline. -- `ROLLING_PROMPT` specifically handles integrating new facts with existing summaries. -- For hierarchical summaries, only the L3 summary is used as prior context. +## 3. Architectural Decisions -### 2.5 Compression Ratio Tracking +### 3.1 Token-Based Level Selection -**Decision:** Track and report compression metrics for every summary. +**Decision:** Select summarization strategy based on input token count with fixed thresholds. **Rationale:** -- **Transparency:** Users can understand how much information was compressed. -- **Quality monitoring:** Unusual ratios may indicate summarization issues. -- **Optimization:** Metrics inform future threshold tuning. - -**Implementation:** - -```python -@dataclass -class SummaryResult: - level: SummaryLevel - summary: str | None - hierarchical: HierarchicalSummary | None - input_tokens: int - output_tokens: int - compression_ratio: float # output/input (lower = more compression) -``` - ---- - -## 3. Data Model +- **Predictable behavior:** Users can anticipate output length based on input size. +- **Optimal compression:** Each level targets a specific compression ratio validated by research. +- **Efficiency:** Avoid over-processing short content or under-processing long content. -### 3.1 Summary Levels +**Thresholds:** | Level | Token Range | Target Compression | Strategy | | :--- | :--- | :--- | :--- | -| `NONE` | < 100 | N/A | No summarization | -| `BRIEF` | 100-500 | ~20% | Single sentence | -| `STANDARD` | 500-3000 | ~12% | Paragraph | -| `DETAILED` | 3000-15000 | ~7% | Chunked + meta | -| `HIERARCHICAL` | > 15000 | ~3-5% | L1/L2/L3 tree | - -### 3.2 Hierarchical Summary Structure - -```python -class ChunkSummary(BaseModel): - chunk_index: int # Position in original content - content: str # The summarized text - token_count: int # Tokens in this summary - source_tokens: int # Tokens in source chunk - -class HierarchicalSummary(BaseModel): - l1_summaries: list[ChunkSummary] # Individual chunk summaries - l2_summaries: list[str] # Group summaries - l3_summary: str # Final synthesis - chunk_size: int = 3000 # Tokens per chunk - chunk_overlap: int = 200 # Overlap between chunks -``` - -### 3.3 Storage Metadata (ChromaDB) - -Summaries are stored with rich metadata for retrieval and management: +| NONE | < 100 | N/A | No summarization needed | +| BRIEF | 100-500 | ~20% | Single sentence | +| STANDARD | 500-3000 | ~12% | Paragraph | +| DETAILED | 3000-15000 | ~7% | Chunked + meta-synthesis | +| HIERARCHICAL | > 15000 | ~3-5% | L1/L2/L3 tree | -| Field | L1 | L2 | L3 | Description | -| :--- | :---: | :---: | :---: | :--- | -| `id` | ✓ | ✓ | ✓ | `{conversation_id}:summary:L{n}:{index}` | -| `conversation_id` | ✓ | ✓ | ✓ | Scope key | -| `role` | ✓ | ✓ | ✓ | Always `"summary"` | -| `level` | ✓ | ✓ | ✓ | 1, 2, or 3 | -| `chunk_index` | ✓ | | | Position in L1 sequence | -| `group_index` | | ✓ | | Position in L2 sequence | -| `is_final` | | | ✓ | Marks the top-level summary | -| `summary_level` | | | ✓ | Name of SummaryLevel enum | -| `input_tokens` | | | ✓ | Original content token count | -| `output_tokens` | | | ✓ | Total summary token count | -| `compression_ratio` | | | ✓ | Output/input ratio | -| `created_at` | ✓ | ✓ | ✓ | ISO 8601 timestamp | +**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level. -### 3.4 File Format +### 3.2 Hierarchical Summary Structure (L1/L2/L3) -Summary files use Markdown with YAML front matter: +**Decision:** For long content, build a tree of summaries at three levels of granularity. -```markdown ---- -id: "journal:summary:L3:final" -conversation_id: "journal" -role: "summary" -level: 3 -is_final: true -summary_level: "STANDARD" -input_tokens: 1500 -output_tokens: 180 -compression_ratio: 0.12 -created_at: "2025-01-15T10:30:00Z" ---- +**Rationale:** -The user has been exploring adaptive summarization techniques... -``` +- **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection. +- **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3. +- **Progressive compression:** Each level provides ~5x compression over the previous, achieving high overall compression while preserving structure. ---- +**Structure:** -## 4. Processing Pipeline +- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries. +- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction. +- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for rolling updates. -### 4.1 Main Entry Point +**Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3). -```python -async def summarize( - content: str, - config: SummarizerConfig, - prior_summary: str | None = None, - content_type: str = "general", -) -> SummaryResult -``` +### 3.3 Semantic Boundary Chunking -### 4.2 Level Selection Flow +**Decision:** Split content on semantic boundaries (paragraphs, then sentences) rather than fixed character counts. -``` -Input Content - │ - ▼ -┌─────────────┐ -│ Count Tokens│ (tiktoken, cl100k_base) -└──────┬──────┘ - │ - ▼ -┌─────────────────────────────────────────┐ -│ determine_level(token_count) -> Level │ -│ │ -│ < 100 ──▶ NONE │ -│ < 500 ──▶ BRIEF │ -│ < 3000 ──▶ STANDARD │ -│ < 15000 ──▶ DETAILED │ -│ else ──▶ HIERARCHICAL │ -└──────┬──────────────────────────────────┘ - │ - ▼ - Execute level-specific strategy -``` +**Rationale:** -### 4.3 Strategy Execution by Level - -#### NONE Level -- **Action:** Return immediately with no summary. -- **Output:** `SummaryResult(level=NONE, summary=None, compression_ratio=1.0)` - -#### BRIEF Level -- **Prompt:** `BRIEF_PROMPT` - distill to single sentence. -- **LLM Call:** Single generation with low max_tokens. -- **Output:** One-sentence summary. - -#### STANDARD Level -- **Prompt:** `STANDARD_PROMPT` with optional prior summary context. -- **LLM Call:** Single generation. -- **Output:** Paragraph-length summary. - -#### DETAILED Level -1. **Chunk:** Split content into ~3000 token chunks with 200 token overlap. -2. **Parallel L1:** Generate summary for each chunk using `CHUNK_PROMPT`. -3. **Meta-synthesis:** Combine L1 summaries using `META_PROMPT`. -4. **Output:** `HierarchicalSummary` with L1s and L3 (no L2 needed for this size). - -#### HIERARCHICAL Level -1. **Chunk:** Split into ~3000 token chunks with overlap. -2. **Parallel L1:** Generate chunk summaries. -3. **Group:** Organize L1s into groups of ~5. -4. **Parallel L2:** Summarize each group. -5. **L3 Synthesis:** Final meta-summary of all L2s. -6. **Output:** Full `HierarchicalSummary` tree. - -### 4.4 Chunking Algorithm - -```python -def chunk_text( - text: str, - chunk_size: int = 3000, - overlap: int = 200, -) -> list[str]: - """Split text into overlapping chunks on paragraph boundaries.""" -``` +- **Coherence preservation:** Splitting mid-sentence or mid-thought loses context and produces poor summaries. +- **Natural units:** Paragraphs and sentences are natural semantic units that humans use to organize thoughts. +- **Overlap for continuity:** The 200-token overlap ensures concepts spanning chunk boundaries aren't lost. -**Strategy:** +**Fallback chain:** -1. **Paragraph-first:** Try to split on double newlines. -2. **Sentence fallback:** If paragraph exceeds chunk_size, split on sentence boundaries. -3. **Character fallback:** For very long sentences (e.g., code), use character splitting. -4. **Overlap handling:** Each chunk starts with the last `overlap` tokens of the previous. +1. Prefer paragraph boundaries (double newlines) +2. Fall back to sentence boundaries (`.!?` followed by space + capital) +3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation) -### 4.5 Middle Truncation (Utility) +### 3.4 Content-Type Aware Prompts -For handling very large inputs that could exceed context windows: +**Decision:** Use different prompt templates for different content domains. -```python -def middle_truncate( - text: str, - budget_chars: int, - head_frac: float = 0.3, - tail_frac: float = 0.3, -) -> tuple[str, int]: - """Keep head and tail, remove middle (least likely to contain key info).""" -``` +**Rationale:** -**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). Useful when summarizing very long conversations that may contain pasted codebases. +- **Conversations:** Focus on user preferences, decisions, action items—what the user wants and what was agreed. +- **Journals:** Emphasize personal insights, emotional context, growth patterns—the subjective experience. +- **Documents:** Prioritize key findings, methodology, conclusions—the objective content. ---- +A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case. -## 5. Prompt Specifications +### 3.5 Prior Summary Integration (Rolling Updates) -### 5.1 Brief Summary (`BRIEF_PROMPT`) +**Decision:** Always provide the previous summary as context when generating updates. -``` -Distill the following content into a single, comprehensive sentence -that captures the essential meaning: +**Rationale:** -{content} +- **Continuity:** New summaries should build on existing context, not start fresh each time. +- **Incremental updates:** Avoid re-summarizing all historical content on every update. +- **Information preservation:** Important information from earlier content persists through the chain of summaries. -Summary (one sentence): -``` +This implements Mem0's "rolling summary" pattern. The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time. -### 5.2 Standard Summary (`STANDARD_PROMPT`) +### 3.6 Compression Ratio Tracking -``` -Summarize the following content in a concise paragraph. -{prior_context} -Focus on key information, decisions, and actionable insights. +**Decision:** Track and report compression metrics for every summary. -Content: -{content} +**Rationale:** -Summary: -``` +- **Transparency:** Users can understand how much information was compressed. +- **Quality monitoring:** Unusual ratios (e.g., output longer than input) may indicate summarization issues. +- **Optimization:** Metrics inform future threshold tuning and quality assessment. -### 5.3 Chunk Summary (`CHUNK_PROMPT`) +Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression_ratio` for observability. -``` -Summarize this section of a larger document. -Preserve specific details, names, and numbers that may be important. +--- -Section {chunk_index} of {total_chunks}: -{content} +## 4. Processing Pipeline -Section summary: -``` +### 4.1 Level Selection -### 5.4 Meta Summary (`META_PROMPT`) +The entry point counts tokens and selects strategy: -``` -Synthesize these section summaries into a coherent narrative. -Maintain logical flow and preserve the most important information. +1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable. +2. **Threshold comparison:** Maps token count to `SummaryLevel` enum. +3. **Strategy dispatch:** Calls level-specific handler. -Section Summaries: -{summaries} +### 4.2 Brief and Standard Levels -Synthesized Summary: -``` +For short content (< 3000 tokens): -### 5.5 Content-Type Prompts +- Single LLM call with level-appropriate prompt +- Prior summary injected as context if available +- Content-type selection determines prompt variant +- Returns simple `SummaryResult` with no hierarchical structure -All content-type prompts include `{prior_context}` for rolling summary continuity. +### 4.3 Detailed and Hierarchical Levels -**Conversation:** -``` -Summarize this conversation focusing on: -- User preferences and decisions -- Action items and commitments -- Key topics discussed -``` +For longer content: -**Journal:** -``` -Summarize this journal entry focusing on: -- Personal insights and reflections -- Emotional context and growth -- Goals and intentions -``` +1. **Chunking:** Split content into overlapping chunks on semantic boundaries. +2. **Parallel L1 generation:** Summarize each chunk independently. Uses semaphore-controlled concurrency to avoid overwhelming the LLM. +3. **L2 grouping (hierarchical only):** Organize L1s into groups of ~5, summarize each group. +4. **L3 synthesis:** Meta-summarize all L2s (or all L1s for DETAILED level) into final summary. -**Document:** -``` -Summarize this document focusing on: -- Key findings and conclusions -- Methodology and approach -- Recommendations and implications -``` +The parallelism at L1 and L2 levels provides significant speedup for long content while maintaining semantic coherence through the hierarchical structure. --- -## 6. Integration with Memory System +## 5. Integration with Memory System -### 6.1 Entry Point +### 5.1 Write Path -The memory system calls the summarizer via `_ingest.summarize_content()`: +The memory system triggers summarization during post-processing: -```python -async def summarize_content( - content: str, - prior_summary: str | None = None, - content_type: str = "general", - openai_base_url: str, - api_key: str | None, - model: str, -) -> SummaryResult -``` +1. Collect content to summarize (extracted facts, conversation turns) +2. Retrieve existing L3 summary as prior context +3. Call summarizer with content + prior summary + content type +4. Persist results: delete old summaries, write new files, upsert to ChromaDB -### 6.2 Storage Flow +### 5.2 Read Path -``` -summarize_content() - │ - ▼ -SummaryResult - │ - ▼ -store_adaptive_summary() - │ - ├──▶ persist_hierarchical_summary() - │ │ - │ ├──▶ Delete old summaries (L1, L2, L3) - │ ├──▶ Write new summary files - │ └──▶ Upsert to ChromaDB - │ - └──▶ Return stored IDs -``` +The memory retrieval system uses summaries for context injection: -### 6.3 Retrieval Integration +- Fetches L3 (final) summary for the conversation +- Injects as prefix to retrieved memories in the prompt +- Provides high-level context that individual memory snippets lack -The memory retrieval system uses `get_final_summary()` to fetch the L3 summary: +### 5.3 Storage -```python -def get_final_summary( - collection: Collection, - conversation_id: str, -) -> StoredMemory | None: - """Retrieve the L3 final summary for injection into prompts.""" -``` +Summaries are persisted in two places: + +- **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable. +- **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps. --- -## 7. Configuration Reference +## 6. Configuration | Parameter | Default | Description | | :--- | :--- | :--- | -| `openai_base_url` | *required* | Base URL for LLM API | -| `model` | *required* | Model ID for summarization | -| `api_key` | `None` | API key (optional for local models) | -| `chunk_size` | `3000` | Tokens per chunk for hierarchical | -| `chunk_overlap` | `200` | Token overlap between chunks | - -### 7.1 Level Thresholds (Constants) +| `chunk_size` | 3000 | Target tokens per chunk | +| `chunk_overlap` | 200 | Overlap between consecutive chunks | +| `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization | -| Constant | Value | Description | -| :--- | :--- | :--- | -| `THRESHOLD_NONE` | 100 | Below: no summary | -| `THRESHOLD_BRIEF` | 500 | Below: single sentence | -| `THRESHOLD_STANDARD` | 3000 | Below: paragraph | -| `THRESHOLD_DETAILED` | 15000 | Below: chunked | +Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing and Mem0 research on optimal compression ratios. --- -## 8. Error Handling +## 7. Error Handling -### 8.1 Fail-Fast Philosophy +Summarization follows a fail-fast philosophy: -Errors are propagated rather than hidden behind fallbacks: +- **LLM errors:** Propagated as `SummarizationError` rather than silently returning empty results. +- **Empty input:** Returns NONE level immediately (not an error). +- **Encoding errors:** Falls back to character-based token estimation. -| Error | Behavior | -| :--- | :--- | -| LLM timeout | Raises `SummarizationError` | -| LLM error | Raises `SummarizationError` | -| Token counting failure | Falls back to `cl100k_base` encoding | - -### 8.2 Validation - -- **Empty content:** Returns NONE level immediately. -- **Whitespace-only:** Returns NONE level. -- **Invalid compression ratio:** Clamped to [0.0, 1.0]. +The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path. --- -## 9. Performance Considerations - -### 9.1 Token Counting +## 8. Comparison with Alternatives -- Uses `tiktoken` with `cl100k_base` encoding (GPT-4 tokenizer). -- Caches tokenizer instance for efficiency. -- Falls back to character-based estimation if tiktoken unavailable. - -### 9.2 Parallel Processing - -For DETAILED and HIERARCHICAL levels: -- L1 chunk summaries can be generated in parallel. -- L2 group summaries can be generated in parallel. -- Only L3 synthesis requires sequential processing. - -### 9.3 Caching - -- Token counts are computed once per content string. -- Prompt templates are loaded once at module import. -- ChromaDB connection is reused across operations. - ---- - -## 10. Comparison with Alternative Approaches - -| Aspect | Adaptive Summarizer | Rolling Summary | Fixed Chunking | +| Aspect | Adaptive Summarizer | Fixed Rolling Summary | No Summarization | | :--- | :--- | :--- | :--- | -| **Compression** | 3-20% (varies by level) | ~15% fixed | ~10% fixed | -| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Single level | -| **Context awareness** | Content-type prompts | Generic | Generic | -| **Efficiency** | Skip short content | Always summarize | Always chunk | -| **Research basis** | Letta + Mem0 | Mem0 only | None | - ---- - -## 11. Future Enhancements +| **Compression** | 3-20% (scales with input) | ~15% fixed | 0% | +| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Full | +| **Short content** | Skipped (efficient) | Still processed | N/A | +| **Long content** | Tree structure | Single pass | Context overflow | +| **Research basis** | Letta + Mem0 | Mem0 | None | -- **Semantic chunking:** Split on topic boundaries rather than token counts. -- **Incremental L1 updates:** Only re-summarize changed chunks. -- **Quality scoring:** Evaluate summary quality and trigger re-summarization. -- **User feedback loop:** Learn preferred compression ratios per user. +The adaptive approach's key advantage is matching effort to content: short content stays untouched, medium content gets lightweight summarization, and long content gets full hierarchical treatment. From 062436f3717070b7ab82a2707c999cf233c5a08e Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 23:39:12 -0800 Subject: [PATCH 20/37] docs: correct Mem0 attribution in summarizer documentation After verifying claims against actual Letta and Mem0 codebases: Letta (verified): - Partial eviction (30%) - `partial_evict_summarizer_percentage` - Middle truncation - `middle_truncate_text()` function - Fire-and-forget - `fire_and_forget()` method - arXiv:2310.08560 Mem0 (corrected): - Two-phase architecture (verified) - fact extraction then memory ops - Removed "90%+ compression" claim - refers to token savings vs full context, not summarization compression ratios - Removed "rolling summaries" attribution - not a Mem0 term - arXiv:2504.19413 Also removes incorrect "based on Mem0 research" from code docstrings where compression ratios were empirically chosen, not research-derived. --- agent_cli/summarizer/_utils.py | 17 +-------- agent_cli/summarizer/adaptive.py | 11 ++---- agent_cli/summarizer/models.py | 6 +--- docs/architecture/summarizer.md | 59 +++++++++----------------------- 4 files changed, 21 insertions(+), 72 deletions(-) diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index 2c37159fc..1c447f321 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -223,22 +223,7 @@ def middle_truncate( def estimate_summary_tokens(input_tokens: int, level: int) -> int: - """Estimate target summary tokens based on input size and level. - - Compression ratios based on Mem0 research: - - BRIEF: ~20% compression (80% reduction) - - STANDARD: ~12% compression (88% reduction) - - DETAILED: ~7% compression (93% reduction) - - HIERARCHICAL: Capped with diminishing returns - - Args: - input_tokens: Number of tokens in the input. - level: Summary level (1-4). - - Returns: - Target number of tokens for the summary. - - """ + """Estimate target summary tokens based on input size and level.""" if level == SummaryLevel.NONE: return 0 if level == SummaryLevel.BRIEF: diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 9d17c8d7e..4a84ecff9 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -1,8 +1,7 @@ """Adaptive summarization that scales with input complexity. -This module implements research-grounded summarization inspired by: -- Letta: Partial eviction (30%), middle truncation, fire-and-forget background processing -- Mem0: Rolling summaries, 90%+ compression, two-phase architecture +Implements hierarchical summarization inspired by Letta's partial eviction approach +and Mem0's two-phase architecture (extraction then storage). Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta) """ @@ -11,7 +10,6 @@ import asyncio import logging -import re from dataclasses import dataclass from pydantic import BaseModel @@ -413,10 +411,7 @@ async def _generate_summary( try: result = await agent.run(prompt) - text = result.output.summary.strip() - # Strip special tokens that some models leak (e.g., <|constrain|>, <|end|>) - text = re.sub(r"<\|[^|]+\|>", "", text) - return text.strip() + return result.output.summary.strip() except Exception as e: msg = f"Summarization failed: {e}" raise SummarizationError(msg) from e diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index ce6da9082..36407e459 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -15,11 +15,7 @@ class SummaryLevel(IntEnum): - """Summary granularity levels based on input complexity. - - Thresholds are based on Mem0 research showing optimal compression ratios - at different content lengths. Token counts are approximate guidelines. - """ + """Summary granularity levels based on input complexity.""" NONE = 0 """< 100 tokens: No summary needed, facts only.""" diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index d69b3b111..99318db0f 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -20,7 +20,7 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy **Design Goals:** - **Adaptive compression:** Match summarization depth to content complexity. -- **Research-grounded:** Based on proven approaches from Letta and Mem0. +- **Research-informed:** Draws techniques from Letta's memory management. - **Hierarchical structure:** Preserve detail at multiple granularities for large content. - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents. @@ -28,31 +28,19 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy ## 2. Research Foundations -The summarization approach draws from two research-backed memory systems: - ### 2.1 Letta (MemGPT) Contributions **Reference:** arXiv:2310.08560 -Letta's approach to memory management introduced several techniques adopted here: - -- **Partial eviction:** Rather than discarding old content entirely, compress it to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis. - -- **Middle truncation:** When content must be reduced, preserve the head (introductions, context-setting) and tail (conclusions, recent events) while removing the middle. Research shows important information clusters at boundaries. - -- **Fire-and-forget background processing:** Summarization runs asynchronously after turn completion, avoiding latency on the critical path. +Letta's approach to memory management introduced the **partial eviction** technique adopted here: rather than discarding old content entirely, compress a portion to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis. ### 2.2 Mem0 Contributions **Reference:** arXiv:2504.19413 -Mem0's memory layer research established compression ratio targets: - -- **90%+ compression:** Long-running conversations can achieve 10:1 or better compression while retaining semantic meaning. Our hierarchical approach targets similar ratios for very long content. - -- **Rolling summaries:** New information integrates with existing summaries rather than replacing them. The `prior_summary` parameter throughout our pipeline implements this pattern. +Mem0's memory layer research informed our storage architecture: -- **Two-phase architecture:** Separate extraction (what's important) from storage (how to persist it). We apply this by first generating summaries, then persisting to both files and vector DB. +- **Two-phase architecture:** Separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB. --- @@ -65,18 +53,17 @@ Mem0's memory layer research established compression ratio targets: **Rationale:** - **Predictable behavior:** Users can anticipate output length based on input size. -- **Optimal compression:** Each level targets a specific compression ratio validated by research. - **Efficiency:** Avoid over-processing short content or under-processing long content. **Thresholds:** -| Level | Token Range | Target Compression | Strategy | -| :--- | :--- | :--- | :--- | -| NONE | < 100 | N/A | No summarization needed | -| BRIEF | 100-500 | ~20% | Single sentence | -| STANDARD | 500-3000 | ~12% | Paragraph | -| DETAILED | 3000-15000 | ~7% | Chunked + meta-synthesis | -| HIERARCHICAL | > 15000 | ~3-5% | L1/L2/L3 tree | +| Level | Token Range | Strategy | +| :--- | :--- | :--- | +| NONE | < 100 | No summarization needed | +| BRIEF | 100-500 | Single sentence | +| STANDARD | 500-3000 | Paragraph | +| DETAILED | 3000-15000 | Chunked + meta-synthesis | +| HIERARCHICAL | > 15000 | L1/L2/L3 tree | **Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level. @@ -88,13 +75,13 @@ Mem0's memory layer research established compression ratio targets: - **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection. - **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3. -- **Progressive compression:** Each level provides ~5x compression over the previous, achieving high overall compression while preserving structure. +- **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure. **Structure:** - **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries. - **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction. -- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for rolling updates. +- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for incremental updates. **Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3). @@ -126,7 +113,7 @@ Mem0's memory layer research established compression ratio targets: A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case. -### 3.5 Prior Summary Integration (Rolling Updates) +### 3.5 Prior Summary Integration **Decision:** Always provide the previous summary as context when generating updates. @@ -136,7 +123,7 @@ A generic summarization prompt loses domain-specific signal. By tailoring prompt - **Incremental updates:** Avoid re-summarizing all historical content on every update. - **Information preservation:** Important information from earlier content persists through the chain of summaries. -This implements Mem0's "rolling summary" pattern. The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time. +The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time. ### 3.6 Compression Ratio Tracking @@ -220,7 +207,7 @@ Summaries are persisted in two places: | `chunk_overlap` | 200 | Overlap between consecutive chunks | | `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization | -Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing and Mem0 research on optimal compression ratios. +Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing. --- @@ -233,17 +220,3 @@ Summarization follows a fail-fast philosophy: - **Encoding errors:** Falls back to character-based token estimation. The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path. - ---- - -## 8. Comparison with Alternatives - -| Aspect | Adaptive Summarizer | Fixed Rolling Summary | No Summarization | -| :--- | :--- | :--- | :--- | -| **Compression** | 3-20% (scales with input) | ~15% fixed | 0% | -| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Full | -| **Short content** | Skipped (efficient) | Still processed | N/A | -| **Long content** | Tree structure | Single pass | Context overflow | -| **Research basis** | Letta + Mem0 | Mem0 | None | - -The adaptive approach's key advantage is matching effort to content: short content stays untouched, medium content gets lightweight summarization, and long content gets full hierarchical treatment. From 584631f70df03731f0706edcde679705f2f3e3aa Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 26 Nov 2025 23:45:11 -0800 Subject: [PATCH 21/37] fix(memory): summarize raw conversation turns, not extracted facts Previously, the summarizer was summarizing the already-compressed extracted facts, which is redundant. Now it summarizes the actual user/assistant messages, which is what makes sense for a conversation summary. --- agent_cli/memory/_ingest.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py index b0b472b71..f2ce90116 100644 --- a/agent_cli/memory/_ingest.py +++ b/agent_cli/memory/_ingest.py @@ -414,13 +414,20 @@ async def extract_and_store_facts_and_summaries( entries=list(to_add), ) - if enable_summarization and facts: - # Get prior summary for context continuity + # Summarize raw conversation turns (not extracted facts) + has_content = user_message or assistant_message + if enable_summarization and has_content: prior_summary_entry = get_final_summary(collection, conversation_id) prior_summary = prior_summary_entry.content if prior_summary_entry else None - # Summarize the new facts - content_to_summarize = "\n".join(facts) + # Build conversation transcript + parts = [] + if user_message: + parts.append(f"User: {user_message}") + if assistant_message: + parts.append(f"Assistant: {assistant_message}") + content_to_summarize = "\n".join(parts) + summary_start = perf_counter() summary_result = await summarize_content( content=content_to_summarize, From bec0384db008a11832ac99940fbe6f34cf24d029 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 00:02:23 -0800 Subject: [PATCH 22/37] docs: clarify research foundations vs original design in summarizer - Document what's actually borrowed from research: - Two-phase architecture from Mem0 (arXiv:2504.19413) - Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785) - Clarify what Letta does differently (message count, not tokens) - Acknowledge original/heuristic design choices: - Token thresholds (100/500/3000/15000) are not research-backed - L1/L2/L3 hierarchy structure is original - Chunk size (3000) is larger than BOOOOKSCORE's 2048 - Add future improvements section based on research findings --- agent_cli/summarizer/adaptive.py | 14 +++++-- docs/architecture/summarizer.md | 66 ++++++++++++++++++++++++-------- 2 files changed, 62 insertions(+), 18 deletions(-) diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 4a84ecff9..9536c70eb 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -1,9 +1,17 @@ """Adaptive summarization that scales with input complexity. -Implements hierarchical summarization inspired by Letta's partial eviction approach -and Mem0's two-phase architecture (extraction then storage). +Implements hierarchical summarization with multiple compression levels (L1/L2/L3). -Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta) +Research foundations: +- Two-phase architecture (extraction then storage) from Mem0 (arXiv:2504.19413) +- Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785) + +Original design (not research-backed): +- Token thresholds (100/500/3000/15000) are heuristic +- L1/L2/L3 hierarchy structure +- Chunk size (3000) - BOOOOKSCORE uses 2048 + +See docs/architecture/summarizer.md for detailed design rationale. """ from __future__ import annotations diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index 99318db0f..f08ea1a44 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -20,7 +20,6 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy **Design Goals:** - **Adaptive compression:** Match summarization depth to content complexity. -- **Research-informed:** Draws techniques from Letta's memory management. - **Hierarchical structure:** Preserve detail at multiple granularities for large content. - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents. @@ -28,19 +27,45 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy ## 2. Research Foundations -### 2.1 Letta (MemGPT) Contributions +This section documents what techniques are borrowed from research vs. what is original design. + +### 2.1 Borrowed: Two-Phase Architecture (Mem0) + +**Reference:** arXiv:2504.19413 + +Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB. + +### 2.2 Borrowed: Hierarchical Merging Concept (BOOOOKSCORE) + +**Reference:** arXiv:2310.00785 (ICLR 2024) + +BOOOOKSCORE's research on book-length summarization demonstrated two approaches: +- **Hierarchical merging:** Summarize chunks, then merge chunk summaries +- **Incremental updating:** Maintain a running summary updated with each chunk + +Key finding: For smaller context models (like local LLMs), hierarchical merging produces more coherent summaries. This informed our L1/L2/L3 structure. + +BOOOOKSCORE's defaults: chunk size of **2048 tokens**, max summary length of **900 tokens**. + +### 2.3 Not Directly Borrowed: Letta's Approach **Reference:** arXiv:2310.08560 -Letta's approach to memory management introduced the **partial eviction** technique adopted here: rather than discarding old content entirely, compress a portion to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis. +Letta (MemGPT) uses a different paradigm focused on **context window management**: +- Message count thresholds (e.g., 10 messages), not token thresholds +- 30% partial eviction when buffer overflows +- Purpose: fit conversation in LLM context window -### 2.2 Mem0 Contributions +Our system has a different purpose (memory compression for storage/retrieval), so while we were inspired by Letta's "partial eviction" concept, our implementation differs significantly. -**Reference:** arXiv:2504.19413 +### 2.4 Original Design (Not Research-Backed) -Mem0's memory layer research informed our storage architecture: +The following aspects are **original design choices without direct research justification**: -- **Two-phase architecture:** Separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB. +- **Token thresholds (100/500/3000/15000):** These numbers were chosen heuristically, not derived from research. They may benefit from tuning. +- **L1/L2/L3 hierarchy structure:** The three-level design is original. The naming was loosely inspired by aijournal's L1-L4 "context pack" levels, but those serve a different purpose (what to include in LLM context, not summarization levels). +- **Chunk size (3000 tokens):** This is larger than BOOOOKSCORE's research-backed 2048 tokens. Consider reducing. +- **L2 group size (5 chunks):** Chosen heuristically. --- @@ -65,7 +90,7 @@ Mem0's memory layer research informed our storage architecture: | DETAILED | 3000-15000 | Chunked + meta-synthesis | | HIERARCHICAL | > 15000 | L1/L2/L3 tree | -**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level. +**Caveat:** These thresholds are heuristic, not research-backed. They should be validated empirically. ### 3.2 Hierarchical Summary Structure (L1/L2/L3) @@ -73,7 +98,7 @@ Mem0's memory layer research informed our storage architecture: **Rationale:** -- **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection. +- **Hierarchical merging:** Research (BOOOOKSCORE) shows this approach works well for smaller context models. - **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3. - **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure. @@ -177,7 +202,7 @@ The parallelism at L1 and L2 levels provides significant speedup for long conten The memory system triggers summarization during post-processing: -1. Collect content to summarize (extracted facts, conversation turns) +1. Collect raw conversation turns (user message + assistant message) 2. Retrieve existing L3 summary as prior context 3. Call summarizer with content + prior summary + content type 4. Persist results: delete old summaries, write new files, upsert to ChromaDB @@ -201,13 +226,13 @@ Summaries are persisted in two places: ## 6. Configuration -| Parameter | Default | Description | +| Parameter | Default | Research Comparison | | :--- | :--- | :--- | -| `chunk_size` | 3000 | Target tokens per chunk | -| `chunk_overlap` | 200 | Overlap between consecutive chunks | -| `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization | +| `chunk_size` | 3000 | BOOOOKSCORE uses 2048 | +| `chunk_overlap` | 200 | No direct comparison | +| `max_concurrent_chunks` | 5 | Implementation choice | -Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing. +Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived from published research. --- @@ -220,3 +245,14 @@ Summarization follows a fail-fast philosophy: - **Encoding errors:** Falls back to character-based token estimation. The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path. + +--- + +## 8. Future Improvements + +Based on research findings, consider: + +1. **Reduce chunk size to 2048** to align with BOOOOKSCORE's tested defaults +2. **Validate token thresholds empirically** with real-world content +3. **Add incremental updating mode** as alternative to hierarchical merging for larger context models +4. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation From 2a88706db1c45a4239450a4e74f06fb9efbe5f1a Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 08:59:28 -0800 Subject: [PATCH 23/37] refactor(summarizer): simplify to NONE/BRIEF/MAP_REDUCE levels Remove old hierarchical summarization (STANDARD, DETAILED, HIERARCHICAL) in favor of a simpler 3-level system inspired by LangChain's map-reduce: - NONE: Skip summarization for very short content (<100 tokens) - BRIEF: Single-pass summary for short content (100-500 tokens) - MAP_REDUCE: LangChain-style map-reduce for longer content (500+ tokens) Key changes: - Add map_reduce.py with dynamic collapse algorithm - Remove HierarchicalSummary and ChunkSummary classes - Rename summary_level_name to summary_level in metadata - Add collapse_depth field to track map-reduce iterations - Use research-backed defaults (chunk_size=2048, token_max=3000) - Update all tests for simplified API - No backward compatibility - clean break from old implementation --- agent_cli/agents/summarize.py | 44 +-- agent_cli/memory/_files.py | 2 +- agent_cli/memory/_ingest.py | 18 +- agent_cli/memory/_persistence.py | 20 +- agent_cli/memory/models.py | 22 +- agent_cli/summarizer/__init__.py | 14 +- agent_cli/summarizer/_prompts.py | 8 +- agent_cli/summarizer/_utils.py | 11 +- agent_cli/summarizer/adaptive.py | 320 +++++------------- agent_cli/summarizer/map_reduce.py | 349 +++++++++++++++++++ agent_cli/summarizer/models.py | 202 ++--------- docs/architecture/summarizer.md | 198 ++++++----- examples/summarizer_demo.py | 91 ++--- tests/memory/test_engine.py | 8 +- tests/memory/test_git_integration.py | 3 +- tests/memory/test_store.py | 135 ++------ tests/summarizer/test_adaptive.py | 147 ++++---- tests/summarizer/test_integration.py | 481 +++------------------------ tests/summarizer/test_models.py | 224 ++----------- tests/summarizer/test_utils.py | 36 +- 20 files changed, 880 insertions(+), 1453 deletions(-) create mode 100644 agent_cli/summarizer/map_reduce.py diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py index abc8dfc72..ec516310e 100644 --- a/agent_cli/agents/summarize.py +++ b/agent_cli/agents/summarize.py @@ -1,4 +1,4 @@ -"""Summarize text files or stdin using adaptive hierarchical summarization.""" +"""Summarize text files or stdin using adaptive map-reduce summarization.""" from __future__ import annotations @@ -131,7 +131,7 @@ def _display_full_result( *, quiet: bool, ) -> None: - """Display full hierarchical result with all levels.""" + """Display full result with all metadata.""" if quiet: if result.summary: print(result.summary) @@ -143,34 +143,12 @@ def _display_full_result( console.print(f" Input tokens: [bold]{result.input_tokens:,}[/bold]") console.print(f" Output tokens: [bold]{result.output_tokens:,}[/bold]") console.print(f" Compression: [bold]{result.compression_ratio:.1%}[/bold]") + if result.collapse_depth > 0: + console.print(f" Collapse depth: [bold]{result.collapse_depth}[/bold]") console.print(f" Time: [bold]{elapsed:.2f}s[/bold]") console.print() - if result.hierarchical: - if result.hierarchical.l1_summaries: - console.print( - f"[bold yellow]L1 Chunk Summaries " - f"({len(result.hierarchical.l1_summaries)} chunks)[/bold yellow]", - ) - for cs in result.hierarchical.l1_summaries: - console.print( - f"\n[dim]--- Chunk {cs.chunk_index + 1} " - f"({cs.source_tokens:,} → {cs.token_count:,} tokens) ---[/dim]", - ) - console.print(cs.content) - - if result.hierarchical.l2_summaries: - console.print( - f"\n[bold yellow]L2 Group Summaries " - f"({len(result.hierarchical.l2_summaries)} groups)[/bold yellow]", - ) - for idx, l2_summary in enumerate(result.hierarchical.l2_summaries): - console.print(f"\n[dim]--- Group {idx + 1} ---[/dim]") - console.print(l2_summary) - - console.print("\n[bold green]L3 Final Summary[/bold green]") - print_output_panel(result.hierarchical.l3_summary, title="Final Summary") - elif result.summary: + if result.summary: print_output_panel( result.summary, title=f"Summary ({result.level.name})", @@ -296,9 +274,9 @@ def summarize_command( ), # --- Chunking Options --- chunk_size: int = typer.Option( - 3000, + 2048, "--chunk-size", - help="Target token count per chunk for hierarchical summarization.", + help="Target token count per chunk for map-reduce summarization.", rich_help_panel="Chunking Options", ), chunk_overlap: int = typer.Option( @@ -341,15 +319,13 @@ def summarize_command( config_file: str | None = opts.CONFIG_FILE, print_args: bool = opts.PRINT_ARGS, ) -> None: - """Summarize text using adaptive hierarchical summarization. + """Summarize text using adaptive map-reduce summarization. Reads from a file or stdin and produces a summary scaled to the input complexity: - NONE (<100 tokens): No summary needed - BRIEF (100-500): Single sentence - - STANDARD (500-3000): Paragraph - - DETAILED (3000-15000): Chunked with meta-summary - - HIERARCHICAL (>15000): Full L1/L2/L3 tree + - MAP_REDUCE (>500): Dynamic collapse until fits token budget Examples: # Summarize a file @@ -361,7 +337,7 @@ def summarize_command( # Pipe content from stdin cat book.txt | agent-cli summarize - # Get full hierarchical output + # Get full output with all metadata agent-cli summarize large_document.txt --output full # Use OpenAI instead of Ollama diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py index 0bb0a5d94..50b7400cf 100644 --- a/agent_cli/memory/_files.py +++ b/agent_cli/memory/_files.py @@ -23,7 +23,7 @@ _SNAPSHOT_FILENAME = "memory_index.json" _DELETED_DIRNAME = "deleted" -# Summary level constants for hierarchical file structure +# Summary level constants for file structure (kept for backward compatibility) _SUMMARY_LEVEL_L1 = 1 _SUMMARY_LEVEL_L2 = 2 _SUMMARY_LEVEL_L3 = 3 diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py index f2ce90116..e50e2ac45 100644 --- a/agent_cli/memory/_ingest.py +++ b/agent_cli/memory/_ingest.py @@ -13,7 +13,7 @@ from agent_cli.memory._persistence import ( delete_memory_files, persist_entries, - persist_hierarchical_summary, + persist_summary, ) from agent_cli.memory._prompt import ( FACT_INSTRUCTIONS, @@ -290,7 +290,7 @@ async def summarize_content( """Adaptively summarize content based on its length. Automatically selects the appropriate summarization strategy - (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) based on input token count. + (NONE, BRIEF, MAP_REDUCE) based on input token count. Args: content: The content to summarize. @@ -326,27 +326,21 @@ async def store_adaptive_summary( conversation_id: str, summary_result: SummaryResult, ) -> list[str]: - """Store an adaptive summary result to files and ChromaDB. + """Store a summary result to files and ChromaDB. - This stores all levels of a hierarchical summary (L1, L2, L3) or - just the final summary for simpler levels. Old summaries are deleted first. - - Files are stored as Markdown with YAML front matter in a hierarchical structure: - - summaries/L1/chunk_{n}.md - L1 chunk summaries - - summaries/L2/group_{n}.md - L2 group summaries - - summaries/L3/final.md - L3 final summary + Old summaries are deleted first, then the new summary is stored. Args: collection: ChromaDB collection. memory_root: Root path for memory files. conversation_id: The conversation this summary belongs to. - summary_result: The result from AdaptiveSummarizer.summarize(). + summary_result: The result from summarize(). Returns: List of IDs that were stored. """ - return persist_hierarchical_summary( + return persist_summary( collection, memory_root=memory_root, conversation_id=conversation_id, diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index 1bb2102d4..a7e3871e2 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -167,19 +167,19 @@ def evict_if_needed( delete_memory_files(memory_root, conversation_id, ids_to_remove) -def persist_hierarchical_summary( +def persist_summary( collection: Collection, *, memory_root: Path, conversation_id: str, summary_result: SummaryResult, ) -> list[str]: - """Persist a hierarchical summary to disk and ChromaDB. + """Persist a summary to disk and ChromaDB. This function: 1. Deletes existing summaries (files and ChromaDB entries) - 2. Writes new summary files to disk in hierarchical structure - 3. Stores entries in ChromaDB + 2. Writes new summary file to disk + 3. Stores entry in ChromaDB Args: collection: ChromaDB collection. @@ -219,14 +219,12 @@ def persist_hierarchical_summary( role=meta_dict["role"], created_at=meta_dict.get("created_at", created_at), summary_kind="summary", - level=meta_dict.get("level"), is_final=meta_dict.get("is_final"), - chunk_index=meta_dict.get("chunk_index"), - group_index=meta_dict.get("group_index"), input_tokens=meta_dict.get("input_tokens"), output_tokens=meta_dict.get("output_tokens"), compression_ratio=meta_dict.get("compression_ratio"), - summary_level_name=meta_dict.get("summary_level_name"), + summary_level=meta_dict.get("summary_level"), + collapse_depth=meta_dict.get("collapse_depth"), ) record = write_memory_file( memory_root, @@ -234,7 +232,11 @@ def persist_hierarchical_summary( doc_id=entry["id"], metadata=metadata, ) - LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level")) + LOGGER.info( + "Persisted summary file: %s (level=%s)", + record.path, + meta_dict.get("summary_level"), + ) stored_ids.append(record.id) # Store in ChromaDB (reuse the entries we already built) diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py index 06266c575..5b8df3855 100644 --- a/agent_cli/memory/models.py +++ b/agent_cli/memory/models.py @@ -49,23 +49,25 @@ class MemoryMetadata(BaseModel): replaced_by: str | None = None source_id: str | None = None - # Hierarchical summary fields (only used when role="summary") + # Summary fields (only used when role="summary") level: int | None = None - """Summary level: 1=chunk, 2=group, 3=final.""" + """Summary level (deprecated, kept for file structure compatibility).""" is_final: bool | None = None - """Whether this is the final L3 summary.""" + """Whether this is the final summary.""" chunk_index: int | None = None - """For L1 summaries: index of the source chunk.""" + """Deprecated: index of the source chunk.""" group_index: int | None = None - """For L2 summaries: index of this group.""" + """Deprecated: index of this group.""" input_tokens: int | None = None - """Number of tokens in the original input (L3 only).""" + """Number of tokens in the original input.""" output_tokens: int | None = None - """Number of tokens in the summary output (L3 only).""" + """Number of tokens in the summary output.""" compression_ratio: float | None = None - """Ratio of output to input tokens (L3 only).""" - summary_level_name: str | None = None - """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL').""" + """Ratio of output to input tokens.""" + summary_level: str | None = None + """Name of the SummaryLevel enum used (NONE, BRIEF, or MAP_REDUCE).""" + collapse_depth: int | None = None + """Number of collapse iterations in map-reduce (0 = no collapse needed).""" class StoredMemory(BaseModel): diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py index fc0994c4c..af977ada1 100644 --- a/agent_cli/summarizer/__init__.py +++ b/agent_cli/summarizer/__init__.py @@ -1,8 +1,13 @@ """Adaptive summarization module for variable-length content. -This module provides research-grounded summarization that scales with input complexity, -inspired by Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, -compression ratios) architectures. +This module provides map-reduce summarization inspired by LangChain's approach: +1. Split content into chunks and summarize each in parallel (map phase) +2. Recursively collapse summaries until they fit token_max (reduce phase) + +Research foundations: +- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse +- BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal +- Two-phase architecture concept from Mem0 (arXiv:2504.19413) Example: from agent_cli.summarizer import summarize, SummarizerConfig @@ -17,10 +22,9 @@ """ from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize -from agent_cli.summarizer.models import HierarchicalSummary, SummaryLevel, SummaryResult +from agent_cli.summarizer.models import SummaryLevel, SummaryResult __all__ = [ - "HierarchicalSummary", "SummarizationError", "SummarizerConfig", "SummaryLevel", diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py index f46b39ebf..1de5fa44f 100644 --- a/agent_cli/summarizer/_prompts.py +++ b/agent_cli/summarizer/_prompts.py @@ -4,7 +4,7 @@ and are optimized for structured, factual output. """ -# Level 1: BRIEF - Single sentence summary +# BRIEF level - Single sentence summary for short content (100-500 tokens) BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words). Focus on the single most important point or takeaway. @@ -13,7 +13,7 @@ One-sentence summary:""".strip() -# Level 2: STANDARD - Paragraph summary +# MAP_REDUCE level - Paragraph summary for content-type aware summarization STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph. Focus on: @@ -28,7 +28,7 @@ Summary (maximum {max_words} words):""".strip() -# Level 3: DETAILED - Used for individual chunks in hierarchical summarization +# CHUNK - Used in map phase of map-reduce summarization CHUNK_SUMMARY_PROMPT = """Summarize this section of a longer document. Capture the main points while preserving important details. @@ -37,7 +37,7 @@ Summary of this section (maximum {max_words} words):""".strip() -# Level 4: META - Combine multiple summaries into one +# META - Combine multiple summaries in reduce phase META_SUMMARY_PROMPT = """Synthesize these summaries into a single coherent overview. Identify common themes and key points across all sections. Eliminate redundancy while preserving unique insights. diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index 1c447f321..8dbfb1ffd 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -228,15 +228,8 @@ def estimate_summary_tokens(input_tokens: int, level: int) -> int: return 0 if level == SummaryLevel.BRIEF: return min(50, max(20, input_tokens // 5)) - if level == SummaryLevel.STANDARD: - return min(200, max(50, input_tokens // 8)) - if level == SummaryLevel.DETAILED: - return min(500, max(100, input_tokens // 15)) - # HIERARCHICAL - # Base of 1000 tokens plus diminishing returns for additional content - base = 1000 - additional = max(0, (input_tokens - 15000) // 100) - return min(2000, base + additional) + # MAP_REDUCE: ~10% compression with floor/ceiling + return min(500, max(50, input_tokens // 10)) def tokens_to_words(tokens: int) -> int: diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 9536c70eb..39669e97d 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -1,22 +1,23 @@ -"""Adaptive summarization that scales with input complexity. +"""Adaptive summarization using map-reduce with dynamic collapse. -Implements hierarchical summarization with multiple compression levels (L1/L2/L3). +Implements a simple algorithm inspired by LangChain's map-reduce chains: +1. If content is short enough, summarize directly +2. Otherwise, split into chunks and summarize each (map phase) +3. Recursively collapse summaries until they fit token_max (reduce phase) Research foundations: -- Two-phase architecture (extraction then storage) from Mem0 (arXiv:2504.19413) -- Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785) +- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse +- BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal +- Two-phase architecture concept from Mem0 (arXiv:2504.19413) -Original design (not research-backed): -- Token thresholds (100/500/3000/15000) are heuristic -- L1/L2/L3 hierarchy structure -- Chunk size (3000) - BOOOOKSCORE uses 2048 +Key insight: No need for predetermined L1/L2/L3 levels. +Dynamic collapse depth based on actual content length. See docs/architecture/summarizer.md for detailed design rationale. """ from __future__ import annotations -import asyncio import logging from dataclasses import dataclass @@ -24,21 +25,20 @@ from agent_cli.summarizer._prompts import ( BRIEF_SUMMARY_PROMPT, - CHUNK_SUMMARY_PROMPT, - META_SUMMARY_PROMPT, format_prior_context, - format_summaries_for_meta, get_prompt_for_content_type, ) from agent_cli.summarizer._utils import ( - chunk_text, count_tokens, estimate_summary_tokens, tokens_to_words, ) +from agent_cli.summarizer.map_reduce import ( + MapReduceConfig, + MapReduceSummarizationError, + map_reduce_summarize, +) from agent_cli.summarizer.models import ( - ChunkSummary, - HierarchicalSummary, SummaryLevel, SummaryResult, ) @@ -46,18 +46,8 @@ logger = logging.getLogger(__name__) # Thresholds for summary levels (in tokens) -LEVEL_THRESHOLDS = { - SummaryLevel.NONE: 100, - SummaryLevel.BRIEF: 500, - SummaryLevel.STANDARD: 3000, - SummaryLevel.DETAILED: 15000, - # HIERARCHICAL is everything above DETAILED -} - -# Number of L1 chunks to group together for L2 summaries -L2_GROUP_SIZE = 5 -# Minimum number of L1 chunks before L2 grouping is applied -L2_MIN_CHUNKS = 5 +THRESHOLD_NONE = 100 # Below this, no summary needed +THRESHOLD_BRIEF = 500 # Below this, just a single sentence class SummaryOutput(BaseModel): @@ -88,7 +78,8 @@ class SummarizerConfig: openai_base_url: str model: str api_key: str | None = None - chunk_size: int = 3000 + chunk_size: int = 2048 # BOOOOKSCORE's tested default + token_max: int = 3000 # LangChain's default - when to collapse chunk_overlap: int = 200 max_concurrent_chunks: int = 5 timeout: float = 60.0 @@ -102,15 +93,11 @@ def __post_init__(self) -> None: def determine_level(token_count: int) -> SummaryLevel: """Map token count to appropriate SummaryLevel.""" - if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]: + if token_count < THRESHOLD_NONE: return SummaryLevel.NONE - if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]: + if token_count < THRESHOLD_BRIEF: return SummaryLevel.BRIEF - if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]: - return SummaryLevel.STANDARD - if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]: - return SummaryLevel.DETAILED - return SummaryLevel.HIERARCHICAL + return SummaryLevel.MAP_REDUCE async def summarize( @@ -121,6 +108,11 @@ async def summarize( ) -> SummaryResult: """Summarize content with adaptive strategy based on length. + Uses a simple algorithm: + - Very short content (<100 tokens): No summary + - Short content (<500 tokens): Single sentence brief summary + - Everything else: Map-reduce with dynamic collapse + Args: content: The content to summarize. config: Summarizer configuration. @@ -135,7 +127,6 @@ async def summarize( return SummaryResult( level=SummaryLevel.NONE, summary=None, - hierarchical=None, input_tokens=0, output_tokens=0, compression_ratio=0.0, @@ -155,7 +146,6 @@ async def summarize( return SummaryResult( level=level, summary=None, - hierarchical=None, input_tokens=input_tokens, output_tokens=0, compression_ratio=0.0, @@ -163,68 +153,22 @@ async def summarize( if level == SummaryLevel.BRIEF: summary = await _brief_summary(content, config) - elif level == SummaryLevel.STANDARD: - summary = await _standard_summary(content, config, prior_summary, content_type) - elif level == SummaryLevel.DETAILED: - return await _detailed_summary(content, input_tokens, config) - else: # HIERARCHICAL - return await _hierarchical_summary(content, input_tokens, config) - - output_tokens = count_tokens(summary, config.model) if summary else 0 - compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0 - - return SummaryResult( - level=level, - summary=summary, - hierarchical=None, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=compression_ratio, - ) - - -async def _summarize_chunks( - chunks: list[str], - config: SummarizerConfig, -) -> list[ChunkSummary]: - """Summarize chunks concurrently with semaphore-controlled parallelism.""" - semaphore = asyncio.Semaphore(config.max_concurrent_chunks) - total = len(chunks) - - async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary: - async with semaphore: - return await _summarize_single_chunk(chunk, idx, total, config) - - gen = (summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)) - return list(await asyncio.gather(*gen)) - - -async def _summarize_single_chunk( - chunk: str, - chunk_index: int, - total_chunks: int, - config: SummarizerConfig, -) -> ChunkSummary: - """Summarize a single chunk and return its metadata.""" - source_tokens = count_tokens(chunk, config.model) - target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt = CHUNK_SUMMARY_PROMPT.format( - chunk_index=chunk_index + 1, - total_chunks=total_chunks, - content=chunk, - max_words=max_words, - ) - - summary = await _generate_summary(prompt, config, max_tokens=target_tokens + 50) - summary_tokens = count_tokens(summary, config.model) + output_tokens = count_tokens(summary, config.model) if summary else 0 + return SummaryResult( + level=level, + summary=summary, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + ) - return ChunkSummary( - chunk_index=chunk_index, - content=summary, - token_count=summary_tokens, - source_tokens=source_tokens, + # MAP_REDUCE level + return await _map_reduce_summary( + content, + input_tokens, + config, + prior_summary, + content_type, ) @@ -234,159 +178,77 @@ async def _brief_summary(content: str, config: SummarizerConfig) -> str: return await _generate_summary(prompt, config, max_tokens=50) -async def _standard_summary( +async def _map_reduce_summary( content: str, + input_tokens: int, config: SummarizerConfig, prior_summary: str | None, content_type: str, -) -> str: - """Generate a paragraph summary for standard-length content.""" - input_tokens = count_tokens(content, config.model) - target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt_template = get_prompt_for_content_type(content_type) - prior_context = format_prior_context(prior_summary) - - prompt = prompt_template.format( - content=content, - prior_context=prior_context, - max_words=max_words, - ) - - return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) - - -async def _detailed_summary( - content: str, - input_tokens: int, - config: SummarizerConfig, ) -> SummaryResult: - """Generate chunked summaries with meta-summary for detailed content.""" - chunks = chunk_text( - content, - chunk_size=config.chunk_size, - overlap=config.chunk_overlap, - model=config.model, - ) - - logger.info("Detailed summary: processing %d chunks", len(chunks)) - - chunk_summaries = await _summarize_chunks(chunks, config) - - # Generate meta-summary - all_summaries = [cs.content for cs in chunk_summaries] - meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED) - max_words = tokens_to_words(meta_target) - - meta_prompt = META_SUMMARY_PROMPT.format( - summaries=format_summaries_for_meta(all_summaries), - max_words=max_words, - ) - - final_summary = await _generate_summary( - meta_prompt, - config, - max_tokens=meta_target + 100, - ) - output_tokens = count_tokens(final_summary, config.model) + """Use map-reduce with dynamic collapse for longer content.""" + # For content that fits in a single chunk, use content-type aware summary + if input_tokens <= config.token_max: + summary = await _content_aware_summary(content, config, prior_summary, content_type) + output_tokens = count_tokens(summary, config.model) if summary else 0 + return SummaryResult( + level=SummaryLevel.MAP_REDUCE, + summary=summary, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + collapse_depth=0, + ) - hierarchical = HierarchicalSummary( - l1_summaries=list(chunk_summaries), - l2_summaries=[], # Not used for DETAILED level - l3_summary=final_summary, + # Use map-reduce for multi-chunk content + mr_config = MapReduceConfig( + openai_base_url=config.openai_base_url, + model=config.model, + api_key=config.api_key, chunk_size=config.chunk_size, + token_max=config.token_max, chunk_overlap=config.chunk_overlap, + max_concurrent=config.max_concurrent_chunks, + timeout=config.timeout, ) + try: + result = await map_reduce_summarize(content, mr_config) + except MapReduceSummarizationError as e: + raise SummarizationError(str(e)) from e + return SummaryResult( - level=SummaryLevel.DETAILED, - summary=final_summary, - hierarchical=hierarchical, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + level=SummaryLevel.MAP_REDUCE, + summary=result.summary, + input_tokens=result.input_tokens, + output_tokens=result.output_tokens, + compression_ratio=result.compression_ratio, + collapse_depth=result.collapse_depth, ) -async def _hierarchical_summary( +async def _content_aware_summary( content: str, - input_tokens: int, config: SummarizerConfig, -) -> SummaryResult: - """Build a tree of summaries for very long content. - - Structure: - - L1: Individual chunk summaries - - L2: Group summaries (groups of ~5 L1 summaries) - - L3: Final synthesis - """ - chunks = chunk_text( - content, - chunk_size=config.chunk_size, - overlap=config.chunk_overlap, - model=config.model, + prior_summary: str | None, + content_type: str, +) -> str: + """Generate a content-type aware summary for single-chunk content.""" + target_tokens = estimate_summary_tokens( + count_tokens(content, config.model), + SummaryLevel.MAP_REDUCE, ) + max_words = tokens_to_words(target_tokens) - logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks)) - - # L1: Summarize each chunk - l1_summaries = await _summarize_chunks(chunks, config) - - # L2: Group summaries (if more than L2_MIN_CHUNKS chunks) - l2_summaries: list[str] = [] - if len(l1_summaries) > L2_MIN_CHUNKS: - groups: list[list[str]] = [] - for i in range(0, len(l1_summaries), L2_GROUP_SIZE): - group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]] - groups.append(group) - - async def summarize_group(group: list[str]) -> str: - combined_tokens = sum(count_tokens(s, config.model) for s in group) - target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD) - max_words = tokens_to_words(target_tokens) - - prompt = META_SUMMARY_PROMPT.format( - summaries=format_summaries_for_meta(group), - max_words=max_words, - ) - return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) - - l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups]) - - # L3: Final synthesis - summaries_to_synthesize = l2_summaries if l2_summaries else [cs.content for cs in l1_summaries] - final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL) - max_words = tokens_to_words(final_target) + prompt_template = get_prompt_for_content_type(content_type) + prior_context = format_prior_context(prior_summary) - final_prompt = META_SUMMARY_PROMPT.format( - summaries=format_summaries_for_meta(summaries_to_synthesize), + prompt = prompt_template.format( + content=content, + prior_context=prior_context, max_words=max_words, ) - final_summary = await _generate_summary( - final_prompt, - config, - max_tokens=final_target + 100, - ) - output_tokens = count_tokens(final_summary, config.model) - - hierarchical = HierarchicalSummary( - l1_summaries=list(l1_summaries), - l2_summaries=list(l2_summaries), - l3_summary=final_summary, - chunk_size=config.chunk_size, - chunk_overlap=config.chunk_overlap, - ) - - return SummaryResult( - level=SummaryLevel.HIERARCHICAL, - summary=final_summary, - hierarchical=hierarchical, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, - ) + return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) async def _generate_summary( diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py new file mode 100644 index 000000000..09d82d09c --- /dev/null +++ b/agent_cli/summarizer/map_reduce.py @@ -0,0 +1,349 @@ +"""Map-reduce summarization inspired by LangChain's approach. + +Simple algorithm: +1. Map: Split content into chunks, summarize each in parallel +2. Reduce: If combined summaries exceed token_max, recursively collapse + +Key insight from LangChain: No need for predetermined levels (L1/L2/L3). +Just keep collapsing until content fits. Dynamic depth based on actual content. + +References: +- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse +- BOOOOKSCORE: chunk_size=2048 optimal for summarization + +""" + +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass + +from pydantic import BaseModel + +from agent_cli.summarizer._prompts import ( + CHUNK_SUMMARY_PROMPT, + META_SUMMARY_PROMPT, + format_summaries_for_meta, +) +from agent_cli.summarizer._utils import ( + chunk_text, + count_tokens, + estimate_summary_tokens, + tokens_to_words, +) +from agent_cli.summarizer.models import SummaryLevel + +logger = logging.getLogger(__name__) + + +class SummaryOutput(BaseModel): + """Structured output for summary generation.""" + + summary: str + + +class MapReduceSummarizationError(Exception): + """Raised when map-reduce summarization fails.""" + + +@dataclass +class MapReduceConfig: + """Configuration for map-reduce summarization. + + Attributes: + openai_base_url: Base URL for OpenAI-compatible API. + model: Model name for summarization. + api_key: Optional API key. + chunk_size: Target size for splitting content (tokens). + LangChain uses 3000, BOOOOKSCORE suggests 2048. + token_max: Maximum tokens for combined summaries before collapsing. + When combined summaries exceed this, we recursively reduce. + chunk_overlap: Overlap between chunks for context continuity. + max_concurrent: Maximum parallel summarization calls. + timeout: Timeout for API calls in seconds. + max_collapse_depth: Safety limit on recursive collapse depth. + + """ + + openai_base_url: str + model: str + api_key: str | None = None + chunk_size: int = 2048 # BOOOOKSCORE's tested default + token_max: int = 3000 # LangChain's default + chunk_overlap: int = 200 + max_concurrent: int = 5 + timeout: float = 60.0 + max_collapse_depth: int = 10 # Safety limit + + def __post_init__(self) -> None: + """Normalize the base URL.""" + self.openai_base_url = self.openai_base_url.rstrip("/") + if self.api_key is None: + self.api_key = "not-needed" + + +@dataclass +class MapReduceResult: + """Result of map-reduce summarization. + + Attributes: + summary: The final collapsed summary. + input_tokens: Token count of original content. + output_tokens: Token count of final summary. + compression_ratio: output_tokens / input_tokens. + collapse_depth: How many reduce iterations were needed. + intermediate_summaries: All intermediate summaries (for debugging/storage). + + """ + + summary: str + input_tokens: int + output_tokens: int + compression_ratio: float + collapse_depth: int + intermediate_summaries: list[list[str]] # Each level of collapse + + +async def map_reduce_summarize( + content: str, + config: MapReduceConfig, +) -> MapReduceResult: + """Summarize content using map-reduce with dynamic collapse. + + Algorithm: + 1. If content fits in token_max, summarize directly + 2. Otherwise, split into chunks and summarize each (map phase) + 3. If combined summaries exceed token_max, recursively collapse (reduce phase) + 4. Continue until everything fits in token_max + + Args: + content: The content to summarize. + config: Map-reduce configuration. + + Returns: + MapReduceResult with summary and metadata. + + """ + if not content or not content.strip(): + return MapReduceResult( + summary="", + input_tokens=0, + output_tokens=0, + compression_ratio=0.0, + collapse_depth=0, + intermediate_summaries=[], + ) + + input_tokens = count_tokens(content, config.model) + + # If content already fits, just summarize directly + if input_tokens <= config.token_max: + summary = await _summarize_text(content, config) + output_tokens = count_tokens(summary, config.model) + return MapReduceResult( + summary=summary, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + collapse_depth=0, + intermediate_summaries=[], + ) + + # Map phase: Split and summarize chunks in parallel + chunks = chunk_text( + content, + chunk_size=config.chunk_size, + overlap=config.chunk_overlap, + model=config.model, + ) + + logger.info("Map phase: processing %d chunks", len(chunks)) + summaries = await _map_summarize(chunks, config) + intermediate_summaries = [summaries.copy()] + + # Reduce phase: Recursively collapse until fits token_max + depth = 0 + while _total_tokens(summaries, config.model) > config.token_max: + depth += 1 + if depth > config.max_collapse_depth: + logger.warning( + "Hit max collapse depth %d, forcing final summary", + config.max_collapse_depth, + ) + break + + logger.info( + "Reduce phase (depth %d): collapsing %d summaries (%d tokens)", + depth, + len(summaries), + _total_tokens(summaries, config.model), + ) + summaries = await _collapse_summaries(summaries, config) + intermediate_summaries.append(summaries.copy()) + + # Final synthesis if we have multiple summaries left + if len(summaries) > 1: + final_summary = await _synthesize(summaries, config) + else: + final_summary = summaries[0] if summaries else "" + + output_tokens = count_tokens(final_summary, config.model) + + return MapReduceResult( + summary=final_summary, + input_tokens=input_tokens, + output_tokens=output_tokens, + compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + collapse_depth=depth, + intermediate_summaries=intermediate_summaries, + ) + + +def _total_tokens(texts: list[str], model: str) -> int: + """Count total tokens across all texts.""" + return sum(count_tokens(t, model) for t in texts) + + +async def _map_summarize(chunks: list[str], config: MapReduceConfig) -> list[str]: + """Summarize each chunk in parallel (map phase).""" + semaphore = asyncio.Semaphore(config.max_concurrent) + total = len(chunks) + + async def summarize_chunk(idx: int, chunk: str) -> str: + async with semaphore: + return await _summarize_chunk(chunk, idx, total, config) + + tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)] + return list(await asyncio.gather(*tasks)) + + +async def _summarize_chunk( + chunk: str, + chunk_index: int, + total_chunks: int, + config: MapReduceConfig, +) -> str: + """Summarize a single chunk.""" + source_tokens = count_tokens(chunk, config.model) + target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.MAP_REDUCE) + max_words = tokens_to_words(target_tokens) + + prompt = CHUNK_SUMMARY_PROMPT.format( + chunk_index=chunk_index + 1, + total_chunks=total_chunks, + content=chunk, + max_words=max_words, + ) + + return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) + + +async def _collapse_summaries( + summaries: list[str], + config: MapReduceConfig, +) -> list[str]: + """Collapse summaries by grouping and re-summarizing (reduce phase). + + Groups summaries that together fit within token_max, then summarizes each group. + This is similar to LangChain's split_list_of_docs approach. + """ + if len(summaries) <= 1: + return summaries + + # Group summaries that together fit within token_max + groups: list[list[str]] = [] + current_group: list[str] = [] + current_tokens = 0 + + for summary in summaries: + summary_tokens = count_tokens(summary, config.model) + + # If adding this summary would exceed token_max, start new group + if current_tokens + summary_tokens > config.token_max and current_group: + groups.append(current_group) + current_group = [summary] + current_tokens = summary_tokens + else: + current_group.append(summary) + current_tokens += summary_tokens + + if current_group: + groups.append(current_group) + + # Summarize each group in parallel + semaphore = asyncio.Semaphore(config.max_concurrent) + + async def summarize_group(group: list[str]) -> str: + async with semaphore: + return await _synthesize(group, config) + + tasks = [summarize_group(g) for g in groups] + return list(await asyncio.gather(*tasks)) + + +async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str: + """Synthesize multiple summaries into one.""" + combined_tokens = sum(count_tokens(s, config.model) for s in summaries) + target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE) + max_words = tokens_to_words(target_tokens) + + prompt = META_SUMMARY_PROMPT.format( + summaries=format_summaries_for_meta(summaries), + max_words=max_words, + ) + + return await _generate_summary(prompt, config, max_tokens=target_tokens + 100) + + +async def _summarize_text(text: str, config: MapReduceConfig) -> str: + """Summarize text that fits within token_max.""" + input_tokens = count_tokens(text, config.model) + target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE) + max_words = tokens_to_words(target_tokens) + + prompt = f"""Summarize the following content in {max_words} words or less. +Focus on the key points and main ideas. + +Content: +{text} + +Summary:""" + + return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) + + +async def _generate_summary( + prompt: str, + config: MapReduceConfig, + max_tokens: int = 256, +) -> str: + """Call the LLM to generate a summary.""" + from pydantic_ai import Agent # noqa: PLC0415 + from pydantic_ai.models.openai import OpenAIChatModel # noqa: PLC0415 + from pydantic_ai.providers.openai import OpenAIProvider # noqa: PLC0415 + from pydantic_ai.settings import ModelSettings # noqa: PLC0415 + + provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url) + model = OpenAIChatModel( + model_name=config.model, + provider=provider, + settings=ModelSettings( + temperature=0.3, + max_tokens=max_tokens, + ), + ) + + agent = Agent( + model=model, + system_prompt="You are a concise summarizer. Output only the summary, no preamble.", + output_type=SummaryOutput, + retries=2, + ) + + try: + result = await agent.run(prompt) + return result.output.summary.strip() + except Exception as e: + msg = f"Map-reduce summarization failed: {e}" + raise MapReduceSummarizationError(msg) from e diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index 36407e459..be0d309be 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -1,4 +1,4 @@ -"""Data models for adaptive summarization.""" +"""Data models for map-reduce summarization.""" from __future__ import annotations @@ -8,103 +8,31 @@ from pydantic import BaseModel, Field -# Hierarchical level constants for storage -HIERARCHICAL_LEVEL_L1 = 1 -HIERARCHICAL_LEVEL_L2 = 2 -HIERARCHICAL_LEVEL_L3 = 3 - class SummaryLevel(IntEnum): - """Summary granularity levels based on input complexity.""" + """Summary strategy based on input length.""" NONE = 0 - """< 100 tokens: No summary needed, facts only.""" + """< 100 tokens: No summary needed.""" BRIEF = 1 - """100-500 tokens: Single-sentence summary (~20% compression).""" - - STANDARD = 2 - """500-3000 tokens: Paragraph summary (~12% compression).""" - - DETAILED = 3 - """3000-15000 tokens: Chunked summaries + meta-summary (~7% compression).""" - - HIERARCHICAL = 4 - """> 15000 tokens: Tree of summaries with multiple levels.""" + """100-500 tokens: Single-sentence summary.""" - -class ChunkSummary(BaseModel): - """Summary of a single chunk within a hierarchical summary.""" - - chunk_index: int = Field(..., description="Index of this chunk in the original content") - content: str = Field(..., description="The summarized content of this chunk") - token_count: int = Field(..., ge=0, description="Token count of this summary") - source_tokens: int = Field(..., ge=0, description="Token count of the source chunk") - - -class HierarchicalSummary(BaseModel): - """A hierarchical summary with multiple levels. - - Structure inspired by Letta's partial eviction pattern: - - L1: Individual chunk summaries (parallel processing) - - L2: Group summaries (groups of ~5 L1 summaries) - - L3: Final synthesis (single top-level summary) - """ - - l1_summaries: list[ChunkSummary] = Field( - default_factory=list, - description="Level 1: Individual chunk summaries", - ) - l2_summaries: list[str] = Field( - default_factory=list, - description="Level 2: Group summaries (if > 5 chunks)", - ) - l3_summary: str = Field( - ..., - description="Level 3: Final synthesized summary", - ) - chunk_size: int = Field( - default=3000, - description="Token size used for chunking", - ) - chunk_overlap: int = Field( - default=200, - description="Token overlap between chunks", - ) - - def get_summary_at_level(self, level: int) -> str | list[str]: - """Get summary content at a specific level. - - Args: - level: 1 for chunk summaries, 2 for group summaries, 3 for final. - - Returns: - Summary content at the requested level. - - """ - if level == HIERARCHICAL_LEVEL_L1: - return [cs.content for cs in self.l1_summaries] - if level == HIERARCHICAL_LEVEL_L2: - return self.l2_summaries if self.l2_summaries else [self.l3_summary] - return self.l3_summary + MAP_REDUCE = 2 + """> 500 tokens: Map-reduce with dynamic collapse.""" class SummaryResult(BaseModel): - """Result of adaptive summarization. + """Result of summarization. - Contains the summary at the appropriate level for the input complexity, - along with metadata about the compression achieved. + Contains the summary and metadata about the compression achieved. """ - level: SummaryLevel = Field(..., description="The summarization level used") + level: SummaryLevel = Field(..., description="The summarization strategy used") summary: str | None = Field( default=None, description="The final summary text (None for NONE level)", ) - hierarchical: HierarchicalSummary | None = Field( - default=None, - description="Full hierarchical structure (for DETAILED/HIERARCHICAL levels)", - ) input_tokens: int = Field(..., ge=0, description="Token count of the input content") output_tokens: int = Field(..., ge=0, description="Token count of the summary") compression_ratio: float = Field( @@ -113,100 +41,40 @@ class SummaryResult(BaseModel): le=1.0, description="Ratio of output to input tokens (lower = more compression)", ) + collapse_depth: int = Field( + default=0, + ge=0, + description="Number of collapse iterations in map-reduce (0 = no collapse needed)", + ) created_at: datetime = Field( default_factory=lambda: datetime.now(UTC), description="Timestamp when summary was created", ) - @property - def chunk_summaries(self) -> list[str] | None: - """Get L1 chunk summaries if available.""" - if self.hierarchical: - return [cs.content for cs in self.hierarchical.l1_summaries] - return None - def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: - """Convert to metadata entries for ChromaDB storage. + """Convert to metadata entry for ChromaDB storage. - Returns a list of metadata dicts, one for each summary level stored. + Returns a list with a single metadata dict for the summary. """ - entries: list[dict[str, Any]] = [] + if self.level == SummaryLevel.NONE or not self.summary: + return [] + timestamp = self.created_at.isoformat() - if self.level == SummaryLevel.NONE: - return entries - - # For hierarchical summaries, store each level - if self.hierarchical: - # L1: Individual chunk summaries - entries.extend( - { - "id": f"{conversation_id}:summary:L1:{cs.chunk_index}", - "content": cs.content, - "metadata": { - "conversation_id": conversation_id, - "role": "summary", - "level": HIERARCHICAL_LEVEL_L1, - "chunk_index": cs.chunk_index, - "token_count": cs.token_count, - "created_at": timestamp, - }, - } - for cs in self.hierarchical.l1_summaries - ) - - # L2: Group summaries - entries.extend( - { - "id": f"{conversation_id}:summary:L2:{idx}", - "content": l2_summary, - "metadata": { - "conversation_id": conversation_id, - "role": "summary", - "level": HIERARCHICAL_LEVEL_L2, - "group_index": idx, - "created_at": timestamp, - }, - } - for idx, l2_summary in enumerate(self.hierarchical.l2_summaries) - ) - - # L3: Final summary - entries.append( - { - "id": f"{conversation_id}:summary:L3:final", - "content": self.hierarchical.l3_summary, - "metadata": { - "conversation_id": conversation_id, - "role": "summary", - "level": HIERARCHICAL_LEVEL_L3, - "is_final": True, - "summary_level_name": self.level.name, - "input_tokens": self.input_tokens, - "output_tokens": self.output_tokens, - "compression_ratio": self.compression_ratio, - "created_at": timestamp, - }, - }, - ) - elif self.summary: - # Non-hierarchical: just store the single summary - entries.append( - { - "id": f"{conversation_id}:summary:L3:final", - "content": self.summary, - "metadata": { - "conversation_id": conversation_id, - "role": "summary", - "level": HIERARCHICAL_LEVEL_L3, - "is_final": True, - "summary_level_name": self.level.name, - "input_tokens": self.input_tokens, - "output_tokens": self.output_tokens, - "compression_ratio": self.compression_ratio, - "created_at": timestamp, - }, + return [ + { + "id": f"{conversation_id}:summary", + "content": self.summary, + "metadata": { + "conversation_id": conversation_id, + "role": "summary", + "is_final": True, + "summary_level": self.level.name, + "input_tokens": self.input_tokens, + "output_tokens": self.output_tokens, + "compression_ratio": self.compression_ratio, + "collapse_depth": self.collapse_depth, + "created_at": timestamp, }, - ) - - return entries + }, + ] diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index f08ea1a44..c34540bc1 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -4,23 +4,23 @@ This document describes the architectural decisions, design rationale, and techn ## 1. System Overview -The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count. +The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. Rather than applying fixed summarization levels, it dynamically collapses content until it fits within a token budget. ``` -Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy +Input Content ──▶ Token Count ──▶ Strategy Selection │ - ┌───────────────────────────────┼───────────────────────────────┐ - │ │ │ - < 100 tokens 500-15000 tokens > 15000 tokens - │ │ │ - No summary needed Chunked processing Hierarchical tree - + meta-synthesis (L1/L2/L3) + ┌───────────────────────────────┼─────────────────────┐ + │ │ │ + < 100 tokens 100-500 tokens > 500 tokens + │ │ │ + No summary Brief summary Map-Reduce + (single sentence) (dynamic collapse) ``` **Design Goals:** -- **Adaptive compression:** Match summarization depth to content complexity. -- **Hierarchical structure:** Preserve detail at multiple granularities for large content. +- **Simple algorithm:** Map-reduce with dynamic collapse depth based on actual content. +- **Research-grounded defaults:** chunk_size=2048 (BOOOOKSCORE), token_max=3000 (LangChain). - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents. --- @@ -29,25 +29,31 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy This section documents what techniques are borrowed from research vs. what is original design. -### 2.1 Borrowed: Two-Phase Architecture (Mem0) +### 2.1 Borrowed: LangChain Map-Reduce Pattern -**Reference:** arXiv:2504.19413 +**Reference:** LangChain `ReduceDocumentsChain` -Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB. +LangChain's approach to document summarization uses a simple algorithm: +1. **Map phase:** Split content into chunks, summarize each in parallel +2. **Reduce phase:** If combined summaries exceed `token_max`, recursively collapse until they fit -### 2.2 Borrowed: Hierarchical Merging Concept (BOOOOKSCORE) +Key insight: No need for predetermined L1/L2/L3 levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`. + +### 2.2 Borrowed: Chunk Size (BOOOOKSCORE) **Reference:** arXiv:2310.00785 (ICLR 2024) -BOOOOKSCORE's research on book-length summarization demonstrated two approaches: -- **Hierarchical merging:** Summarize chunks, then merge chunk summaries -- **Incremental updating:** Maintain a running summary updated with each chunk +BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. Their defaults: +- Chunk size: **2048 tokens** (we use this) +- Max summary length: **900 tokens** -Key finding: For smaller context models (like local LLMs), hierarchical merging produces more coherent summaries. This informed our L1/L2/L3 structure. +### 2.3 Borrowed: Two-Phase Architecture (Mem0) -BOOOOKSCORE's defaults: chunk size of **2048 tokens**, max summary length of **900 tokens**. +**Reference:** arXiv:2504.19413 -### 2.3 Not Directly Borrowed: Letta's Approach +Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB. + +### 2.4 Not Directly Borrowed: Letta's Approach **Reference:** arXiv:2310.08560 @@ -56,61 +62,79 @@ Letta (MemGPT) uses a different paradigm focused on **context window management* - 30% partial eviction when buffer overflows - Purpose: fit conversation in LLM context window -Our system has a different purpose (memory compression for storage/retrieval), so while we were inspired by Letta's "partial eviction" concept, our implementation differs significantly. +Our system has a different purpose (memory compression for storage/retrieval), so our implementation differs significantly. -### 2.4 Original Design (Not Research-Backed) +### 2.5 Original Design (Not Research-Backed) The following aspects are **original design choices without direct research justification**: -- **Token thresholds (100/500/3000/15000):** These numbers were chosen heuristically, not derived from research. They may benefit from tuning. -- **L1/L2/L3 hierarchy structure:** The three-level design is original. The naming was loosely inspired by aijournal's L1-L4 "context pack" levels, but those serve a different purpose (what to include in LLM context, not summarization levels). -- **Chunk size (3000 tokens):** This is larger than BOOOOKSCORE's research-backed 2048 tokens. Consider reducing. -- **L2 group size (5 chunks):** Chosen heuristically. +- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/map-reduce were chosen heuristically. +- **L2 group logic for storage:** The intermediate summaries stored as "L2" is for backward compatibility with the storage layer. +- **Content-type prompts:** Domain-specific prompts are original design. --- ## 3. Architectural Decisions -### 3.1 Token-Based Level Selection +### 3.1 Map-Reduce with Dynamic Collapse -**Decision:** Select summarization strategy based on input token count with fixed thresholds. +**Decision:** Use LangChain-style map-reduce instead of fixed L1/L2/L3 levels. **Rationale:** -- **Predictable behavior:** Users can anticipate output length based on input size. -- **Efficiency:** Avoid over-processing short content or under-processing long content. +- **Simpler algorithm:** No need to distinguish STANDARD/DETAILED/HIERARCHICAL. +- **Dynamic depth:** Collapse depth adapts to actual content length. +- **Research-backed:** LangChain's approach is battle-tested. -**Thresholds:** +**Algorithm:** -| Level | Token Range | Strategy | -| :--- | :--- | :--- | -| NONE | < 100 | No summarization needed | -| BRIEF | 100-500 | Single sentence | -| STANDARD | 500-3000 | Paragraph | -| DETAILED | 3000-15000 | Chunked + meta-synthesis | -| HIERARCHICAL | > 15000 | L1/L2/L3 tree | +```python +def map_reduce_summarize(content, token_max=3000): + if tokens(content) <= token_max: + return summarize_directly(content) -**Caveat:** These thresholds are heuristic, not research-backed. They should be validated empirically. + # Map: Split and summarize chunks in parallel + chunks = split_into_chunks(content, chunk_size=2048) + summaries = [summarize(chunk) for chunk in chunks] -### 3.2 Hierarchical Summary Structure (L1/L2/L3) + # Reduce: Recursively collapse until fits + while total_tokens(summaries) > token_max: + groups = group_summaries_by_token_max(summaries, token_max) + summaries = [synthesize(group) for group in groups] -**Decision:** For long content, build a tree of summaries at three levels of granularity. + return final_synthesis(summaries) +``` + +### 3.2 Token-Based Level Selection (Simplified) + +**Decision:** Use three effective levels instead of five. **Rationale:** -- **Hierarchical merging:** Research (BOOOOKSCORE) shows this approach works well for smaller context models. -- **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3. -- **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure. +- **Simplicity:** Fewer code paths, easier to understand. +- **Dynamic instead of fixed:** Map-reduce adapts to content, no need for DETAILED vs HIERARCHICAL distinction. -**Structure:** +**Effective Levels:** -- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries. -- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction. -- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for incremental updates. +| Level | Token Range | Strategy | +| :--- | :--- | :--- | +| NONE | < 100 | No summarization needed | +| BRIEF | 100-500 | Single sentence | +| MAP_REDUCE | > 500 | Dynamic collapse until fits token_max | + +**Backward Compatibility:** The output still reports STANDARD, DETAILED, or HIERARCHICAL based on collapse depth for storage compatibility. -**Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3). +### 3.3 Research-Backed Defaults + +**Decision:** Use values from published research. + +| Parameter | Value | Source | +| :--- | :--- | :--- | +| `chunk_size` | 2048 | BOOOOKSCORE | +| `token_max` | 3000 | LangChain | +| `chunk_overlap` | 200 | Original | -### 3.3 Semantic Boundary Chunking +### 3.4 Semantic Boundary Chunking **Decision:** Split content on semantic boundaries (paragraphs, then sentences) rather than fixed character counts. @@ -126,7 +150,7 @@ The following aspects are **original design choices without direct research just 2. Fall back to sentence boundaries (`.!?` followed by space + capital) 3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation) -### 3.4 Content-Type Aware Prompts +### 3.5 Content-Type Aware Prompts **Decision:** Use different prompt templates for different content domains. @@ -138,7 +162,7 @@ The following aspects are **original design choices without direct research just A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case. -### 3.5 Prior Summary Integration +### 3.6 Prior Summary Integration **Decision:** Always provide the previous summary as context when generating updates. @@ -150,7 +174,7 @@ A generic summarization prompt loses domain-specific signal. By tailoring prompt The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time. -### 3.6 Compression Ratio Tracking +### 3.7 Compression Ratio Tracking **Decision:** Track and report compression metrics for every summary. @@ -171,28 +195,26 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression The entry point counts tokens and selects strategy: 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable. -2. **Threshold comparison:** Maps token count to `SummaryLevel` enum. -3. **Strategy dispatch:** Calls level-specific handler. +2. **Threshold comparison:** Determines if NONE, BRIEF, or map-reduce. +3. **Strategy dispatch:** Calls appropriate handler. -### 4.2 Brief and Standard Levels +### 4.2 Brief Level -For short content (< 3000 tokens): +For short content (100-500 tokens): -- Single LLM call with level-appropriate prompt -- Prior summary injected as context if available -- Content-type selection determines prompt variant +- Single LLM call with brief prompt - Returns simple `SummaryResult` with no hierarchical structure -### 4.3 Detailed and Hierarchical Levels +### 4.3 Map-Reduce Level -For longer content: +For longer content (> 500 tokens): -1. **Chunking:** Split content into overlapping chunks on semantic boundaries. -2. **Parallel L1 generation:** Summarize each chunk independently. Uses semaphore-controlled concurrency to avoid overwhelming the LLM. -3. **L2 grouping (hierarchical only):** Organize L1s into groups of ~5, summarize each group. -4. **L3 synthesis:** Meta-summarize all L2s (or all L1s for DETAILED level) into final summary. +1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly. +2. **Map phase:** Split content into overlapping chunks, summarize each in parallel. +3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively. +4. **Final synthesis:** Combine remaining summaries into final output. -The parallelism at L1 and L2 levels provides significant speedup for long content while maintaining semantic coherence through the hierarchical structure. +The parallelism in the map phase provides significant speedup for long content while maintaining semantic coherence through the collapse process. --- @@ -222,17 +244,22 @@ Summaries are persisted in two places: - **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable. - **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps. +For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 structure: +- First collapse level → L1 (chunk summaries) +- Intermediate levels → L2 (grouped summaries) +- Final output → L3 (synthesis) + --- ## 6. Configuration -| Parameter | Default | Research Comparison | +| Parameter | Default | Source | | :--- | :--- | :--- | -| `chunk_size` | 3000 | BOOOOKSCORE uses 2048 | -| `chunk_overlap` | 200 | No direct comparison | -| `max_concurrent_chunks` | 5 | Implementation choice | - -Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived from published research. +| `chunk_size` | 2048 | BOOOOKSCORE | +| `token_max` | 3000 | LangChain | +| `chunk_overlap` | 200 | Original | +| `max_concurrent` | 5 | Implementation choice | +| `max_collapse_depth` | 10 | Safety limit | --- @@ -240,19 +267,30 @@ Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived fr Summarization follows a fail-fast philosophy: -- **LLM errors:** Propagated as `SummarizationError` rather than silently returning empty results. +- **LLM errors:** Propagated as `SummarizationError` or `MapReduceSummarizationError` rather than silently returning empty results. - **Empty input:** Returns NONE level immediately (not an error). - **Encoding errors:** Falls back to character-based token estimation. +- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max. The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path. --- -## 8. Future Improvements +## 8. Comparison: Old vs New Approach + +| Aspect | Old Approach | New Approach | +| :--- | :--- | :--- | +| Levels | 5 fixed (NONE/BRIEF/STANDARD/DETAILED/HIERARCHICAL) | 3 effective (NONE/BRIEF/MAP_REDUCE) | +| Hierarchy | Fixed L1/L2/L3 structure | Dynamic collapse depth | +| Chunk size | 3000 tokens | 2048 tokens (BOOOOKSCORE) | +| token_max | N/A (fixed levels) | 3000 (LangChain) | +| Complexity | Multiple code paths | Single map-reduce algorithm | +| Research basis | Heuristic | LangChain + BOOOOKSCORE | + +--- -Based on research findings, consider: +## 9. Future Improvements -1. **Reduce chunk size to 2048** to align with BOOOOKSCORE's tested defaults -2. **Validate token thresholds empirically** with real-world content -3. **Add incremental updating mode** as alternative to hierarchical merging for larger context models -4. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation +1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation +2. **Add incremental updating mode** as alternative to hierarchical merging for larger context models +3. **Tune token thresholds empirically** with real-world content diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py index 6a542dbdc..70d434dda 100644 --- a/examples/summarizer_demo.py +++ b/examples/summarizer_demo.py @@ -1,17 +1,15 @@ """Demonstrate the summarizer on texts of varying lengths from the internet. This script fetches content of different sizes and shows how the adaptive -summarizer automatically selects the appropriate strategy (BRIEF, STANDARD, -DETAILED, or HIERARCHICAL) based on content length. +summarizer automatically selects the appropriate strategy (BRIEF or MAP_REDUCE) +based on content length. Usage: python examples/summarizer_demo.py # Test specific levels only python examples/summarizer_demo.py --level brief - python examples/summarizer_demo.py --level standard - python examples/summarizer_demo.py --level detailed - python examples/summarizer_demo.py --level hierarchical + python examples/summarizer_demo.py --level map_reduce # Use a different model python examples/summarizer_demo.py --model "gpt-4o-mini" @@ -58,9 +56,7 @@ class TextSample: # Thresholds from adaptive.py: # NONE: < 100 tokens # BRIEF: 100-500 tokens -# STANDARD: 500-3000 tokens -# DETAILED: 3000-15000 tokens -# HIERARCHICAL: > 15000 tokens +# MAP_REDUCE: >= 500 tokens # Sample texts of varying lengths to demonstrate different summarization levels SAMPLES: list[TextSample] = [ @@ -98,10 +94,10 @@ class TextSample: """, ), TextSample( - name="Standard - Technology Article", - description="~800-2000 tokens - triggers STANDARD level (500-3000 token range)", + name="Map-Reduce - Technology Article", + description="~800-2000 tokens - triggers MAP_REDUCE level (>=500 tokens)", url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence", - expected_level=SummaryLevel.STANDARD, + expected_level=SummaryLevel.MAP_REDUCE, content_type="document", fallback_content=""" Artificial intelligence (AI) is the intelligence of machines or software, @@ -178,18 +174,18 @@ class TextSample: """, ), TextSample( - name="Detailed - Full Article", - description="~4000-10000 tokens - triggers DETAILED level (3000-15000 token range)", + name="Map-Reduce - Full Article", + description="~4000-10000 tokens - triggers MAP_REDUCE with chunking", url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning", - expected_level=SummaryLevel.DETAILED, + expected_level=SummaryLevel.MAP_REDUCE, content_type="document", fallback_content=None, # We'll generate synthetic content ), TextSample( - name="Hierarchical - Long Document", - description="~16000+ tokens - triggers HIERARCHICAL level (>15000 tokens)", + name="Map-Reduce - Long Document", + description="~16000+ tokens - triggers MAP_REDUCE with multiple collapse iterations", url="https://www.gutenberg.org/cache/epub/84/pg84.txt", # Frankenstein (truncated) - expected_level=SummaryLevel.HIERARCHICAL, + expected_level=SummaryLevel.MAP_REDUCE, content_type="document", fallback_content=None, # We'll generate synthetic content (~16K tokens) ), @@ -229,7 +225,7 @@ def generate_synthetic_content(target_tokens: int, topic: str = "technology") -> return "\n\n".join(result) -async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: # noqa: PLR0912 +async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: """Fetch content from URL or use fallback.""" try: # Add User-Agent header to avoid 403 errors from some sites @@ -269,9 +265,7 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: # Check if content is too short for expected level min_words_for_level = { SummaryLevel.BRIEF: 80, # Need ~100 tokens - SummaryLevel.STANDARD: 400, # Need ~500 tokens - SummaryLevel.DETAILED: 2500, # Need ~3000 tokens - SummaryLevel.HIERARCHICAL: 12000, # Need ~15000 tokens + SummaryLevel.MAP_REDUCE: 400, # Need ~500 tokens } min_words = min_words_for_level.get(sample.expected_level, 50) @@ -282,22 +276,17 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: else: target_tokens = { SummaryLevel.BRIEF: 300, - SummaryLevel.STANDARD: 1500, - SummaryLevel.DETAILED: 8000, - SummaryLevel.HIERARCHICAL: 16000, # Keep manageable for demo + SummaryLevel.MAP_REDUCE: 1500, } content = generate_synthetic_content( target_tokens.get(sample.expected_level, 1000), ) - # For HIERARCHICAL, truncate very long content to keep demo fast - # but ensure we stay above 15000 tokens (~13000 words) - if sample.expected_level == SummaryLevel.HIERARCHICAL: - words = content.split() - # ~16000 tokens ≈ 13500 words (need >15000 tokens for HIERARCHICAL) - if len(words) > 13500: # noqa: PLR2004 - content = " ".join(words[:13500]) - print(" 📎 Truncated to ~13500 words for faster demo") + # For very long content, truncate to keep demo fast + words = content.split() + if len(words) > 13500: # noqa: PLR2004 + content = " ".join(words[:13500]) + print(" 📎 Truncated to ~13500 words for faster demo") return content.strip() @@ -310,9 +299,7 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: # Generate synthetic content for the expected level target_tokens = { SummaryLevel.BRIEF: 300, - SummaryLevel.STANDARD: 1500, - SummaryLevel.DETAILED: 8000, - SummaryLevel.HIERARCHICAL: 16000, # Keep manageable for demo + SummaryLevel.MAP_REDUCE: 1500, } return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000)) @@ -335,9 +322,7 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non level_emoji = { SummaryLevel.NONE: "⏭️", SummaryLevel.BRIEF: "📝", - SummaryLevel.STANDARD: "📄", - SummaryLevel.DETAILED: "📚", - SummaryLevel.HIERARCHICAL: "🏗️", + SummaryLevel.MAP_REDUCE: "🔄", } print("\n🎯 Summarization Result:") print(f" Level: {level_emoji.get(result.level, '❓')} {result.level.name}") @@ -345,6 +330,8 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non print(f" Match: {'✅' if result.level == sample.expected_level else '⚠️'}") print(f" Output tokens: {result.output_tokens:,}") print(f" Compression: {result.compression_ratio:.1%}") + if result.collapse_depth > 0: + print(f" Collapse depth: {result.collapse_depth}") # Summary content if result.summary: @@ -357,23 +344,6 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non ) print(wrapped) - # Hierarchical details if present - if result.hierarchical: - h = result.hierarchical - print("\n🏗️ Hierarchical Structure:") - print(f" L1 chunks: {len(h.l1_summaries)}") - print(f" L2 groups: {len(h.l2_summaries)}") - if h.l2_summaries: - print(f" L2 preview: {h.l2_summaries[0][:100]}...") - print("\n L3 Final Summary:") - wrapped = textwrap.fill( - h.l3_summary, - width=68, - initial_indent=" ", - subsequent_indent=" ", - ) - print(wrapped) - async def run_demo( level_filter: str | None = None, @@ -394,7 +364,7 @@ async def run_demo( openai_base_url=actual_base_url, model=actual_model, api_key=api_key, - chunk_size=3000, + chunk_size=2048, # BOOOOKSCORE default max_concurrent_chunks=3, timeout=120.0, # Longer timeout for local models ) @@ -404,9 +374,7 @@ async def run_demo( if level_filter: level_map = { "brief": SummaryLevel.BRIEF, - "standard": SummaryLevel.STANDARD, - "detailed": SummaryLevel.DETAILED, - "hierarchical": SummaryLevel.HIERARCHICAL, + "map_reduce": SummaryLevel.MAP_REDUCE, } target_level = level_map.get(level_filter.lower()) if target_level: @@ -449,14 +417,15 @@ def main() -> None: epilog=textwrap.dedent(""" Examples: python examples/summarizer_demo.py - python examples/summarizer_demo.py --level standard + python examples/summarizer_demo.py --level brief + python examples/summarizer_demo.py --level map_reduce python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1" """), ) parser.add_argument( "--level", - choices=["brief", "standard", "detailed", "hierarchical"], + choices=["brief", "map_reduce"], help="Only test a specific summarization level", ) parser.add_argument( diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py index 12e419de9..44d0a031c 100644 --- a/tests/memory/test_engine.py +++ b/tests/memory/test_engine.py @@ -355,9 +355,8 @@ def __init__(self, output: Any) -> None: async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: return SummaryResult( - level=SummaryLevel.STANDARD, + level=SummaryLevel.MAP_REDUCE, summary="summary up to 256", - hierarchical=None, input_tokens=100, output_tokens=20, compression_ratio=0.2, @@ -583,9 +582,8 @@ def __init__(self, output: Any) -> None: async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: return SummaryResult( - level=SummaryLevel.STANDARD, + level=SummaryLevel.MAP_REDUCE, summary="summary text", - hierarchical=None, input_tokens=100, output_tokens=20, compression_ratio=0.2, @@ -632,4 +630,4 @@ async def fake_reconcile( files = list(tmp_path.glob("entries/**/*.md")) assert len(files) == 4 # user + assistant + fact + 1 summary assert any("facts" in str(f) for f in files) - assert any("summaries/L3/final.md" in str(f) for f in files) + assert any("summaries" in str(f) for f in files) diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py index db197b023..86040d7a1 100644 --- a/tests/memory/test_git_integration.py +++ b/tests/memory/test_git_integration.py @@ -66,9 +66,8 @@ async def fake_reconcile( async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: return SummaryResult( - level=SummaryLevel.STANDARD, + level=SummaryLevel.MAP_REDUCE, summary="User likes testing.", - hierarchical=None, input_tokens=100, output_tokens=20, compression_ratio=0.2, diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py index 5e8e33142..29dbe2e55 100644 --- a/tests/memory/test_store.py +++ b/tests/memory/test_store.py @@ -137,21 +137,21 @@ def test_upsert_and_delete_entries_delegate() -> None: def test_upsert_summary_entries_simple() -> None: - """Test upserting a simple (non-hierarchical) summary.""" + """Test upserting a summary.""" fake = _FakeCollection() entries = [ { - "id": "conv-123:summary:L3:final", - "content": "A standard paragraph summary.", + "id": "conv-123:summary", + "content": "A paragraph summary.", "metadata": { "conversation_id": "conv-123", "role": "summary", - "level": 3, "is_final": True, - "summary_level_name": "STANDARD", + "summary_level": "MAP_REDUCE", "input_tokens": 1000, "output_tokens": 50, "compression_ratio": 0.05, + "collapse_depth": 0, "created_at": "2024-01-01T00:00:00", }, }, @@ -159,52 +159,30 @@ def test_upsert_summary_entries_simple() -> None: ids = _store.upsert_summary_entries(fake, entries) - assert ids == ["conv-123:summary:L3:final"] + assert ids == ["conv-123:summary"] assert len(fake.upserts) == 1 upserted_ids, upserted_docs, upserted_metas = fake.upserts[0] - assert upserted_ids == ["conv-123:summary:L3:final"] - assert upserted_docs == ["A standard paragraph summary."] - assert upserted_metas[0]["level"] == 3 + assert upserted_ids == ["conv-123:summary"] + assert upserted_docs == ["A paragraph summary."] assert upserted_metas[0]["is_final"] is True -def test_upsert_summary_entries_with_chunks() -> None: - """Test upserting a hierarchical summary with L1 and L3 entries.""" +def test_upsert_summary_entries_with_collapse_depth() -> None: + """Test upserting a summary with collapse depth metadata.""" fake = _FakeCollection() entries = [ { - "id": "conv-456:summary:L1:0", - "content": "Chunk 0 summary", - "metadata": { - "conversation_id": "conv-456", - "role": "summary", - "level": 1, - "chunk_index": 0, - "created_at": "2024-01-01T00:00:00", - }, - }, - { - "id": "conv-456:summary:L1:1", - "content": "Chunk 1 summary", - "metadata": { - "conversation_id": "conv-456", - "role": "summary", - "level": 1, - "chunk_index": 1, - "created_at": "2024-01-01T00:00:00", - }, - }, - { - "id": "conv-456:summary:L3:final", + "id": "conv-456:summary", "content": "Final synthesis", "metadata": { "conversation_id": "conv-456", "role": "summary", - "level": 3, "is_final": True, + "summary_level": "MAP_REDUCE", "input_tokens": 5000, "output_tokens": 100, "compression_ratio": 0.02, + "collapse_depth": 2, "created_at": "2024-01-01T00:00:00", }, }, @@ -212,10 +190,9 @@ def test_upsert_summary_entries_with_chunks() -> None: ids = _store.upsert_summary_entries(fake, entries) - assert len(ids) == 3 - assert "conv-456:summary:L1:0" in ids - assert "conv-456:summary:L1:1" in ids - assert "conv-456:summary:L3:final" in ids + assert len(ids) == 1 + assert ids[0] == "conv-456:summary" + assert fake.upserts[0][2][0]["collapse_depth"] == 2 def test_upsert_summary_entries_empty() -> None: @@ -228,41 +205,8 @@ def test_upsert_summary_entries_empty() -> None: assert len(fake.upserts) == 0 -def test_get_summary_at_level() -> None: - """Test retrieving summaries at a specific level.""" - fake = _FakeCollection( - get_result={ - "documents": ["Chunk 0", "Chunk 1"], - "metadatas": [ - { - "conversation_id": "c1", - "role": "summary", - "level": 1, - "chunk_index": 0, - "created_at": "now", - }, - { - "conversation_id": "c1", - "role": "summary", - "level": 1, - "chunk_index": 1, - "created_at": "now", - }, - ], - "ids": ["c1:summary:L1:0", "c1:summary:L1:1"], - }, - ) - - records = _store.get_summary_at_level(fake, "c1", level=1) - - assert len(records) == 2 - assert records[0].metadata.level == 1 - assert records[0].metadata.chunk_index == 0 - assert records[1].metadata.chunk_index == 1 - - -def test_get_final_summary_returns_final() -> None: - """Test getting the L3 final summary.""" +def test_get_final_summary_returns_summary() -> None: + """Test getting the final summary for a conversation.""" fake = _FakeCollection( get_result={ "documents": ["The final summary"], @@ -270,12 +214,13 @@ def test_get_final_summary_returns_final() -> None: { "conversation_id": "c1", "role": "summary", - "level": 3, "is_final": True, + "summary_level": "MAP_REDUCE", + "collapse_depth": 1, "created_at": "now", }, ], - "ids": ["c1:summary:L3:final"], + "ids": ["c1:summary"], }, ) @@ -295,42 +240,28 @@ def test_get_final_summary_returns_none_when_missing() -> None: assert result is None -def test_delete_summaries_all_levels() -> None: - """Test deleting all summary levels for a conversation.""" +def test_delete_summaries() -> None: + """Test deleting summaries for a conversation.""" fake = _FakeCollection( get_result={ - "documents": ["L1", "L3"], + "documents": ["The summary"], "metadatas": [ - {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"}, - {"conversation_id": "c1", "role": "summary", "level": 3, "created_at": "now"}, + { + "conversation_id": "c1", + "role": "summary", + "summary_level": "MAP_REDUCE", + "created_at": "now", + }, ], - "ids": ["c1:summary:L1:0", "c1:summary:L3:final"], + "ids": ["c1:summary"], }, ) deleted_count = _store.delete_summaries(fake, "c1") - assert deleted_count == 2 - assert len(fake.deleted) == 1 - assert set(fake.deleted[0]) == {"c1:summary:L1:0", "c1:summary:L3:final"} - - -def test_delete_summaries_specific_levels() -> None: - """Test deleting only specific summary levels.""" - fake = _FakeCollection( - get_result={ - "documents": ["L1 chunk"], - "metadatas": [ - {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"}, - ], - "ids": ["c1:summary:L1:0"], - }, - ) - - deleted_count = _store.delete_summaries(fake, "c1", levels=[1]) - assert deleted_count == 1 - assert fake.deleted[0] == ["c1:summary:L1:0"] + assert len(fake.deleted) == 1 + assert fake.deleted[0] == ["c1:summary"] def test_delete_summaries_no_entries() -> None: diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py index 6acf43171..a64a72a16 100644 --- a/tests/summarizer/test_adaptive.py +++ b/tests/summarizer/test_adaptive.py @@ -7,7 +7,8 @@ import pytest from agent_cli.summarizer.adaptive import ( - LEVEL_THRESHOLDS, + THRESHOLD_BRIEF, + THRESHOLD_NONE, SummarizationError, SummarizerConfig, SummaryOutput, @@ -63,9 +64,31 @@ def test_trailing_slash_stripped(self) -> None: ) assert config.openai_base_url == "http://localhost:8000/v1" + def test_default_chunk_size_is_booookscore(self) -> None: + """Test that default chunk_size follows BOOOOKSCORE recommendation.""" + config = SummarizerConfig( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + assert config.chunk_size == 2048 # BOOOOKSCORE's tested default + + def test_default_token_max_is_langchain(self) -> None: + """Test that default token_max follows LangChain's default.""" + config = SummarizerConfig( + openai_base_url="http://localhost:8000/v1", + model="gpt-4", + ) + assert config.token_max == 3000 # LangChain's default + class TestDetermineLevel: - """Tests for level determination based on token count.""" + """Tests for level determination based on token count. + + The simplified approach has 3 levels: + - NONE: Very short content (< 100 tokens) + - BRIEF: Short content (100-500 tokens) + - MAP_REDUCE: Everything else (uses map-reduce) + """ def test_none_level_threshold(self) -> None: """Test NONE level for very short content.""" @@ -78,30 +101,17 @@ def test_brief_level_threshold(self) -> None: assert determine_level(300) == SummaryLevel.BRIEF assert determine_level(499) == SummaryLevel.BRIEF - def test_standard_level_threshold(self) -> None: - """Test STANDARD level for medium content.""" - assert determine_level(500) == SummaryLevel.STANDARD - assert determine_level(1500) == SummaryLevel.STANDARD - assert determine_level(2999) == SummaryLevel.STANDARD - - def test_detailed_level_threshold(self) -> None: - """Test DETAILED level for longer content.""" - assert determine_level(3000) == SummaryLevel.DETAILED - assert determine_level(8000) == SummaryLevel.DETAILED - assert determine_level(14999) == SummaryLevel.DETAILED - - def test_hierarchical_level_threshold(self) -> None: - """Test HIERARCHICAL level for very long content.""" - assert determine_level(15000) == SummaryLevel.HIERARCHICAL - assert determine_level(50000) == SummaryLevel.HIERARCHICAL - assert determine_level(100000) == SummaryLevel.HIERARCHICAL + def test_map_reduce_level_for_longer_content(self) -> None: + """Test that content >= 500 tokens uses MAP_REDUCE.""" + assert determine_level(500) == SummaryLevel.MAP_REDUCE + assert determine_level(1500) == SummaryLevel.MAP_REDUCE + assert determine_level(5000) == SummaryLevel.MAP_REDUCE + assert determine_level(20000) == SummaryLevel.MAP_REDUCE def test_thresholds_match_constants(self) -> None: """Verify thresholds match the module constants.""" - assert LEVEL_THRESHOLDS[SummaryLevel.NONE] == 100 - assert LEVEL_THRESHOLDS[SummaryLevel.BRIEF] == 500 - assert LEVEL_THRESHOLDS[SummaryLevel.STANDARD] == 3000 - assert LEVEL_THRESHOLDS[SummaryLevel.DETAILED] == 15000 + assert THRESHOLD_NONE == 100 + assert THRESHOLD_BRIEF == 500 class TestSummarize: @@ -168,92 +178,81 @@ async def test_brief_level_calls_brief_summary( assert result.summary == "Brief summary." @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._standard_summary") - async def test_standard_level_calls_standard_summary( + @patch("agent_cli.summarizer.adaptive._map_reduce_summary") + async def test_longer_content_uses_map_reduce( self, - mock_standard: AsyncMock, + mock_map_reduce: AsyncMock, config: SummarizerConfig, ) -> None: - """Test that STANDARD level content calls _standard_summary.""" - mock_standard.return_value = "Standard summary paragraph." + """Test that content >= 500 tokens uses map-reduce.""" + mock_result = SummaryResult( + level=SummaryLevel.MAP_REDUCE, + summary="Map-reduce summary.", + input_tokens=800, + output_tokens=100, + compression_ratio=0.125, + ) + mock_map_reduce.return_value = mock_result - # Create content that's ~500-3000 tokens + # Create content that's ~500+ tokens content = "This is a test sentence with more words. " * 100 # ~800 tokens result = await summarize(content, config, content_type="general") - mock_standard.assert_called_once_with(content, config, None, "general") - assert result.level == SummaryLevel.STANDARD - assert result.summary == "Standard summary paragraph." + mock_map_reduce.assert_called_once() + assert result.summary == "Map-reduce summary." @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._standard_summary") - async def test_prior_summary_passed_to_standard( + @patch("agent_cli.summarizer.adaptive._map_reduce_summary") + async def test_prior_summary_passed_to_map_reduce( self, - mock_standard: AsyncMock, + mock_map_reduce: AsyncMock, config: SummarizerConfig, ) -> None: - """Test that prior_summary is passed to _standard_summary.""" - mock_standard.return_value = "Updated summary." - - content = "This is a test sentence with more words. " * 100 - prior = "Previous context summary." - - await summarize(content, config, prior_summary=prior) - - mock_standard.assert_called_once_with(content, config, prior, "general") - - @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._detailed_summary") - async def test_detailed_level_calls_detailed_summary( - self, - mock_detailed: AsyncMock, - config: SummarizerConfig, - ) -> None: - """Test that DETAILED level content calls _detailed_summary.""" + """Test that prior_summary is passed to _map_reduce_summary.""" mock_result = SummaryResult( - level=SummaryLevel.DETAILED, - summary="Detailed summary.", - hierarchical=None, - input_tokens=5000, + level=SummaryLevel.MAP_REDUCE, + summary="Updated summary.", + input_tokens=800, output_tokens=100, - compression_ratio=0.02, + compression_ratio=0.125, ) - mock_detailed.return_value = mock_result + mock_map_reduce.return_value = mock_result - # Create content that's ~3000-15000 tokens - content = "Word " * 5000 # ~5000 tokens + content = "This is a test sentence with more words. " * 100 + prior = "Previous context summary." - result = await summarize(content, config) + await summarize(content, config, prior_summary=prior) - assert mock_detailed.called - assert result.level == SummaryLevel.DETAILED + # Verify prior_summary was passed + call_args = mock_map_reduce.call_args + assert call_args[0][3] == prior # prior_summary is 4th positional arg @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._hierarchical_summary") - async def test_hierarchical_level_calls_hierarchical_summary( + @patch("agent_cli.summarizer.adaptive._map_reduce_summary") + async def test_very_long_content_uses_map_reduce( self, - mock_hierarchical: AsyncMock, + mock_map_reduce: AsyncMock, config: SummarizerConfig, ) -> None: - """Test that HIERARCHICAL level content calls _hierarchical_summary.""" + """Test that very long content uses map-reduce.""" mock_result = SummaryResult( - level=SummaryLevel.HIERARCHICAL, - summary="Hierarchical summary.", - hierarchical=None, + level=SummaryLevel.MAP_REDUCE, + summary="Long content summary.", input_tokens=20000, output_tokens=500, compression_ratio=0.025, + collapse_depth=2, ) - mock_hierarchical.return_value = mock_result + mock_map_reduce.return_value = mock_result # Create content that's > 15000 tokens content = "Word " * 20000 result = await summarize(content, config) - assert mock_hierarchical.called - assert result.level == SummaryLevel.HIERARCHICAL + assert mock_map_reduce.called + assert result.level == SummaryLevel.MAP_REDUCE class TestGenerateSummary: diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py index d70286592..f11fcff8b 100644 --- a/tests/summarizer/test_integration.py +++ b/tests/summarizer/test_integration.py @@ -1,457 +1,68 @@ -"""Integration tests for the summarizer with memory system.""" +"""Integration tests for summarizer with storage layer.""" from __future__ import annotations -from typing import TYPE_CHECKING, Any -from unittest.mock import patch - -import pytest - -from agent_cli.memory._ingest import summarize_content -from agent_cli.memory._persistence import persist_hierarchical_summary -from agent_cli.memory._store import ( - get_final_summary, - get_summary_at_level, - upsert_summary_entries, -) -from agent_cli.summarizer import SummaryLevel, SummaryResult from agent_cli.summarizer.adaptive import determine_level -from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary - -if TYPE_CHECKING: - from pathlib import Path - - -class _FakeCollection: - """Minimal Chroma-like collection for testing.""" - - def __init__(self) -> None: - self._store: dict[str, tuple[str, dict[str, Any]]] = {} - - def upsert( - self, - *, - ids: list[str], - documents: list[str], - metadatas: list[dict[str, Any]], - ) -> None: - for doc_id, doc, meta in zip(ids, documents, metadatas, strict=False): - self._store[doc_id] = (doc, meta) - - def get( - self, - *, - where: dict[str, Any] | None = None, - include: list[str] | None = None, # noqa: ARG002 - ) -> dict[str, Any]: - if where is None: - return {"documents": [], "metadatas": [], "ids": []} - - results: list[tuple[str, tuple[str, dict[str, Any]]]] = [] - for doc_id, (doc, meta) in self._store.items(): - # Check all conditions in $and clause - conditions = where.get("$and", [where]) - match = True - for clause in conditions: - for k, v in clause.items(): - if k == "$and": - continue - if isinstance(v, dict): - if "$in" in v and meta.get(k) not in v["$in"]: - match = False - if "$ne" in v and meta.get(k) == v["$ne"]: - match = False - elif meta.get(k) != v: - match = False - if match: - results.append((doc_id, (doc, meta))) - - docs = [doc for _, (doc, _) in results] - metas = [meta for _, (_, meta) in results] - ids = [doc_id for doc_id, _ in results] - return {"documents": docs, "metadatas": metas, "ids": ids} - - def delete( - self, - ids: list[str] | None = None, - where: dict[str, Any] | None = None, # noqa: ARG002 - ) -> None: - if ids: - for doc_id in ids: - self._store.pop(doc_id, None) - - -@pytest.fixture -def fake_collection() -> _FakeCollection: - """Create a fake ChromaDB collection.""" - return _FakeCollection() - - -@pytest.fixture -def memory_root(tmp_path: Path) -> Path: - """Create a temporary memory root directory.""" - return tmp_path / "memory" - - -class TestSummaryResultStorageMetadata: - """Test SummaryResult.to_storage_metadata for various levels.""" - - def test_standard_summary_produces_single_entry(self) -> None: - """Test that STANDARD level produces a single L3 entry.""" - result = SummaryResult( - level=SummaryLevel.STANDARD, - summary="A paragraph summary of the content.", - hierarchical=None, - input_tokens=1000, - output_tokens=50, - compression_ratio=0.05, - ) - - entries = result.to_storage_metadata("conv-123") - - assert len(entries) == 1 - entry = entries[0] - assert entry["id"] == "conv-123:summary:L3:final" - assert entry["content"] == "A paragraph summary of the content." - assert entry["metadata"]["level"] == 3 - assert entry["metadata"]["is_final"] is True - assert entry["metadata"]["summary_level_name"] == "STANDARD" +from agent_cli.summarizer.models import SummaryLevel, SummaryResult - def test_hierarchical_summary_produces_multiple_entries(self) -> None: - """Test that HIERARCHICAL level produces L1, L2, L3 entries.""" - l1_summaries = [ - ChunkSummary( - chunk_index=0, - content="Chunk 0", - token_count=10, - source_tokens=100, - ), - ChunkSummary( - chunk_index=1, - content="Chunk 1", - token_count=10, - source_tokens=100, - ), - ChunkSummary( - chunk_index=2, - content="Chunk 2", - token_count=10, - source_tokens=100, - ), - ] - hierarchical = HierarchicalSummary( - l1_summaries=l1_summaries, - l2_summaries=["Group 0 summary"], - l3_summary="Final hierarchical synthesis.", - ) - result = SummaryResult( - level=SummaryLevel.HIERARCHICAL, - summary="Final hierarchical synthesis.", - hierarchical=hierarchical, - input_tokens=20000, - output_tokens=200, - compression_ratio=0.01, - ) - entries = result.to_storage_metadata("conv-456") - - # Should have 3 L1 + 1 L2 + 1 L3 = 5 entries - assert len(entries) == 5 - - # Check L1 entries - l1_entries = [e for e in entries if e["metadata"]["level"] == 1] - assert len(l1_entries) == 3 - - # Check L2 entries - l2_entries = [e for e in entries if e["metadata"]["level"] == 2] - assert len(l2_entries) == 1 - - # Check L3 entry - l3_entries = [e for e in entries if e["metadata"]["level"] == 3] - assert len(l3_entries) == 1 +class TestDetermineLevel: + """Tests for determine_level function with various content sizes.""" + def test_short_content_is_brief(self) -> None: + """Test that 100-500 token content uses BRIEF.""" + level = determine_level(200) + assert level == SummaryLevel.BRIEF -class TestHierarchicalSummaryStorage: - """Test storing hierarchical summaries to ChromaDB.""" + def test_medium_content_is_map_reduce(self) -> None: + """Test that 500+ token content uses MAP_REDUCE.""" + level = determine_level(1000) + assert level == SummaryLevel.MAP_REDUCE - def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None: - """Test storing a simple (non-hierarchical) summary.""" - result = SummaryResult( - level=SummaryLevel.STANDARD, - summary="A standard summary.", - hierarchical=None, - input_tokens=1000, - output_tokens=50, - compression_ratio=0.05, - ) + def test_long_content_is_map_reduce(self) -> None: + """Test that 3000+ token content uses MAP_REDUCE.""" + level = determine_level(5000) + assert level == SummaryLevel.MAP_REDUCE - entries = result.to_storage_metadata("conv-123") - ids = upsert_summary_entries(fake_collection, entries) + def test_very_long_content_is_map_reduce(self) -> None: + """Test that content over 15000 tokens still uses MAP_REDUCE.""" + level = determine_level(20000) + assert level == SummaryLevel.MAP_REDUCE - assert len(ids) == 1 - assert "conv-123:summary:L3:final" in ids - # Verify retrieval - stored = get_final_summary(fake_collection, "conv-123") - assert stored is not None - assert stored.content == "A standard summary." +class TestSummaryResultStorage: + """Tests for SummaryResult storage metadata generation.""" - def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> None: - """Test storing a hierarchical summary with all levels.""" - l1_summaries = [ - ChunkSummary( - chunk_index=0, - content="Chunk 0 summary", - token_count=10, - source_tokens=100, - ), - ChunkSummary( - chunk_index=1, - content="Chunk 1 summary", - token_count=10, - source_tokens=100, - ), - ] - hierarchical = HierarchicalSummary( - l1_summaries=l1_summaries, - l2_summaries=[], - l3_summary="Final summary", - ) + def test_to_storage_metadata_creates_entry(self) -> None: + """Test that to_storage_metadata creates a valid entry.""" result = SummaryResult( - level=SummaryLevel.DETAILED, - summary="Final summary", - hierarchical=hierarchical, + level=SummaryLevel.MAP_REDUCE, + summary="A comprehensive summary.", input_tokens=5000, output_tokens=100, compression_ratio=0.02, + collapse_depth=1, ) + entries = result.to_storage_metadata("test-conversation") - entries = result.to_storage_metadata("conv-789") - ids = upsert_summary_entries(fake_collection, entries) - - assert len(ids) == 3 # 2 L1 + 1 L3 - - # Verify L1 retrieval - l1_stored = get_summary_at_level(fake_collection, "conv-789", level=1) - assert len(l1_stored) == 2 - - # Verify L3 retrieval - final = get_final_summary(fake_collection, "conv-789") - assert final is not None - assert final.content == "Final summary" - - -class TestFilePersistence: - """Test hierarchical summary file persistence.""" - - def test_persist_hierarchical_creates_files( - self, - fake_collection: _FakeCollection, - memory_root: Path, - ) -> None: - """Test that persist_hierarchical_summary creates correct file structure.""" - l1_summaries = [ - ChunkSummary( - chunk_index=0, - content="Chunk 0 content", - token_count=10, - source_tokens=100, - ), - ChunkSummary( - chunk_index=1, - content="Chunk 1 content", - token_count=10, - source_tokens=100, - ), - ] - hierarchical = HierarchicalSummary( - l1_summaries=l1_summaries, - l2_summaries=["Group 0 summary"], - l3_summary="Final synthesis", - ) - result = SummaryResult( - level=SummaryLevel.HIERARCHICAL, - summary="Final synthesis", - hierarchical=hierarchical, - input_tokens=20000, - output_tokens=200, - compression_ratio=0.01, - ) - - ids = persist_hierarchical_summary( - fake_collection, - memory_root=memory_root, - conversation_id="test-conv", - summary_result=result, - ) - - assert len(ids) == 4 # 2 L1 + 1 L2 + 1 L3 - - # Check file structure (note: _slugify converts - to - not _) - entries_dir = memory_root / "entries" / "test-conv" - l1_dir = entries_dir / "summaries" / "L1" - l2_dir = entries_dir / "summaries" / "L2" - l3_dir = entries_dir / "summaries" / "L3" - - assert l1_dir.exists() - assert l2_dir.exists() - assert l3_dir.exists() - - # Check L1 files - l1_files = list(l1_dir.glob("*.md")) - assert len(l1_files) == 2 - - # Check L2 files - l2_files = list(l2_dir.glob("*.md")) - assert len(l2_files) == 1 - - # Check L3 files - l3_files = list(l3_dir.glob("*.md")) - assert len(l3_files) == 1 - assert (l3_dir / "final.md").exists() + assert len(entries) == 1 + entry = entries[0] + assert entry["id"] == "test-conversation:summary" + assert entry["content"] == "A comprehensive summary." + assert entry["metadata"]["conversation_id"] == "test-conversation" + assert entry["metadata"]["role"] == "summary" + assert entry["metadata"]["is_final"] is True + assert entry["metadata"]["summary_level"] == "MAP_REDUCE" + assert entry["metadata"]["collapse_depth"] == 1 - def test_persist_simple_summary_creates_l3_file( - self, - fake_collection: _FakeCollection, - memory_root: Path, - ) -> None: - """Test that a simple summary creates just L3/final.md.""" + def test_none_level_returns_empty(self) -> None: + """Test that NONE level produces no storage entries.""" result = SummaryResult( - level=SummaryLevel.STANDARD, - summary="A standard paragraph summary.", - hierarchical=None, - input_tokens=1000, - output_tokens=50, - compression_ratio=0.05, - ) - - ids = persist_hierarchical_summary( - fake_collection, - memory_root=memory_root, - conversation_id="simple-conv", - summary_result=result, - ) - - assert len(ids) == 1 - - # Check file exists (note: _slugify converts - to - not _) - entries_dir = memory_root / "entries" / "simple-conv" - l3_file = entries_dir / "summaries" / "L3" / "final.md" - assert l3_file.exists() - - # Check content has YAML front matter - content = l3_file.read_text(encoding="utf-8") - assert "---" in content - assert "level: 3" in content - assert "A standard paragraph summary." in content - - def test_persist_deletes_old_summaries( - self, - fake_collection: _FakeCollection, - memory_root: Path, - ) -> None: - """Test that persisting new summary deletes old summary files.""" - # Create first summary - result1 = SummaryResult( - level=SummaryLevel.STANDARD, - summary="First summary.", - hierarchical=None, - input_tokens=1000, - output_tokens=50, - compression_ratio=0.05, - ) - - persist_hierarchical_summary( - fake_collection, - memory_root=memory_root, - conversation_id="conv", - summary_result=result1, + level=SummaryLevel.NONE, + summary=None, + input_tokens=50, + output_tokens=0, + compression_ratio=0.0, ) - - entries_dir = memory_root / "entries" / "conv" - first_file = entries_dir / "summaries" / "L3" / "final.md" - assert first_file.exists() - assert "First summary." in first_file.read_text() - - # Create second summary (should replace first) - result2 = SummaryResult( - level=SummaryLevel.STANDARD, - summary="Second summary.", - hierarchical=None, - input_tokens=1000, - output_tokens=50, - compression_ratio=0.05, - ) - - persist_hierarchical_summary( - fake_collection, - memory_root=memory_root, - conversation_id="conv", - summary_result=result2, - ) - - # First summary should be moved to deleted - assert first_file.exists() - assert "Second summary." in first_file.read_text() - - # Old summary should be in deleted folder - deleted_dir = memory_root / "entries" / "deleted" / "conv" / "summaries" - assert deleted_dir.exists() - - -class TestDetermineLevelFunction: - """Test that determine_level correctly determines summary levels.""" - - def test_very_short_content_is_none(self) -> None: - """Test that content under 100 tokens gets NONE level.""" - level = determine_level(50) - assert level == SummaryLevel.NONE - - def test_short_content_is_brief(self) -> None: - """Test that 100-500 token content gets BRIEF level.""" - level = determine_level(300) - assert level == SummaryLevel.BRIEF - - def test_medium_content_is_standard(self) -> None: - """Test that 500-3000 token content gets STANDARD level.""" - level = determine_level(1500) - assert level == SummaryLevel.STANDARD - - def test_long_content_is_detailed(self) -> None: - """Test that 3000-15000 token content gets DETAILED level.""" - level = determine_level(8000) - assert level == SummaryLevel.DETAILED - - def test_very_long_content_is_hierarchical(self) -> None: - """Test that content over 15000 tokens gets HIERARCHICAL level.""" - level = determine_level(25000) - assert level == SummaryLevel.HIERARCHICAL - - -class TestSummarizeContentFunction: - """Test the summarize_content function from _ingest.""" - - @pytest.mark.asyncio - async def test_summarize_content_creates_result(self) -> None: - """Test that summarize_content returns a valid SummaryResult.""" - # Patch at source since _ingest imports inside the function - with patch("agent_cli.summarizer.summarize") as mock_summarize: - mock_result = SummaryResult( - level=SummaryLevel.STANDARD, - summary="Mocked summary.", - hierarchical=None, - input_tokens=1000, - output_tokens=50, - compression_ratio=0.05, - ) - mock_summarize.return_value = mock_result - - result = await summarize_content( - content="Some content to summarize " * 100, - openai_base_url="http://localhost:8000/v1", - api_key=None, - model="test-model", - ) - - assert result.level == SummaryLevel.STANDARD - assert result.summary == "Mocked summary." + entries = result.to_storage_metadata("test-conversation") + assert entries == [] diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py index d39621119..c5b04f703 100644 --- a/tests/summarizer/test_models.py +++ b/tests/summarizer/test_models.py @@ -7,8 +7,6 @@ import pytest from agent_cli.summarizer.models import ( - ChunkSummary, - HierarchicalSummary, SummaryLevel, SummaryResult, ) @@ -21,122 +19,12 @@ def test_level_values(self) -> None: """Test that levels have correct integer values.""" assert SummaryLevel.NONE == 0 assert SummaryLevel.BRIEF == 1 - assert SummaryLevel.STANDARD == 2 - assert SummaryLevel.DETAILED == 3 - assert SummaryLevel.HIERARCHICAL == 4 + assert SummaryLevel.MAP_REDUCE == 2 def test_level_ordering(self) -> None: """Test that levels can be compared.""" assert SummaryLevel.NONE < SummaryLevel.BRIEF - assert SummaryLevel.BRIEF < SummaryLevel.STANDARD - assert SummaryLevel.STANDARD < SummaryLevel.DETAILED - assert SummaryLevel.DETAILED < SummaryLevel.HIERARCHICAL - - -class TestChunkSummary: - """Tests for ChunkSummary model.""" - - def test_basic_creation(self) -> None: - """Test creating a chunk summary.""" - chunk = ChunkSummary( - chunk_index=0, - content="This is a summary of chunk 1.", - token_count=10, - source_tokens=100, - ) - assert chunk.chunk_index == 0 - assert chunk.content == "This is a summary of chunk 1." - assert chunk.token_count == 10 - assert chunk.source_tokens == 100 - - def test_validation_negative_tokens(self) -> None: - """Test that negative token counts fail validation.""" - with pytest.raises(ValueError, match="greater than or equal to 0"): - ChunkSummary( - chunk_index=0, - content="Test", - token_count=-1, - source_tokens=100, - ) - - -class TestHierarchicalSummary: - """Tests for HierarchicalSummary model.""" - - def test_basic_creation(self) -> None: - """Test creating a hierarchical summary.""" - l1 = [ - ChunkSummary( - chunk_index=0, - content="Chunk 1 summary", - token_count=10, - source_tokens=100, - ), - ChunkSummary( - chunk_index=1, - content="Chunk 2 summary", - token_count=12, - source_tokens=120, - ), - ] - hs = HierarchicalSummary( - l1_summaries=l1, - l2_summaries=["Group summary"], - l3_summary="Final summary of all content.", - ) - assert len(hs.l1_summaries) == 2 - assert len(hs.l2_summaries) == 1 - assert hs.l3_summary == "Final summary of all content." - - def test_default_chunk_settings(self) -> None: - """Test default chunk size and overlap.""" - hs = HierarchicalSummary( - l1_summaries=[], - l2_summaries=[], - l3_summary="Final", - ) - assert hs.chunk_size == 3000 - assert hs.chunk_overlap == 200 - - def test_get_summary_at_level_1(self) -> None: - """Test getting L1 summaries.""" - l1 = [ - ChunkSummary(chunk_index=0, content="C1", token_count=5, source_tokens=50), - ChunkSummary(chunk_index=1, content="C2", token_count=5, source_tokens=50), - ] - hs = HierarchicalSummary(l1_summaries=l1, l2_summaries=[], l3_summary="Final") - result = hs.get_summary_at_level(1) - assert result == ["C1", "C2"] - - def test_get_summary_at_level_2_with_l2(self) -> None: - """Test getting L2 summaries when available.""" - hs = HierarchicalSummary( - l1_summaries=[], - l2_summaries=["Group A", "Group B"], - l3_summary="Final", - ) - result = hs.get_summary_at_level(2) - assert result == ["Group A", "Group B"] - - def test_get_summary_at_level_2_fallback(self) -> None: - """Test getting L2 falls back to L3 when no L2 summaries.""" - hs = HierarchicalSummary( - l1_summaries=[], - l2_summaries=[], - l3_summary="Final summary", - ) - result = hs.get_summary_at_level(2) - assert result == ["Final summary"] - - def test_get_summary_at_level_3(self) -> None: - """Test getting L3 summary.""" - hs = HierarchicalSummary( - l1_summaries=[], - l2_summaries=["Group"], - l3_summary="The final summary", - ) - result = hs.get_summary_at_level(3) - assert result == "The final summary" + assert SummaryLevel.BRIEF < SummaryLevel.MAP_REDUCE class TestSummaryResult: @@ -147,56 +35,46 @@ def test_none_level_result(self) -> None: result = SummaryResult( level=SummaryLevel.NONE, summary=None, - hierarchical=None, input_tokens=50, output_tokens=0, compression_ratio=0.0, ) assert result.level == SummaryLevel.NONE assert result.summary is None - assert result.chunk_summaries is None + assert result.collapse_depth == 0 def test_brief_level_result(self) -> None: """Test result for brief summary.""" result = SummaryResult( level=SummaryLevel.BRIEF, summary="A brief one-sentence summary.", - hierarchical=None, input_tokens=200, output_tokens=10, compression_ratio=0.05, ) assert result.level == SummaryLevel.BRIEF assert result.summary == "A brief one-sentence summary." - assert result.chunk_summaries is None + assert result.collapse_depth == 0 - def test_hierarchical_result_with_chunk_summaries(self) -> None: - """Test hierarchical result exposes chunk summaries.""" - l1 = [ - ChunkSummary(chunk_index=0, content="Chunk 1", token_count=10, source_tokens=100), - ChunkSummary(chunk_index=1, content="Chunk 2", token_count=10, source_tokens=100), - ] - hierarchical = HierarchicalSummary( - l1_summaries=l1, - l2_summaries=[], - l3_summary="Final", - ) + def test_map_reduce_result(self) -> None: + """Test result for map-reduce summary.""" result = SummaryResult( - level=SummaryLevel.DETAILED, - summary="Final", - hierarchical=hierarchical, + level=SummaryLevel.MAP_REDUCE, + summary="A comprehensive summary.", input_tokens=5000, output_tokens=100, compression_ratio=0.02, + collapse_depth=2, ) - assert result.chunk_summaries == ["Chunk 1", "Chunk 2"] + assert result.level == SummaryLevel.MAP_REDUCE + assert result.summary == "A comprehensive summary." + assert result.collapse_depth == 2 def test_to_storage_metadata_none_level(self) -> None: """Test that NONE level produces no storage entries.""" result = SummaryResult( level=SummaryLevel.NONE, summary=None, - hierarchical=None, input_tokens=50, output_tokens=0, compression_ratio=0.0, @@ -205,77 +83,44 @@ def test_to_storage_metadata_none_level(self) -> None: assert entries == [] def test_to_storage_metadata_simple_summary(self) -> None: - """Test storage metadata for simple (non-hierarchical) summary.""" + """Test storage metadata for a summary.""" result = SummaryResult( - level=SummaryLevel.STANDARD, - summary="A standard paragraph summary.", - hierarchical=None, - input_tokens=1000, - output_tokens=50, + level=SummaryLevel.BRIEF, + summary="A brief summary.", + input_tokens=200, + output_tokens=10, compression_ratio=0.05, ) entries = result.to_storage_metadata("conv-456") assert len(entries) == 1 entry = entries[0] - assert entry["id"] == "conv-456:summary:L3:final" - assert entry["content"] == "A standard paragraph summary." + assert entry["id"] == "conv-456:summary" + assert entry["content"] == "A brief summary." assert entry["metadata"]["conversation_id"] == "conv-456" assert entry["metadata"]["role"] == "summary" - assert entry["metadata"]["level"] == 3 assert entry["metadata"]["is_final"] is True - assert entry["metadata"]["summary_level_name"] == "STANDARD" + assert entry["metadata"]["summary_level"] == "BRIEF" - def test_to_storage_metadata_hierarchical(self) -> None: - """Test storage metadata for hierarchical summary.""" - l1 = [ - ChunkSummary( - chunk_index=0, - content="Chunk 0 text", - token_count=10, - source_tokens=100, - ), - ChunkSummary( - chunk_index=1, - content="Chunk 1 text", - token_count=12, - source_tokens=120, - ), - ] - hierarchical = HierarchicalSummary( - l1_summaries=l1, - l2_summaries=["Group 0 summary"], - l3_summary="Final synthesis", - ) + def test_to_storage_metadata_map_reduce(self) -> None: + """Test storage metadata for map-reduce summary.""" result = SummaryResult( - level=SummaryLevel.HIERARCHICAL, - summary="Final synthesis", - hierarchical=hierarchical, + level=SummaryLevel.MAP_REDUCE, + summary="Final synthesis of content.", input_tokens=20000, output_tokens=200, compression_ratio=0.01, + collapse_depth=3, ) entries = result.to_storage_metadata("conv-789") - # Should have 2 L1 + 1 L2 + 1 L3 = 4 entries - assert len(entries) == 4 - - # Check L1 entries - l1_entries = [e for e in entries if e["metadata"]["level"] == 1] - assert len(l1_entries) == 2 - assert l1_entries[0]["id"] == "conv-789:summary:L1:0" - assert l1_entries[0]["metadata"]["chunk_index"] == 0 - - # Check L2 entry - l2_entries = [e for e in entries if e["metadata"]["level"] == 2] - assert len(l2_entries) == 1 - assert l2_entries[0]["id"] == "conv-789:summary:L2:0" - assert l2_entries[0]["content"] == "Group 0 summary" - - # Check L3 entry - l3_entries = [e for e in entries if e["metadata"]["level"] == 3] - assert len(l3_entries) == 1 - assert l3_entries[0]["id"] == "conv-789:summary:L3:final" - assert l3_entries[0]["metadata"]["is_final"] is True + # Should have 1 entry (the final summary) + assert len(entries) == 1 + entry = entries[0] + assert entry["id"] == "conv-789:summary" + assert entry["content"] == "Final synthesis of content." + assert entry["metadata"]["summary_level"] == "MAP_REDUCE" + assert entry["metadata"]["collapse_depth"] == 3 + assert entry["metadata"]["is_final"] is True def test_compression_ratio_bounds(self) -> None: """Test compression ratio validation.""" @@ -283,7 +128,6 @@ def test_compression_ratio_bounds(self) -> None: result = SummaryResult( level=SummaryLevel.BRIEF, summary="Test", - hierarchical=None, input_tokens=100, output_tokens=10, compression_ratio=0.1, @@ -295,7 +139,6 @@ def test_compression_ratio_bounds(self) -> None: SummaryResult( level=SummaryLevel.BRIEF, summary="Test", - hierarchical=None, input_tokens=100, output_tokens=10, compression_ratio=1.5, @@ -307,7 +150,6 @@ def test_created_at_default(self) -> None: result = SummaryResult( level=SummaryLevel.BRIEF, summary="Test", - hierarchical=None, input_tokens=100, output_tokens=10, compression_ratio=0.1, diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py index 22eb4039e..2621b158e 100644 --- a/tests/summarizer/test_utils.py +++ b/tests/summarizer/test_utils.py @@ -140,37 +140,27 @@ def test_none_level(self) -> None: def test_brief_level(self) -> None: """Test level 1 (BRIEF) compression.""" - # BRIEF: ~20% compression, capped at 50 + # BRIEF: ~20% compression, capped at 50, minimum 20 result = estimate_summary_tokens(100, level=1) assert result >= 20 # minimum of 20 assert result <= 50 # capped at 50 - def test_standard_level(self) -> None: - """Test level 2 (STANDARD) compression.""" - # STANDARD: ~12% compression, capped at 200 + def test_map_reduce_level(self) -> None: + """Test level 2 (MAP_REDUCE) compression.""" + # MAP_REDUCE: ~10% compression, capped at 500, minimum 50 result = estimate_summary_tokens(1000, level=2) assert result >= 50 # minimum of 50 - assert result <= 200 # capped at 200 - - def test_detailed_level(self) -> None: - """Test level 3 (DETAILED) compression.""" - # DETAILED: ~7% compression, capped at 500 - result = estimate_summary_tokens(10000, level=3) - assert result >= 100 # minimum of 100 assert result <= 500 # capped at 500 - def test_hierarchical_level(self) -> None: - """Test level 4 (HIERARCHICAL) compression.""" - # HIERARCHICAL: base of 1000 + diminishing returns - result = estimate_summary_tokens(50000, level=4) - assert result >= 1000 # base minimum - assert result <= 2000 # capped at 2000 - - def test_hierarchical_small_input(self) -> None: - """Test HIERARCHICAL with smaller input.""" - # Even with small input, should return base - result = estimate_summary_tokens(5000, level=4) - assert result == 1000 # just the base, no additional + def test_map_reduce_large_input(self) -> None: + """Test MAP_REDUCE with large input hits cap.""" + result = estimate_summary_tokens(50000, level=2) + assert result == 500 # capped at 500 + + def test_map_reduce_small_input(self) -> None: + """Test MAP_REDUCE with small input uses floor.""" + result = estimate_summary_tokens(100, level=2) + assert result == 50 # floor of 50 class TestTokensToWords: From 0fce8aa3db38fa125a4fb8752378322a565cb961 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 09:16:57 -0800 Subject: [PATCH 24/37] refactor(summarizer): consolidate shared code to reduce duplication Address review feedback: 1. DRY: Move SummaryOutput, SummarizationError, SummarizerConfig, and generate_summary to _utils.py - eliminates duplicate code between adaptive.py and map_reduce.py 2. Config consolidation: Remove MapReduceConfig, use SummarizerConfig throughout. map_reduce.py now accepts SummarizerConfig directly. 3. Document redundant check: The token_max check in map_reduce_summarize is kept as a safety guard for direct calls, with clear documentation explaining it's normally handled by adaptive.py. --- agent_cli/summarizer/_utils.py | 93 +++++++++++++++++++++ agent_cli/summarizer/adaptive.py | 109 ++++-------------------- agent_cli/summarizer/map_reduce.py | 129 +++++++---------------------- tests/summarizer/test_adaptive.py | 16 ++-- 4 files changed, 146 insertions(+), 201 deletions(-) diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index 8dbfb1ffd..078e21edc 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -3,15 +3,108 @@ from __future__ import annotations import re +from dataclasses import dataclass from functools import lru_cache from typing import TYPE_CHECKING +from pydantic import BaseModel + from agent_cli.summarizer.models import SummaryLevel if TYPE_CHECKING: import tiktoken +class SummaryOutput(BaseModel): + """Structured output for summary generation.""" + + summary: str + + +class SummarizationError(Exception): + """Raised when summarization fails after all retries.""" + + +@dataclass +class SummarizerConfig: + """Configuration for summarization operations. + + Example: + config = SummarizerConfig( + openai_base_url="http://localhost:8000/v1", + model="llama3.1:8b", + ) + result = await summarize(long_document, config) + print(f"Level: {result.level.name}") + print(f"Compression: {result.compression_ratio:.1%}") + + """ + + openai_base_url: str + model: str + api_key: str | None = None + chunk_size: int = 2048 # BOOOOKSCORE's tested default + token_max: int = 3000 # LangChain's default - when to collapse + chunk_overlap: int = 200 + max_concurrent_chunks: int = 5 + timeout: float = 60.0 + + def __post_init__(self) -> None: + """Normalize the base URL.""" + self.openai_base_url = self.openai_base_url.rstrip("/") + if self.api_key is None: + self.api_key = "not-needed" + + +async def generate_summary( + prompt: str, + config: SummarizerConfig, + max_tokens: int = 256, +) -> str: + """Call the LLM to generate a summary. + + Args: + prompt: The prompt to send to the LLM. + config: Summarizer configuration. + max_tokens: Maximum tokens for the response. + + Returns: + The generated summary text. + + Raises: + SummarizationError: If the LLM call fails. + + """ + from pydantic_ai import Agent # noqa: PLC0415 + from pydantic_ai.models.openai import OpenAIChatModel # noqa: PLC0415 + from pydantic_ai.providers.openai import OpenAIProvider # noqa: PLC0415 + from pydantic_ai.settings import ModelSettings # noqa: PLC0415 + + provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url) + model = OpenAIChatModel( + model_name=config.model, + provider=provider, + settings=ModelSettings( + temperature=0.3, + max_tokens=max_tokens, + ), + ) + + agent = Agent( + model=model, + system_prompt="You are a concise summarizer. Output only the summary, no preamble.", + output_type=SummaryOutput, + retries=2, + ) + + try: + result = await agent.run(prompt) + return result.output.summary.strip() + except Exception as e: + msg = f"Summarization failed: {e}" + raise SummarizationError(msg) from e + + @lru_cache(maxsize=4) def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None: """Get tiktoken encoding for a model, with caching. diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 39669e97d..b03a84e6c 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -19,9 +19,6 @@ from __future__ import annotations import logging -from dataclasses import dataclass - -from pydantic import BaseModel from agent_cli.summarizer._prompts import ( BRIEF_SUMMARY_PROMPT, @@ -29,12 +26,14 @@ get_prompt_for_content_type, ) from agent_cli.summarizer._utils import ( + SummarizationError, + SummarizerConfig, count_tokens, estimate_summary_tokens, + generate_summary, tokens_to_words, ) from agent_cli.summarizer.map_reduce import ( - MapReduceConfig, MapReduceSummarizationError, map_reduce_summarize, ) @@ -49,46 +48,15 @@ THRESHOLD_NONE = 100 # Below this, no summary needed THRESHOLD_BRIEF = 500 # Below this, just a single sentence - -class SummaryOutput(BaseModel): - """Structured output for summary generation.""" - - summary: str - - -class SummarizationError(Exception): - """Raised when summarization fails after all retries.""" - - -@dataclass -class SummarizerConfig: - """Configuration for summarization operations. - - Example: - config = SummarizerConfig( - openai_base_url="http://localhost:8000/v1", - model="llama3.1:8b", - ) - result = await summarize(long_document, config) - print(f"Level: {result.level.name}") - print(f"Compression: {result.compression_ratio:.1%}") - - """ - - openai_base_url: str - model: str - api_key: str | None = None - chunk_size: int = 2048 # BOOOOKSCORE's tested default - token_max: int = 3000 # LangChain's default - when to collapse - chunk_overlap: int = 200 - max_concurrent_chunks: int = 5 - timeout: float = 60.0 - - def __post_init__(self) -> None: - """Normalize the base URL.""" - self.openai_base_url = self.openai_base_url.rstrip("/") - if self.api_key is None: - self.api_key = "not-needed" +# Re-export for backwards compatibility +__all__ = [ + "THRESHOLD_BRIEF", + "THRESHOLD_NONE", + "SummarizationError", + "SummarizerConfig", + "determine_level", + "summarize", +] def determine_level(token_count: int) -> SummaryLevel: @@ -175,7 +143,7 @@ async def summarize( async def _brief_summary(content: str, config: SummarizerConfig) -> str: """Generate a single-sentence summary for brief content.""" prompt = BRIEF_SUMMARY_PROMPT.format(content=content) - return await _generate_summary(prompt, config, max_tokens=50) + return await generate_summary(prompt, config, max_tokens=50) async def _map_reduce_summary( @@ -200,19 +168,8 @@ async def _map_reduce_summary( ) # Use map-reduce for multi-chunk content - mr_config = MapReduceConfig( - openai_base_url=config.openai_base_url, - model=config.model, - api_key=config.api_key, - chunk_size=config.chunk_size, - token_max=config.token_max, - chunk_overlap=config.chunk_overlap, - max_concurrent=config.max_concurrent_chunks, - timeout=config.timeout, - ) - try: - result = await map_reduce_summarize(content, mr_config) + result = await map_reduce_summarize(content, config) except MapReduceSummarizationError as e: raise SummarizationError(str(e)) from e @@ -248,40 +205,4 @@ async def _content_aware_summary( max_words=max_words, ) - return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) - - -async def _generate_summary( - prompt: str, - config: SummarizerConfig, - max_tokens: int = 256, -) -> str: - """Call the LLM to generate a summary. Raises SummarizationError on failure.""" - from pydantic_ai import Agent # noqa: PLC0415 - from pydantic_ai.models.openai import OpenAIChatModel # noqa: PLC0415 - from pydantic_ai.providers.openai import OpenAIProvider # noqa: PLC0415 - from pydantic_ai.settings import ModelSettings # noqa: PLC0415 - - provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url) - model = OpenAIChatModel( - model_name=config.model, - provider=provider, - settings=ModelSettings( - temperature=0.3, - max_tokens=max_tokens, - ), - ) - - agent = Agent( - model=model, - system_prompt="You are a concise summarizer. Output only the summary, no preamble.", - output_type=SummaryOutput, - retries=2, - ) - - try: - result = await agent.run(prompt) - return result.output.summary.strip() - except Exception as e: - msg = f"Summarization failed: {e}" - raise SummarizationError(msg) from e + return await generate_summary(prompt, config, max_tokens=target_tokens + 50) diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py index 09d82d09c..76365e2d3 100644 --- a/agent_cli/summarizer/map_reduce.py +++ b/agent_cli/summarizer/map_reduce.py @@ -19,17 +19,18 @@ import logging from dataclasses import dataclass -from pydantic import BaseModel - from agent_cli.summarizer._prompts import ( CHUNK_SUMMARY_PROMPT, META_SUMMARY_PROMPT, format_summaries_for_meta, ) from agent_cli.summarizer._utils import ( + SummarizationError, + SummarizerConfig, chunk_text, count_tokens, estimate_summary_tokens, + generate_summary, tokens_to_words, ) from agent_cli.summarizer.models import SummaryLevel @@ -37,52 +38,10 @@ logger = logging.getLogger(__name__) -class SummaryOutput(BaseModel): - """Structured output for summary generation.""" - - summary: str - - -class MapReduceSummarizationError(Exception): +class MapReduceSummarizationError(SummarizationError): """Raised when map-reduce summarization fails.""" -@dataclass -class MapReduceConfig: - """Configuration for map-reduce summarization. - - Attributes: - openai_base_url: Base URL for OpenAI-compatible API. - model: Model name for summarization. - api_key: Optional API key. - chunk_size: Target size for splitting content (tokens). - LangChain uses 3000, BOOOOKSCORE suggests 2048. - token_max: Maximum tokens for combined summaries before collapsing. - When combined summaries exceed this, we recursively reduce. - chunk_overlap: Overlap between chunks for context continuity. - max_concurrent: Maximum parallel summarization calls. - timeout: Timeout for API calls in seconds. - max_collapse_depth: Safety limit on recursive collapse depth. - - """ - - openai_base_url: str - model: str - api_key: str | None = None - chunk_size: int = 2048 # BOOOOKSCORE's tested default - token_max: int = 3000 # LangChain's default - chunk_overlap: int = 200 - max_concurrent: int = 5 - timeout: float = 60.0 - max_collapse_depth: int = 10 # Safety limit - - def __post_init__(self) -> None: - """Normalize the base URL.""" - self.openai_base_url = self.openai_base_url.rstrip("/") - if self.api_key is None: - self.api_key = "not-needed" - - @dataclass class MapReduceResult: """Result of map-reduce summarization. @@ -107,19 +66,24 @@ class MapReduceResult: async def map_reduce_summarize( content: str, - config: MapReduceConfig, + config: SummarizerConfig, + max_collapse_depth: int = 10, ) -> MapReduceResult: """Summarize content using map-reduce with dynamic collapse. Algorithm: - 1. If content fits in token_max, summarize directly - 2. Otherwise, split into chunks and summarize each (map phase) - 3. If combined summaries exceed token_max, recursively collapse (reduce phase) - 4. Continue until everything fits in token_max + 1. Split into chunks and summarize each (map phase) + 2. If combined summaries exceed token_max, recursively collapse (reduce phase) + 3. Continue until everything fits in token_max + + Note: This function assumes content exceeds token_max. The caller (adaptive.py) + handles the case where content fits in a single chunk. The check below is a + safety guard for direct calls to this function. Args: content: The content to summarize. - config: Map-reduce configuration. + config: Summarizer configuration. + max_collapse_depth: Safety limit on recursive collapse depth. Returns: MapReduceResult with summary and metadata. @@ -137,7 +101,8 @@ async def map_reduce_summarize( input_tokens = count_tokens(content, config.model) - # If content already fits, just summarize directly + # Safety guard: if content fits in token_max, summarize directly. + # Normally handled by adaptive.py, but kept for direct calls to this function. if input_tokens <= config.token_max: summary = await _summarize_text(content, config) output_tokens = count_tokens(summary, config.model) @@ -166,10 +131,10 @@ async def map_reduce_summarize( depth = 0 while _total_tokens(summaries, config.model) > config.token_max: depth += 1 - if depth > config.max_collapse_depth: + if depth > max_collapse_depth: logger.warning( "Hit max collapse depth %d, forcing final summary", - config.max_collapse_depth, + max_collapse_depth, ) break @@ -205,9 +170,9 @@ def _total_tokens(texts: list[str], model: str) -> int: return sum(count_tokens(t, model) for t in texts) -async def _map_summarize(chunks: list[str], config: MapReduceConfig) -> list[str]: +async def _map_summarize(chunks: list[str], config: SummarizerConfig) -> list[str]: """Summarize each chunk in parallel (map phase).""" - semaphore = asyncio.Semaphore(config.max_concurrent) + semaphore = asyncio.Semaphore(config.max_concurrent_chunks) total = len(chunks) async def summarize_chunk(idx: int, chunk: str) -> str: @@ -222,7 +187,7 @@ async def _summarize_chunk( chunk: str, chunk_index: int, total_chunks: int, - config: MapReduceConfig, + config: SummarizerConfig, ) -> str: """Summarize a single chunk.""" source_tokens = count_tokens(chunk, config.model) @@ -236,12 +201,12 @@ async def _summarize_chunk( max_words=max_words, ) - return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) + return await generate_summary(prompt, config, max_tokens=target_tokens + 50) async def _collapse_summaries( summaries: list[str], - config: MapReduceConfig, + config: SummarizerConfig, ) -> list[str]: """Collapse summaries by grouping and re-summarizing (reduce phase). @@ -272,7 +237,7 @@ async def _collapse_summaries( groups.append(current_group) # Summarize each group in parallel - semaphore = asyncio.Semaphore(config.max_concurrent) + semaphore = asyncio.Semaphore(config.max_concurrent_chunks) async def summarize_group(group: list[str]) -> str: async with semaphore: @@ -282,7 +247,7 @@ async def summarize_group(group: list[str]) -> str: return list(await asyncio.gather(*tasks)) -async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str: +async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str: """Synthesize multiple summaries into one.""" combined_tokens = sum(count_tokens(s, config.model) for s in summaries) target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE) @@ -293,10 +258,10 @@ async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str: max_words=max_words, ) - return await _generate_summary(prompt, config, max_tokens=target_tokens + 100) + return await generate_summary(prompt, config, max_tokens=target_tokens + 100) -async def _summarize_text(text: str, config: MapReduceConfig) -> str: +async def _summarize_text(text: str, config: SummarizerConfig) -> str: """Summarize text that fits within token_max.""" input_tokens = count_tokens(text, config.model) target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE) @@ -310,40 +275,4 @@ async def _summarize_text(text: str, config: MapReduceConfig) -> str: Summary:""" - return await _generate_summary(prompt, config, max_tokens=target_tokens + 50) - - -async def _generate_summary( - prompt: str, - config: MapReduceConfig, - max_tokens: int = 256, -) -> str: - """Call the LLM to generate a summary.""" - from pydantic_ai import Agent # noqa: PLC0415 - from pydantic_ai.models.openai import OpenAIChatModel # noqa: PLC0415 - from pydantic_ai.providers.openai import OpenAIProvider # noqa: PLC0415 - from pydantic_ai.settings import ModelSettings # noqa: PLC0415 - - provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url) - model = OpenAIChatModel( - model_name=config.model, - provider=provider, - settings=ModelSettings( - temperature=0.3, - max_tokens=max_tokens, - ), - ) - - agent = Agent( - model=model, - system_prompt="You are a concise summarizer. Output only the summary, no preamble.", - output_type=SummaryOutput, - retries=2, - ) - - try: - result = await agent.run(prompt) - return result.output.summary.strip() - except Exception as e: - msg = f"Map-reduce summarization failed: {e}" - raise MapReduceSummarizationError(msg) from e + return await generate_summary(prompt, config, max_tokens=target_tokens + 50) diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py index a64a72a16..202a55921 100644 --- a/tests/summarizer/test_adaptive.py +++ b/tests/summarizer/test_adaptive.py @@ -6,13 +6,15 @@ import pytest -from agent_cli.summarizer.adaptive import ( - THRESHOLD_BRIEF, - THRESHOLD_NONE, +from agent_cli.summarizer._utils import ( SummarizationError, SummarizerConfig, SummaryOutput, - _generate_summary, + generate_summary, +) +from agent_cli.summarizer.adaptive import ( + THRESHOLD_BRIEF, + THRESHOLD_NONE, determine_level, summarize, ) @@ -256,7 +258,7 @@ async def test_very_long_content_uses_map_reduce( class TestGenerateSummary: - """Tests for _generate_summary function.""" + """Tests for generate_summary function.""" @pytest.fixture def config(self) -> SummarizerConfig: @@ -281,7 +283,7 @@ async def test_generate_summary_with_pydantic_ai( mock_agent.run = AsyncMock(return_value=mock_result) mock_agent_class.return_value = mock_agent - result = await _generate_summary("Test prompt", config, max_tokens=100) + result = await generate_summary("Test prompt", config, max_tokens=100) assert result == "Generated summary." mock_agent.run.assert_called_once_with("Test prompt") @@ -298,7 +300,7 @@ async def test_raises_summarization_error_on_failure( mock_agent_class.return_value = mock_agent with pytest.raises(SummarizationError, match="Summarization failed"): - await _generate_summary("Test prompt", config, max_tokens=100) + await generate_summary("Test prompt", config, max_tokens=100) class TestSummaryOutput: From 38cce558a15f7ef0d4af7b401d28c3b2aa1dd6b9 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 09:23:57 -0800 Subject: [PATCH 25/37] refactor(summarizer): remove redundant _summarize_text and safety guard - Remove _summarize_text function with hardcoded prompt (use centralized prompts in _prompts.py via adaptive.py instead) - Remove redundant token_max safety guard from map_reduce_summarize - Update docstring to clarify function is designed for content exceeding token_max, directing users to adaptive.summarize() for proper routing --- agent_cli/summarizer/map_reduce.py | 37 +++--------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py index 76365e2d3..93aaabd8c 100644 --- a/agent_cli/summarizer/map_reduce.py +++ b/agent_cli/summarizer/map_reduce.py @@ -76,9 +76,9 @@ async def map_reduce_summarize( 2. If combined summaries exceed token_max, recursively collapse (reduce phase) 3. Continue until everything fits in token_max - Note: This function assumes content exceeds token_max. The caller (adaptive.py) - handles the case where content fits in a single chunk. The check below is a - safety guard for direct calls to this function. + Note: This function is designed for content that exceeds token_max. For shorter + content, use the main `summarize()` function in adaptive.py which selects the + appropriate strategy (NONE, BRIEF, or MAP_REDUCE with content-aware prompts). Args: content: The content to summarize. @@ -101,20 +101,6 @@ async def map_reduce_summarize( input_tokens = count_tokens(content, config.model) - # Safety guard: if content fits in token_max, summarize directly. - # Normally handled by adaptive.py, but kept for direct calls to this function. - if input_tokens <= config.token_max: - summary = await _summarize_text(content, config) - output_tokens = count_tokens(summary, config.model) - return MapReduceResult( - summary=summary, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, - collapse_depth=0, - intermediate_summaries=[], - ) - # Map phase: Split and summarize chunks in parallel chunks = chunk_text( content, @@ -259,20 +245,3 @@ async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str: ) return await generate_summary(prompt, config, max_tokens=target_tokens + 100) - - -async def _summarize_text(text: str, config: SummarizerConfig) -> str: - """Summarize text that fits within token_max.""" - input_tokens = count_tokens(text, config.model) - target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE) - max_words = tokens_to_words(target_tokens) - - prompt = f"""Summarize the following content in {max_words} words or less. -Focus on the key points and main ideas. - -Content: -{text} - -Summary:""" - - return await generate_summary(prompt, config, max_tokens=target_tokens + 50) From c38e305ae886320577e09ff22f9ec42dbda1192b Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 09:34:11 -0800 Subject: [PATCH 26/37] refactor(summarizer): remove redundant exception re-wrapping MapReduceSummarizationError already inherits from SummarizationError, so catching and re-raising serves no purpose. --- agent_cli/summarizer/adaptive.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index b03a84e6c..c5ba092eb 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -33,10 +33,7 @@ generate_summary, tokens_to_words, ) -from agent_cli.summarizer.map_reduce import ( - MapReduceSummarizationError, - map_reduce_summarize, -) +from agent_cli.summarizer.map_reduce import map_reduce_summarize from agent_cli.summarizer.models import ( SummaryLevel, SummaryResult, @@ -168,10 +165,7 @@ async def _map_reduce_summary( ) # Use map-reduce for multi-chunk content - try: - result = await map_reduce_summarize(content, config) - except MapReduceSummarizationError as e: - raise SummarizationError(str(e)) from e + result = await map_reduce_summarize(content, config) return SummaryResult( level=SummaryLevel.MAP_REDUCE, From 349942b95a23506b078e4d74d432f7d66f4bc0e1 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 09:35:21 -0800 Subject: [PATCH 27/37] refactor(summarizer): remove defensive guards for impossible conditions - Remove empty content check in map_reduce_summarize (caller validates) - Remove 'if summary else 0' guards (generate_summary never returns None) - Remove 'if input_tokens > 0' guards (input is guaranteed non-empty) - Remove 'if summaries else ""' guard (summaries always has content) --- agent_cli/summarizer/adaptive.py | 8 ++++---- agent_cli/summarizer/map_reduce.py | 14 ++------------ 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index c5ba092eb..640c52e60 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -118,13 +118,13 @@ async def summarize( if level == SummaryLevel.BRIEF: summary = await _brief_summary(content, config) - output_tokens = count_tokens(summary, config.model) if summary else 0 + output_tokens = count_tokens(summary, config.model) return SummaryResult( level=level, summary=summary, input_tokens=input_tokens, output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + compression_ratio=output_tokens / input_tokens, ) # MAP_REDUCE level @@ -154,13 +154,13 @@ async def _map_reduce_summary( # For content that fits in a single chunk, use content-type aware summary if input_tokens <= config.token_max: summary = await _content_aware_summary(content, config, prior_summary, content_type) - output_tokens = count_tokens(summary, config.model) if summary else 0 + output_tokens = count_tokens(summary, config.model) return SummaryResult( level=SummaryLevel.MAP_REDUCE, summary=summary, input_tokens=input_tokens, output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + compression_ratio=output_tokens / input_tokens, collapse_depth=0, ) diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py index 93aaabd8c..07332c1cf 100644 --- a/agent_cli/summarizer/map_reduce.py +++ b/agent_cli/summarizer/map_reduce.py @@ -89,16 +89,6 @@ async def map_reduce_summarize( MapReduceResult with summary and metadata. """ - if not content or not content.strip(): - return MapReduceResult( - summary="", - input_tokens=0, - output_tokens=0, - compression_ratio=0.0, - collapse_depth=0, - intermediate_summaries=[], - ) - input_tokens = count_tokens(content, config.model) # Map phase: Split and summarize chunks in parallel @@ -137,7 +127,7 @@ async def map_reduce_summarize( if len(summaries) > 1: final_summary = await _synthesize(summaries, config) else: - final_summary = summaries[0] if summaries else "" + final_summary = summaries[0] output_tokens = count_tokens(final_summary, config.model) @@ -145,7 +135,7 @@ async def map_reduce_summarize( summary=final_summary, input_tokens=input_tokens, output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0, + compression_ratio=output_tokens / input_tokens, collapse_depth=depth, intermediate_summaries=intermediate_summaries, ) From aef0e9cc02deb98a8ff14581a2912724441e24d2 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 09:59:24 -0800 Subject: [PATCH 28/37] feat(scripts): add summarizer comparison script with needle-in-haystack test Compares old L1-L4 hierarchical vs new adaptive map-reduce approach: - Shows which level each system would use - Runs new summarizer and measures fact preservation - Uses specific 'needle' facts embedded in test content --- scripts/compare_summarizers.py | 402 +++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 scripts/compare_summarizers.py diff --git a/scripts/compare_summarizers.py b/scripts/compare_summarizers.py new file mode 100644 index 000000000..15265cb0e --- /dev/null +++ b/scripts/compare_summarizers.py @@ -0,0 +1,402 @@ +"""Compare old (L1-L4 hierarchical) vs new (adaptive map-reduce) summarizer. + +This script: +1. Shows what level each system would use for test content +2. Runs the NEW summarizer to produce actual summaries +3. Evaluates summary quality using needle-in-haystack questions +4. Uses LLM-as-judge for quality assessment + +Usage: + python scripts/compare_summarizers.py + python scripts/compare_summarizers.py --model "gpt-4o-mini" --base-url "https://api.openai.com/v1" +""" + +from __future__ import annotations + +import argparse +import asyncio +import os +import textwrap +from dataclasses import dataclass, field + +from agent_cli.summarizer import SummarizerConfig, summarize +from agent_cli.summarizer._utils import count_tokens + +# Old system thresholds +OLD_THRESHOLD_NONE = 100 +OLD_THRESHOLD_BRIEF = 500 +OLD_THRESHOLD_STANDARD = 3000 +OLD_THRESHOLD_DETAILED = 15000 + +# New system thresholds +NEW_THRESHOLD_NONE = 100 +NEW_THRESHOLD_BRIEF = 500 + +# Evaluation threshold +FACT_PRESERVATION_THRESHOLD = 0.5 + +# Test content at different sizes with embedded "needles" (specific facts) +TEST_CASES = [ + { + "name": "Brief Range (~300 tokens)", + "description": "Tests the 100-500 token range where OLD=BRIEF, NEW=BRIEF", + "content": """ + The artificial intelligence revolution is transforming every industry. + Machine learning algorithms now power recommendation systems, fraud detection, + and autonomous vehicles. Deep learning, a subset of machine learning, uses + neural networks with multiple layers to analyze complex patterns in data. + + Major tech companies are investing billions in AI research. Google's DeepMind + created AlphaGo, which defeated world champion Lee Sedol in March 2016 in + the ancient game of Go. OpenAI developed GPT models that can generate + human-like text. These advances raise both excitement and concerns about + the future of work and society. + + Researchers are working on making AI systems more transparent and aligned with + human values. The field of AI safety, pioneered by researchers like Stuart + Russell at UC Berkeley, aims to ensure that advanced AI systems remain + beneficial and under human control. + """, + "needles": [ + ("Who did AlphaGo defeat?", "Lee Sedol"), + ("When did AlphaGo win?", "March 2016"), + ("Who pioneered AI safety?", "Stuart Russell"), + ("Where does Stuart Russell work?", "UC Berkeley"), + ], + }, + { + "name": "Standard/MapReduce Range (~900 tokens)", + "description": "Tests 500-3000 range where OLD=STANDARD, NEW=MAP_REDUCE", + "content": """ + Climate change represents one of the most pressing challenges facing humanity. + The Earth's average temperature has risen approximately 1.1 degrees Celsius since + the pre-industrial era, primarily due to human activities that release greenhouse + gases. Carbon dioxide from burning fossil fuels accounts for 76% of emissions. + + The Intergovernmental Panel on Climate Change (IPCC), led by chair Hoesung Lee, + has warned that limiting warming to 1.5 degrees Celsius is crucial. The 2021 + report involved 234 authors from 66 countries analyzing over 14,000 scientific + papers. Their conclusion: human influence has warmed the climate at a rate + unprecedented in at least the last 2,000 years. + + Renewable energy offers hope. Solar panel costs dropped 89% between 2010 and 2020, + making solar competitive with fossil fuels. China leads with 306 gigawatts of + installed solar capacity. Wind energy has grown exponentially, with Denmark + generating 47% of its electricity from wind in 2019. + + Electric vehicles are gaining ground. Tesla delivered 936,172 vehicles in 2021, + while traditional automakers race to electrify. Norway leads adoption, with + electric vehicles representing 65% of new car sales in 2021. Battery costs + have fallen 89% since 2010, from $1,100 to $132 per kilowatt-hour. + + Carbon capture remains expensive at $250-$600 per ton of CO2. The Orca plant + in Iceland, opened in September 2021, captures just 4,000 tons annually. + Critics note this equals emissions from about 870 cars. More radical approaches + like solar radiation management could cool the planet but carry unknown risks. + + The Paris Agreement, signed by 196 parties in December 2015, aims to limit + warming to well below 2 degrees. Countries submit Nationally Determined + Contributions (NDCs) outlining their emission reduction plans. However, + current pledges put the world on track for 2.7 degrees of warming by 2100. + + Individual actions matter but systemic change is essential. Agriculture accounts + for 10-12% of global emissions. Beef production generates 60 kg of CO2 equivalent + per kilogram of meat. A plant-based diet could reduce food emissions by up to 73%. + """, + "needles": [ + ("Who chairs the IPCC?", "Hoesung Lee"), + ("How many authors contributed to the 2021 IPCC report?", "234"), + ("What percent of Denmark's electricity comes from wind?", "47%"), + ("When did the Orca plant open?", "September 2021"), + ("How many vehicles did Tesla deliver in 2021?", "936,172"), + ("What percent of Norway's new cars are electric?", "65%"), + ("When was the Paris Agreement signed?", "December 2015"), + ("How much CO2 does beef production generate per kg?", "60 kg"), + ], + }, + { + "name": "Detailed/MapReduce Range (~1800 tokens)", + "description": "Tests larger content where OLD=DETAILED (chunks+meta), NEW=MAP_REDUCE", + "content": """ + The history of computing spans centuries of human innovation, from ancient + calculating devices to quantum computers. Understanding this evolution reveals + how incremental advances compound into revolutionary change. + + Ancient Foundations (2400 BCE - 1600 CE) + + The abacus emerged independently in multiple civilizations. Chinese merchants + used the suanpan as early as 2400 BCE for arithmetic. The Roman abacus used + grooved beads, while the Japanese soroban featured a distinctive 1:4 bead + arrangement still used today. + + Mechanical Calculation (1600-1900) + + In 1642, nineteen-year-old Blaise Pascal invented the Pascaline to help his + tax-collector father. This brass rectangular box could add and subtract using + interlocking gears. Only 50 were built, and 9 survive in museums today. + + Gottfried Wilhelm Leibniz improved Pascal's design in 1694, creating the + Stepped Reckoner capable of multiplication and division. He also invented + binary arithmetic, writing "Explication de l'Arithmétique Binaire" in 1703, + laying groundwork for digital computing. + + Charles Babbage designed the Analytical Engine from 1833-1871, incorporating + a mill (processor), store (memory), and punch card input. Ada Lovelace wrote + detailed notes including what's considered the first algorithm - for computing + Bernoulli numbers. The engine was never completed; Babbage died in 1871. + + Electronic Era (1900-1970) + + Alan Turing published "On Computable Numbers" in 1936, defining the theoretical + Turing machine. During WWII, he led the team at Bletchley Park that cracked + the Enigma code, shortening the war by an estimated two years. + + ENIAC, completed February 14, 1946, at the University of Pennsylvania, was + the first general-purpose electronic computer. It weighed 30 tons, consumed + 150 kilowatts, and contained 17,468 vacuum tubes. Programming required + physically rewiring the machine, taking days for each new problem. + + The transistor, invented December 23, 1947, at Bell Labs by John Bardeen, + Walter Brattain, and William Shockley, revolutionized electronics. They + shared the 1956 Nobel Prize in Physics. By 1954, the TRADIC computer used + 800 transistors instead of vacuum tubes. + + Jack Kilby demonstrated the first integrated circuit on September 12, 1958, + at Texas Instruments. Robert Noyce independently developed a superior silicon + version at Fairchild. Kilby won the 2000 Nobel Prize; Noyce had died in 1990. + + Personal Computing (1970-2000) + + Intel's 4004, released November 15, 1971, was the first commercial microprocessor. + Designed by Federico Faggin, it contained 2,300 transistors running at 740 kHz. + The 8080 (1974) powered the Altair 8800, sparking the PC revolution. + + Steve Wozniak built the Apple I in 1976 in his garage. The Apple II (1977) + featured color graphics and cost $1,298. IBM entered with the PC on August 12, + 1981, using Microsoft's MS-DOS. By 1984, Apple's Macintosh introduced the GUI + to mainstream users at $2,495. + + Tim Berners-Lee invented the World Wide Web at CERN in 1989, proposing it + on March 12. The first website went live December 20, 1990. By 1995, the + internet had 16 million users; by 2000, 361 million. + + Modern Era (2000-Present) + + Moore's Law, predicting transistor doubling every two years, has held since + Gordon Moore's 1965 observation. Intel's 2021 Alder Lake processors contain + 10+ billion transistors on chips measuring 215 mm². + + Steve Jobs unveiled the iPhone on January 9, 2007. It sold 1.4 million units + in its first year. Smartphones now exceed 6.6 billion globally, containing + more power than 1990s supercomputers. + + Google claimed quantum supremacy October 23, 2019, with Sycamore completing + a calculation in 200 seconds that would take 10,000 years classically. + IBM disputed this, but the quantum era has clearly begun. + """, + "needles": [ + ("How old was Pascal when he invented the Pascaline?", "19"), + ("When did Leibniz write about binary arithmetic?", "1703"), + ("How many vacuum tubes did ENIAC contain?", "17,468"), + ("When was the transistor invented?", "December 23, 1947"), + ("When did Jack Kilby demonstrate the integrated circuit?", "September 12, 1958"), + ("How many transistors did the Intel 4004 have?", "2,300"), + ("When did the first website go live?", "December 20, 1990"), + ("When did Jobs unveil the iPhone?", "January 9, 2007"), + ("When did Google claim quantum supremacy?", "October 23, 2019"), + ], + }, +] + + +def get_old_level(tokens: int) -> tuple[str, str]: + """Determine what level the OLD (L1-L4) summarizer would use.""" + if tokens < OLD_THRESHOLD_NONE: + return "NONE", "No summary needed" + if tokens < OLD_THRESHOLD_BRIEF: + return "BRIEF", "Single sentence (~20% compression)" + if tokens < OLD_THRESHOLD_STANDARD: + return "STANDARD", "Paragraph with content-aware prompts (~12%)" + if tokens < OLD_THRESHOLD_DETAILED: + return "DETAILED", "Chunked L1 summaries + meta L3 (~7%)" + return "HIERARCHICAL", "Full L1/L2/L3 tree structure" + + +def get_new_level(tokens: int) -> tuple[str, str]: + """Determine what level the NEW (adaptive) summarizer would use.""" + if tokens < NEW_THRESHOLD_NONE: + return "NONE", "No summary needed" + if tokens < NEW_THRESHOLD_BRIEF: + return "BRIEF", "Single sentence" + return "MAP_REDUCE", "Dynamic collapse based on content" + + +@dataclass +class TestResult: + """Result of testing one content sample.""" + + name: str + tokens: int + old_level: str + old_description: str + new_level: str + new_description: str + new_summary: str | None = None + needles_found: int = 0 + total_needles: int = 0 + needle_details: list[tuple[str, str, bool]] = field(default_factory=list) + + +async def run_test(test_case: dict, config: dict) -> TestResult: + """Run a single test case.""" + content = test_case["content"].strip() + tokens = count_tokens(content, config["model"]) + + old_level, old_desc = get_old_level(tokens) + new_level, new_desc = get_new_level(tokens) + + # Run new summarizer + cfg = SummarizerConfig( + openai_base_url=config["base_url"], + model=config["model"], + api_key=config.get("api_key", "not-needed"), + ) + + result = await summarize(content, cfg, content_type="document") + + # Check needles in summary + needle_details = [] + needles_found = 0 + + if result.summary: + summary_lower = result.summary.lower() + for question, answer in test_case["needles"]: + # Check if the key fact is preserved + found = answer.lower() in summary_lower + needle_details.append((question, answer, found)) + if found: + needles_found += 1 + + return TestResult( + name=test_case["name"], + tokens=tokens, + old_level=old_level, + old_description=old_desc, + new_level=new_level, + new_description=new_desc, + new_summary=result.summary, + needles_found=needles_found, + total_needles=len(test_case["needles"]), + needle_details=needle_details, + ) + + +def print_result(result: TestResult) -> None: + """Print a test result.""" + print(f"\n{'=' * 70}") + print(f"{result.name}") + print(f"{'=' * 70}") + print(f"Input tokens: {result.tokens}") + print() + print("Level comparison:") + print(f" OLD: {result.old_level:12} - {result.old_description}") + print(f" NEW: {result.new_level:12} - {result.new_description}") + print() + + if result.new_summary: + print("New summary:") + wrapped = textwrap.fill( + result.new_summary, + width=68, + initial_indent=" ", + subsequent_indent=" ", + ) + print(wrapped) + print() + + print( + f"Needle-in-haystack test: {result.needles_found}/{result.total_needles} facts preserved", + ) + for question, answer, found in result.needle_details: + status = "[OK]" if found else "[MISSING]" + print(f" {status} {question} -> {answer}") + else: + print("No summary produced (NONE level)") + + +async def main() -> None: + """Run all tests.""" + parser = argparse.ArgumentParser(description="Compare summarizer versions") + parser.add_argument("--model", default=os.environ.get("OPENAI_MODEL", "gpt-oss-high:20b")) + parser.add_argument( + "--base-url", + default=os.environ.get("OPENAI_BASE_URL", "http://192.168.1.143:9292/v1"), + ) + parser.add_argument("--api-key", default=os.environ.get("OPENAI_API_KEY", "not-needed")) + args = parser.parse_args() + + config = { + "model": args.model, + "base_url": args.base_url, + "api_key": args.api_key, + } + + print("=" * 70) + print("SUMMARIZER COMPARISON: OLD (L1-L4) vs NEW (Adaptive Map-Reduce)") + print("=" * 70) + print(f"Model: {config['model']}") + print(f"Base URL: {config['base_url']}") + + results = [] + for test in TEST_CASES: + print(f"\nRunning: {test['name']}...") + result = await run_test(test, config) + results.append(result) + print_result(result) + + # Summary + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + + total_needles = sum(r.total_needles for r in results) + found_needles = sum(r.needles_found for r in results) + + print( + f"\nOverall fact preservation: {found_needles}/{total_needles} ({100 * found_needles / total_needles:.1f}%)", + ) + print() + + print("Key differences:") + print(""" +OLD System (5 levels): + - NONE (<100), BRIEF (100-500), STANDARD (500-3000), + DETAILED (3000-15000), HIERARCHICAL (>15000) + - Fixed boundaries, L1/L2/L3 tree for large content + - Stored intermediate summaries at each level + - Chunk size: 3000 tokens + +NEW System (3 levels): + - NONE (<100), BRIEF (100-500), MAP_REDUCE (>=500) + - Dynamic collapse depth based on content + - Content-type aware prompts + - Chunk size: 2048 tokens (BOOOOKSCORE research) + - Only stores final summary + +Trade-offs: + + Simpler (3 levels vs 5) + + Research-backed parameters + + Content-aware prompts + - No intermediate level access + - All >=500 token content treated the same +""") + + print("Verdict: ", end="") + if found_needles / total_needles >= FACT_PRESERVATION_THRESHOLD: + print("NEW system preserves facts adequately") + else: + print("NEW system may lose important details - further tuning needed") + + +if __name__ == "__main__": + asyncio.run(main()) From 86500c5b585d3d18eec31130d147c7d10230fcb7 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 10:05:07 -0800 Subject: [PATCH 29/37] docs(summarizer): update architecture doc to reflect current implementation - Remove references to old L1-L4/STANDARD/DETAILED/HIERARCHICAL levels - Remove HierarchicalSummary and ChunkSummary (no longer exist) - Update storage format to show single summary entry - Add new section on limitations and trade-offs - Simplify error handling section - Add data models section with current code --- docs/architecture/summarizer.md | 231 ++++++++++++++++++-------------- 1 file changed, 128 insertions(+), 103 deletions(-) diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index c34540bc1..43caf336d 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -37,7 +37,7 @@ LangChain's approach to document summarization uses a simple algorithm: 1. **Map phase:** Split content into chunks, summarize each in parallel 2. **Reduce phase:** If combined summaries exceed `token_max`, recursively collapse until they fit -Key insight: No need for predetermined L1/L2/L3 levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`. +Key insight: No need for predetermined levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`. ### 2.2 Borrowed: Chunk Size (BOOOOKSCORE) @@ -51,25 +51,13 @@ BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. T **Reference:** arXiv:2504.19413 -Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB. +Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to storage. -### 2.4 Not Directly Borrowed: Letta's Approach - -**Reference:** arXiv:2310.08560 - -Letta (MemGPT) uses a different paradigm focused on **context window management**: -- Message count thresholds (e.g., 10 messages), not token thresholds -- 30% partial eviction when buffer overflows -- Purpose: fit conversation in LLM context window - -Our system has a different purpose (memory compression for storage/retrieval), so our implementation differs significantly. - -### 2.5 Original Design (Not Research-Backed) +### 2.4 Original Design (Not Research-Backed) The following aspects are **original design choices without direct research justification**: -- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/map-reduce were chosen heuristically. -- **L2 group logic for storage:** The intermediate summaries stored as "L2" is for backward compatibility with the storage layer. +- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/MAP_REDUCE were chosen heuristically. - **Content-type prompts:** Domain-specific prompts are original design. --- @@ -78,51 +66,39 @@ The following aspects are **original design choices without direct research just ### 3.1 Map-Reduce with Dynamic Collapse -**Decision:** Use LangChain-style map-reduce instead of fixed L1/L2/L3 levels. +**Decision:** Use LangChain-style map-reduce instead of fixed hierarchy. **Rationale:** -- **Simpler algorithm:** No need to distinguish STANDARD/DETAILED/HIERARCHICAL. +- **Simpler algorithm:** Single code path handles all content sizes. - **Dynamic depth:** Collapse depth adapts to actual content length. - **Research-backed:** LangChain's approach is battle-tested. **Algorithm:** ```python -def map_reduce_summarize(content, token_max=3000): - if tokens(content) <= token_max: - return summarize_directly(content) - +async def map_reduce_summarize(content, config): # Map: Split and summarize chunks in parallel - chunks = split_into_chunks(content, chunk_size=2048) - summaries = [summarize(chunk) for chunk in chunks] + chunks = chunk_text(content, chunk_size=2048) + summaries = await parallel_summarize(chunks) - # Reduce: Recursively collapse until fits - while total_tokens(summaries) > token_max: - groups = group_summaries_by_token_max(summaries, token_max) - summaries = [synthesize(group) for group in groups] + # Reduce: Recursively collapse until fits token_max + while total_tokens(summaries) > config.token_max: + groups = group_by_token_limit(summaries, config.token_max) + summaries = await parallel_synthesize(groups) return final_synthesis(summaries) ``` -### 3.2 Token-Based Level Selection (Simplified) - -**Decision:** Use three effective levels instead of five. +### 3.2 Three-Level Strategy -**Rationale:** - -- **Simplicity:** Fewer code paths, easier to understand. -- **Dynamic instead of fixed:** Map-reduce adapts to content, no need for DETAILED vs HIERARCHICAL distinction. - -**Effective Levels:** +**Decision:** Use three levels based on token count. | Level | Token Range | Strategy | | :--- | :--- | :--- | | NONE | < 100 | No summarization needed | | BRIEF | 100-500 | Single sentence | -| MAP_REDUCE | > 500 | Dynamic collapse until fits token_max | - -**Backward Compatibility:** The output still reports STANDARD, DETAILED, or HIERARCHICAL based on collapse depth for storage compatibility. +| MAP_REDUCE | >= 500 | Dynamic collapse until fits token_max | ### 3.3 Research-Backed Defaults @@ -140,15 +116,15 @@ def map_reduce_summarize(content, token_max=3000): **Rationale:** -- **Coherence preservation:** Splitting mid-sentence or mid-thought loses context and produces poor summaries. -- **Natural units:** Paragraphs and sentences are natural semantic units that humans use to organize thoughts. +- **Coherence preservation:** Splitting mid-sentence loses context. +- **Natural units:** Paragraphs and sentences are natural semantic units. - **Overlap for continuity:** The 200-token overlap ensures concepts spanning chunk boundaries aren't lost. **Fallback chain:** 1. Prefer paragraph boundaries (double newlines) 2. Fall back to sentence boundaries (`.!?` followed by space + capital) -3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation) +3. Final fallback to word-based splitting ### 3.5 Content-Type Aware Prompts @@ -156,35 +132,27 @@ def map_reduce_summarize(content, token_max=3000): **Rationale:** -- **Conversations:** Focus on user preferences, decisions, action items—what the user wants and what was agreed. -- **Journals:** Emphasize personal insights, emotional context, growth patterns—the subjective experience. -- **Documents:** Prioritize key findings, methodology, conclusions—the objective content. +- **Conversations:** Focus on user preferences, decisions, action items. +- **Journals:** Emphasize personal insights, emotional context, growth patterns. +- **Documents:** Prioritize key findings, methodology, conclusions. -A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case. +A generic summarization prompt loses domain-specific signal. ### 3.6 Prior Summary Integration -**Decision:** Always provide the previous summary as context when generating updates. +**Decision:** Provide the previous summary as context when generating updates. **Rationale:** -- **Continuity:** New summaries should build on existing context, not start fresh each time. -- **Incremental updates:** Avoid re-summarizing all historical content on every update. -- **Information preservation:** Important information from earlier content persists through the chain of summaries. - -The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time. +- **Continuity:** New summaries build on existing context. +- **Incremental updates:** Avoid re-summarizing all historical content. +- **Information preservation:** Important information persists through the chain. ### 3.7 Compression Ratio Tracking **Decision:** Track and report compression metrics for every summary. -**Rationale:** - -- **Transparency:** Users can understand how much information was compressed. -- **Quality monitoring:** Unusual ratios (e.g., output longer than input) may indicate summarization issues. -- **Optimization:** Metrics inform future threshold tuning and quality assessment. - -Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression_ratio` for observability. +Every `SummaryResult` includes `input_tokens`, `output_tokens`, `compression_ratio`, and `collapse_depth` for observability. --- @@ -192,10 +160,10 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression ### 4.1 Level Selection -The entry point counts tokens and selects strategy: +The entry point (`summarize()`) counts tokens and selects strategy: 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable. -2. **Threshold comparison:** Determines if NONE, BRIEF, or map-reduce. +2. **Threshold comparison:** Determines NONE, BRIEF, or MAP_REDUCE. 3. **Strategy dispatch:** Calls appropriate handler. ### 4.2 Brief Level @@ -203,55 +171,118 @@ The entry point counts tokens and selects strategy: For short content (100-500 tokens): - Single LLM call with brief prompt -- Returns simple `SummaryResult` with no hierarchical structure +- Returns `SummaryResult` with single-sentence summary ### 4.3 Map-Reduce Level -For longer content (> 500 tokens): +For longer content (>= 500 tokens): 1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly. 2. **Map phase:** Split content into overlapping chunks, summarize each in parallel. 3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively. 4. **Final synthesis:** Combine remaining summaries into final output. -The parallelism in the map phase provides significant speedup for long content while maintaining semantic coherence through the collapse process. +The `collapse_depth` field in the result indicates how many reduce iterations were needed. --- -## 5. Integration with Memory System +## 5. Data Models + +### 5.1 SummaryLevel + +```python +class SummaryLevel(IntEnum): + NONE = 0 # < 100 tokens + BRIEF = 1 # 100-500 tokens + MAP_REDUCE = 2 # >= 500 tokens +``` + +### 5.2 SummaryResult -### 5.1 Write Path +```python +class SummaryResult(BaseModel): + level: SummaryLevel + summary: str | None + input_tokens: int + output_tokens: int + compression_ratio: float + collapse_depth: int # 0 = no collapse needed + created_at: datetime +``` + +### 5.3 SummarizerConfig + +```python +@dataclass +class SummarizerConfig: + openai_base_url: str + model: str + api_key: str | None = None + chunk_size: int = 2048 # BOOOOKSCORE + token_max: int = 3000 # LangChain + chunk_overlap: int = 200 + max_concurrent_chunks: int = 5 + timeout: float = 60.0 +``` + +--- + +## 6. Integration with Memory System + +### 6.1 Write Path The memory system triggers summarization during post-processing: -1. Collect raw conversation turns (user message + assistant message) -2. Retrieve existing L3 summary as prior context +1. Collect raw conversation turns +2. Retrieve existing summary as prior context 3. Call summarizer with content + prior summary + content type -4. Persist results: delete old summaries, write new files, upsert to ChromaDB +4. Persist result to storage -### 5.2 Read Path +### 6.2 Read Path The memory retrieval system uses summaries for context injection: -- Fetches L3 (final) summary for the conversation -- Injects as prefix to retrieved memories in the prompt -- Provides high-level context that individual memory snippets lack +- Fetches summary for the conversation +- Injects as prefix to retrieved memories +- Provides high-level context that individual snippets lack + +### 6.3 Storage + +Summaries are stored with metadata: -### 5.3 Storage +```python +{ + "id": "{conversation_id}:summary", + "content": summary_text, + "metadata": { + "conversation_id": conversation_id, + "role": "summary", + "summary_level": "MAP_REDUCE", + "input_tokens": 1500, + "output_tokens": 150, + "compression_ratio": 0.1, + "collapse_depth": 1, + "created_at": "2024-01-15T10:30:00Z", + }, +} +``` + +--- + +## 7. Error Handling -Summaries are persisted in two places: +Summarization follows a fail-fast philosophy: -- **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable. -- **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps. +- **LLM errors:** Propagated as `SummarizationError` (base class for all summarization errors). +- **Empty input:** Returns NONE level immediately (not an error). +- **Encoding errors:** Falls back to character-based token estimation. +- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max. -For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 structure: -- First collapse level → L1 (chunk summaries) -- Intermediate levels → L2 (grouped summaries) -- Final output → L3 (synthesis) +The caller decides how to handle failures—typically by proceeding without a summary rather than blocking the entire operation. --- -## 6. Configuration +## 8. Configuration | Parameter | Default | Source | | :--- | :--- | :--- | @@ -263,34 +294,28 @@ For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 s --- -## 7. Error Handling +## 9. Limitations and Trade-offs -Summarization follows a fail-fast philosophy: +### 9.1 Fact Preservation -- **LLM errors:** Propagated as `SummarizationError` or `MapReduceSummarizationError` rather than silently returning empty results. -- **Empty input:** Returns NONE level immediately (not an error). -- **Encoding errors:** Falls back to character-based token estimation. -- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max. +Summarization is inherently lossy. Specific facts (dates, numbers, names) are often dropped in favor of thematic content. If your use case requires fact retrieval: -The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path. +- Store original content alongside summaries +- Use fact extraction instead of summarization +- Use RAG to retrieve original chunks ---- +### 9.2 No Intermediate Summaries -## 8. Comparison: Old vs New Approach +Unlike hierarchical approaches, map-reduce only stores the final summary. Intermediate chunk summaries are discarded after synthesis. This simplifies storage but removes granular access. -| Aspect | Old Approach | New Approach | -| :--- | :--- | :--- | -| Levels | 5 fixed (NONE/BRIEF/STANDARD/DETAILED/HIERARCHICAL) | 3 effective (NONE/BRIEF/MAP_REDUCE) | -| Hierarchy | Fixed L1/L2/L3 structure | Dynamic collapse depth | -| Chunk size | 3000 tokens | 2048 tokens (BOOOOKSCORE) | -| token_max | N/A (fixed levels) | 3000 (LangChain) | -| Complexity | Multiple code paths | Single map-reduce algorithm | -| Research basis | Heuristic | LangChain + BOOOOKSCORE | +### 9.3 Fixed Thresholds + +The 100/500 token thresholds are heuristic. They may need tuning for specific domains or languages. --- -## 9. Future Improvements +## 10. Future Improvements 1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation -2. **Add incremental updating mode** as alternative to hierarchical merging for larger context models -3. **Tune token thresholds empirically** with real-world content +2. **Tune token thresholds empirically** with real-world content +3. **Add fact extraction mode** for use cases requiring specific detail preservation From 63b755a0bd2b51bf31913934c5bc804712f409fd Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 10:29:15 -0800 Subject: [PATCH 30/37] docs: update memory.md for 3-level summarizer Remove outdated references to 5-level hierarchy (STANDARD, DETAILED, HIERARCHICAL) and L1/L2/L3 storage structure. Update to reflect current 3-level system (NONE, BRIEF, MAP_REDUCE) with single final summary. Also fix prompt names to match actual implementation: - BRIEF_SUMMARY_PROMPT, STANDARD_SUMMARY_PROMPT - CHUNK_SUMMARY_PROMPT, META_SUMMARY_PROMPT - Remove non-existent ROLLING_PROMPT --- docs/architecture/memory.md | 38 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md index b42e739a9..361640e9c 100644 --- a/docs/architecture/memory.md +++ b/docs/architecture/memory.md @@ -164,13 +164,7 @@ entries/ assistant/ __.md # Raw assistant responses summaries/ - L1/ - chunk_0.md # Level 1: Individual chunk summaries - chunk_1.md - L2/ - group_0.md # Level 2: Group summaries (groups of ~5 L1s) - L3/ - final.md # Level 3: Final synthesized summary + __summary.md # Single final summary (map-reduce collapses to one) ``` **Deleted Directory Structure (Soft Deletes):** @@ -182,7 +176,7 @@ entries/ facts/ __.md summaries/ - L1/, L2/, L3/ # Tombstoned summary levels + __summary.md # Tombstoned summary ``` ### 2.2 File Format @@ -276,17 +270,16 @@ Resolves contradictions using a "Search-Decide-Update" loop with complete enumer * **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`. * **Deletes:** Soft-deletes files (moved under `deleted/`) and removes from Chroma. -### 4.4 Summarization (Adaptive Hierarchical) +### 4.4 Summarization (Adaptive Map-Reduce) Uses the `agent_cli.summarizer` module for research-backed adaptive summarization. -* **Level Selection:** Automatically determines summarization depth based on token count: +* **Level Selection:** Automatically determines summarization strategy based on token count: * `NONE` (< 100 tokens): No summary needed, facts only. - * `BRIEF` (100-500 tokens): Single-sentence summary (~20% compression). - * `STANDARD` (500-3000 tokens): Paragraph summary (~12% compression). - * `DETAILED` (3000-15000 tokens): Chunked summaries + meta-summary (~7% compression). - * `HIERARCHICAL` (> 15000 tokens): Full L1/L2/L3 tree structure. -* **Input:** Previous L3 summary (if any) + newly extracted facts. -* **Persistence:** Stores summaries in `summaries/L1/`, `L2/`, `L3/` subdirectories with YAML front matter containing compression metrics. + * `BRIEF` (100-500 tokens): Single-sentence summary. + * `MAP_REDUCE` (>= 500 tokens): Dynamic collapse using map-reduce with content-type aware prompts. +* **Algorithm:** LangChain-inspired map-reduce that recursively collapses until content fits token_max (3000). +* **Input:** Previous summary (if any) + newly extracted facts. +* **Persistence:** Stores single final summary in `summaries/` directory with YAML front matter containing compression metrics. * **See:** `docs/architecture/summarizer.md` for detailed algorithm specification. ### 4.5 Eviction @@ -318,13 +311,12 @@ To replicate the system behavior, the following prompt strategies are required. * **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences. ### 5.3 Summarization (Adaptive Prompts) -The summarizer uses level-specific prompts from `agent_cli.summarizer._prompts`: -* **`BRIEF_PROMPT`:** Single-sentence distillation for short content. -* **`STANDARD_PROMPT`:** Paragraph summary with prior context integration. -* **`CHUNK_PROMPT`:** Individual chunk summarization for hierarchical processing. -* **`META_PROMPT`:** Synthesizes multiple chunk summaries into cohesive narrative. -* **`ROLLING_PROMPT`:** Integrates new facts with existing summary. -* **Content-type variants:** `CONVERSATION_PROMPT`, `JOURNAL_PROMPT`, `DOCUMENT_PROMPT` for domain-specific summarization. +The summarizer uses prompts from `agent_cli.summarizer._prompts`: +* **`BRIEF_SUMMARY_PROMPT`:** Single-sentence distillation for short content (100-500 tokens). +* **`STANDARD_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content). +* **`CHUNK_SUMMARY_PROMPT`:** Individual chunk summarization for map phase. +* **`META_SUMMARY_PROMPT`:** Synthesizes multiple chunk summaries in reduce phase. +* **Content-type variants:** `CONVERSATION_SUMMARY_PROMPT`, `JOURNAL_SUMMARY_PROMPT`, `DOCUMENT_SUMMARY_PROMPT` for domain-specific summarization. --- From 88869c257951edacdc039219f4f965ac845d0359 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 10:42:46 -0800 Subject: [PATCH 31/37] refactor(summarizer): rename STANDARD_SUMMARY_PROMPT to GENERAL_SUMMARY_PROMPT The prompt name "STANDARD" was a leftover from the old 5-level system which had a STANDARD SummaryLevel. Since that level no longer exists (now just NONE, BRIEF, MAP_REDUCE), rename to GENERAL_SUMMARY_PROMPT to match its actual purpose as the "general" content type prompt. --- agent_cli/summarizer/_prompts.py | 8 ++++---- docs/architecture/memory.md | 2 +- tests/summarizer/test_prompts.py | 32 ++++++++++++++++---------------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py index 1de5fa44f..476cb408e 100644 --- a/agent_cli/summarizer/_prompts.py +++ b/agent_cli/summarizer/_prompts.py @@ -13,8 +13,8 @@ One-sentence summary:""".strip() -# MAP_REDUCE level - Paragraph summary for content-type aware summarization -STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph. +# MAP_REDUCE level - Paragraph summary for general content type +GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph. Focus on: - Key facts, decisions, and outcomes @@ -104,12 +104,12 @@ def get_prompt_for_content_type(content_type: str) -> str: """ prompts = { - "general": STANDARD_SUMMARY_PROMPT, + "general": GENERAL_SUMMARY_PROMPT, "conversation": CONVERSATION_SUMMARY_PROMPT, "journal": JOURNAL_SUMMARY_PROMPT, "document": DOCUMENT_SUMMARY_PROMPT, } - return prompts.get(content_type, STANDARD_SUMMARY_PROMPT) + return prompts.get(content_type, GENERAL_SUMMARY_PROMPT) def format_prior_context(prior_summary: str | None) -> str: diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md index 361640e9c..66331d9f7 100644 --- a/docs/architecture/memory.md +++ b/docs/architecture/memory.md @@ -313,7 +313,7 @@ To replicate the system behavior, the following prompt strategies are required. ### 5.3 Summarization (Adaptive Prompts) The summarizer uses prompts from `agent_cli.summarizer._prompts`: * **`BRIEF_SUMMARY_PROMPT`:** Single-sentence distillation for short content (100-500 tokens). -* **`STANDARD_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content). +* **`GENERAL_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content). * **`CHUNK_SUMMARY_PROMPT`:** Individual chunk summarization for map phase. * **`META_SUMMARY_PROMPT`:** Synthesizes multiple chunk summaries in reduce phase. * **Content-type variants:** `CONVERSATION_SUMMARY_PROMPT`, `JOURNAL_SUMMARY_PROMPT`, `DOCUMENT_SUMMARY_PROMPT` for domain-specific summarization. diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py index 660229709..ef05ebad5 100644 --- a/tests/summarizer/test_prompts.py +++ b/tests/summarizer/test_prompts.py @@ -7,9 +7,9 @@ CHUNK_SUMMARY_PROMPT, CONVERSATION_SUMMARY_PROMPT, DOCUMENT_SUMMARY_PROMPT, + GENERAL_SUMMARY_PROMPT, JOURNAL_SUMMARY_PROMPT, META_SUMMARY_PROMPT, - STANDARD_SUMMARY_PROMPT, format_prior_context, format_summaries_for_meta, get_prompt_for_content_type, @@ -26,13 +26,13 @@ def test_brief_prompt_has_content_placeholder(self) -> None: result = BRIEF_SUMMARY_PROMPT.format(content="Test content") assert "Test content" in result - def test_standard_prompt_has_placeholders(self) -> None: - """Test STANDARD prompt contains required placeholders.""" - assert "{content}" in STANDARD_SUMMARY_PROMPT - assert "{prior_context}" in STANDARD_SUMMARY_PROMPT - assert "{max_words}" in STANDARD_SUMMARY_PROMPT + def test_general_prompt_has_placeholders(self) -> None: + """Test GENERAL prompt contains required placeholders.""" + assert "{content}" in GENERAL_SUMMARY_PROMPT + assert "{prior_context}" in GENERAL_SUMMARY_PROMPT + assert "{max_words}" in GENERAL_SUMMARY_PROMPT - result = STANDARD_SUMMARY_PROMPT.format( + result = GENERAL_SUMMARY_PROMPT.format( content="Main content", prior_context="Previous context", max_words=100, @@ -92,10 +92,10 @@ def test_document_prompt_has_placeholders(self) -> None: class TestGetPromptForContentType: """Tests for get_prompt_for_content_type function.""" - def test_general_returns_standard(self) -> None: - """Test general content type returns standard prompt.""" + def test_general_returns_general(self) -> None: + """Test general content type returns general prompt.""" prompt = get_prompt_for_content_type("general") - assert prompt == STANDARD_SUMMARY_PROMPT + assert prompt == GENERAL_SUMMARY_PROMPT def test_conversation_returns_conversation(self) -> None: """Test conversation content type returns conversation prompt.""" @@ -112,15 +112,15 @@ def test_document_returns_document(self) -> None: prompt = get_prompt_for_content_type("document") assert prompt == DOCUMENT_SUMMARY_PROMPT - def test_unknown_returns_standard(self) -> None: - """Test unknown content type falls back to standard.""" + def test_unknown_returns_general(self) -> None: + """Test unknown content type falls back to general.""" prompt = get_prompt_for_content_type("unknown_type") - assert prompt == STANDARD_SUMMARY_PROMPT + assert prompt == GENERAL_SUMMARY_PROMPT - def test_empty_returns_standard(self) -> None: - """Test empty string falls back to standard.""" + def test_empty_returns_general(self) -> None: + """Test empty string falls back to general.""" prompt = get_prompt_for_content_type("") - assert prompt == STANDARD_SUMMARY_PROMPT + assert prompt == GENERAL_SUMMARY_PROMPT class TestFormatPriorContext: From df8f05688a0f98d287eb53104acfa64e62dbd697 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Thu, 27 Nov 2025 11:12:27 -0800 Subject: [PATCH 32/37] docs: clarify prompt comments to avoid confusion with level names --- agent_cli/summarizer/_prompts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py index 476cb408e..de59f9404 100644 --- a/agent_cli/summarizer/_prompts.py +++ b/agent_cli/summarizer/_prompts.py @@ -4,7 +4,7 @@ and are optimized for structured, factual output. """ -# BRIEF level - Single sentence summary for short content (100-500 tokens) +# Single sentence summary for short content (used at BRIEF level, 100-500 tokens) BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words). Focus on the single most important point or takeaway. @@ -13,7 +13,7 @@ One-sentence summary:""".strip() -# MAP_REDUCE level - Paragraph summary for general content type +# Paragraph summary for "general" content type (default when no specific type provided) GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph. Focus on: From f550b3fa4293aa6a078c1037746becd80e64bad5 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Fri, 28 Nov 2025 22:49:27 -0800 Subject: [PATCH 33/37] Chunk memories --- agent_cli/rag/client.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/agent_cli/rag/client.py b/agent_cli/rag/client.py index be930ab48..af3438739 100644 --- a/agent_cli/rag/client.py +++ b/agent_cli/rag/client.py @@ -125,8 +125,17 @@ def add( for i in range(len(chunks)) ] - # Upsert to collection - self.collection.upsert(ids=ids, documents=chunks, metadatas=metadatas) + # Upsert to collection in batches to avoid overwhelming the embedding service + batch_size = 10 + for i in range(0, len(ids), batch_size): + batch_ids = ids[i : i + batch_size] + batch_docs = chunks[i : i + batch_size] + batch_metas = metadatas[i : i + batch_size] + self.collection.upsert( + ids=batch_ids, + documents=batch_docs, + metadatas=batch_metas, + ) logger.info("Added doc_id=%s with %d chunks", doc_id, len(chunks)) return doc_id From 4f1d16a529210b1dd73291483578791984d60fad Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Fri, 28 Nov 2025 23:12:38 -0800 Subject: [PATCH 34/37] refactor(summarizer): remove dead code and reorganize models - Remove unused `middle_truncate()` function and its tests - Remove unused `MapReduceSummarizationError` exception class - Move `SummarizerConfig` and `SummarizationError` from _utils.py to models.py This groups all exported types in models.py and keeps _utils.py focused on actual utility functions (token counting, chunking, LLM calls). Net: -96 lines --- agent_cli/summarizer/__init__.py | 9 +++- agent_cli/summarizer/_utils.py | 87 +----------------------------- agent_cli/summarizer/adaptive.py | 4 +- agent_cli/summarizer/map_reduce.py | 8 +-- agent_cli/summarizer/models.py | 36 +++++++++++++ tests/summarizer/test_utils.py | 46 ---------------- 6 files changed, 47 insertions(+), 143 deletions(-) diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py index af977ada1..daf0e2bc6 100644 --- a/agent_cli/summarizer/__init__.py +++ b/agent_cli/summarizer/__init__.py @@ -21,8 +21,13 @@ """ -from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize -from agent_cli.summarizer.models import SummaryLevel, SummaryResult +from agent_cli.summarizer.adaptive import summarize +from agent_cli.summarizer.models import ( + SummarizationError, + SummarizerConfig, + SummaryLevel, + SummaryResult, +) __all__ = [ "SummarizationError", diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index 078e21edc..23c8dd195 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -3,13 +3,12 @@ from __future__ import annotations import re -from dataclasses import dataclass from functools import lru_cache from typing import TYPE_CHECKING from pydantic import BaseModel -from agent_cli.summarizer.models import SummaryLevel +from agent_cli.summarizer.models import SummarizationError, SummarizerConfig, SummaryLevel if TYPE_CHECKING: import tiktoken @@ -21,41 +20,6 @@ class SummaryOutput(BaseModel): summary: str -class SummarizationError(Exception): - """Raised when summarization fails after all retries.""" - - -@dataclass -class SummarizerConfig: - """Configuration for summarization operations. - - Example: - config = SummarizerConfig( - openai_base_url="http://localhost:8000/v1", - model="llama3.1:8b", - ) - result = await summarize(long_document, config) - print(f"Level: {result.level.name}") - print(f"Compression: {result.compression_ratio:.1%}") - - """ - - openai_base_url: str - model: str - api_key: str | None = None - chunk_size: int = 2048 # BOOOOKSCORE's tested default - token_max: int = 3000 # LangChain's default - when to collapse - chunk_overlap: int = 200 - max_concurrent_chunks: int = 5 - timeout: float = 60.0 - - def __post_init__(self) -> None: - """Normalize the base URL.""" - self.openai_base_url = self.openai_base_url.rstrip("/") - if self.api_key is None: - self.api_key = "not-needed" - - async def generate_summary( prompt: str, config: SummarizerConfig, @@ -266,55 +230,6 @@ def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str: return " ".join(overlap_parts) -def middle_truncate( - text: str, - budget_chars: int, - head_frac: float = 0.3, - tail_frac: float = 0.3, -) -> tuple[str, int]: - """Middle-truncate text to fit within a character budget. - - Keeps the first head_frac and last tail_frac portions, dropping the middle. - This preserves context from both the beginning (often contains setup) and - end (often contains conclusions/recent events). - - Inspired by Letta's `middle_truncate_text` function. - - Args: - text: Text to truncate. - budget_chars: Maximum character count for output. - head_frac: Fraction of budget for the head portion. - tail_frac: Fraction of budget for the tail portion. - - Returns: - Tuple of (truncated_text, dropped_char_count). - - """ - if budget_chars <= 0 or len(text) <= budget_chars: - return text, 0 - - head_len = max(0, int(budget_chars * head_frac)) - tail_len = max(0, int(budget_chars * tail_frac)) - - # Ensure head + tail doesn't exceed budget - if head_len + tail_len > budget_chars: - tail_len = max(0, budget_chars - head_len) - - head = text[:head_len] - tail = text[-tail_len:] if tail_len > 0 else "" - dropped = max(0, len(text) - (len(head) + len(tail))) - - marker = f"\n[...{dropped} characters truncated...]\n" - - # If marker would overflow budget, shrink tail - available_for_marker = budget_chars - (len(head) + len(tail)) - if available_for_marker < len(marker): - over = len(marker) - available_for_marker - tail = tail[:-over] if over < len(tail) else "" - - return head + marker + tail, dropped - - def estimate_summary_tokens(input_tokens: int, level: int) -> int: """Estimate target summary tokens based on input size and level.""" if level == SummaryLevel.NONE: diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index 640c52e60..f242b662f 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -26,8 +26,6 @@ get_prompt_for_content_type, ) from agent_cli.summarizer._utils import ( - SummarizationError, - SummarizerConfig, count_tokens, estimate_summary_tokens, generate_summary, @@ -35,6 +33,8 @@ ) from agent_cli.summarizer.map_reduce import map_reduce_summarize from agent_cli.summarizer.models import ( + SummarizationError, + SummarizerConfig, SummaryLevel, SummaryResult, ) diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py index 07332c1cf..3dd81aa43 100644 --- a/agent_cli/summarizer/map_reduce.py +++ b/agent_cli/summarizer/map_reduce.py @@ -25,23 +25,17 @@ format_summaries_for_meta, ) from agent_cli.summarizer._utils import ( - SummarizationError, - SummarizerConfig, chunk_text, count_tokens, estimate_summary_tokens, generate_summary, tokens_to_words, ) -from agent_cli.summarizer.models import SummaryLevel +from agent_cli.summarizer.models import SummarizerConfig, SummaryLevel logger = logging.getLogger(__name__) -class MapReduceSummarizationError(SummarizationError): - """Raised when map-reduce summarization fails.""" - - @dataclass class MapReduceResult: """Result of map-reduce summarization. diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index be0d309be..14be0c864 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -2,6 +2,7 @@ from __future__ import annotations +from dataclasses import dataclass from datetime import UTC, datetime from enum import IntEnum from typing import Any @@ -9,6 +10,41 @@ from pydantic import BaseModel, Field +class SummarizationError(Exception): + """Raised when summarization fails after all retries.""" + + +@dataclass +class SummarizerConfig: + """Configuration for summarization operations. + + Example: + config = SummarizerConfig( + openai_base_url="http://localhost:8000/v1", + model="llama3.1:8b", + ) + result = await summarize(long_document, config) + print(f"Level: {result.level.name}") + print(f"Compression: {result.compression_ratio:.1%}") + + """ + + openai_base_url: str + model: str + api_key: str | None = None + chunk_size: int = 2048 # BOOOOKSCORE's tested default + token_max: int = 3000 # LangChain's default - when to collapse + chunk_overlap: int = 200 + max_concurrent_chunks: int = 5 + timeout: float = 60.0 + + def __post_init__(self) -> None: + """Normalize the base URL.""" + self.openai_base_url = self.openai_base_url.rstrip("/") + if self.api_key is None: + self.api_key = "not-needed" + + class SummaryLevel(IntEnum): """Summary strategy based on input length.""" diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py index 2621b158e..188a79172 100644 --- a/tests/summarizer/test_utils.py +++ b/tests/summarizer/test_utils.py @@ -6,7 +6,6 @@ chunk_text, count_tokens, estimate_summary_tokens, - middle_truncate, tokens_to_words, ) @@ -86,51 +85,6 @@ def test_large_paragraph_sentence_split(self) -> None: assert len(chunks) > 1 -class TestMiddleTruncate: - """Tests for middle_truncate function.""" - - def test_no_truncation_needed(self) -> None: - """Test that short text is not truncated.""" - text = "Short text" - result, dropped = middle_truncate(text, budget_chars=100) - assert result == text - assert dropped == 0 - - def test_basic_truncation(self) -> None: - """Test basic middle truncation.""" - text = "A" * 100 # 100 character string - result, dropped = middle_truncate(text, budget_chars=50) - - # Should have head + marker + tail - assert len(result) <= 50 + 50 # Allow for marker - assert dropped > 0 - assert "[..." in result - assert "truncated...]" in result - - def test_head_tail_fractions(self) -> None: - """Test custom head/tail fractions.""" - text = "AAAAA" + "BBBBB" * 20 + "CCCCC" - result, dropped = middle_truncate(text, budget_chars=30, head_frac=0.5, tail_frac=0.5) - - # Should preserve beginning (A's) and end (C's) - assert result.startswith("A") - assert dropped > 0 - - def test_zero_budget(self) -> None: - """Test with zero budget returns original.""" - text = "Some text" - result, dropped = middle_truncate(text, budget_chars=0) - assert result == text - assert dropped == 0 - - def test_negative_budget(self) -> None: - """Test with negative budget returns original.""" - text = "Some text" - result, dropped = middle_truncate(text, budget_chars=-10) - assert result == text - assert dropped == 0 - - class TestEstimateSummaryTokens: """Tests for estimate_summary_tokens function.""" From 1ed9ff4ba1ae92aac70100c96677eae27fa55b96 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 3 Dec 2025 20:10:28 -0800 Subject: [PATCH 35/37] refactor(summarizer): simplify API with target_tokens/target_ratio parameters Remove SummaryLevel enum and three-level strategy in favor of a simple "fits target? return as-is : map-reduce" approach. This reduces complexity while maintaining full functionality. Changes: - Remove SummaryLevel enum (NONE/BRIEF/MAP_REDUCE) - Add target_tokens parameter for absolute token limit - Add target_ratio parameter for relative compression (e.g., 0.2 = 20%) - Simplify estimate_summary_tokens to use ~10% compression ratio - Update memory integration to use compression_ratio in logging - Rewrite examples and tests for new API - Update architecture documentation Net reduction: ~165 lines of code --- agent_cli/agents/summarize.py | 5 +- agent_cli/memory/_ingest.py | 4 +- agent_cli/memory/_persistence.py | 6 +- agent_cli/memory/models.py | 2 +- agent_cli/summarizer/__init__.py | 18 ++- agent_cli/summarizer/_utils.py | 14 +- agent_cli/summarizer/adaptive.py | 138 +++++++------------ agent_cli/summarizer/map_reduce.py | 53 +++++--- agent_cli/summarizer/models.py | 26 +--- docs/architecture/summarizer.md | 195 ++++++++++++++++----------- examples/summarizer_demo.py | 164 ++++++++++------------ tests/memory/test_engine.py | 4 +- tests/memory/test_git_integration.py | 3 +- tests/summarizer/test_adaptive.py | 168 +++++++---------------- tests/summarizer/test_integration.py | 34 +---- tests/summarizer/test_models.py | 60 ++------- tests/summarizer/test_utils.py | 48 +++---- 17 files changed, 388 insertions(+), 554 deletions(-) diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py index ec516310e..ecfd1e053 100644 --- a/agent_cli/agents/summarize.py +++ b/agent_cli/agents/summarize.py @@ -115,7 +115,7 @@ def _display_result( elif result.summary: print_output_panel( result.summary, - title=f"Summary (Level: {result.level.name})", + title="Summary", subtitle=f"[dim]{result.output_tokens:,} tokens | {result.compression_ratio:.1%} of original | {elapsed:.2f}s[/dim]", ) else: @@ -139,7 +139,6 @@ def _display_full_result( console.print() console.print("[bold cyan]Summarization Result[/bold cyan]") - console.print(f" Level: [bold]{result.level.name}[/bold]") console.print(f" Input tokens: [bold]{result.input_tokens:,}[/bold]") console.print(f" Output tokens: [bold]{result.output_tokens:,}[/bold]") console.print(f" Compression: [bold]{result.compression_ratio:.1%}[/bold]") @@ -151,7 +150,7 @@ def _display_full_result( if result.summary: print_output_panel( result.summary, - title=f"Summary ({result.level.name})", + title="Summary", ) diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py index e50e2ac45..933d8bf58 100644 --- a/agent_cli/memory/_ingest.py +++ b/agent_cli/memory/_ingest.py @@ -432,10 +432,10 @@ async def extract_and_store_facts_and_summaries( model=model, ) LOGGER.info( - "Summary update completed in %.1f ms (conversation=%s, level=%s)", + "Summary update completed in %.1f ms (conversation=%s, compression=%.1f%%)", _elapsed_ms(summary_start), conversation_id, - summary_result.level.name, + summary_result.compression_ratio * 100, ) if summary_result.summary: await store_adaptive_summary( diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index a7e3871e2..46ac03631 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -191,10 +191,8 @@ def persist_summary( List of IDs that were stored. """ - from agent_cli.summarizer import SummaryLevel # noqa: PLC0415 - - # Skip if no summary needed - if summary_result.level == SummaryLevel.NONE: + # Skip if no summary was generated + if not summary_result.summary: return [] # Delete existing summary files diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py index 5b8df3855..d52d952ce 100644 --- a/agent_cli/memory/models.py +++ b/agent_cli/memory/models.py @@ -65,7 +65,7 @@ class MemoryMetadata(BaseModel): compression_ratio: float | None = None """Ratio of output to input tokens.""" summary_level: str | None = None - """Name of the SummaryLevel enum used (NONE, BRIEF, or MAP_REDUCE).""" + """Deprecated: previously stored SummaryLevel enum name.""" collapse_depth: int | None = None """Number of collapse iterations in map-reduce (0 = no collapse needed).""" diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py index daf0e2bc6..7c7603b98 100644 --- a/agent_cli/summarizer/__init__.py +++ b/agent_cli/summarizer/__init__.py @@ -1,13 +1,13 @@ """Adaptive summarization module for variable-length content. This module provides map-reduce summarization inspired by LangChain's approach: -1. Split content into chunks and summarize each in parallel (map phase) -2. Recursively collapse summaries until they fit token_max (reduce phase) +1. If content fits target, return as-is (no LLM call) +2. Otherwise, split into chunks and summarize each in parallel (map phase) +3. Recursively collapse summaries until they fit target (reduce phase) Research foundations: - LangChain ReduceDocumentsChain: token_max=3000, recursive collapse - BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal -- Two-phase architecture concept from Mem0 (arXiv:2504.19413) Example: from agent_cli.summarizer import summarize, SummarizerConfig @@ -16,8 +16,14 @@ openai_base_url="http://localhost:8000/v1", model="gpt-4", ) - result = await summarize(long_document, config) - print(f"Level: {result.level.name}, Compression: {result.compression_ratio:.1%}") + + # Compress to fit 4000 tokens + result = await summarize(long_document, config, target_tokens=4000) + + # Compress to 20% of original size + result = await summarize(long_document, config, target_ratio=0.2) + + print(f"Compression: {result.compression_ratio:.1%}") """ @@ -25,14 +31,12 @@ from agent_cli.summarizer.models import ( SummarizationError, SummarizerConfig, - SummaryLevel, SummaryResult, ) __all__ = [ "SummarizationError", "SummarizerConfig", - "SummaryLevel", "SummaryResult", "summarize", ] diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py index 23c8dd195..64c72b8ff 100644 --- a/agent_cli/summarizer/_utils.py +++ b/agent_cli/summarizer/_utils.py @@ -8,7 +8,7 @@ from pydantic import BaseModel -from agent_cli.summarizer.models import SummarizationError, SummarizerConfig, SummaryLevel +from agent_cli.summarizer.models import SummarizationError, SummarizerConfig if TYPE_CHECKING: import tiktoken @@ -230,13 +230,11 @@ def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str: return " ".join(overlap_parts) -def estimate_summary_tokens(input_tokens: int, level: int) -> int: - """Estimate target summary tokens based on input size and level.""" - if level == SummaryLevel.NONE: - return 0 - if level == SummaryLevel.BRIEF: - return min(50, max(20, input_tokens // 5)) - # MAP_REDUCE: ~10% compression with floor/ceiling +def estimate_summary_tokens(input_tokens: int) -> int: + """Estimate target summary tokens based on input size. + + Uses ~10% compression ratio with floor/ceiling bounds. + """ return min(500, max(50, input_tokens // 10)) diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py index f242b662f..2a772062a 100644 --- a/agent_cli/summarizer/adaptive.py +++ b/agent_cli/summarizer/adaptive.py @@ -1,17 +1,13 @@ """Adaptive summarization using map-reduce with dynamic collapse. Implements a simple algorithm inspired by LangChain's map-reduce chains: -1. If content is short enough, summarize directly +1. If content fits target, return as-is (no LLM call) 2. Otherwise, split into chunks and summarize each (map phase) -3. Recursively collapse summaries until they fit token_max (reduce phase) +3. Recursively collapse summaries until they fit target (reduce phase) Research foundations: - LangChain ReduceDocumentsChain: token_max=3000, recursive collapse - BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal -- Two-phase architecture concept from Mem0 (arXiv:2504.19413) - -Key insight: No need for predetermined L1/L2/L3 levels. -Dynamic collapse depth based on actual content length. See docs/architecture/summarizer.md for detailed design rationale. """ @@ -21,76 +17,68 @@ import logging from agent_cli.summarizer._prompts import ( - BRIEF_SUMMARY_PROMPT, format_prior_context, get_prompt_for_content_type, ) from agent_cli.summarizer._utils import ( count_tokens, - estimate_summary_tokens, generate_summary, tokens_to_words, ) from agent_cli.summarizer.map_reduce import map_reduce_summarize from agent_cli.summarizer.models import ( - SummarizationError, SummarizerConfig, - SummaryLevel, SummaryResult, ) logger = logging.getLogger(__name__) -# Thresholds for summary levels (in tokens) -THRESHOLD_NONE = 100 # Below this, no summary needed -THRESHOLD_BRIEF = 500 # Below this, just a single sentence - -# Re-export for backwards compatibility __all__ = [ - "THRESHOLD_BRIEF", - "THRESHOLD_NONE", - "SummarizationError", "SummarizerConfig", - "determine_level", "summarize", ] -def determine_level(token_count: int) -> SummaryLevel: - """Map token count to appropriate SummaryLevel.""" - if token_count < THRESHOLD_NONE: - return SummaryLevel.NONE - if token_count < THRESHOLD_BRIEF: - return SummaryLevel.BRIEF - return SummaryLevel.MAP_REDUCE - - async def summarize( content: str, config: SummarizerConfig, + *, + target_tokens: int | None = None, + target_ratio: float | None = None, prior_summary: str | None = None, content_type: str = "general", ) -> SummaryResult: - """Summarize content with adaptive strategy based on length. + """Summarize content to fit within a target token limit. - Uses a simple algorithm: - - Very short content (<100 tokens): No summary - - Short content (<500 tokens): Single sentence brief summary - - Everything else: Map-reduce with dynamic collapse + Simple algorithm: + - If content already fits target, return as-is (no LLM call) + - Otherwise, use map-reduce to compress until it fits Args: content: The content to summarize. config: Summarizer configuration. + target_tokens: Absolute token limit (e.g., 4000). Defaults to config.token_max. + target_ratio: Relative compression ratio (e.g., 0.2 = compress to 20% of input). + Takes precedence over target_tokens if both provided. prior_summary: Optional prior summary for context continuity. content_type: Type of content ("general", "conversation", "journal", "document"). Returns: - SummaryResult with summary and metadata. + SummaryResult with summary and compression metrics. + + Examples: + # Compress to fit 4000 tokens + result = await summarize(huge_doc, config, target_tokens=4000) + + # Compress to 20% of original size + result = await summarize(huge_doc, config, target_ratio=0.2) + + # Use default (config.token_max = 3000) + result = await summarize(huge_doc, config) """ if not content or not content.strip(): return SummaryResult( - level=SummaryLevel.NONE, summary=None, input_tokens=0, output_tokens=0, @@ -98,65 +86,43 @@ async def summarize( ) input_tokens = count_tokens(content, config.model) - level = determine_level(input_tokens) + + # Determine target + if target_ratio is not None: + target = max(1, int(input_tokens * target_ratio)) + elif target_tokens is not None: + target = target_tokens + else: + target = config.token_max logger.info( - "Summarizing %d tokens at level %s (type=%s)", + "Summarizing %d tokens to target %d (type=%s)", input_tokens, - level.name, + target, content_type, ) - if level == SummaryLevel.NONE: + # Already fits? Return content as-is (no LLM call) + if input_tokens <= target: return SummaryResult( - level=level, - summary=None, + summary=content, input_tokens=input_tokens, - output_tokens=0, - compression_ratio=0.0, + output_tokens=input_tokens, + compression_ratio=1.0, + collapse_depth=0, ) - if level == SummaryLevel.BRIEF: - summary = await _brief_summary(content, config) - output_tokens = count_tokens(summary, config.model) - return SummaryResult( - level=level, - summary=summary, - input_tokens=input_tokens, - output_tokens=output_tokens, - compression_ratio=output_tokens / input_tokens, + # Content fits in single chunk but exceeds target - use content-aware summary + if input_tokens <= config.chunk_size: + summary = await _content_aware_summary( + content, + config, + target, + prior_summary, + content_type, ) - - # MAP_REDUCE level - return await _map_reduce_summary( - content, - input_tokens, - config, - prior_summary, - content_type, - ) - - -async def _brief_summary(content: str, config: SummarizerConfig) -> str: - """Generate a single-sentence summary for brief content.""" - prompt = BRIEF_SUMMARY_PROMPT.format(content=content) - return await generate_summary(prompt, config, max_tokens=50) - - -async def _map_reduce_summary( - content: str, - input_tokens: int, - config: SummarizerConfig, - prior_summary: str | None, - content_type: str, -) -> SummaryResult: - """Use map-reduce with dynamic collapse for longer content.""" - # For content that fits in a single chunk, use content-type aware summary - if input_tokens <= config.token_max: - summary = await _content_aware_summary(content, config, prior_summary, content_type) output_tokens = count_tokens(summary, config.model) return SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary=summary, input_tokens=input_tokens, output_tokens=output_tokens, @@ -164,11 +130,10 @@ async def _map_reduce_summary( collapse_depth=0, ) - # Use map-reduce for multi-chunk content - result = await map_reduce_summarize(content, config) + # Large content - use map-reduce with dynamic collapse + result = await map_reduce_summarize(content, config, target) return SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary=result.summary, input_tokens=result.input_tokens, output_tokens=result.output_tokens, @@ -180,14 +145,11 @@ async def _map_reduce_summary( async def _content_aware_summary( content: str, config: SummarizerConfig, + target_tokens: int, prior_summary: str | None, content_type: str, ) -> str: """Generate a content-type aware summary for single-chunk content.""" - target_tokens = estimate_summary_tokens( - count_tokens(content, config.model), - SummaryLevel.MAP_REDUCE, - ) max_words = tokens_to_words(target_tokens) prompt_template = get_prompt_for_content_type(content_type) diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py index 3dd81aa43..86e8b796a 100644 --- a/agent_cli/summarizer/map_reduce.py +++ b/agent_cli/summarizer/map_reduce.py @@ -2,7 +2,7 @@ Simple algorithm: 1. Map: Split content into chunks, summarize each in parallel -2. Reduce: If combined summaries exceed token_max, recursively collapse +2. Reduce: If combined summaries exceed target, recursively collapse Key insight from LangChain: No need for predetermined levels (L1/L2/L3). Just keep collapsing until content fits. Dynamic depth based on actual content. @@ -18,6 +18,7 @@ import asyncio import logging from dataclasses import dataclass +from typing import TYPE_CHECKING from agent_cli.summarizer._prompts import ( CHUNK_SUMMARY_PROMPT, @@ -31,7 +32,9 @@ generate_summary, tokens_to_words, ) -from agent_cli.summarizer.models import SummarizerConfig, SummaryLevel + +if TYPE_CHECKING: + from agent_cli.summarizer.models import SummarizerConfig logger = logging.getLogger(__name__) @@ -61,28 +64,29 @@ class MapReduceResult: async def map_reduce_summarize( content: str, config: SummarizerConfig, + target: int | None = None, max_collapse_depth: int = 10, ) -> MapReduceResult: """Summarize content using map-reduce with dynamic collapse. Algorithm: 1. Split into chunks and summarize each (map phase) - 2. If combined summaries exceed token_max, recursively collapse (reduce phase) - 3. Continue until everything fits in token_max - - Note: This function is designed for content that exceeds token_max. For shorter - content, use the main `summarize()` function in adaptive.py which selects the - appropriate strategy (NONE, BRIEF, or MAP_REDUCE with content-aware prompts). + 2. If combined summaries exceed target, recursively collapse (reduce phase) + 3. Continue until everything fits in target Args: content: The content to summarize. config: Summarizer configuration. + target: Target token count. Defaults to config.token_max. max_collapse_depth: Safety limit on recursive collapse depth. Returns: MapReduceResult with summary and metadata. """ + if target is None: + target = config.token_max + input_tokens = count_tokens(content, config.model) # Map phase: Split and summarize chunks in parallel @@ -97,9 +101,9 @@ async def map_reduce_summarize( summaries = await _map_summarize(chunks, config) intermediate_summaries = [summaries.copy()] - # Reduce phase: Recursively collapse until fits token_max + # Reduce phase: Recursively collapse until fits target depth = 0 - while _total_tokens(summaries, config.model) > config.token_max: + while _total_tokens(summaries, config.model) > target: depth += 1 if depth > max_collapse_depth: logger.warning( @@ -109,17 +113,18 @@ async def map_reduce_summarize( break logger.info( - "Reduce phase (depth %d): collapsing %d summaries (%d tokens)", + "Reduce phase (depth %d): collapsing %d summaries (%d tokens) to target %d", depth, len(summaries), _total_tokens(summaries, config.model), + target, ) - summaries = await _collapse_summaries(summaries, config) + summaries = await _collapse_summaries(summaries, config, target) intermediate_summaries.append(summaries.copy()) # Final synthesis if we have multiple summaries left if len(summaries) > 1: - final_summary = await _synthesize(summaries, config) + final_summary = await _synthesize(summaries, config, target) else: final_summary = summaries[0] @@ -161,7 +166,7 @@ async def _summarize_chunk( ) -> str: """Summarize a single chunk.""" source_tokens = count_tokens(chunk, config.model) - target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.MAP_REDUCE) + target_tokens = estimate_summary_tokens(source_tokens) max_words = tokens_to_words(target_tokens) prompt = CHUNK_SUMMARY_PROMPT.format( @@ -177,16 +182,17 @@ async def _summarize_chunk( async def _collapse_summaries( summaries: list[str], config: SummarizerConfig, + target: int, ) -> list[str]: """Collapse summaries by grouping and re-summarizing (reduce phase). - Groups summaries that together fit within token_max, then summarizes each group. + Groups summaries that together fit within target, then summarizes each group. This is similar to LangChain's split_list_of_docs approach. """ if len(summaries) <= 1: return summaries - # Group summaries that together fit within token_max + # Group summaries that together fit within target groups: list[list[str]] = [] current_group: list[str] = [] current_tokens = 0 @@ -194,8 +200,8 @@ async def _collapse_summaries( for summary in summaries: summary_tokens = count_tokens(summary, config.model) - # If adding this summary would exceed token_max, start new group - if current_tokens + summary_tokens > config.token_max and current_group: + # If adding this summary would exceed target, start new group + if current_tokens + summary_tokens > target and current_group: groups.append(current_group) current_group = [summary] current_tokens = summary_tokens @@ -211,16 +217,21 @@ async def _collapse_summaries( async def summarize_group(group: list[str]) -> str: async with semaphore: - return await _synthesize(group, config) + return await _synthesize(group, config, target) tasks = [summarize_group(g) for g in groups] return list(await asyncio.gather(*tasks)) -async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str: +async def _synthesize( + summaries: list[str], + config: SummarizerConfig, + target: int, +) -> str: """Synthesize multiple summaries into one.""" combined_tokens = sum(count_tokens(s, config.model) for s in summaries) - target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE) + # Aim for target tokens but use estimate if combined is smaller + target_tokens = min(target, estimate_summary_tokens(combined_tokens)) max_words = tokens_to_words(target_tokens) prompt = META_SUMMARY_PROMPT.format( diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index 14be0c864..65eb42ed5 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -4,7 +4,6 @@ from dataclasses import dataclass from datetime import UTC, datetime -from enum import IntEnum from typing import Any from pydantic import BaseModel, Field @@ -24,7 +23,6 @@ class SummarizerConfig: model="llama3.1:8b", ) result = await summarize(long_document, config) - print(f"Level: {result.level.name}") print(f"Compression: {result.compression_ratio:.1%}") """ @@ -33,7 +31,7 @@ class SummarizerConfig: model: str api_key: str | None = None chunk_size: int = 2048 # BOOOOKSCORE's tested default - token_max: int = 3000 # LangChain's default - when to collapse + token_max: int = 3000 # LangChain's default - target size after compression chunk_overlap: int = 200 max_concurrent_chunks: int = 5 timeout: float = 60.0 @@ -45,32 +43,18 @@ def __post_init__(self) -> None: self.api_key = "not-needed" -class SummaryLevel(IntEnum): - """Summary strategy based on input length.""" - - NONE = 0 - """< 100 tokens: No summary needed.""" - - BRIEF = 1 - """100-500 tokens: Single-sentence summary.""" - - MAP_REDUCE = 2 - """> 500 tokens: Map-reduce with dynamic collapse.""" - - class SummaryResult(BaseModel): """Result of summarization. Contains the summary and metadata about the compression achieved. """ - level: SummaryLevel = Field(..., description="The summarization strategy used") summary: str | None = Field( default=None, - description="The final summary text (None for NONE level)", + description="The summary text (None if content already fit target)", ) input_tokens: int = Field(..., ge=0, description="Token count of the input content") - output_tokens: int = Field(..., ge=0, description="Token count of the summary") + output_tokens: int = Field(..., ge=0, description="Token count of the output") compression_ratio: float = Field( ..., ge=0.0, @@ -91,8 +75,9 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: """Convert to metadata entry for ChromaDB storage. Returns a list with a single metadata dict for the summary. + Returns empty list if no summary was generated. """ - if self.level == SummaryLevel.NONE or not self.summary: + if not self.summary: return [] timestamp = self.created_at.isoformat() @@ -105,7 +90,6 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: "conversation_id": conversation_id, "role": "summary", "is_final": True, - "summary_level": self.level.name, "input_tokens": self.input_tokens, "output_tokens": self.output_tokens, "compression_ratio": self.compression_ratio, diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md index 43caf336d..c7476142e 100644 --- a/docs/architecture/summarizer.md +++ b/docs/architecture/summarizer.md @@ -4,22 +4,23 @@ This document describes the architectural decisions, design rationale, and techn ## 1. System Overview -The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. Rather than applying fixed summarization levels, it dynamically collapses content until it fits within a token budget. +The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. It compresses content to fit within a specified token budget using a simple algorithm: ``` -Input Content ──▶ Token Count ──▶ Strategy Selection +Input Content ──▶ Token Count ──▶ Compare to Target │ - ┌───────────────────────────────┼─────────────────────┐ - │ │ │ - < 100 tokens 100-500 tokens > 500 tokens - │ │ │ - No summary Brief summary Map-Reduce - (single sentence) (dynamic collapse) + ┌───────────────────────┴───────────────────────┐ + │ │ + Fits target Exceeds target + │ │ + Return as-is Map-Reduce + (no LLM call) (dynamic collapse) ``` **Design Goals:** -- **Simple algorithm:** Map-reduce with dynamic collapse depth based on actual content. +- **Maximum simplicity:** Single entry point with straightforward logic. +- **Flexible targeting:** Specify absolute token count or relative compression ratio. - **Research-grounded defaults:** chunk_size=2048 (BOOOOKSCORE), token_max=3000 (LangChain). - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents. @@ -47,59 +48,81 @@ BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. T - Chunk size: **2048 tokens** (we use this) - Max summary length: **900 tokens** -### 2.3 Borrowed: Two-Phase Architecture (Mem0) - -**Reference:** arXiv:2504.19413 - -Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to storage. - -### 2.4 Original Design (Not Research-Backed) +### 2.3 Original Design (Not Research-Backed) The following aspects are **original design choices without direct research justification**: -- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/MAP_REDUCE were chosen heuristically. - **Content-type prompts:** Domain-specific prompts are original design. +- **Target ratio parameter:** The option to specify compression as a percentage is a convenience feature. --- ## 3. Architectural Decisions -### 3.1 Map-Reduce with Dynamic Collapse +### 3.1 Simple Target-Based Logic + +**Decision:** Use a simple "fits? return : compress" algorithm. + +**Rationale:** + +- **Minimal complexity:** No level selection logic, threshold management, or multiple code paths. +- **Clear semantics:** If content fits the target, return it unchanged. Otherwise, compress. +- **Flexible targeting:** Users can specify exact token counts or relative ratios. + +**Algorithm:** + +```python +async def summarize( + content: str, + config: SummarizerConfig, + *, + target_tokens: int | None = None, # Absolute limit + target_ratio: float | None = None, # e.g., 0.2 = compress to 20% +) -> SummaryResult: + input_tokens = count_tokens(content) + + # Determine target + if target_ratio is not None: + target = max(1, int(input_tokens * target_ratio)) + elif target_tokens is not None: + target = target_tokens + else: + target = config.token_max # Default: 3000 + + # Already fits? Return as-is (no LLM call) + if input_tokens <= target: + return SummaryResult(summary=content, ...) + + # Compress using map-reduce + return await map_reduce_summarize(content, config, target) +``` -**Decision:** Use LangChain-style map-reduce instead of fixed hierarchy. +### 3.2 Map-Reduce with Dynamic Collapse + +**Decision:** Use LangChain-style map-reduce for all compression. **Rationale:** -- **Simpler algorithm:** Single code path handles all content sizes. +- **Single algorithm:** One code path handles all content sizes. - **Dynamic depth:** Collapse depth adapts to actual content length. - **Research-backed:** LangChain's approach is battle-tested. **Algorithm:** ```python -async def map_reduce_summarize(content, config): +async def map_reduce_summarize(content, config, target): # Map: Split and summarize chunks in parallel chunks = chunk_text(content, chunk_size=2048) summaries = await parallel_summarize(chunks) - # Reduce: Recursively collapse until fits token_max - while total_tokens(summaries) > config.token_max: - groups = group_by_token_limit(summaries, config.token_max) + # Reduce: Recursively collapse until fits target + while total_tokens(summaries) > target: + groups = group_by_token_limit(summaries, target) summaries = await parallel_synthesize(groups) return final_synthesis(summaries) ``` -### 3.2 Three-Level Strategy - -**Decision:** Use three levels based on token count. - -| Level | Token Range | Strategy | -| :--- | :--- | :--- | -| NONE | < 100 | No summarization needed | -| BRIEF | 100-500 | Single sentence | -| MAP_REDUCE | >= 500 | Dynamic collapse until fits token_max | - ### 3.3 Research-Backed Defaults **Decision:** Use values from published research. @@ -158,29 +181,29 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, `compression_rat ## 4. Processing Pipeline -### 4.1 Level Selection +### 4.1 Entry Point -The entry point (`summarize()`) counts tokens and selects strategy: +The entry point (`summarize()`) implements simple logic: 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable. -2. **Threshold comparison:** Determines NONE, BRIEF, or MAP_REDUCE. -3. **Strategy dispatch:** Calls appropriate handler. +2. **Target calculation:** Determines target from `target_tokens`, `target_ratio`, or default `token_max`. +3. **Fit check:** If content fits target, return as-is. +4. **Compression:** Call map-reduce if content exceeds target. -### 4.2 Brief Level +### 4.2 Single-Chunk Content -For short content (100-500 tokens): +For content that fits within `chunk_size` but exceeds target: -- Single LLM call with brief prompt -- Returns `SummaryResult` with single-sentence summary +- Single LLM call with content-type aware prompt +- Returns `SummaryResult` with compressed summary -### 4.3 Map-Reduce Level +### 4.3 Multi-Chunk Content -For longer content (>= 500 tokens): +For larger content (> chunk_size tokens): -1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly. -2. **Map phase:** Split content into overlapping chunks, summarize each in parallel. -3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively. -4. **Final synthesis:** Combine remaining summaries into final output. +1. **Map phase:** Split content into overlapping chunks, summarize each in parallel. +2. **Reduce phase:** If combined summaries exceed target, group and re-summarize recursively. +3. **Final synthesis:** Combine remaining summaries into final output. The `collapse_depth` field in the result indicates how many reduce iterations were needed. @@ -188,29 +211,19 @@ The `collapse_depth` field in the result indicates how many reduce iterations we ## 5. Data Models -### 5.1 SummaryLevel - -```python -class SummaryLevel(IntEnum): - NONE = 0 # < 100 tokens - BRIEF = 1 # 100-500 tokens - MAP_REDUCE = 2 # >= 500 tokens -``` - -### 5.2 SummaryResult +### 5.1 SummaryResult ```python class SummaryResult(BaseModel): - level: SummaryLevel - summary: str | None + summary: str | None # None if content was empty input_tokens: int output_tokens: int - compression_ratio: float - collapse_depth: int # 0 = no collapse needed + compression_ratio: float # 0.0-1.0 + collapse_depth: int # 0 = no collapse needed created_at: datetime ``` -### 5.3 SummarizerConfig +### 5.2 SummarizerConfig ```python @dataclass @@ -219,7 +232,7 @@ class SummarizerConfig: model: str api_key: str | None = None chunk_size: int = 2048 # BOOOOKSCORE - token_max: int = 3000 # LangChain + token_max: int = 3000 # LangChain (default target) chunk_overlap: int = 200 max_concurrent_chunks: int = 5 timeout: float = 60.0 @@ -257,12 +270,12 @@ Summaries are stored with metadata: "metadata": { "conversation_id": conversation_id, "role": "summary", - "summary_level": "MAP_REDUCE", "input_tokens": 1500, "output_tokens": 150, "compression_ratio": 0.1, "collapse_depth": 1, "created_at": "2024-01-15T10:30:00Z", + "is_final": True, }, } ``` @@ -274,9 +287,9 @@ Summaries are stored with metadata: Summarization follows a fail-fast philosophy: - **LLM errors:** Propagated as `SummarizationError` (base class for all summarization errors). -- **Empty input:** Returns NONE level immediately (not an error). +- **Empty input:** Returns result with `summary=None` immediately (not an error). - **Encoding errors:** Falls back to character-based token estimation. -- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max. +- **Max depth exceeded:** Warning logged, forces final synthesis even if over target. The caller decides how to handle failures—typically by proceeding without a summary rather than blocking the entire operation. @@ -294,9 +307,41 @@ The caller decides how to handle failures—typically by proceeding without a su --- -## 9. Limitations and Trade-offs +## 9. Usage Examples + +### Basic Usage + +```python +from agent_cli.summarizer import SummarizerConfig, summarize + +config = SummarizerConfig( + openai_base_url="http://localhost:11434/v1", + model="llama3.1:8b", +) + +# Default: compress to fit 3000 tokens +result = await summarize(content, config) + +# Compress to specific token count +result = await summarize(content, config, target_tokens=500) -### 9.1 Fact Preservation +# Compress to 20% of original size +result = await summarize(content, config, target_ratio=0.2) + +# With content type for better prompts +result = await summarize( + content, + config, + target_tokens=500, + content_type="conversation", +) +``` + +--- + +## 10. Limitations and Trade-offs + +### 10.1 Fact Preservation Summarization is inherently lossy. Specific facts (dates, numbers, names) are often dropped in favor of thematic content. If your use case requires fact retrieval: @@ -304,18 +349,14 @@ Summarization is inherently lossy. Specific facts (dates, numbers, names) are of - Use fact extraction instead of summarization - Use RAG to retrieve original chunks -### 9.2 No Intermediate Summaries +### 10.2 No Intermediate Summaries Unlike hierarchical approaches, map-reduce only stores the final summary. Intermediate chunk summaries are discarded after synthesis. This simplifies storage but removes granular access. -### 9.3 Fixed Thresholds - -The 100/500 token thresholds are heuristic. They may need tuning for specific domains or languages. - --- -## 10. Future Improvements +## 11. Future Improvements 1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation -2. **Tune token thresholds empirically** with real-world content -3. **Add fact extraction mode** for use cases requiring specific detail preservation +2. **Add fact extraction mode** for use cases requiring specific detail preservation +3. **Streaming support** for real-time summarization feedback diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py index 70d434dda..feebc5f20 100644 --- a/examples/summarizer_demo.py +++ b/examples/summarizer_demo.py @@ -1,15 +1,16 @@ -"""Demonstrate the summarizer on texts of varying lengths from the internet. +"""Demonstrate the simplified summarizer on texts of varying lengths. This script fetches content of different sizes and shows how the adaptive -summarizer automatically selects the appropriate strategy (BRIEF or MAP_REDUCE) -based on content length. +summarizer compresses content to fit different target token counts or ratios. Usage: python examples/summarizer_demo.py - # Test specific levels only - python examples/summarizer_demo.py --level brief - python examples/summarizer_demo.py --level map_reduce + # Test with specific target ratio + python examples/summarizer_demo.py --target-ratio 0.2 + + # Test with specific target token count + python examples/summarizer_demo.py --target-tokens 500 # Use a different model python examples/summarizer_demo.py --model "gpt-4o-mini" @@ -30,12 +31,11 @@ from agent_cli.summarizer import ( SummarizerConfig, - SummaryLevel, SummaryResult, summarize, ) -# Defaults for local AI setup (same as aijournal_poc.py) +# Defaults for local AI setup DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1" DEFAULT_MODEL = "gpt-oss-high:20b" @@ -47,24 +47,17 @@ class TextSample: name: str description: str url: str - expected_level: SummaryLevel content_type: str = "general" # If URL fetch fails, use this fallback fallback_content: str | None = None -# Thresholds from adaptive.py: -# NONE: < 100 tokens -# BRIEF: 100-500 tokens -# MAP_REDUCE: >= 500 tokens - -# Sample texts of varying lengths to demonstrate different summarization levels +# Sample texts of varying lengths to demonstrate summarization SAMPLES: list[TextSample] = [ TextSample( - name="Brief - Short News Article", - description="~150-400 tokens - triggers BRIEF level (100-500 token range)", + name="Short News Article", + description="~150-400 tokens - demonstrates small content handling", url="https://httpbin.org/json", # Returns small JSON we'll convert to text - expected_level=SummaryLevel.BRIEF, fallback_content=""" Breaking News: Scientists at the Marine Biology Institute have made a groundbreaking discovery in the Mariana Trench. A new species of deep-sea @@ -94,10 +87,9 @@ class TextSample: """, ), TextSample( - name="Map-Reduce - Technology Article", - description="~800-2000 tokens - triggers MAP_REDUCE level (>=500 tokens)", + name="Technology Article", + description="~800-2000 tokens - demonstrates medium content", url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence", - expected_level=SummaryLevel.MAP_REDUCE, content_type="document", fallback_content=""" Artificial intelligence (AI) is the intelligence of machines or software, @@ -174,21 +166,12 @@ class TextSample: """, ), TextSample( - name="Map-Reduce - Full Article", - description="~4000-10000 tokens - triggers MAP_REDUCE with chunking", + name="Full Article", + description="~4000-10000 tokens - demonstrates large content with chunking", url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning", - expected_level=SummaryLevel.MAP_REDUCE, content_type="document", fallback_content=None, # We'll generate synthetic content ), - TextSample( - name="Map-Reduce - Long Document", - description="~16000+ tokens - triggers MAP_REDUCE with multiple collapse iterations", - url="https://www.gutenberg.org/cache/epub/84/pg84.txt", # Frankenstein (truncated) - expected_level=SummaryLevel.MAP_REDUCE, - content_type="document", - fallback_content=None, # We'll generate synthetic content (~16K tokens) - ), ] @@ -262,25 +245,11 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: content = re.sub(r"<[^>]+>", " ", content) content = re.sub(r"\s+", " ", content).strip() - # Check if content is too short for expected level - min_words_for_level = { - SummaryLevel.BRIEF: 80, # Need ~100 tokens - SummaryLevel.MAP_REDUCE: 400, # Need ~500 tokens - } - min_words = min_words_for_level.get(sample.expected_level, 50) - + # Check if content is too short + min_words = 80 if len(content.split()) < min_words: print(f" 📎 Fetched content too short ({len(content.split())} words), using fallback") - if sample.fallback_content: - content = sample.fallback_content - else: - target_tokens = { - SummaryLevel.BRIEF: 300, - SummaryLevel.MAP_REDUCE: 1500, - } - content = generate_synthetic_content( - target_tokens.get(sample.expected_level, 1000), - ) + content = sample.fallback_content or generate_synthetic_content(1500) # For very long content, truncate to keep demo fast words = content.split() @@ -296,15 +265,17 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str: if sample.fallback_content: return sample.fallback_content.strip() - # Generate synthetic content for the expected level - target_tokens = { - SummaryLevel.BRIEF: 300, - SummaryLevel.MAP_REDUCE: 1500, - } - return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000)) + # Generate synthetic content + return generate_synthetic_content(1500) -def print_result(sample: TextSample, result: SummaryResult, content: str) -> None: +def print_result( + sample: TextSample, + result: SummaryResult, + content: str, + target_tokens: int | None, + target_ratio: float | None, +) -> None: """Print a formatted summary result.""" print("\n" + "=" * 70) print(f"📄 {sample.name}") @@ -318,23 +289,30 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non print(f" Tokens: {result.input_tokens:,}") print(f" Content type: {sample.content_type}") - # Summarization result - level_emoji = { - SummaryLevel.NONE: "⏭️", - SummaryLevel.BRIEF: "📝", - SummaryLevel.MAP_REDUCE: "🔄", - } - print("\n🎯 Summarization Result:") - print(f" Level: {level_emoji.get(result.level, '❓')} {result.level.name}") - print(f" Expected: {sample.expected_level.name}") - print(f" Match: {'✅' if result.level == sample.expected_level else '⚠️'}") + # Target info + print("\n🎯 Target:") + if target_ratio is not None: + print(f" Ratio: {target_ratio:.0%} of input") + print(f" Calculated target: ~{int(result.input_tokens * target_ratio):,} tokens") + elif target_tokens is not None: + print(f" Tokens: {target_tokens:,}") + else: + print(" Default: 3000 tokens (LangChain default)") + + # Result info + print("\n📝 Result:") + if result.summary == content: + print(" Status: ⏭️ Content already fits target (returned as-is)") + elif result.collapse_depth > 0: + print(f" Status: 🔄 Map-reduce summarization (collapse depth: {result.collapse_depth})") + else: + print(" Status: 📝 Single-pass summarization") + print(f" Output tokens: {result.output_tokens:,}") print(f" Compression: {result.compression_ratio:.1%}") - if result.collapse_depth > 0: - print(f" Collapse depth: {result.collapse_depth}") # Summary content - if result.summary: + if result.summary and result.summary != content: print("\n📝 Summary:") wrapped = textwrap.fill( result.summary, @@ -342,11 +320,15 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non initial_indent=" ", subsequent_indent=" ", ) + # Only show first ~500 chars of summary + if len(wrapped) > 600: # noqa: PLR2004 + wrapped = wrapped[:600] + "..." print(wrapped) async def run_demo( - level_filter: str | None = None, + target_tokens: int | None = None, + target_ratio: float | None = None, model: str | None = None, base_url: str | None = None, ) -> None: @@ -369,39 +351,28 @@ async def run_demo( timeout=120.0, # Longer timeout for local models ) - # Filter samples if requested - samples = SAMPLES - if level_filter: - level_map = { - "brief": SummaryLevel.BRIEF, - "map_reduce": SummaryLevel.MAP_REDUCE, - } - target_level = level_map.get(level_filter.lower()) - if target_level: - samples = [s for s in SAMPLES if s.expected_level == target_level] - print(f"\n🔍 Filtering to {level_filter.upper()} level only") - async with httpx.AsyncClient() as client: - for sample in samples: + for sample in SAMPLES: print(f"\n⏳ Processing: {sample.name}...") # Fetch content content = await fetch_content(sample, client) try: - # Summarize + # Summarize with specified target result = await summarize( content=content, config=config, + target_tokens=target_tokens, + target_ratio=target_ratio, content_type=sample.content_type, ) # Display results - print_result(sample, result, content) + print_result(sample, result, content, target_tokens, target_ratio) except Exception as e: print(f"\n❌ Error summarizing {sample.name}: {e}") - traceback.print_exc() print("\n" + "=" * 70) @@ -417,16 +388,21 @@ def main() -> None: epilog=textwrap.dedent(""" Examples: python examples/summarizer_demo.py - python examples/summarizer_demo.py --level brief - python examples/summarizer_demo.py --level map_reduce + python examples/summarizer_demo.py --target-ratio 0.2 + python examples/summarizer_demo.py --target-tokens 500 python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1" """), ) parser.add_argument( - "--level", - choices=["brief", "map_reduce"], - help="Only test a specific summarization level", + "--target-ratio", + type=float, + help="Target ratio for compression (e.g., 0.2 = compress to 20%%)", + ) + parser.add_argument( + "--target-tokens", + type=int, + help="Target token count for summary", ) parser.add_argument( "--model", @@ -439,9 +415,13 @@ def main() -> None: args = parser.parse_args() + if args.target_ratio is not None and args.target_tokens is not None: + parser.error("Cannot specify both --target-ratio and --target-tokens") + asyncio.run( run_demo( - level_filter=args.level, + target_tokens=args.target_tokens, + target_ratio=args.target_ratio, model=args.model, base_url=args.base_url, ), diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py index 44d0a031c..fc341b7df 100644 --- a/tests/memory/test_engine.py +++ b/tests/memory/test_engine.py @@ -23,7 +23,7 @@ Message, StoredMemory, ) -from agent_cli.summarizer import SummaryLevel, SummaryResult +from agent_cli.summarizer import SummaryResult class _DummyReranker: @@ -355,7 +355,6 @@ def __init__(self, output: Any) -> None: async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: return SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary="summary up to 256", input_tokens=100, output_tokens=20, @@ -582,7 +581,6 @@ def __init__(self, output: Any) -> None: async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: return SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary="summary text", input_tokens=100, output_tokens=20, diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py index 86040d7a1..64130990c 100644 --- a/tests/memory/test_git_integration.py +++ b/tests/memory/test_git_integration.py @@ -14,7 +14,7 @@ from agent_cli.memory import _ingest from agent_cli.memory.client import MemoryClient from agent_cli.memory.entities import Fact -from agent_cli.summarizer import SummaryLevel, SummaryResult +from agent_cli.summarizer import SummaryResult if TYPE_CHECKING: from pathlib import Path @@ -66,7 +66,6 @@ async def fake_reconcile( async def fake_summarize_content(**_kwargs: Any) -> SummaryResult: return SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary="User likes testing.", input_tokens=100, output_tokens=20, diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py index 202a55921..b7ce45e82 100644 --- a/tests/summarizer/test_adaptive.py +++ b/tests/summarizer/test_adaptive.py @@ -12,13 +12,8 @@ SummaryOutput, generate_summary, ) -from agent_cli.summarizer.adaptive import ( - THRESHOLD_BRIEF, - THRESHOLD_NONE, - determine_level, - summarize, -) -from agent_cli.summarizer.models import SummaryLevel, SummaryResult +from agent_cli.summarizer.adaptive import summarize +from agent_cli.summarizer.map_reduce import MapReduceResult class TestSummarizerConfig: @@ -83,39 +78,6 @@ def test_default_token_max_is_langchain(self) -> None: assert config.token_max == 3000 # LangChain's default -class TestDetermineLevel: - """Tests for level determination based on token count. - - The simplified approach has 3 levels: - - NONE: Very short content (< 100 tokens) - - BRIEF: Short content (100-500 tokens) - - MAP_REDUCE: Everything else (uses map-reduce) - """ - - def test_none_level_threshold(self) -> None: - """Test NONE level for very short content.""" - assert determine_level(50) == SummaryLevel.NONE - assert determine_level(99) == SummaryLevel.NONE - - def test_brief_level_threshold(self) -> None: - """Test BRIEF level for short content.""" - assert determine_level(100) == SummaryLevel.BRIEF - assert determine_level(300) == SummaryLevel.BRIEF - assert determine_level(499) == SummaryLevel.BRIEF - - def test_map_reduce_level_for_longer_content(self) -> None: - """Test that content >= 500 tokens uses MAP_REDUCE.""" - assert determine_level(500) == SummaryLevel.MAP_REDUCE - assert determine_level(1500) == SummaryLevel.MAP_REDUCE - assert determine_level(5000) == SummaryLevel.MAP_REDUCE - assert determine_level(20000) == SummaryLevel.MAP_REDUCE - - def test_thresholds_match_constants(self) -> None: - """Verify thresholds match the module constants.""" - assert THRESHOLD_NONE == 100 - assert THRESHOLD_BRIEF == 500 - - class TestSummarize: """Tests for main summarize function.""" @@ -128,133 +90,101 @@ def config(self) -> SummarizerConfig: ) @pytest.mark.asyncio - async def test_empty_content_returns_none_level( + async def test_empty_content_returns_no_summary( self, config: SummarizerConfig, ) -> None: - """Test that empty content returns NONE level result.""" + """Test that empty content returns result with no summary.""" result = await summarize("", config) - assert result.level == SummaryLevel.NONE assert result.summary is None assert result.input_tokens == 0 assert result.output_tokens == 0 @pytest.mark.asyncio - async def test_whitespace_only_returns_none_level( + async def test_whitespace_only_returns_no_summary( self, config: SummarizerConfig, ) -> None: - """Test that whitespace-only content returns NONE level result.""" + """Test that whitespace-only content returns result with no summary.""" result = await summarize(" \n\n ", config) - assert result.level == SummaryLevel.NONE assert result.summary is None @pytest.mark.asyncio - async def test_very_short_content_no_summary( + async def test_short_content_returns_as_is( self, config: SummarizerConfig, ) -> None: - """Test that very short content gets NONE level (no summary).""" - # Less than 100 tokens + """Test that short content is returned as-is (no LLM call).""" + # Less than default token_max (3000) result = await summarize("Hello world", config) - assert result.level == SummaryLevel.NONE - assert result.summary is None + assert result.summary == "Hello world" + assert result.compression_ratio == 1.0 # No compression @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._brief_summary") - async def test_brief_level_calls_brief_summary( + async def test_target_tokens_respected( self, - mock_brief: AsyncMock, config: SummarizerConfig, ) -> None: - """Test that BRIEF level content calls _brief_summary.""" - mock_brief.return_value = "Brief summary." - - # Create content that's ~100-500 tokens - content = "This is a test sentence. " * 30 # ~150 tokens - - result = await summarize(content, config) - - mock_brief.assert_called_once_with(content, config) - assert result.level == SummaryLevel.BRIEF - assert result.summary == "Brief summary." + """Test that content fitting target_tokens is returned as-is.""" + content = "Short content" + result = await summarize(content, config, target_tokens=1000) + assert result.summary == content + assert result.compression_ratio == 1.0 @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._map_reduce_summary") - async def test_longer_content_uses_map_reduce( + async def test_target_ratio_calculates_target( self, - mock_map_reduce: AsyncMock, config: SummarizerConfig, ) -> None: - """Test that content >= 500 tokens uses map-reduce.""" - mock_result = SummaryResult( - level=SummaryLevel.MAP_REDUCE, - summary="Map-reduce summary.", - input_tokens=800, - output_tokens=100, - compression_ratio=0.125, - ) - mock_map_reduce.return_value = mock_result - - # Create content that's ~500+ tokens - content = "This is a test sentence with more words. " * 100 # ~800 tokens - - result = await summarize(content, config, content_type="general") - - mock_map_reduce.assert_called_once() - assert result.summary == "Map-reduce summary." + """Test that target_ratio calculates correct target.""" + # Short content that fits even with 10% target + content = "Hello" + result = await summarize(content, config, target_ratio=0.1) + # Content is so short it fits in 10% target + assert result.summary == content @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._map_reduce_summary") - async def test_prior_summary_passed_to_map_reduce( + @patch("agent_cli.summarizer.adaptive._content_aware_summary") + async def test_content_exceeding_target_gets_summarized( self, - mock_map_reduce: AsyncMock, + mock_summary: AsyncMock, config: SummarizerConfig, ) -> None: - """Test that prior_summary is passed to _map_reduce_summary.""" - mock_result = SummaryResult( - level=SummaryLevel.MAP_REDUCE, - summary="Updated summary.", - input_tokens=800, - output_tokens=100, - compression_ratio=0.125, - ) - mock_map_reduce.return_value = mock_result + """Test that content exceeding target gets summarized.""" + mock_summary.return_value = "Summarized content." - content = "This is a test sentence with more words. " * 100 - prior = "Previous context summary." + # Create content that's ~500 tokens (exceeds target of 100) + content = "This is a test sentence. " * 100 - await summarize(content, config, prior_summary=prior) + result = await summarize(content, config, target_tokens=100) - # Verify prior_summary was passed - call_args = mock_map_reduce.call_args - assert call_args[0][3] == prior # prior_summary is 4th positional arg + mock_summary.assert_called_once() + assert result.summary == "Summarized content." @pytest.mark.asyncio - @patch("agent_cli.summarizer.adaptive._map_reduce_summary") - async def test_very_long_content_uses_map_reduce( + @patch("agent_cli.summarizer.adaptive.map_reduce_summarize") + async def test_large_content_uses_map_reduce( self, mock_map_reduce: AsyncMock, config: SummarizerConfig, ) -> None: - """Test that very long content uses map-reduce.""" - mock_result = SummaryResult( - level=SummaryLevel.MAP_REDUCE, - summary="Long content summary.", - input_tokens=20000, - output_tokens=500, - compression_ratio=0.025, - collapse_depth=2, + """Test that content exceeding chunk_size uses map-reduce.""" + mock_map_reduce.return_value = MapReduceResult( + summary="Map-reduce summary.", + input_tokens=5000, + output_tokens=100, + compression_ratio=0.02, + collapse_depth=1, + intermediate_summaries=[["chunk1", "chunk2"]], ) - mock_map_reduce.return_value = mock_result - # Create content that's > 15000 tokens - content = "Word " * 20000 + # Create content larger than chunk_size (2048) + content = "Word " * 3000 # ~3000 tokens - result = await summarize(content, config) + result = await summarize(content, config, target_tokens=500) - assert mock_map_reduce.called - assert result.level == SummaryLevel.MAP_REDUCE + mock_map_reduce.assert_called_once() + assert result.summary == "Map-reduce summary." class TestGenerateSummary: diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py index f11fcff8b..867815ce9 100644 --- a/tests/summarizer/test_integration.py +++ b/tests/summarizer/test_integration.py @@ -2,32 +2,7 @@ from __future__ import annotations -from agent_cli.summarizer.adaptive import determine_level -from agent_cli.summarizer.models import SummaryLevel, SummaryResult - - -class TestDetermineLevel: - """Tests for determine_level function with various content sizes.""" - - def test_short_content_is_brief(self) -> None: - """Test that 100-500 token content uses BRIEF.""" - level = determine_level(200) - assert level == SummaryLevel.BRIEF - - def test_medium_content_is_map_reduce(self) -> None: - """Test that 500+ token content uses MAP_REDUCE.""" - level = determine_level(1000) - assert level == SummaryLevel.MAP_REDUCE - - def test_long_content_is_map_reduce(self) -> None: - """Test that 3000+ token content uses MAP_REDUCE.""" - level = determine_level(5000) - assert level == SummaryLevel.MAP_REDUCE - - def test_very_long_content_is_map_reduce(self) -> None: - """Test that content over 15000 tokens still uses MAP_REDUCE.""" - level = determine_level(20000) - assert level == SummaryLevel.MAP_REDUCE +from agent_cli.summarizer.models import SummaryResult class TestSummaryResultStorage: @@ -36,7 +11,6 @@ class TestSummaryResultStorage: def test_to_storage_metadata_creates_entry(self) -> None: """Test that to_storage_metadata creates a valid entry.""" result = SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary="A comprehensive summary.", input_tokens=5000, output_tokens=100, @@ -52,13 +26,11 @@ def test_to_storage_metadata_creates_entry(self) -> None: assert entry["metadata"]["conversation_id"] == "test-conversation" assert entry["metadata"]["role"] == "summary" assert entry["metadata"]["is_final"] is True - assert entry["metadata"]["summary_level"] == "MAP_REDUCE" assert entry["metadata"]["collapse_depth"] == 1 - def test_none_level_returns_empty(self) -> None: - """Test that NONE level produces no storage entries.""" + def test_no_summary_returns_empty(self) -> None: + """Test that no summary produces no storage entries.""" result = SummaryResult( - level=SummaryLevel.NONE, summary=None, input_tokens=50, output_tokens=0, diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py index c5b04f703..05d5625f4 100644 --- a/tests/summarizer/test_models.py +++ b/tests/summarizer/test_models.py @@ -7,73 +7,39 @@ import pytest from agent_cli.summarizer.models import ( - SummaryLevel, SummaryResult, ) -class TestSummaryLevel: - """Tests for SummaryLevel enum.""" - - def test_level_values(self) -> None: - """Test that levels have correct integer values.""" - assert SummaryLevel.NONE == 0 - assert SummaryLevel.BRIEF == 1 - assert SummaryLevel.MAP_REDUCE == 2 - - def test_level_ordering(self) -> None: - """Test that levels can be compared.""" - assert SummaryLevel.NONE < SummaryLevel.BRIEF - assert SummaryLevel.BRIEF < SummaryLevel.MAP_REDUCE - - class TestSummaryResult: """Tests for SummaryResult model.""" - def test_none_level_result(self) -> None: - """Test result for content that needs no summary.""" + def test_result_with_no_summary(self) -> None: + """Test result when content already fits target.""" result = SummaryResult( - level=SummaryLevel.NONE, summary=None, input_tokens=50, output_tokens=0, compression_ratio=0.0, ) - assert result.level == SummaryLevel.NONE assert result.summary is None assert result.collapse_depth == 0 - def test_brief_level_result(self) -> None: - """Test result for brief summary.""" - result = SummaryResult( - level=SummaryLevel.BRIEF, - summary="A brief one-sentence summary.", - input_tokens=200, - output_tokens=10, - compression_ratio=0.05, - ) - assert result.level == SummaryLevel.BRIEF - assert result.summary == "A brief one-sentence summary." - assert result.collapse_depth == 0 - - def test_map_reduce_result(self) -> None: - """Test result for map-reduce summary.""" + def test_result_with_summary(self) -> None: + """Test result with a generated summary.""" result = SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary="A comprehensive summary.", input_tokens=5000, output_tokens=100, compression_ratio=0.02, collapse_depth=2, ) - assert result.level == SummaryLevel.MAP_REDUCE assert result.summary == "A comprehensive summary." assert result.collapse_depth == 2 - def test_to_storage_metadata_none_level(self) -> None: - """Test that NONE level produces no storage entries.""" + def test_to_storage_metadata_no_summary(self) -> None: + """Test that no summary produces no storage entries.""" result = SummaryResult( - level=SummaryLevel.NONE, summary=None, input_tokens=50, output_tokens=0, @@ -82,10 +48,9 @@ def test_to_storage_metadata_none_level(self) -> None: entries = result.to_storage_metadata("conv-123") assert entries == [] - def test_to_storage_metadata_simple_summary(self) -> None: + def test_to_storage_metadata_with_summary(self) -> None: """Test storage metadata for a summary.""" result = SummaryResult( - level=SummaryLevel.BRIEF, summary="A brief summary.", input_tokens=200, output_tokens=10, @@ -99,12 +64,10 @@ def test_to_storage_metadata_simple_summary(self) -> None: assert entry["metadata"]["conversation_id"] == "conv-456" assert entry["metadata"]["role"] == "summary" assert entry["metadata"]["is_final"] is True - assert entry["metadata"]["summary_level"] == "BRIEF" - def test_to_storage_metadata_map_reduce(self) -> None: - """Test storage metadata for map-reduce summary.""" + def test_to_storage_metadata_with_collapse_depth(self) -> None: + """Test storage metadata includes collapse depth.""" result = SummaryResult( - level=SummaryLevel.MAP_REDUCE, summary="Final synthesis of content.", input_tokens=20000, output_tokens=200, @@ -113,12 +76,10 @@ def test_to_storage_metadata_map_reduce(self) -> None: ) entries = result.to_storage_metadata("conv-789") - # Should have 1 entry (the final summary) assert len(entries) == 1 entry = entries[0] assert entry["id"] == "conv-789:summary" assert entry["content"] == "Final synthesis of content." - assert entry["metadata"]["summary_level"] == "MAP_REDUCE" assert entry["metadata"]["collapse_depth"] == 3 assert entry["metadata"]["is_final"] is True @@ -126,7 +87,6 @@ def test_compression_ratio_bounds(self) -> None: """Test compression ratio validation.""" # Valid ratio result = SummaryResult( - level=SummaryLevel.BRIEF, summary="Test", input_tokens=100, output_tokens=10, @@ -137,7 +97,6 @@ def test_compression_ratio_bounds(self) -> None: # Ratio must be between 0 and 1 with pytest.raises(ValueError, match="less than or equal to 1"): SummaryResult( - level=SummaryLevel.BRIEF, summary="Test", input_tokens=100, output_tokens=10, @@ -148,7 +107,6 @@ def test_created_at_default(self) -> None: """Test that created_at is automatically set.""" before = datetime.now(UTC) result = SummaryResult( - level=SummaryLevel.BRIEF, summary="Test", input_tokens=100, output_tokens=10, diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py index 188a79172..89a441719 100644 --- a/tests/summarizer/test_utils.py +++ b/tests/summarizer/test_utils.py @@ -88,32 +88,32 @@ def test_large_paragraph_sentence_split(self) -> None: class TestEstimateSummaryTokens: """Tests for estimate_summary_tokens function.""" - def test_none_level(self) -> None: - """Test level 0 (NONE) returns 0.""" - assert estimate_summary_tokens(1000, level=0) == 0 - - def test_brief_level(self) -> None: - """Test level 1 (BRIEF) compression.""" - # BRIEF: ~20% compression, capped at 50, minimum 20 - result = estimate_summary_tokens(100, level=1) - assert result >= 20 # minimum of 20 - assert result <= 50 # capped at 50 - - def test_map_reduce_level(self) -> None: - """Test level 2 (MAP_REDUCE) compression.""" - # MAP_REDUCE: ~10% compression, capped at 500, minimum 50 - result = estimate_summary_tokens(1000, level=2) - assert result >= 50 # minimum of 50 - assert result <= 500 # capped at 500 - - def test_map_reduce_large_input(self) -> None: - """Test MAP_REDUCE with large input hits cap.""" - result = estimate_summary_tokens(50000, level=2) + def test_typical_input(self) -> None: + """Test typical input uses ~10% compression.""" + # ~10% compression, capped at 500, minimum 50 + result = estimate_summary_tokens(1000) + assert result == 100 # 1000 // 10 = 100 + + def test_medium_input(self) -> None: + """Test medium input stays within bounds.""" + result = estimate_summary_tokens(2000) + assert result == 200 # 2000 // 10 = 200 + assert result >= 50 # above floor + assert result <= 500 # below ceiling + + def test_large_input_hits_cap(self) -> None: + """Test large input hits 500 token cap.""" + result = estimate_summary_tokens(50000) assert result == 500 # capped at 500 - def test_map_reduce_small_input(self) -> None: - """Test MAP_REDUCE with small input uses floor.""" - result = estimate_summary_tokens(100, level=2) + def test_small_input_uses_floor(self) -> None: + """Test small input uses 50 token floor.""" + result = estimate_summary_tokens(100) + assert result == 50 # floor of 50 (100 // 10 = 10, but min is 50) + + def test_very_small_input(self) -> None: + """Test very small input still uses floor.""" + result = estimate_summary_tokens(10) assert result == 50 # floor of 50 From 1ca62668f0799be7a1fbe227b4eed4e64dd20e09 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 3 Dec 2025 20:16:20 -0800 Subject: [PATCH 36/37] chore(summarizer): remove dead code - Remove unused BRIEF_SUMMARY_PROMPT (brief level was removed) - Remove unused timeout field from SummarizerConfig - Update tests and examples accordingly --- agent_cli/summarizer/_prompts.py | 9 --------- agent_cli/summarizer/models.py | 1 - examples/summarizer_demo.py | 1 - tests/summarizer/test_adaptive.py | 2 -- tests/summarizer/test_prompts.py | 8 -------- 5 files changed, 21 deletions(-) diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py index de59f9404..e49fd417d 100644 --- a/agent_cli/summarizer/_prompts.py +++ b/agent_cli/summarizer/_prompts.py @@ -4,15 +4,6 @@ and are optimized for structured, factual output. """ -# Single sentence summary for short content (used at BRIEF level, 100-500 tokens) -BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words). -Focus on the single most important point or takeaway. - -Content: -{content} - -One-sentence summary:""".strip() - # Paragraph summary for "general" content type (default when no specific type provided) GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph. diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index 65eb42ed5..721201da3 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -34,7 +34,6 @@ class SummarizerConfig: token_max: int = 3000 # LangChain's default - target size after compression chunk_overlap: int = 200 max_concurrent_chunks: int = 5 - timeout: float = 60.0 def __post_init__(self) -> None: """Normalize the base URL.""" diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py index feebc5f20..f5d593a17 100644 --- a/examples/summarizer_demo.py +++ b/examples/summarizer_demo.py @@ -348,7 +348,6 @@ async def run_demo( api_key=api_key, chunk_size=2048, # BOOOOKSCORE default max_concurrent_chunks=3, - timeout=120.0, # Longer timeout for local models ) async with httpx.AsyncClient() as client: diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py index b7ce45e82..1fbf3d7ba 100644 --- a/tests/summarizer/test_adaptive.py +++ b/tests/summarizer/test_adaptive.py @@ -46,12 +46,10 @@ def test_init_with_custom_settings(self) -> None: chunk_size=5000, chunk_overlap=300, max_concurrent_chunks=10, - timeout=120.0, ) assert config.chunk_size == 5000 assert config.chunk_overlap == 300 assert config.max_concurrent_chunks == 10 - assert config.timeout == 120.0 def test_trailing_slash_stripped(self) -> None: """Test that trailing slash is stripped from base URL.""" diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py index ef05ebad5..825fe077c 100644 --- a/tests/summarizer/test_prompts.py +++ b/tests/summarizer/test_prompts.py @@ -3,7 +3,6 @@ from __future__ import annotations from agent_cli.summarizer._prompts import ( - BRIEF_SUMMARY_PROMPT, CHUNK_SUMMARY_PROMPT, CONVERSATION_SUMMARY_PROMPT, DOCUMENT_SUMMARY_PROMPT, @@ -19,13 +18,6 @@ class TestPromptTemplates: """Tests for prompt template structure.""" - def test_brief_prompt_has_content_placeholder(self) -> None: - """Test BRIEF prompt contains content placeholder.""" - assert "{content}" in BRIEF_SUMMARY_PROMPT - # Test it can be formatted - result = BRIEF_SUMMARY_PROMPT.format(content="Test content") - assert "Test content" in result - def test_general_prompt_has_placeholders(self) -> None: """Test GENERAL prompt contains required placeholders.""" assert "{content}" in GENERAL_SUMMARY_PROMPT From f02c584b2d981d6455511d1226dbe50e071399b0 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 22 Apr 2026 14:34:48 -0700 Subject: [PATCH 37/37] fix(summarizer): persist final summaries as L3 entries --- agent_cli/_requirements/memory.txt | 4 +++- agent_cli/core/chroma.py | 1 + agent_cli/docs_gen.py | 11 +++++++++-- agent_cli/memory/_persistence.py | 19 ++----------------- agent_cli/memory/models.py | 2 +- agent_cli/summarizer/models.py | 2 ++ tests/memory/test_store.py | 17 ++++++++++++++++- tests/summarizer/test_integration.py | 2 ++ tests/summarizer/test_models.py | 14 ++++++++++++++ tests/test_api_integration.py | 17 +++++++++-------- uv.lock | 8 ++++++++ 11 files changed, 67 insertions(+), 30 deletions(-) diff --git a/agent_cli/_requirements/memory.txt b/agent_cli/_requirements/memory.txt index 9a4b89d7f..309c83c88 100644 --- a/agent_cli/_requirements/memory.txt +++ b/agent_cli/_requirements/memory.txt @@ -333,7 +333,9 @@ tenacity==9.1.2 # chromadb # google-genai tiktoken==0.12.0 - # via pydantic-ai-slim + # via + # agent-cli + # pydantic-ai-slim tokenizers==0.22.2 # via # chromadb diff --git a/agent_cli/core/chroma.py b/agent_cli/core/chroma.py index 0cc639cb7..22455fa65 100644 --- a/agent_cli/core/chroma.py +++ b/agent_cli/core/chroma.py @@ -3,6 +3,7 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any + from agent_cli.constants import DEFAULT_OPENAI_EMBEDDING_MODEL if TYPE_CHECKING: diff --git a/agent_cli/docs_gen.py b/agent_cli/docs_gen.py index 49002f1a4..000448c23 100644 --- a/agent_cli/docs_gen.py +++ b/agent_cli/docs_gen.py @@ -16,6 +16,7 @@ from __future__ import annotations +from functools import cache from typing import Any, get_origin import click @@ -59,10 +60,16 @@ def _format_default(default: Any) -> str: return str(default) +@cache +def _get_root_click_app() -> click.Command: + """Build the Click app once for documentation introspection.""" + return get_command(app) + + def _get_click_command(command_path: str) -> click.Command | None: """Get a Click command from a path like 'transcribe' or 'memory.proxy'.""" parts = command_path.split(".") - click_app = get_command(app) + click_app = _get_root_click_app() cmd: click.Command | click.Group = click_app for part in parts: @@ -209,7 +216,7 @@ def _options_by_panel( def _list_commands() -> list[str]: """List all available commands including subcommands.""" - click_app = get_command(app) + click_app = _get_root_click_app() commands = [] def _walk(cmd: click.Command | click.Group, prefix: str = "") -> None: diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py index 46ac03631..862f753e0 100644 --- a/agent_cli/memory/_persistence.py +++ b/agent_cli/memory/_persistence.py @@ -4,7 +4,6 @@ import logging import shutil -from datetime import UTC, datetime from typing import TYPE_CHECKING from agent_cli.memory._files import ( @@ -207,23 +206,9 @@ def persist_summary( return [] stored_ids: list[str] = [] - created_at = datetime.now(UTC).isoformat() for entry in entries: - meta_dict = entry["metadata"] - # Build MemoryMetadata from the summary result's metadata dict - metadata = MemoryMetadata( - conversation_id=meta_dict["conversation_id"], - role=meta_dict["role"], - created_at=meta_dict.get("created_at", created_at), - summary_kind="summary", - is_final=meta_dict.get("is_final"), - input_tokens=meta_dict.get("input_tokens"), - output_tokens=meta_dict.get("output_tokens"), - compression_ratio=meta_dict.get("compression_ratio"), - summary_level=meta_dict.get("summary_level"), - collapse_depth=meta_dict.get("collapse_depth"), - ) + metadata = MemoryMetadata(**entry["metadata"]) record = write_memory_file( memory_root, content=entry["content"], @@ -233,7 +218,7 @@ def persist_summary( LOGGER.info( "Persisted summary file: %s (level=%s)", record.path, - meta_dict.get("summary_level"), + metadata.level, ) stored_ids.append(record.id) diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py index d52d952ce..e2463ffe9 100644 --- a/agent_cli/memory/models.py +++ b/agent_cli/memory/models.py @@ -51,7 +51,7 @@ class MemoryMetadata(BaseModel): # Summary fields (only used when role="summary") level: int | None = None - """Summary level (deprecated, kept for file structure compatibility).""" + """Summary level used for hierarchical summary retrieval and file layout.""" is_final: bool | None = None """Whether this is the final summary.""" chunk_index: int | None = None diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py index 721201da3..99354e696 100644 --- a/agent_cli/summarizer/models.py +++ b/agent_cli/summarizer/models.py @@ -88,6 +88,8 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]: "metadata": { "conversation_id": conversation_id, "role": "summary", + "summary_kind": "summary", + "level": 3, "is_final": True, "input_tokens": self.input_tokens, "output_tokens": self.output_tokens, diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py index 29dbe2e55..d7e19f80d 100644 --- a/tests/memory/test_store.py +++ b/tests/memory/test_store.py @@ -17,12 +17,14 @@ def __init__( self.query_result = query_result or {} self.get_result = get_result or {} self.deleted: list[list[str]] = [] + self.get_calls: list[dict[str, Any]] = [] self.upserts: list[tuple[list[str], list[str], list[dict[str, Any]]]] = [] def query(self, **_kwargs: Any) -> dict[str, Any]: return self.query_result - def get(self, **_kwargs: Any) -> dict[str, Any]: + def get(self, **kwargs: Any) -> dict[str, Any]: + self.get_calls.append(kwargs) return self.get_result def delete(self, ids: list[str]) -> None: @@ -146,6 +148,8 @@ def test_upsert_summary_entries_simple() -> None: "metadata": { "conversation_id": "conv-123", "role": "summary", + "summary_kind": "summary", + "level": 3, "is_final": True, "summary_level": "MAP_REDUCE", "input_tokens": 1000, @@ -164,6 +168,8 @@ def test_upsert_summary_entries_simple() -> None: upserted_ids, upserted_docs, upserted_metas = fake.upserts[0] assert upserted_ids == ["conv-123:summary"] assert upserted_docs == ["A paragraph summary."] + assert upserted_metas[0]["summary_kind"] == "summary" + assert upserted_metas[0]["level"] == 3 assert upserted_metas[0]["is_final"] is True @@ -177,6 +183,8 @@ def test_upsert_summary_entries_with_collapse_depth() -> None: "metadata": { "conversation_id": "conv-456", "role": "summary", + "summary_kind": "summary", + "level": 3, "is_final": True, "summary_level": "MAP_REDUCE", "input_tokens": 5000, @@ -214,6 +222,8 @@ def test_get_final_summary_returns_summary() -> None: { "conversation_id": "c1", "role": "summary", + "summary_kind": "summary", + "level": 3, "is_final": True, "summary_level": "MAP_REDUCE", "collapse_depth": 1, @@ -228,6 +238,11 @@ def test_get_final_summary_returns_summary() -> None: assert result is not None assert result.content == "The final summary" + assert fake.get_calls[0]["where"]["$and"] == [ + {"conversation_id": "c1"}, + {"role": "summary"}, + {"level": 3}, + ] assert result.metadata.is_final is True diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py index 867815ce9..7301d1071 100644 --- a/tests/summarizer/test_integration.py +++ b/tests/summarizer/test_integration.py @@ -25,6 +25,8 @@ def test_to_storage_metadata_creates_entry(self) -> None: assert entry["content"] == "A comprehensive summary." assert entry["metadata"]["conversation_id"] == "test-conversation" assert entry["metadata"]["role"] == "summary" + assert entry["metadata"]["summary_kind"] == "summary" + assert entry["metadata"]["level"] == 3 assert entry["metadata"]["is_final"] is True assert entry["metadata"]["collapse_depth"] == 1 diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py index 05d5625f4..98db88d39 100644 --- a/tests/summarizer/test_models.py +++ b/tests/summarizer/test_models.py @@ -65,6 +65,20 @@ def test_to_storage_metadata_with_summary(self) -> None: assert entry["metadata"]["role"] == "summary" assert entry["metadata"]["is_final"] is True + def test_to_storage_metadata_marks_final_summary_for_memory_layer(self) -> None: + """Test storage metadata includes the fields memory retrieval expects.""" + result = SummaryResult( + summary="A brief summary.", + input_tokens=200, + output_tokens=10, + compression_ratio=0.05, + ) + entries = result.to_storage_metadata("conv-456") + + entry = entries[0] + assert entry["metadata"]["summary_kind"] == "summary" + assert entry["metadata"]["level"] == 3 + def test_to_storage_metadata_with_collapse_depth(self) -> None: """Test storage metadata includes collapse depth.""" result = SummaryResult( diff --git a/tests/test_api_integration.py b/tests/test_api_integration.py index 858944029..163c74a4b 100644 --- a/tests/test_api_integration.py +++ b/tests/test_api_integration.py @@ -160,10 +160,15 @@ def test_api_configuration_handling(monkeypatch: MonkeyPatch) -> None: assert True # Config is created during request -def test_temp_file_cleanup(client: TestClient) -> None: +def test_temp_file_cleanup( + client: TestClient, + monkeypatch: MonkeyPatch, + tmp_path: Path, +) -> None: """Test that temporary files are cleaned up after processing.""" + monkeypatch.setattr(tempfile, "tempdir", str(tmp_path), raising=False) temp_dir = Path(tempfile.gettempdir()) - temp_files_before = set(temp_dir.iterdir()) + wav_files_before = set(temp_dir.glob("*.wav")) with patch("agent_cli.server.proxy.api._transcribe_with_provider") as mock_transcribe: mock_transcribe.return_value = "test" @@ -183,12 +188,8 @@ def test_temp_file_cleanup(client: TestClient) -> None: # Give a moment for cleanup time.sleep(0.1) - temp_files_after = set(temp_dir.iterdir()) - new_files = temp_files_after - temp_files_before - - # No new WAV files should remain - wav_files = [f for f in new_files if f.name.endswith(".wav")] - assert len(wav_files) == 0 + wav_files_after = set(temp_dir.glob("*.wav")) + assert wav_files_after - wav_files_before == set() @pytest.mark.asyncio diff --git a/uv.lock b/uv.lock index 52ecaf5e2..34de4fdf3 100644 --- a/uv.lock +++ b/uv.lock @@ -53,6 +53,7 @@ dev = [ { name = "pytest-mock" }, { name = "pytest-timeout" }, { name = "ruff" }, + { name = "tiktoken" }, { name = "versioningit" }, { name = "zensical" }, ] @@ -79,6 +80,7 @@ memory = [ { name = "openai" }, { name = "pydantic-ai-slim", extra = ["google", "openai"] }, { name = "pyyaml" }, + { name = "tiktoken" }, { name = "transformers" }, { name = "watchfiles" }, ] @@ -113,6 +115,7 @@ test = [ { name = "pytest-cov" }, { name = "pytest-mock" }, { name = "pytest-timeout" }, + { name = "tiktoken" }, ] vad = [ { name = "onnxruntime" }, @@ -150,6 +153,7 @@ dev = [ { name = "pytest-mock" }, { name = "pytest-timeout" }, { name = "ruff" }, + { name = "tiktoken" }, { name = "versioningit" }, { name = "zensical" }, ] @@ -218,6 +222,9 @@ requires-dist = [ { name = "setproctitle" }, { name = "sounddevice", marker = "extra == 'audio'", specifier = ">=0.4.6" }, { name = "soundfile", marker = "extra == 'kokoro'", specifier = ">=0.12.0" }, + { name = "tiktoken", marker = "extra == 'dev'", specifier = ">=0.5.0" }, + { name = "tiktoken", marker = "extra == 'memory'", specifier = ">=0.5.0" }, + { name = "tiktoken", marker = "extra == 'test'", specifier = ">=0.5.0" }, { name = "torch", marker = "extra == 'whisper-transformers'", specifier = ">=2.0.0" }, { name = "transformers", marker = "extra == 'kokoro'", specifier = ">=4.40.0" }, { name = "transformers", marker = "extra == 'memory'", specifier = ">=4.30.0" }, @@ -250,6 +257,7 @@ dev = [ { name = "pytest-mock" }, { name = "pytest-timeout" }, { name = "ruff" }, + { name = "tiktoken", specifier = ">=0.5.0" }, { name = "versioningit" }, { name = "zensical" }, ]