From ca905ca0ac79c69f297c7d811e76563d6f276718 Mon Sep 17 00:00:00 2001 From: leored Date: Wed, 18 Feb 2026 23:34:05 +0100 Subject: [PATCH 1/2] Implement paragraph-level chunking per BR-EMBEDDING-001 The HierarchicalChunker was treating entire sections as single chunks regardless of paragraph count. This adds SPLIT_BY_PARAGRAPHS strategy that splits on blank lines, merges short paragraphs (<20 words), and falls back to size-based splitting for oversized paragraphs. Also fixes an infinite loop in _chunk_text when overlap sent current_pos backwards on the last chunk. Co-Authored-By: Claude Opus 4.6 --- src/kb_engine/smart/chunking/hierarchical.py | 89 ++++++++ src/kb_engine/smart/schemas/entity.py | 2 +- src/kb_engine/smart/types.py | 1 + .../chunking/test_hierarchical_chunker.py | 216 ++++++++++++++++++ 4 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 tests/unit/chunking/test_hierarchical_chunker.py diff --git a/src/kb_engine/smart/chunking/hierarchical.py b/src/kb_engine/smart/chunking/hierarchical.py index 4248403..4bfd1b5 100644 --- a/src/kb_engine/smart/chunking/hierarchical.py +++ b/src/kb_engine/smart/chunking/hierarchical.py @@ -111,6 +111,19 @@ async def chunk( sequence += 1 section_log.debug("chunker.section.keep_intact", content_length=len(section.content)) + elif strategy == ChunkingStrategy.SPLIT_BY_PARAGRAPHS: + text_chunks = self._split_by_paragraphs( + text=section.content, + context=context, + doc_id=doc_id, + doc_kind=parsed.kind, + section_name=section.name, + start_sequence=sequence, + ) + chunks.extend(text_chunks) + sequence += len(text_chunks) + section_log.debug("chunker.section.paragraphs", count=len(text_chunks)) + elif strategy == ChunkingStrategy.SPLIT_BY_ITEMS: items = self._extract_list_items(section.content) for item in items: @@ -270,10 +283,86 @@ def _chunk_text( chunks.append(chunk) seq += 1 + if end_pos >= len(text): + break current_pos = end_pos - self.chunk_overlap return chunks + MIN_PARAGRAPH_WORDS = 20 + + def _split_by_paragraphs( + self, + text: str, + context: HierarchicalContext, + doc_id: str, + doc_kind, + section_name: str, + start_sequence: int, + ) -> list[ContextualizedChunk]: + """Split text into chunks by paragraph per BR-EMBEDDING-001. + + Rules: + 1. Each paragraph (separated by blank lines) produces one chunk. + 2. Paragraphs with < 20 words are merged with the next paragraph. + 3. If a merged paragraph exceeds max_chunk_size, fall back to + size-based splitting for that paragraph. + """ + raw_paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] + + if not raw_paragraphs: + return [] + + # Merge short paragraphs (< 20 words) with the next one + merged: list[str] = [] + buffer = "" + for para in raw_paragraphs: + if buffer: + buffer = f"{buffer}\n\n{para}" + else: + buffer = para + + if len(buffer.split()) >= self.MIN_PARAGRAPH_WORDS: + merged.append(buffer) + buffer = "" + + # Flush remaining buffer + if buffer: + if merged: + merged[-1] = f"{merged[-1]}\n\n{buffer}" + else: + merged.append(buffer) + + # Create chunks, falling back to size-based splitting for large paragraphs + chunks: list[ContextualizedChunk] = [] + seq = start_sequence + for para in merged: + if len(para) > self.max_chunk_size: + sub_chunks = self._chunk_text( + text=para, + context=context, + doc_id=doc_id, + doc_kind=doc_kind, + section_name=section_name, + start_sequence=seq, + ) + chunks.extend(sub_chunks) + seq += len(sub_chunks) + else: + chunk = self._create_chunk( + content=para, + context=context, + chunk_type="paragraph", + doc_id=doc_id, + doc_kind=doc_kind, + section_name=section_name, + sequence=seq, + ) + chunks.append(chunk) + seq += 1 + + return chunks + def _extract_non_table_text(self, section: ParsedSection) -> str: """Extract text content that's not part of tables.""" content = section.content diff --git a/src/kb_engine/smart/schemas/entity.py b/src/kb_engine/smart/schemas/entity.py index 305466e..eef4844 100644 --- a/src/kb_engine/smart/schemas/entity.py +++ b/src/kb_engine/smart/schemas/entity.py @@ -35,7 +35,7 @@ aliases=["Description"], required=True, content_expectation=ContentExpectation.TEXT, - chunking_strategy=ChunkingStrategy.KEEP_INTACT, + chunking_strategy=ChunkingStrategy.SPLIT_BY_PARAGRAPHS, description="Entity description", ), SectionDefinition( diff --git a/src/kb_engine/smart/types.py b/src/kb_engine/smart/types.py index a2d4c29..3faf8b1 100644 --- a/src/kb_engine/smart/types.py +++ b/src/kb_engine/smart/types.py @@ -35,6 +35,7 @@ class ChunkingStrategy(str, Enum): KEEP_INTACT = "keep_intact" TABLE_ROWS = "table_rows" SPLIT_BY_ITEMS = "split_by_items" + SPLIT_BY_PARAGRAPHS = "split_by_paragraphs" class ContentExpectation(str, Enum): diff --git a/tests/unit/chunking/test_hierarchical_chunker.py b/tests/unit/chunking/test_hierarchical_chunker.py new file mode 100644 index 0000000..d29c173 --- /dev/null +++ b/tests/unit/chunking/test_hierarchical_chunker.py @@ -0,0 +1,216 @@ +"""Tests for HierarchicalChunker paragraph splitting (BR-EMBEDDING-001).""" + +import pytest + +from kb_engine.smart.chunking import HierarchicalChunker, MockSummaryService +from kb_engine.smart.types import ( + ChunkingStrategy, + ContentExpectation, + KDDDocumentKind, + ParsedDocument, + ParsedSection, + SectionDefinition, + TemplateSchema, +) + +# Reusable paragraphs with >= 20 words each +P_PEDIDO = ( + "Representa un pedido de compra realizado por un Usuario registrado " + "en la plataforma de comercio electrónico del sistema principal de ventas." +) # 21 words +P_CICLO = ( + "El pedido tiene un ciclo de vida completo que va desde borrador hasta " + "entregado, pasando por confirmado, en preparación y finalmente enviado." +) # 22 words +P_LINEAS = ( + "Cada pedido contiene una o más líneas con productos seleccionados " + "del catálogo vigente, incluyendo cantidades y precios unitarios actualizados al momento." +) # 21 words + + +def _make_schema( + chunking_strategy: ChunkingStrategy = ChunkingStrategy.SPLIT_BY_PARAGRAPHS, + section_name: str = "Descripción", +) -> TemplateSchema: + return TemplateSchema( + kind=KDDDocumentKind.ENTITY, + required_sections=[ + SectionDefinition( + name=section_name, + required=True, + content_expectation=ContentExpectation.TEXT, + chunking_strategy=chunking_strategy, + ), + ], + ) + + +def _make_parsed( + content: str, + section_name: str = "Descripción", + doc_id: str = "TEST-001", +) -> ParsedDocument: + return ParsedDocument( + kind=KDDDocumentKind.ENTITY, + frontmatter={"id": doc_id}, + title="TestEntity", + sections=[ + ParsedSection( + name=section_name, + level=2, + content=content, + ), + ], + tables=[], + code_blocks=[], + cross_references=[], + validation_errors=[], + raw_content=content, + ) + + +def _make_chunker(max_chunk_size: int = 1024) -> HierarchicalChunker: + return HierarchicalChunker( + summary_service=MockSummaryService(), + max_chunk_size=max_chunk_size, + ) + + +@pytest.mark.unit +class TestSplitByParagraphs: + """Tests for SPLIT_BY_PARAGRAPHS strategy per BR-EMBEDDING-001.""" + + @pytest.mark.asyncio + async def test_two_paragraphs_produce_two_chunks(self): + """Each paragraph (>= 20 words) produces an independent chunk.""" + text = f"{P_PEDIDO}\n\n{P_CICLO}" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(text), _make_schema()) + + assert len(chunks) == 2 + assert chunks[0].chunk_type == "paragraph" + assert chunks[1].chunk_type == "paragraph" + assert "pedido de compra" in chunks[0].content + assert "ciclo de vida" in chunks[1].content + + @pytest.mark.asyncio + async def test_short_paragraph_merged_with_next(self): + """Paragraphs with < 20 words are merged with the next one.""" + short = "Párrafo corto introductorio." + text = f"{short}\n\n{P_PEDIDO}" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(text), _make_schema()) + + assert len(chunks) == 1 + assert "Párrafo corto" in chunks[0].content + assert "pedido de compra" in chunks[0].content + + @pytest.mark.asyncio + async def test_multiple_short_paragraphs_merged_until_threshold(self): + """Multiple short paragraphs accumulate until >= 20 words.""" + p1 = "Primera frase corta." + p2 = "Segunda frase corta." + p3 = "Tercera frase corta." + text = f"{p1}\n\n{p2}\n\n{p3}\n\n{P_PEDIDO}" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(text), _make_schema()) + + assert len(chunks) == 1 + assert "Primera frase" in chunks[0].content + assert "pedido de compra" in chunks[0].content + + @pytest.mark.asyncio + async def test_trailing_short_paragraph_appended_to_last(self): + """A trailing short paragraph is appended to the last chunk.""" + trailing = "Nota final breve." + text = f"{P_PEDIDO}\n\n{trailing}" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(text), _make_schema()) + + assert len(chunks) == 1 + assert "Nota final breve." in chunks[0].content + assert "pedido de compra" in chunks[0].content + + @pytest.mark.asyncio + async def test_single_paragraph_produces_one_chunk(self): + """A single paragraph produces exactly one chunk.""" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(P_PEDIDO), _make_schema()) + + assert len(chunks) == 1 + assert chunks[0].chunk_type == "paragraph" + + @pytest.mark.asyncio + async def test_empty_content_produces_no_chunks(self): + """Empty content produces zero chunks.""" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(""), _make_schema()) + + assert len(chunks) == 0 + + @pytest.mark.asyncio + async def test_large_paragraph_falls_back_to_size_split(self): + """A paragraph exceeding max_chunk_size is split by size.""" + large = " ".join(f"word{i}" for i in range(300)) + chunker = _make_chunker(max_chunk_size=200) + chunks = await chunker.chunk(_make_parsed(large), _make_schema()) + + assert len(chunks) > 1 + for chunk in chunks: + assert chunk.chunk_type == "text" + + @pytest.mark.asyncio + async def test_chunk_ids_are_sequential(self): + """Chunk IDs follow the doc_id#sequence pattern.""" + text = f"{P_PEDIDO}\n\n{P_CICLO}" + chunker = _make_chunker() + chunks = await chunker.chunk( + _make_parsed(text, doc_id="ENT-001"), _make_schema() + ) + + assert chunks[0].id == "ENT-001#0" + assert chunks[1].id == "ENT-001#1" + + @pytest.mark.asyncio + async def test_three_paragraphs_produce_three_chunks(self): + """Three substantial paragraphs produce three chunks (BR-EMBEDDING-001 example).""" + text = f"{P_PEDIDO}\n\n{P_CICLO}\n\n{P_LINEAS}" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(text), _make_schema()) + + assert len(chunks) == 3 + assert "pedido de compra" in chunks[0].content + assert "ciclo de vida" in chunks[1].content + assert "líneas con productos" in chunks[2].content + + @pytest.mark.asyncio + async def test_contextualized_content_includes_prefix(self): + """Each chunk's contextualized_content includes the hierarchical prefix.""" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(P_PEDIDO), _make_schema()) + + assert len(chunks) == 1 + assert chunks[0].contextualized_content != chunks[0].content + assert chunks[0].content in chunks[0].contextualized_content + + @pytest.mark.asyncio + async def test_whitespace_only_paragraphs_ignored(self): + """Blank lines between paragraphs don't produce empty chunks.""" + text = f"{P_PEDIDO}\n\n \n\n{P_CICLO}" + chunker = _make_chunker() + chunks = await chunker.chunk(_make_parsed(text), _make_schema()) + + assert len(chunks) == 2 + + +@pytest.mark.unit +class TestEntitySchemaUsesParagraphSplitting: + """Verify entity schema uses SPLIT_BY_PARAGRAPHS for Descripción.""" + + def test_descripcion_strategy(self): + from kb_engine.smart.schemas.entity import ENTITY_SCHEMA + + descripcion = next( + s for s in ENTITY_SCHEMA.required_sections if s.name == "Descripción" + ) + assert descripcion.chunking_strategy == ChunkingStrategy.SPLIT_BY_PARAGRAPHS From c03380a1d77d7eed933e3a31c39d36e3535914cc Mon Sep 17 00:00:00 2001 From: leored Date: Fri, 20 Feb 2026 18:56:43 +0100 Subject: [PATCH 2/2] Migrate from Python to TypeScript/Bun MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the entire Python stack (src/kb_engine/, src/kdd/, tests/, docker/, migrations/) with a single TypeScript/Bun package. The new implementation covers indexing (16 extractors, paragraph chunking, embeddings via @huggingface/transformers), 6 query types (hybrid, graph, impact, semantic, coverage, violations), CLI (citty, 7 subcommands), and MCP server (@modelcontextprotocol/sdk, 7 tools). Key changes: - Runtime: Python 3.11+ → Bun/TypeScript - Graph: FalkorDB/NetworkX → graphology (in-memory) - Vectors: ChromaDB/HNSWLib → brute-force cosine (in-memory) - Embeddings: sentence-transformers → @huggingface/transformers - Storage: SQLite/PostgreSQL → .kdd-index/ JSON artifacts - CLI: Click → citty - MCP: FastMCP (Python) → @modelcontextprotocol/sdk (TS) - Docs: README, architecture doc, CI workflow rewritten; ADRs superseded Co-Authored-By: Claude Opus 4.6 --- .env.example | 71 -- .github/workflows/cd.yml | 59 -- .github/workflows/ci.yml | 113 +-- .gitignore | 106 +-- .pre-commit-config.yaml | 28 - Makefile | 138 +--- README.md | 361 +++----- bench/compare.ts | 114 +++ bun.lock | 403 +++++++++ docker/Dockerfile | 48 -- docker/Dockerfile.dev | 24 - docker/docker-compose.yml | 63 -- docs/architecture/kdd-engine.md | 325 +++----- docs/architecture/smart-ingestion-pipeline.md | 259 ------ ...ository-pattern-for-storage-abstraction.md | 3 +- ...ADR-0002-kdd-semantic-chunking-strategy.md | 3 +- .../ADR-0003-entity-extraction-pipeline.md | 3 +- .../ADR-0004-mcp-server-agent-integration.md | 3 +- docs/design/adr/README.md | 11 +- docs/design/challenges/README.md | 4 + docs/design/requirements.md | 349 -------- migrations/alembic.ini | 79 -- migrations/env.py | 87 -- migrations/versions/.gitkeep | 0 package.json | 27 + pyproject.toml | 155 ---- src/application/chunking.ts | 138 ++++ src/application/commands/index-document.ts | 129 +++ src/application/extractors/base.ts | 208 +++++ src/application/extractors/kinds/adr.ts | 32 + .../extractors/kinds/business-policy.ts | 53 ++ .../extractors/kinds/business-rule.ts | 55 ++ src/application/extractors/kinds/command.ts | 55 ++ .../extractors/kinds/cross-policy.ts | 53 ++ src/application/extractors/kinds/entity.ts | 116 +++ src/application/extractors/kinds/event.ts | 34 + src/application/extractors/kinds/glossary.ts | 32 + src/application/extractors/kinds/objective.ts | 32 + src/application/extractors/kinds/prd.ts | 36 + src/application/extractors/kinds/process.ts | 32 + src/application/extractors/kinds/query.ts | 34 + .../extractors/kinds/requirement.ts | 32 + .../extractors/kinds/ui-component.ts | 32 + src/application/extractors/kinds/ui-view.ts | 36 + src/application/extractors/kinds/use-case.ts | 92 +++ src/application/extractors/registry.ts | 63 ++ src/application/queries/coverage-query.ts | 96 +++ src/application/queries/graph-query.ts | 101 +++ src/application/queries/hybrid-search.ts | 207 +++++ src/application/queries/impact-query.ts | 139 ++++ src/application/queries/semantic-query.ts | 91 +++ src/application/queries/violations-query.ts | 71 ++ src/cli.ts | 318 +++++++ src/container.ts | 48 ++ src/domain/rules.ts | 117 +++ src/domain/types.ts | 212 +++++ src/infra/artifact-loader.ts | 52 ++ src/infra/artifact-writer.ts | 115 +++ src/infra/embedding-model.ts | 38 + src/infra/graph-store.ts | 226 +++++ src/infra/markdown-parser.ts | 87 ++ src/infra/vector-store.ts | 60 ++ src/infra/wiki-links.ts | 47 ++ src/kb_engine/__init__.py | 3 - src/kb_engine/api/__init__.py | 5 - src/kb_engine/api/dependencies.py | 97 --- src/kb_engine/api/main.py | 84 -- src/kb_engine/api/middleware/__init__.py | 1 - src/kb_engine/api/middleware/auth.py | 20 - src/kb_engine/api/middleware/logging.py | 46 -- src/kb_engine/api/routers/__init__.py | 1 - src/kb_engine/api/routers/admin.py | 72 -- src/kb_engine/api/routers/curation.py | 108 --- src/kb_engine/api/routers/health.py | 78 -- src/kb_engine/api/routers/indexing.py | 231 ------ src/kb_engine/api/routers/retrieval.py | 55 -- src/kb_engine/chunking/__init__.py | 15 - src/kb_engine/chunking/base.py | 100 --- src/kb_engine/chunking/config.py | 41 - src/kb_engine/chunking/factory.py | 87 -- src/kb_engine/chunking/parsers.py | 225 ----- src/kb_engine/chunking/strategies/__init__.py | 15 - src/kb_engine/chunking/strategies/default.py | 54 -- src/kb_engine/chunking/strategies/entity.py | 80 -- src/kb_engine/chunking/strategies/process.py | 140 ---- src/kb_engine/chunking/strategies/rule.py | 108 --- src/kb_engine/chunking/strategies/use_case.py | 107 --- src/kb_engine/chunking/types.py | 5 - src/kb_engine/cli.py | 620 -------------- src/kb_engine/config/__init__.py | 5 - src/kb_engine/config/logging.py | 46 -- src/kb_engine/config/settings.py | 149 ---- src/kb_engine/core/__init__.py | 51 -- src/kb_engine/core/exceptions.py | 64 -- src/kb_engine/core/interfaces/__init__.py | 17 - src/kb_engine/core/interfaces/chunkers.py | 49 -- src/kb_engine/core/interfaces/extractors.py | 73 -- src/kb_engine/core/interfaces/repositories.py | 153 ---- src/kb_engine/core/models/__init__.py | 29 - src/kb_engine/core/models/document.py | 144 ---- src/kb_engine/core/models/embedding.py | 42 - src/kb_engine/core/models/graph.py | 111 --- src/kb_engine/core/models/repository.py | 35 - src/kb_engine/core/models/search.py | 103 --- src/kb_engine/embedding/__init__.py | 11 - src/kb_engine/embedding/base.py | 76 -- src/kb_engine/embedding/config.py | 32 - src/kb_engine/embedding/factory.py | 32 - src/kb_engine/embedding/providers/__init__.py | 1 - src/kb_engine/embedding/providers/local.py | 81 -- src/kb_engine/embedding/providers/openai.py | 44 - src/kb_engine/extraction/__init__.py | 14 - src/kb_engine/extraction/config.py | 49 -- .../extraction/extractors/__init__.py | 13 - src/kb_engine/extraction/extractors/base.py | 36 - .../extraction/extractors/frontmatter.py | 102 --- src/kb_engine/extraction/extractors/llm.py | 89 -- .../extraction/extractors/pattern.py | 130 --- src/kb_engine/extraction/factory.py | 35 - src/kb_engine/extraction/models.py | 42 - src/kb_engine/extraction/pipeline.py | 122 --- src/kb_engine/extraction/strategies.py | 151 ---- src/kb_engine/git/__init__.py | 6 - src/kb_engine/git/scanner.py | 161 ---- src/kb_engine/git/url_resolver.py | 78 -- src/kb_engine/mcp_server.py | 358 -------- src/kb_engine/pipelines/__init__.py | 6 - .../pipelines/indexation/__init__.py | 5 - .../pipelines/indexation/pipeline.py | 330 -------- src/kb_engine/pipelines/inference/__init__.py | 5 - src/kb_engine/pipelines/inference/pipeline.py | 307 ------- src/kb_engine/py.typed | 0 src/kb_engine/repositories/__init__.py | 5 - src/kb_engine/repositories/factory.py | 161 ---- src/kb_engine/repositories/graph/__init__.py | 1 - src/kb_engine/repositories/graph/neo4j.py | 89 -- src/kb_engine/repositories/graph/sqlite.py | 393 --------- .../repositories/traceability/__init__.py | 1 - .../repositories/traceability/postgres.py | 97 --- .../repositories/traceability/sqlite.py | 332 -------- src/kb_engine/repositories/vector/__init__.py | 1 - src/kb_engine/repositories/vector/chroma.py | 184 ----- src/kb_engine/repositories/vector/qdrant.py | 69 -- src/kb_engine/services/__init__.py | 9 - src/kb_engine/services/indexing.py | 89 -- src/kb_engine/services/retrieval.py | 34 - src/kb_engine/smart/__init__.py | 47 -- src/kb_engine/smart/chunking/__init__.py | 11 - src/kb_engine/smart/chunking/hierarchical.py | 415 ---------- src/kb_engine/smart/chunking/summarizer.py | 123 --- src/kb_engine/smart/extraction/__init__.py | 7 - src/kb_engine/smart/extraction/entity.py | 251 ------ src/kb_engine/smart/parsers/__init__.py | 9 - src/kb_engine/smart/parsers/detector.py | 109 --- src/kb_engine/smart/parsers/entity.py | 427 ---------- src/kb_engine/smart/pipelines/__init__.py | 7 - src/kb_engine/smart/pipelines/entity.py | 273 ------- src/kb_engine/smart/schemas/__init__.py | 7 - src/kb_engine/smart/schemas/entity.py | 87 -- src/kb_engine/smart/stores/__init__.py | 7 - src/kb_engine/smart/stores/falkordb_graph.py | 773 ------------------ src/kb_engine/smart/types.py | 268 ------ src/kb_engine/utils/__init__.py | 13 - src/kb_engine/utils/hashing.py | 11 - src/kb_engine/utils/markdown.py | 137 ---- src/kb_engine/utils/tokenization.py | 38 - src/kdd/__init__.py | 3 - src/kdd/api/__init__.py | 0 src/kdd/api/cli.py | 438 ---------- src/kdd/api/server.py | 409 --------- src/kdd/application/__init__.py | 1 - src/kdd/application/chunking.py | 177 ---- src/kdd/application/commands/__init__.py | 0 .../application/commands/enrich_with_agent.py | 122 --- .../application/commands/index_document.py | 196 ----- .../application/commands/index_incremental.py | 186 ----- src/kdd/application/commands/merge_index.py | 223 ----- src/kdd/application/commands/sync_index.py | 66 -- src/kdd/application/extractors/__init__.py | 1 - src/kdd/application/extractors/base.py | 217 ----- .../application/extractors/kinds/__init__.py | 1 - src/kdd/application/extractors/kinds/adr.py | 63 -- .../extractors/kinds/business_policy.py | 89 -- .../extractors/kinds/business_rule.py | 96 --- .../application/extractors/kinds/command.py | 89 -- .../extractors/kinds/cross_policy.py | 92 --- .../application/extractors/kinds/entity.py | 196 ----- src/kdd/application/extractors/kinds/event.py | 69 -- .../application/extractors/kinds/objective.py | 64 -- src/kdd/application/extractors/kinds/prd.py | 82 -- .../application/extractors/kinds/process.py | 66 -- src/kdd/application/extractors/kinds/query.py | 64 -- .../extractors/kinds/requirement.py | 64 -- .../extractors/kinds/ui_component.py | 61 -- .../application/extractors/kinds/ui_view.py | 69 -- .../application/extractors/kinds/use_case.py | 137 ---- src/kdd/application/extractors/registry.py | 68 -- src/kdd/application/queries/__init__.py | 0 src/kdd/application/queries/index_loader.py | 69 -- .../application/queries/retrieve_coverage.py | 132 --- src/kdd/application/queries/retrieve_graph.py | 119 --- .../application/queries/retrieve_hybrid.py | 238 ------ .../application/queries/retrieve_impact.py | 154 ---- .../application/queries/retrieve_semantic.py | 119 --- .../queries/retrieve_violations.py | 84 -- src/kdd/container.py | 119 --- src/kdd/domain/__init__.py | 1 - src/kdd/domain/entities.py | 201 ----- src/kdd/domain/enums.py | 122 --- src/kdd/domain/events.py | 189 ----- src/kdd/domain/ports.py | 134 --- src/kdd/domain/rules.py | 229 ------ src/kdd/infrastructure/__init__.py | 1 - src/kdd/infrastructure/agent/__init__.py | 0 src/kdd/infrastructure/agent/claude_cli.py | 127 --- src/kdd/infrastructure/artifact/__init__.py | 1 - src/kdd/infrastructure/artifact/filesystem.py | 236 ------ src/kdd/infrastructure/embedding/__init__.py | 0 .../embedding/sentence_transformer.py | 30 - src/kdd/infrastructure/events/__init__.py | 1 - src/kdd/infrastructure/events/bus.py | 26 - src/kdd/infrastructure/git/__init__.py | 1 - src/kdd/infrastructure/git/diff.py | 146 ---- src/kdd/infrastructure/graph/__init__.py | 0 .../infrastructure/graph/networkx_store.py | 261 ------ src/kdd/infrastructure/parsing/__init__.py | 1 - src/kdd/infrastructure/parsing/hashing.py | 13 - src/kdd/infrastructure/parsing/markdown.py | 117 --- .../infrastructure/parsing/tokenization.py | 23 - src/kdd/infrastructure/parsing/wiki_links.py | 69 -- src/kdd/infrastructure/vector/__init__.py | 0 .../infrastructure/vector/hnswlib_store.py | 103 --- src/mcp.ts | 271 ++++++ tests/__init__.py | 1 - tests/api/__init__.py | 1 - tests/api/test_health.py | 78 -- tests/conftest.py | 86 -- tests/factories.py | 76 -- tests/fixtures/.gitkeep | 0 tests/fixtures/entities/Usuario.md | 90 -- tests/integration/__init__.py | 1 - tests/integration/pipelines/__init__.py | 1 - tests/integration/repositories/__init__.py | 1 - tests/integration/test_smart_pipeline.py | 550 ------------- tests/unit/__init__.py | 1 - tests/unit/chunking/__init__.py | 1 - tests/unit/chunking/test_chunking.py | 230 ------ .../chunking/test_hierarchical_chunker.py | 216 ----- tests/unit/chunking/test_parsers.py | 169 ---- tests/unit/config/test_settings.py | 58 -- tests/unit/core/__init__.py | 1 - tests/unit/core/test_models.py | 258 ------ tests/unit/extraction/__init__.py | 1 - tests/unit/extraction/test_extraction.py | 184 ----- tests/unit/git/__init__.py | 0 tests/unit/git/test_scanner.py | 129 --- tests/unit/git/test_url_resolver.py | 78 -- tests/unit/repositories/__init__.py | 0 tests/unit/repositories/test_sqlite_graph.py | 130 --- .../repositories/test_sqlite_traceability.py | 163 ---- tests/unit/test_cli_graph.py | 410 ---------- tests/unit/test_mcp_server.py | 369 --------- tests/unit/utils/__init__.py | 0 tests/unit/utils/test_markdown.py | 146 ---- tests/v2/__init__.py | 0 tests/v2/api/__init__.py | 0 tests/v2/api/test_cli.py | 129 --- tests/v2/api/test_server.py | 178 ---- tests/v2/application/__init__.py | 0 tests/v2/application/commands/__init__.py | 0 .../commands/test_index_document.py | 191 ----- .../commands/test_index_incremental.py | 227 ----- .../application/commands/test_merge_index.py | 267 ------ .../application/commands/test_sync_index.py | 274 ------- tests/v2/application/extractors/__init__.py | 0 tests/v2/application/extractors/conftest.py | 88 -- .../v2/application/extractors/test_command.py | 60 -- .../v2/application/extractors/test_entity.py | 114 --- tests/v2/application/extractors/test_event.py | 54 -- tests/v2/application/extractors/test_prd.py | 46 -- tests/v2/application/extractors/test_query.py | 50 -- .../application/extractors/test_registry.py | 42 - .../extractors/test_requirement.py | 46 -- tests/v2/application/extractors/test_rule.py | 56 -- .../extractors/test_synthetic_kinds.py | 497 ----------- .../application/extractors/test_use_case.py | 82 -- tests/v2/application/queries/__init__.py | 0 tests/v2/application/queries/conftest.py | 145 ---- .../application/queries/test_index_loader.py | 145 ---- .../queries/test_retrieve_coverage.py | 61 -- .../queries/test_retrieve_graph.py | 80 -- .../queries/test_retrieve_hybrid.py | 157 ---- .../queries/test_retrieve_impact.py | 66 -- .../queries/test_retrieve_violations.py | 54 -- tests/v2/application/test_chunking.py | 129 --- tests/v2/conftest.py | 1 - tests/v2/domain/__init__.py | 0 tests/v2/domain/test_entities.py | 284 ------- tests/v2/domain/test_enums.py | 119 --- tests/v2/domain/test_events.py | 210 ----- tests/v2/domain/test_rules.py | 324 -------- tests/v2/infrastructure/__init__.py | 0 tests/v2/infrastructure/test_artifact.py | 231 ------ tests/v2/infrastructure/test_claude_cli.py | 156 ---- tests/v2/infrastructure/test_event_bus.py | 69 -- tests/v2/infrastructure/test_git_diff.py | 144 ---- tests/v2/infrastructure/test_hnswlib_store.py | 93 --- tests/v2/infrastructure/test_markdown.py | 136 --- .../v2/infrastructure/test_networkx_store.py | 239 ------ tests/v2/infrastructure/test_wiki_links.py | 81 -- tests/v2/test_container.py | 56 -- tsconfig.json | 22 + 312 files changed, 4478 insertions(+), 27525 deletions(-) delete mode 100644 .env.example delete mode 100644 .github/workflows/cd.yml delete mode 100644 .pre-commit-config.yaml create mode 100644 bench/compare.ts create mode 100644 bun.lock delete mode 100644 docker/Dockerfile delete mode 100644 docker/Dockerfile.dev delete mode 100644 docker/docker-compose.yml delete mode 100644 docs/architecture/smart-ingestion-pipeline.md delete mode 100644 docs/design/requirements.md delete mode 100644 migrations/alembic.ini delete mode 100644 migrations/env.py delete mode 100644 migrations/versions/.gitkeep create mode 100644 package.json delete mode 100644 pyproject.toml create mode 100644 src/application/chunking.ts create mode 100644 src/application/commands/index-document.ts create mode 100644 src/application/extractors/base.ts create mode 100644 src/application/extractors/kinds/adr.ts create mode 100644 src/application/extractors/kinds/business-policy.ts create mode 100644 src/application/extractors/kinds/business-rule.ts create mode 100644 src/application/extractors/kinds/command.ts create mode 100644 src/application/extractors/kinds/cross-policy.ts create mode 100644 src/application/extractors/kinds/entity.ts create mode 100644 src/application/extractors/kinds/event.ts create mode 100644 src/application/extractors/kinds/glossary.ts create mode 100644 src/application/extractors/kinds/objective.ts create mode 100644 src/application/extractors/kinds/prd.ts create mode 100644 src/application/extractors/kinds/process.ts create mode 100644 src/application/extractors/kinds/query.ts create mode 100644 src/application/extractors/kinds/requirement.ts create mode 100644 src/application/extractors/kinds/ui-component.ts create mode 100644 src/application/extractors/kinds/ui-view.ts create mode 100644 src/application/extractors/kinds/use-case.ts create mode 100644 src/application/extractors/registry.ts create mode 100644 src/application/queries/coverage-query.ts create mode 100644 src/application/queries/graph-query.ts create mode 100644 src/application/queries/hybrid-search.ts create mode 100644 src/application/queries/impact-query.ts create mode 100644 src/application/queries/semantic-query.ts create mode 100644 src/application/queries/violations-query.ts create mode 100644 src/cli.ts create mode 100644 src/container.ts create mode 100644 src/domain/rules.ts create mode 100644 src/domain/types.ts create mode 100644 src/infra/artifact-loader.ts create mode 100644 src/infra/artifact-writer.ts create mode 100644 src/infra/embedding-model.ts create mode 100644 src/infra/graph-store.ts create mode 100644 src/infra/markdown-parser.ts create mode 100644 src/infra/vector-store.ts create mode 100644 src/infra/wiki-links.ts delete mode 100644 src/kb_engine/__init__.py delete mode 100644 src/kb_engine/api/__init__.py delete mode 100644 src/kb_engine/api/dependencies.py delete mode 100644 src/kb_engine/api/main.py delete mode 100644 src/kb_engine/api/middleware/__init__.py delete mode 100644 src/kb_engine/api/middleware/auth.py delete mode 100644 src/kb_engine/api/middleware/logging.py delete mode 100644 src/kb_engine/api/routers/__init__.py delete mode 100644 src/kb_engine/api/routers/admin.py delete mode 100644 src/kb_engine/api/routers/curation.py delete mode 100644 src/kb_engine/api/routers/health.py delete mode 100644 src/kb_engine/api/routers/indexing.py delete mode 100644 src/kb_engine/api/routers/retrieval.py delete mode 100644 src/kb_engine/chunking/__init__.py delete mode 100644 src/kb_engine/chunking/base.py delete mode 100644 src/kb_engine/chunking/config.py delete mode 100644 src/kb_engine/chunking/factory.py delete mode 100644 src/kb_engine/chunking/parsers.py delete mode 100644 src/kb_engine/chunking/strategies/__init__.py delete mode 100644 src/kb_engine/chunking/strategies/default.py delete mode 100644 src/kb_engine/chunking/strategies/entity.py delete mode 100644 src/kb_engine/chunking/strategies/process.py delete mode 100644 src/kb_engine/chunking/strategies/rule.py delete mode 100644 src/kb_engine/chunking/strategies/use_case.py delete mode 100644 src/kb_engine/chunking/types.py delete mode 100644 src/kb_engine/cli.py delete mode 100644 src/kb_engine/config/__init__.py delete mode 100644 src/kb_engine/config/logging.py delete mode 100644 src/kb_engine/config/settings.py delete mode 100644 src/kb_engine/core/__init__.py delete mode 100644 src/kb_engine/core/exceptions.py delete mode 100644 src/kb_engine/core/interfaces/__init__.py delete mode 100644 src/kb_engine/core/interfaces/chunkers.py delete mode 100644 src/kb_engine/core/interfaces/extractors.py delete mode 100644 src/kb_engine/core/interfaces/repositories.py delete mode 100644 src/kb_engine/core/models/__init__.py delete mode 100644 src/kb_engine/core/models/document.py delete mode 100644 src/kb_engine/core/models/embedding.py delete mode 100644 src/kb_engine/core/models/graph.py delete mode 100644 src/kb_engine/core/models/repository.py delete mode 100644 src/kb_engine/core/models/search.py delete mode 100644 src/kb_engine/embedding/__init__.py delete mode 100644 src/kb_engine/embedding/base.py delete mode 100644 src/kb_engine/embedding/config.py delete mode 100644 src/kb_engine/embedding/factory.py delete mode 100644 src/kb_engine/embedding/providers/__init__.py delete mode 100644 src/kb_engine/embedding/providers/local.py delete mode 100644 src/kb_engine/embedding/providers/openai.py delete mode 100644 src/kb_engine/extraction/__init__.py delete mode 100644 src/kb_engine/extraction/config.py delete mode 100644 src/kb_engine/extraction/extractors/__init__.py delete mode 100644 src/kb_engine/extraction/extractors/base.py delete mode 100644 src/kb_engine/extraction/extractors/frontmatter.py delete mode 100644 src/kb_engine/extraction/extractors/llm.py delete mode 100644 src/kb_engine/extraction/extractors/pattern.py delete mode 100644 src/kb_engine/extraction/factory.py delete mode 100644 src/kb_engine/extraction/models.py delete mode 100644 src/kb_engine/extraction/pipeline.py delete mode 100644 src/kb_engine/extraction/strategies.py delete mode 100644 src/kb_engine/git/__init__.py delete mode 100644 src/kb_engine/git/scanner.py delete mode 100644 src/kb_engine/git/url_resolver.py delete mode 100644 src/kb_engine/mcp_server.py delete mode 100644 src/kb_engine/pipelines/__init__.py delete mode 100644 src/kb_engine/pipelines/indexation/__init__.py delete mode 100644 src/kb_engine/pipelines/indexation/pipeline.py delete mode 100644 src/kb_engine/pipelines/inference/__init__.py delete mode 100644 src/kb_engine/pipelines/inference/pipeline.py delete mode 100644 src/kb_engine/py.typed delete mode 100644 src/kb_engine/repositories/__init__.py delete mode 100644 src/kb_engine/repositories/factory.py delete mode 100644 src/kb_engine/repositories/graph/__init__.py delete mode 100644 src/kb_engine/repositories/graph/neo4j.py delete mode 100644 src/kb_engine/repositories/graph/sqlite.py delete mode 100644 src/kb_engine/repositories/traceability/__init__.py delete mode 100644 src/kb_engine/repositories/traceability/postgres.py delete mode 100644 src/kb_engine/repositories/traceability/sqlite.py delete mode 100644 src/kb_engine/repositories/vector/__init__.py delete mode 100644 src/kb_engine/repositories/vector/chroma.py delete mode 100644 src/kb_engine/repositories/vector/qdrant.py delete mode 100644 src/kb_engine/services/__init__.py delete mode 100644 src/kb_engine/services/indexing.py delete mode 100644 src/kb_engine/services/retrieval.py delete mode 100644 src/kb_engine/smart/__init__.py delete mode 100644 src/kb_engine/smart/chunking/__init__.py delete mode 100644 src/kb_engine/smart/chunking/hierarchical.py delete mode 100644 src/kb_engine/smart/chunking/summarizer.py delete mode 100644 src/kb_engine/smart/extraction/__init__.py delete mode 100644 src/kb_engine/smart/extraction/entity.py delete mode 100644 src/kb_engine/smart/parsers/__init__.py delete mode 100644 src/kb_engine/smart/parsers/detector.py delete mode 100644 src/kb_engine/smart/parsers/entity.py delete mode 100644 src/kb_engine/smart/pipelines/__init__.py delete mode 100644 src/kb_engine/smart/pipelines/entity.py delete mode 100644 src/kb_engine/smart/schemas/__init__.py delete mode 100644 src/kb_engine/smart/schemas/entity.py delete mode 100644 src/kb_engine/smart/stores/__init__.py delete mode 100644 src/kb_engine/smart/stores/falkordb_graph.py delete mode 100644 src/kb_engine/smart/types.py delete mode 100644 src/kb_engine/utils/__init__.py delete mode 100644 src/kb_engine/utils/hashing.py delete mode 100644 src/kb_engine/utils/markdown.py delete mode 100644 src/kb_engine/utils/tokenization.py delete mode 100644 src/kdd/__init__.py delete mode 100644 src/kdd/api/__init__.py delete mode 100644 src/kdd/api/cli.py delete mode 100644 src/kdd/api/server.py delete mode 100644 src/kdd/application/__init__.py delete mode 100644 src/kdd/application/chunking.py delete mode 100644 src/kdd/application/commands/__init__.py delete mode 100644 src/kdd/application/commands/enrich_with_agent.py delete mode 100644 src/kdd/application/commands/index_document.py delete mode 100644 src/kdd/application/commands/index_incremental.py delete mode 100644 src/kdd/application/commands/merge_index.py delete mode 100644 src/kdd/application/commands/sync_index.py delete mode 100644 src/kdd/application/extractors/__init__.py delete mode 100644 src/kdd/application/extractors/base.py delete mode 100644 src/kdd/application/extractors/kinds/__init__.py delete mode 100644 src/kdd/application/extractors/kinds/adr.py delete mode 100644 src/kdd/application/extractors/kinds/business_policy.py delete mode 100644 src/kdd/application/extractors/kinds/business_rule.py delete mode 100644 src/kdd/application/extractors/kinds/command.py delete mode 100644 src/kdd/application/extractors/kinds/cross_policy.py delete mode 100644 src/kdd/application/extractors/kinds/entity.py delete mode 100644 src/kdd/application/extractors/kinds/event.py delete mode 100644 src/kdd/application/extractors/kinds/objective.py delete mode 100644 src/kdd/application/extractors/kinds/prd.py delete mode 100644 src/kdd/application/extractors/kinds/process.py delete mode 100644 src/kdd/application/extractors/kinds/query.py delete mode 100644 src/kdd/application/extractors/kinds/requirement.py delete mode 100644 src/kdd/application/extractors/kinds/ui_component.py delete mode 100644 src/kdd/application/extractors/kinds/ui_view.py delete mode 100644 src/kdd/application/extractors/kinds/use_case.py delete mode 100644 src/kdd/application/extractors/registry.py delete mode 100644 src/kdd/application/queries/__init__.py delete mode 100644 src/kdd/application/queries/index_loader.py delete mode 100644 src/kdd/application/queries/retrieve_coverage.py delete mode 100644 src/kdd/application/queries/retrieve_graph.py delete mode 100644 src/kdd/application/queries/retrieve_hybrid.py delete mode 100644 src/kdd/application/queries/retrieve_impact.py delete mode 100644 src/kdd/application/queries/retrieve_semantic.py delete mode 100644 src/kdd/application/queries/retrieve_violations.py delete mode 100644 src/kdd/container.py delete mode 100644 src/kdd/domain/__init__.py delete mode 100644 src/kdd/domain/entities.py delete mode 100644 src/kdd/domain/enums.py delete mode 100644 src/kdd/domain/events.py delete mode 100644 src/kdd/domain/ports.py delete mode 100644 src/kdd/domain/rules.py delete mode 100644 src/kdd/infrastructure/__init__.py delete mode 100644 src/kdd/infrastructure/agent/__init__.py delete mode 100644 src/kdd/infrastructure/agent/claude_cli.py delete mode 100644 src/kdd/infrastructure/artifact/__init__.py delete mode 100644 src/kdd/infrastructure/artifact/filesystem.py delete mode 100644 src/kdd/infrastructure/embedding/__init__.py delete mode 100644 src/kdd/infrastructure/embedding/sentence_transformer.py delete mode 100644 src/kdd/infrastructure/events/__init__.py delete mode 100644 src/kdd/infrastructure/events/bus.py delete mode 100644 src/kdd/infrastructure/git/__init__.py delete mode 100644 src/kdd/infrastructure/git/diff.py delete mode 100644 src/kdd/infrastructure/graph/__init__.py delete mode 100644 src/kdd/infrastructure/graph/networkx_store.py delete mode 100644 src/kdd/infrastructure/parsing/__init__.py delete mode 100644 src/kdd/infrastructure/parsing/hashing.py delete mode 100644 src/kdd/infrastructure/parsing/markdown.py delete mode 100644 src/kdd/infrastructure/parsing/tokenization.py delete mode 100644 src/kdd/infrastructure/parsing/wiki_links.py delete mode 100644 src/kdd/infrastructure/vector/__init__.py delete mode 100644 src/kdd/infrastructure/vector/hnswlib_store.py create mode 100644 src/mcp.ts delete mode 100644 tests/__init__.py delete mode 100644 tests/api/__init__.py delete mode 100644 tests/api/test_health.py delete mode 100644 tests/conftest.py delete mode 100644 tests/factories.py delete mode 100644 tests/fixtures/.gitkeep delete mode 100644 tests/fixtures/entities/Usuario.md delete mode 100644 tests/integration/__init__.py delete mode 100644 tests/integration/pipelines/__init__.py delete mode 100644 tests/integration/repositories/__init__.py delete mode 100644 tests/integration/test_smart_pipeline.py delete mode 100644 tests/unit/__init__.py delete mode 100644 tests/unit/chunking/__init__.py delete mode 100644 tests/unit/chunking/test_chunking.py delete mode 100644 tests/unit/chunking/test_hierarchical_chunker.py delete mode 100644 tests/unit/chunking/test_parsers.py delete mode 100644 tests/unit/config/test_settings.py delete mode 100644 tests/unit/core/__init__.py delete mode 100644 tests/unit/core/test_models.py delete mode 100644 tests/unit/extraction/__init__.py delete mode 100644 tests/unit/extraction/test_extraction.py delete mode 100644 tests/unit/git/__init__.py delete mode 100644 tests/unit/git/test_scanner.py delete mode 100644 tests/unit/git/test_url_resolver.py delete mode 100644 tests/unit/repositories/__init__.py delete mode 100644 tests/unit/repositories/test_sqlite_graph.py delete mode 100644 tests/unit/repositories/test_sqlite_traceability.py delete mode 100644 tests/unit/test_cli_graph.py delete mode 100644 tests/unit/test_mcp_server.py delete mode 100644 tests/unit/utils/__init__.py delete mode 100644 tests/unit/utils/test_markdown.py delete mode 100644 tests/v2/__init__.py delete mode 100644 tests/v2/api/__init__.py delete mode 100644 tests/v2/api/test_cli.py delete mode 100644 tests/v2/api/test_server.py delete mode 100644 tests/v2/application/__init__.py delete mode 100644 tests/v2/application/commands/__init__.py delete mode 100644 tests/v2/application/commands/test_index_document.py delete mode 100644 tests/v2/application/commands/test_index_incremental.py delete mode 100644 tests/v2/application/commands/test_merge_index.py delete mode 100644 tests/v2/application/commands/test_sync_index.py delete mode 100644 tests/v2/application/extractors/__init__.py delete mode 100644 tests/v2/application/extractors/conftest.py delete mode 100644 tests/v2/application/extractors/test_command.py delete mode 100644 tests/v2/application/extractors/test_entity.py delete mode 100644 tests/v2/application/extractors/test_event.py delete mode 100644 tests/v2/application/extractors/test_prd.py delete mode 100644 tests/v2/application/extractors/test_query.py delete mode 100644 tests/v2/application/extractors/test_registry.py delete mode 100644 tests/v2/application/extractors/test_requirement.py delete mode 100644 tests/v2/application/extractors/test_rule.py delete mode 100644 tests/v2/application/extractors/test_synthetic_kinds.py delete mode 100644 tests/v2/application/extractors/test_use_case.py delete mode 100644 tests/v2/application/queries/__init__.py delete mode 100644 tests/v2/application/queries/conftest.py delete mode 100644 tests/v2/application/queries/test_index_loader.py delete mode 100644 tests/v2/application/queries/test_retrieve_coverage.py delete mode 100644 tests/v2/application/queries/test_retrieve_graph.py delete mode 100644 tests/v2/application/queries/test_retrieve_hybrid.py delete mode 100644 tests/v2/application/queries/test_retrieve_impact.py delete mode 100644 tests/v2/application/queries/test_retrieve_violations.py delete mode 100644 tests/v2/application/test_chunking.py delete mode 100644 tests/v2/conftest.py delete mode 100644 tests/v2/domain/__init__.py delete mode 100644 tests/v2/domain/test_entities.py delete mode 100644 tests/v2/domain/test_enums.py delete mode 100644 tests/v2/domain/test_events.py delete mode 100644 tests/v2/domain/test_rules.py delete mode 100644 tests/v2/infrastructure/__init__.py delete mode 100644 tests/v2/infrastructure/test_artifact.py delete mode 100644 tests/v2/infrastructure/test_claude_cli.py delete mode 100644 tests/v2/infrastructure/test_event_bus.py delete mode 100644 tests/v2/infrastructure/test_git_diff.py delete mode 100644 tests/v2/infrastructure/test_hnswlib_store.py delete mode 100644 tests/v2/infrastructure/test_markdown.py delete mode 100644 tests/v2/infrastructure/test_networkx_store.py delete mode 100644 tests/v2/infrastructure/test_wiki_links.py delete mode 100644 tests/v2/test_container.py create mode 100644 tsconfig.json diff --git a/.env.example b/.env.example deleted file mode 100644 index c6dabc6..0000000 --- a/.env.example +++ /dev/null @@ -1,71 +0,0 @@ -# ============================================================================= -# KB-Engine Environment Configuration -# ============================================================================= -# Copy this file to .env and fill in the values - -# ----------------------------------------------------------------------------- -# General -# ----------------------------------------------------------------------------- -ENVIRONMENT=development -DEBUG=true -LOG_LEVEL=INFO - -# ----------------------------------------------------------------------------- -# API Configuration -# ----------------------------------------------------------------------------- -API_HOST=0.0.0.0 -API_PORT=8000 -API_WORKERS=1 - -# ----------------------------------------------------------------------------- -# PostgreSQL (Traceability Store) -# ----------------------------------------------------------------------------- -POSTGRES_HOST=localhost -POSTGRES_PORT=5432 -POSTGRES_USER=kb_engine -POSTGRES_PASSWORD=changeme -POSTGRES_DB=kb_engine -DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} - -# ----------------------------------------------------------------------------- -# Qdrant (Vector Store) -# ----------------------------------------------------------------------------- -QDRANT_HOST=localhost -QDRANT_PORT=6333 -QDRANT_GRPC_PORT=6334 -QDRANT_API_KEY= -QDRANT_COLLECTION=kb_engine_embeddings - -# ----------------------------------------------------------------------------- -# Neo4j (Graph Store) -# ----------------------------------------------------------------------------- -NEO4J_URI=bolt://localhost:7687 -NEO4J_USER=neo4j -NEO4J_PASSWORD=changeme - -# ----------------------------------------------------------------------------- -# Embeddings -# ----------------------------------------------------------------------------- -EMBEDDING_PROVIDER=local -LOCAL_EMBEDDING_MODEL=all-mpnet-base-v2 - -# ----------------------------------------------------------------------------- -# OpenAI (optional, for embedding_provider=openai) -# ----------------------------------------------------------------------------- -OPENAI_API_KEY=sk-your-key-here -OPENAI_EMBEDDING_MODEL=text-embedding-3-small -OPENAI_CHAT_MODEL=gpt-4-turbo-preview - -# ----------------------------------------------------------------------------- -# Chunking Configuration (ADR-0002) -# ----------------------------------------------------------------------------- -CHUNK_SIZE_MIN=100 -CHUNK_SIZE_TARGET=512 -CHUNK_SIZE_MAX=1024 -CHUNK_OVERLAP=50 - -# ----------------------------------------------------------------------------- -# Extraction Configuration (ADR-0003) -# ----------------------------------------------------------------------------- -EXTRACTION_USE_LLM=true -EXTRACTION_CONFIDENCE_THRESHOLD=0.7 diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml deleted file mode 100644 index 6167bb2..0000000 --- a/.github/workflows/cd.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: CD - -on: - push: - tags: - - "v*" - -jobs: - build-and-push: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - steps: - - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ghcr.io/${{ github.repository }} - tags: | - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: . - file: ./docker/Dockerfile - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - - deploy: - runs-on: ubuntu-latest - needs: build-and-push - environment: production - - steps: - - name: Deploy to production - run: | - echo "Deployment step - configure based on your infrastructure" - # Add deployment commands here - # e.g., kubectl apply, docker compose pull && up, etc. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a28b91d..9728e87 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,123 +7,34 @@ on: branches: [main, develop] jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install ruff mypy - - - name: Run ruff check - run: ruff check src tests - - - name: Run ruff format check - run: ruff format --check src tests - type-check: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 + - name: Set up Bun + uses: oven-sh/setup-bun@v2 with: - python-version: "3.11" + bun-version: latest - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + run: bun install - - name: Run mypy - run: mypy src + - name: Type check + run: bunx tsc --noEmit test: runs-on: ubuntu-latest - services: - postgres: - image: postgres:16-alpine - env: - POSTGRES_USER: kb_engine - POSTGRES_PASSWORD: changeme - POSTGRES_DB: kb_engine_test - ports: - - 5432:5432 - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" - - - name: Run unit tests - run: pytest -m unit --cov=kb_engine --cov-report=xml - - - name: Upload coverage - uses: codecov/codecov-action@v4 - with: - files: ./coverage.xml - fail_ci_if_error: false - - integration-test: - runs-on: ubuntu-latest - needs: [lint, type-check, test] - services: - postgres: - image: postgres:16-alpine - env: - POSTGRES_USER: kb_engine - POSTGRES_PASSWORD: changeme - POSTGRES_DB: kb_engine_test - ports: - - 5432:5432 - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - - qdrant: - image: qdrant/qdrant:latest - ports: - - 6333:6333 - steps: - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 + - name: Set up Bun + uses: oven-sh/setup-bun@v2 with: - python-version: "3.11" + bun-version: latest - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + run: bun install - - name: Run integration tests - run: pytest -m integration - env: - DATABASE_URL: postgresql+asyncpg://kb_engine:changeme@localhost:5432/kb_engine_test - QDRANT_HOST: localhost - QDRANT_PORT: 6333 + - name: Run tests + run: bun test diff --git a/.gitignore b/.gitignore index 48051ba..644a8dd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,63 +1,17 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so +# Dependencies +node_modules/ -# Distribution / packaging -.Python -build/ -develop-eggs/ +# Build dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -*.manifest -*.spec -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ +# Index artifacts +.kdd-index/ -# Translations -*.mo -*.pot +# Embedding model cache +.models/ -# Environments +# Environment .env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ # IDE .idea/ @@ -66,42 +20,20 @@ venv.bak/ *.swo *~ -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# ruff -.ruff_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# Local development -*.local -*.log -logs/ - -# Database -*.db -*.sqlite3 - -# Docker -docker-compose.override.yml - # OS .DS_Store Thumbs.db -# Project specific -.kdd-index/ -/data/ -/uploads/ -/exports/ -*.bak +# Logs +*.log + +# Python (legacy) +.venv/ +*.db +*.db.settings +__pycache__/ +.pytest_cache/ +*.pyc -# obsidian +# Obsidian .obsidian/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 731011b..0000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,28 +0,0 @@ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - - id: check-added-large-files - args: ['--maxkb=1000'] - - id: check-merge-conflict - - id: detect-private-key - - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.0 - hooks: - - id: ruff - args: [--fix, --exit-non-zero-on-fix] - - id: ruff-format - - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 - hooks: - - id: mypy - additional_dependencies: - - pydantic>=2.5.0 - - types-python-dateutil - args: [--ignore-missing-imports] - exclude: ^tests/ diff --git a/Makefile b/Makefile index 5235504..44cc622 100644 --- a/Makefile +++ b/Makefile @@ -1,131 +1,33 @@ -.PHONY: help install install-dev lint format type-check test test-unit test-integration test-api run dev clean docker-up docker-down migrate +.PHONY: help install index search test typecheck mcp clean -# Default target help: - @echo "KB-Engine Development Commands" + @echo "KDD Toolkit (TypeScript/Bun)" @echo "" - @echo "Setup:" - @echo " make install Install production dependencies" - @echo " make install-dev Install development dependencies" - @echo "" - @echo "Code Quality:" - @echo " make lint Run linter (ruff)" - @echo " make format Format code (ruff)" - @echo " make type-check Run type checker (mypy)" - @echo "" - @echo "Testing:" - @echo " make test Run all tests" - @echo " make test-unit Run unit tests only" - @echo " make test-integration Run integration tests" - @echo " make test-api Run API tests" - @echo " make test-cov Run tests with coverage" - @echo "" - @echo "Running:" - @echo " make run Run the API server" - @echo " make dev Run with auto-reload" - @echo "" - @echo "Docker:" - @echo " make docker-up Start Docker services" - @echo " make docker-down Stop Docker services" - @echo "" - @echo "Database:" - @echo " make migrate Run database migrations" - @echo " make migrate-create Create new migration" - @echo "" - @echo "Cleanup:" - @echo " make clean Remove build artifacts" - -# ============================================================================= -# Setup -# ============================================================================= + @echo " make install Install dependencies" + @echo " make index Index specs/ into .kdd-index/" + @echo " make search q=.. Hybrid search" + @echo " make test Run tests" + @echo " make typecheck Type-check with tsc" + @echo " make mcp Start MCP server" + @echo " make clean Remove node_modules and .kdd-index" install: - pip install -e . - -install-dev: - pip install -e ".[dev]" - pre-commit install - -# ============================================================================= -# Code Quality -# ============================================================================= + bun install -lint: - ruff check src tests - ruff format --check src tests +index: + bun run src/cli.ts index specs/ -format: - ruff check --fix src tests - ruff format src tests - -type-check: - mypy src - -# ============================================================================= -# Testing -# ============================================================================= +search: + bun run src/cli.ts search --index-path .kdd-index "$(q)" test: - pytest - -test-unit: - pytest -m unit tests/unit - -test-integration: - pytest -m integration tests/integration - -test-api: - pytest -m api tests/api - -test-cov: - pytest --cov=kb_engine --cov-report=html --cov-report=term - -# ============================================================================= -# Running -# ============================================================================= - -run: - uvicorn kb_engine.api.main:app --host 0.0.0.0 --port 8000 - -dev: - uvicorn kb_engine.api.main:app --host 0.0.0.0 --port 8000 --reload - -# ============================================================================= -# Docker -# ============================================================================= - -docker-up: - docker compose -f docker/docker-compose.yml up -d - -docker-down: - docker compose -f docker/docker-compose.yml down - -docker-logs: - docker compose -f docker/docker-compose.yml logs -f - -# ============================================================================= -# Database -# ============================================================================= - -migrate: - alembic upgrade head + bun test -migrate-create: - @read -p "Migration message: " msg; \ - alembic revision --autogenerate -m "$$msg" +typecheck: + bunx tsc --noEmit -# ============================================================================= -# Cleanup -# ============================================================================= +mcp: + bun run src/mcp.ts clean: - rm -rf build/ - rm -rf dist/ - rm -rf *.egg-info/ - rm -rf .pytest_cache/ - rm -rf .mypy_cache/ - rm -rf .ruff_cache/ - rm -rf htmlcov/ - rm -rf .coverage - find . -type d -name __pycache__ -exec rm -rf {} + - find . -type f -name "*.pyc" -delete + rm -rf node_modules .kdd-index diff --git a/README.md b/README.md index 77193a3..781e71b 100644 --- a/README.md +++ b/README.md @@ -1,293 +1,192 @@ -# KB-Engine +# KDD Toolkit -Sistema de retrieval de conocimiento para agentes de IA. Indexa documentación estructurada (KDD) y devuelve **referencias** a documentos relevantes, no contenido. +Motor de indexación y retrieval para especificaciones KDD (Knowledge-Driven Development). Indexa artefactos de dominio y ofrece búsqueda híbrida (semántica + grafo + lexical) para agentes de IA. ## Concepto -KB-Engine actúa como un "bibliotecario": cuando un agente pregunta algo, responde con URLs y anclas a los documentos relevantes (`file://path/to/doc.md#seccion`), permitiendo que el agente decida qué leer. +KDD Toolkit actúa como un "bibliotecario": cuando un agente pregunta algo, responde con nodos del grafo de conocimiento y scores de relevancia, permitiendo que el agente decida qué documentos leer. ``` -┌─────────────┐ query ┌─────────────┐ referencias ┌─────────────┐ -│ Agente │ ─────────────▶ │ KB-Engine │ ──────────────────▶ │ Agente lee │ -│ IA │ │ (retrieval) │ │ documentos │ -└─────────────┘ └─────────────┘ └─────────────┘ +┌─────────────┐ query ┌─────────────┐ scored nodes ┌─────────────┐ +│ Agente │ ─────────────▶ │ KDD Toolkit │ ──────────────────▶ │ Agente lee │ +│ IA │ │ (retrieval) │ │ specs/*.md │ +└─────────────┘ └─────────────┘ └─────────────┘ ``` -## Arquitectura +## Stack -### Dual Stack +| Componente | Tecnología | +|------------|------------| +| **Runtime** | Bun (TypeScript) | +| **Grafo** | graphology (in-memory, cargado de `.kdd-index/`) | +| **Vectores** | Brute-force cosine similarity (in-memory) | +| **Embeddings** | `all-mpnet-base-v2` (768 dims) via `@huggingface/transformers` | +| **CLI** | citty | +| **MCP** | `@modelcontextprotocol/sdk` | -| Componente | Local (P2P) | Servidor | -|------------|-------------|----------| -| **Trazabilidad** | SQLite | PostgreSQL | -| **Vectores** | ChromaDB | Qdrant | -| **Grafos** | FalkorDBLite | Neo4j | -| **Embeddings** | sentence-transformers | OpenAI | - -### Modelo Distribuido - -``` -┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ -│ Desarrollador 1 │ │ Desarrollador 2 │ │ Desarrollador N │ -│ (indexa local) │ │ (indexa local) │ │ (indexa local) │ -└────────┬─────────┘ └────────┬─────────┘ └────────┬─────────┘ - │ │ │ - └────────────────────────┼────────────────────────┘ - ▼ - ┌──────────────────┐ - │ Servidor Central │ - │ (merge + search) │ - └──────────────────┘ -``` - -Cada desarrollador indexa localmente con embeddings deterministas. El servidor central hace merge y ofrece búsqueda unificada. - -## Características - -- **Chunking semántico KDD**: Estrategias específicas para entidades, casos de uso, reglas, procesos -- **Soporte ES/EN**: Detecta patrones en español e inglés -- **Grafo de conocimiento**: Entidades, conceptos, eventos y sus relaciones (FalkorDB/Neo4j) -- **Smart Ingestion**: Pipeline inteligente con detección de tipo de documento -- **CLI**: Interfaz principal via `kb` command +Sin bases de datos. Todo se persiste como ficheros JSON en `.kdd-index/`. ## Quick Start ### Requisitos -- Python 3.11+ (recomendado 3.12) -- (Opcional) Docker para modo servidor +- [Bun](https://bun.sh/) v1.1+ -### Instalación rápida +### Instalación ```bash -pip install git+https://github.com/knowledge-driven-dev/kb-engine.git -``` - -Para actualizar a la última versión: - -```bash -pip install --upgrade --force-reinstall git+https://github.com/knowledge-driven-dev/kb-engine.git -``` - -### Instalación para desarrollo - -```bash -# Clonar git clone https://github.com/knowledge-driven-dev/kb-engine.git cd kb-engine - -# Crear entorno virtual con Python 3.12 -python3.12 -m venv .venv -source .venv/bin/activate - -# Instalar dependencias (modo editable + herramientas de dev) -pip install -e ".[dev]" - -# Verificar instalación -pytest tests/ -v +bun install ``` -El primer `kb index` descargará el modelo de embeddings (`paraphrase-multilingual-MiniLM-L12-v2`, ~120MB). -Los datos locales se almacenan en `~/.kb-engine/` (SQLite, ChromaDB, FalkorDB). - -### Instalación (Modo Servidor) +### Indexar ```bash -# Instalar con dependencias de servidor -pip install -e ".[dev,server]" +# Indexar todas las specs (grafo + embeddings) +bun run src/cli.ts index specs/ -# Copiar configuración -cp .env.example .env -# Editar .env con tus credenciales - -# Levantar servicios (PostgreSQL, Qdrant, Neo4j) -docker compose -f docker/docker-compose.yml up -d - -# Ejecutar migraciones -alembic -c migrations/alembic.ini upgrade head +# Solo grafo (sin embeddings, más rápido) +bun run src/cli.ts index specs/ --level L1 ``` -### Instalación MCP (para agentes) - -```bash -# Instalar con dependencias MCP -pip install -e ".[mcp]" - -# Iniciar servidor MCP -kb-mcp -``` +El primer `index` con nivel L2 descargará el modelo de embeddings (`all-mpnet-base-v2`, ~440MB). Los datos se almacenan en `.kdd-index/`. -El servidor MCP expone las herramientas `kdd_search`, `kdd_related` y `kdd_list` para -que agentes de IA consulten la base de conocimiento. - -### Uso (CLI) +### Buscar ```bash -# Indexar documentos -kb index ./docs/domain/ - -# Buscar -kb search "¿cómo se registra un usuario?" - -# Buscar en modo híbrido (vectores + grafo) -kb search "registro de usuario" --mode hybrid +# Búsqueda híbrida (semántica + grafo + lexical) +bun run src/cli.ts search --index-path .kdd-index "impact analysis" -# Ver estado del índice -kb status +# Filtrar por kind +bun run src/cli.ts search --index-path .kdd-index "authentication" --kind entity,command -# Sincronizar incrementalmente (solo archivos cambiados desde un commit) -kb sync --since abc1234 +# Sin embeddings (solo grafo + lexical) +bun run src/cli.ts search --index-path .kdd-index "pedido" --no-embeddings ``` -### Administración del grafo (`kb graph`) - -Comandos para explorar, inspeccionar y administrar el grafo de conocimiento (FalkorDB). -Todos soportan `--json` para salida estructurada. +### Explorar ```bash -# Estadísticas del grafo -kb graph stats - -# Listar nodos (opcionalmente filtrar por tipo) -kb graph ls -kb graph ls --type entity - -# Inspeccionar un nodo: vecindario + proveniencia -kb graph inspect entity:User -kb graph inspect entity:User -d 3 # profundidad personalizada +# Traversal del grafo desde un nodo +bun run src/cli.ts graph --index-path .kdd-index "Entity:KDDDocument" -# Verificar alcanzabilidad entre dos nodos -kb graph path entity:User entity:Order -kb graph path entity:User entity:Order --max-depth 3 +# Análisis de impacto (reverse BFS) +bun run src/cli.ts impact --index-path .kdd-index "Entity:KDDDocument" -# Nodos extraídos de un documento -kb graph impact doc-1 +# Búsqueda semántica pura +bun run src/cli.ts semantic --index-path .kdd-index "retrieval query" -# Documentos que contribuyeron a un nodo -kb graph provenance entity:User +# Cobertura de gobernanza +bun run src/cli.ts coverage --index-path .kdd-index "Entity:KDDDocument" -# Consulta Cypher directa -kb graph cypher "MATCH (n) RETURN labels(n)[0] as type, count(n) as cnt" - -# Eliminar un nodo (pide confirmación, -f para omitirla) -kb graph delete entity:Obsolete -kb graph delete entity:Obsolete -f - -# Calidad del grafo -kb graph orphans # entidades stub sin documento primario -kb graph completeness # estado de completitud por entidad -kb graph completeness -s stub +# Violaciones de dependencia entre capas +bun run src/cli.ts violations --index-path .kdd-index ``` -## Estructura del Proyecto +### MCP Server (para agentes) -``` -kb-engine/ -├── src/kb_engine/ -│ ├── core/ # Modelos de dominio e interfaces -│ ├── smart/ # Pipeline de ingesta inteligente (FalkorDB) -│ │ ├── parsers/ # Detectores y parsers KDD -│ │ ├── chunking/ # Chunking jerárquico con contexto -│ │ ├── extraction/ # Extracción de entidades para grafo -│ │ ├── stores/ # FalkorDBGraphStore -│ │ ├── schemas/ # Esquemas de templates KDD -│ │ └── pipelines/ # EntityIngestionPipeline -│ ├── repositories/ # Implementaciones de storage -│ ├── chunking/ # Estrategias de chunking clásicas -│ ├── extraction/ # Pipeline de extracción legacy -│ ├── embedding/ # Configuración de embeddings -│ ├── pipelines/ # Pipelines de indexación/retrieval -│ ├── services/ # Lógica de negocio -│ ├── api/ # REST API (FastAPI) -│ ├── cli.py # Comandos CLI (Click) -│ └── mcp_server.py # Servidor MCP para agentes -├── tests/ -│ ├── unit/ -│ └── integration/ -└── docs/design/ # ADRs y documentos de diseño +```bash +bun run src/mcp.ts ``` -## Documentos KDD Soportados +Expone 7 tools MCP: `kdd_search`, `kdd_find_spec`, `kdd_related`, `kdd_impact`, `kdd_read_section`, `kdd_list`, `kdd_stats`. -| Tipo | Descripción | -|------|-------------| -| `entity` | Entidades de dominio (Usuario, Producto, etc.) | -| `use-case` | Casos de uso del sistema | -| `rule` | Reglas de negocio | -| `process` | Procesos y flujos | -| `event` | Eventos de dominio | -| `glossary` | Términos y definiciones | +Variables de entorno opcionales: +- `KDD_INDEX_PATH` — ruta al índice (default: `.kdd-index`) +- `KDD_SPECS_PATH` — ruta a las specs (default: `specs`) -## API - -```bash -# Health check -GET /health - -# Búsqueda (devuelve referencias) -POST /api/v1/retrieval/search -{ - "query": "registro de usuario", - "top_k": 5 -} - -# Indexar documento -POST /api/v1/indexing/documents +## Estructura del Proyecto -# Listar documentos -GET /api/v1/indexing/documents ``` +kb-engine/ +├── specs/ # 52 spec files KDD (sin cambios) +├── src/ +│ ├── domain/ +│ │ ├── types.ts # Enums, interfaces, modelos +│ │ └── rules.ts # BR-DOCUMENT-001, BR-EMBEDDING-001, BR-LAYER-001 +│ ├── application/ +│ │ ├── extractors/ +│ │ │ ├── base.ts # Helpers: makeNodeId, buildWikiLinkEdges, etc. +│ │ │ ├── registry.ts # ExtractorRegistry (16 extractors) +│ │ │ └── kinds/ # Un extractor por KDDKind +│ │ ├── commands/ +│ │ │ └── index-document.ts # CMD-001: read → parse → extract → embed → write +│ │ ├── queries/ +│ │ │ ├── hybrid-search.ts # QRY-003: semántica + grafo + lexical +│ │ │ ├── graph-query.ts # QRY-001: BFS traversal +│ │ │ ├── impact-query.ts # QRY-004: reverse BFS +│ │ │ ├── semantic-query.ts # QRY-002: vector puro +│ │ │ ├── coverage-query.ts # QRY-005: gobernanza +│ │ │ └── violations-query.ts # QRY-006: violaciones de capa +│ │ └── chunking.ts # BR-EMBEDDING-001 paragraph chunking +│ ├── infra/ +│ │ ├── artifact-loader.ts # Lee .kdd-index/ +│ │ ├── artifact-writer.ts # Escribe .kdd-index/ +│ │ ├── graph-store.ts # graphology wrapper (BFS, text search) +│ │ ├── vector-store.ts # Brute-force cosine similarity +│ │ ├── embedding-model.ts # @huggingface/transformers wrapper +│ │ ├── markdown-parser.ts # Frontmatter + secciones +│ │ └── wiki-links.ts # [[Target]] extraction +│ ├── container.ts # DI wiring +│ ├── cli.ts # CLI (7 subcommands) +│ └── mcp.ts # MCP server (7 tools) +├── tests/ # bun:test +├── bench/ # Benchmarks +├── docs/ # ADRs y diseño +├── package.json +├── tsconfig.json +└── Makefile +``` + +## 16 KDDKind Types + +Cada kind tiene un extractor dedicado en `src/application/extractors/kinds/`: + +| Kind | Layer | Ejemplo de ID | +|------|-------|---------------| +| `entity` | 01-domain | `Entity:KDDDocument` | +| `event` | 01-domain | `Event:EVT-KDDDocument-Indexed` | +| `business-rule` | 01-domain | `BR:BR-INDEX-001` | +| `business-policy` | 02-behavior | `BP:BP-CREDITO-001` | +| `cross-policy` | 02-behavior | `XP:XP-CREDITOS-001` | +| `command` | 02-behavior | `CMD:CMD-001` | +| `query` | 02-behavior | `QRY:QRY-003` | +| `process` | 02-behavior | `PROC:PROC-001` | +| `use-case` | 02-behavior | `UC:UC-001` | +| `ui-view` | 03-experience | `UIView:UI-Dashboard` | +| `ui-component` | 03-experience | `UIComponent:UI-Button` | +| `requirement` | 04-verification | `REQ:REQ-001` | +| `objective` | 00-requirements | `OBJ:OBJ-001` | +| `prd` | 00-requirements | `PRD:PRD-KBEngine` | +| `adr` | 00-requirements | `ADR:ADR-0001` | +| `glossary` | 01-domain | `Glossary:GlossaryName` | + +## Index Levels + +| Nivel | Contenido | Búsqueda | +|-------|-----------|----------| +| **L1** | Grafo de nodos/edges (front-matter + wiki-links) | Grafo + lexical | +| **L2** | L1 + embeddings vectoriales (768 dims) | Híbrida (semántica + grafo + lexical) | ## Tests ```bash -# Todos los tests -pytest tests/ -v - -# Solo unitarios -pytest tests/unit/ -v - -# Solo integración -pytest tests/integration/ -v - -# Con coverage -pytest tests/ --cov=kb_engine +bun test ``` -## Configuración - -Variables de entorno (`.env`). Ver `.env.example` para la lista completa. +## Makefile ```bash -# --- Perfil --- -KB_PROFILE=local # "local" (defecto) o "server" - -# --- Rutas locales (perfil local) --- -SQLITE_PATH=~/.kb-engine/kb.db -CHROMA_PATH=~/.kb-engine/chroma -FALKORDB_PATH=~/.kb-engine/graph.db - -# --- Embeddings --- -EMBEDDING_PROVIDER=local # "local" (sentence-transformers) o "openai" -LOCAL_EMBEDDING_MODEL=paraphrase-multilingual-MiniLM-L12-v2 -OPENAI_API_KEY=sk-... # solo si EMBEDDING_PROVIDER=openai - -# --- Perfil servidor --- -DATABASE_URL=postgresql+asyncpg://user:pass@localhost:5432/kb_engine -QDRANT_HOST=localhost -QDRANT_PORT=6333 -NEO4J_URI=bolt://localhost:7687 -NEO4J_PASSWORD=changeme +make install # bun install +make index # Indexar specs/ +make search q=.. # Búsqueda híbrida +make test # bun test +make typecheck # tsc --noEmit +make mcp # Iniciar MCP server +make clean # Limpiar node_modules y .kdd-index ``` -## Roadmap - -- [x] Stack local con SQLite + ChromaDB -- [x] Smart ingestion pipeline con FalkorDB -- [x] CLI completo (`kb index/search/sync/status/graph`) -- [x] Integración MCP para agentes -- [ ] Sincronización P2P con servidor - ## Licencia MIT diff --git a/bench/compare.ts b/bench/compare.ts new file mode 100644 index 0000000..d94a488 --- /dev/null +++ b/bench/compare.ts @@ -0,0 +1,114 @@ +/** + * Benchmark — measures kdd-ts performance. + */ + +import { resolve } from "node:path"; +import { createContainer } from "../src/container.ts"; +import { hybridSearch } from "../src/application/queries/hybrid-search.ts"; +import { graphQuery } from "../src/application/queries/graph-query.ts"; +import { impactQuery } from "../src/application/queries/impact-query.ts"; + +const INDEX_PATH = resolve(import.meta.dir, "../.kdd-index"); +const QUERIES = [ + "documento KDD", + "indexación incremental", + "embedding modelo", + "grafo nodos edges", + "business rule validación", +]; + +interface BenchResult { + label: string; + ms: number; +} + +const results: BenchResult[] = []; + +function record(label: string, ms: number) { + results.push({ label, ms }); + console.log(` ${label}: ${ms.toFixed(1)}ms`); +} + +console.log("\n=== KDD-TS Benchmark ===\n"); + +console.log("1. Index load (graph only, no embeddings):"); +let t0 = performance.now(); +const containerLight = await createContainer(INDEX_PATH, { skipEmbeddings: true }); +record("index_load_graph_only", performance.now() - t0); +console.log(` Nodes: ${containerLight.graphStore.nodeCount()}, Edges: ${containerLight.graphStore.edgeCount()}`); + +console.log("\n2. Index load (with embeddings vectors):"); +t0 = performance.now(); +const containerFull = await createContainer(INDEX_PATH); +record("index_load_with_embeddings", performance.now() - t0); + +console.log("\n3. Graph query latency:"); +t0 = performance.now(); +graphQuery({ rootNode: "Entity:KDDDocument", depth: 2 }, containerFull.graphStore); +record("graph_query_cold", performance.now() - t0); + +const graphTimes: number[] = []; +for (let i = 0; i < 100; i++) { + const t = performance.now(); + graphQuery({ rootNode: "Entity:KDDDocument", depth: 2 }, containerFull.graphStore); + graphTimes.push(performance.now() - t); +} +record("graph_query_warm_avg_100", graphTimes.reduce((a, b) => a + b, 0) / graphTimes.length); + +console.log("\n4. Impact query latency:"); +t0 = performance.now(); +impactQuery({ nodeId: "Entity:KDDDocument", depth: 3 }, containerFull.graphStore); +record("impact_query_cold", performance.now() - t0); + +const impactTimes: number[] = []; +for (let i = 0; i < 100; i++) { + const t = performance.now(); + impactQuery({ nodeId: "Entity:KDDDocument", depth: 3 }, containerFull.graphStore); + impactTimes.push(performance.now() - t); +} +record("impact_query_warm_avg_100", impactTimes.reduce((a, b) => a + b, 0) / impactTimes.length); + +console.log("\n5. Hybrid search latency:"); +t0 = performance.now(); +await hybridSearch( + { queryText: QUERIES[0]!, minScore: 0.1, limit: 10 }, + containerFull.graphStore, + containerFull.vectorStore, + containerFull.encodeFn, +); +record("hybrid_search_cold_first_encode", performance.now() - t0); + +const hybridTimes: number[] = []; +for (const q of QUERIES) { + const t = performance.now(); + await hybridSearch( + { queryText: q, minScore: 0.1, limit: 10 }, + containerFull.graphStore, + containerFull.vectorStore, + containerFull.encodeFn, + ); + hybridTimes.push(performance.now() - t); +} +record("hybrid_search_warm_avg_5", hybridTimes.reduce((a, b) => a + b, 0) / hybridTimes.length); + +console.log("\n6. Lexical-only search:"); +const lexTimes: number[] = []; +for (const q of QUERIES) { + const t = performance.now(); + await hybridSearch( + { queryText: q, minScore: 0.01, limit: 10 }, + containerFull.graphStore, + null, + null, + ); + lexTimes.push(performance.now() - t); +} +record("lexical_search_avg_5", lexTimes.reduce((a, b) => a + b, 0) / lexTimes.length); + +console.log("\n\n=== Summary ===\n"); +console.log("| Metric | Time (ms) |"); +console.log("|--------|-----------|"); +for (const r of results) { + console.log(`| ${r.label} | ${r.ms.toFixed(1)} |`); +} +console.log("\nDone."); diff --git a/bun.lock b/bun.lock new file mode 100644 index 0000000..15c966f --- /dev/null +++ b/bun.lock @@ -0,0 +1,403 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "kdd", + "dependencies": { + "@huggingface/transformers": "^3.8.1", + "@modelcontextprotocol/sdk": "^1.0.0", + "citty": "^0.2.1", + "graphology": "^0.26.0", + "graphology-traversal": "^0.3.1", + "graphology-types": "^0.24.8", + "gray-matter": "^4.0.3", + }, + "devDependencies": { + "@types/bun": "latest", + }, + "peerDependencies": { + "typescript": "^5", + }, + }, + }, + "packages": { + "@emnapi/runtime": ["@emnapi/runtime@1.8.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-mehfKSMWjjNol8659Z8KxEMrdSJDDot5SXMq00dM8BN4o+CLNXQ0xH2V7EchNHV4RmbZLmmPdEaXZc5H2FXmDg=="], + + "@hono/node-server": ["@hono/node-server@1.19.9", "", { "peerDependencies": { "hono": "^4" } }, "sha512-vHL6w3ecZsky+8P5MD+eFfaGTyCeOHUIFYMGpQGbrBTSmNNoxv0if69rEZ5giu36weC5saFuznL411gRX7bJDw=="], + + "@huggingface/jinja": ["@huggingface/jinja@0.5.5", "", {}, "sha512-xRlzazC+QZwr6z4ixEqYHo9fgwhTZ3xNSdljlKfUFGZSdlvt166DljRELFUfFytlYOYvo3vTisA/AFOuOAzFQQ=="], + + "@huggingface/transformers": ["@huggingface/transformers@3.8.1", "", { "dependencies": { "@huggingface/jinja": "^0.5.3", "onnxruntime-node": "1.21.0", "onnxruntime-web": "1.22.0-dev.20250409-89f8206ba4", "sharp": "^0.34.1" } }, "sha512-tsTk4zVjImqdqjS8/AOZg2yNLd1z9S5v+7oUPpXaasDRwEDhB+xnglK1k5cad26lL5/ZIaeREgWWy0bs9y9pPA=="], + + "@img/colour": ["@img/colour@1.0.0", "", {}, "sha512-A5P/LfWGFSl6nsckYtjw9da+19jB8hkJ6ACTGcDfEJ0aE+l2n2El7dsVM7UVHZQ9s2lmYMWlrS21YLy2IR1LUw=="], + + "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.2.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w=="], + + "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.2.4" }, "os": "darwin", "cpu": "x64" }, "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw=="], + + "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g=="], + + "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.2.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg=="], + + "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.2.4", "", { "os": "linux", "cpu": "arm" }, "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A=="], + + "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw=="], + + "@img/sharp-libvips-linux-ppc64": ["@img/sharp-libvips-linux-ppc64@1.2.4", "", { "os": "linux", "cpu": "ppc64" }, "sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA=="], + + "@img/sharp-libvips-linux-riscv64": ["@img/sharp-libvips-linux-riscv64@1.2.4", "", { "os": "linux", "cpu": "none" }, "sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA=="], + + "@img/sharp-libvips-linux-s390x": ["@img/sharp-libvips-linux-s390x@1.2.4", "", { "os": "linux", "cpu": "s390x" }, "sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ=="], + + "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw=="], + + "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw=="], + + "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg=="], + + "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.2.4" }, "os": "linux", "cpu": "arm" }, "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw=="], + + "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg=="], + + "@img/sharp-linux-ppc64": ["@img/sharp-linux-ppc64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-ppc64": "1.2.4" }, "os": "linux", "cpu": "ppc64" }, "sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA=="], + + "@img/sharp-linux-riscv64": ["@img/sharp-linux-riscv64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-riscv64": "1.2.4" }, "os": "linux", "cpu": "none" }, "sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw=="], + + "@img/sharp-linux-s390x": ["@img/sharp-linux-s390x@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-s390x": "1.2.4" }, "os": "linux", "cpu": "s390x" }, "sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg=="], + + "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ=="], + + "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg=="], + + "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q=="], + + "@img/sharp-wasm32": ["@img/sharp-wasm32@0.34.5", "", { "dependencies": { "@emnapi/runtime": "^1.7.0" }, "cpu": "none" }, "sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw=="], + + "@img/sharp-win32-arm64": ["@img/sharp-win32-arm64@0.34.5", "", { "os": "win32", "cpu": "arm64" }, "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g=="], + + "@img/sharp-win32-ia32": ["@img/sharp-win32-ia32@0.34.5", "", { "os": "win32", "cpu": "ia32" }, "sha512-FV9m/7NmeCmSHDD5j4+4pNI8Cp3aW+JvLoXcTUo0IqyjSfAZJ8dIUmijx1qaJsIiU+Hosw6xM5KijAWRJCSgNg=="], + + "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.34.5", "", { "os": "win32", "cpu": "x64" }, "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw=="], + + "@isaacs/fs-minipass": ["@isaacs/fs-minipass@4.0.1", "", { "dependencies": { "minipass": "^7.0.4" } }, "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w=="], + + "@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.26.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-Y5RmPncpiDtTXDbLKswIJzTqu2hyBKxTNsgKqKclDbhIgg1wgtf1fRuvxgTnRfcnxtvvgbIEcqUOzZrJ6iSReg=="], + + "@protobufjs/aspromise": ["@protobufjs/aspromise@1.1.2", "", {}, "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ=="], + + "@protobufjs/base64": ["@protobufjs/base64@1.1.2", "", {}, "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg=="], + + "@protobufjs/codegen": ["@protobufjs/codegen@2.0.4", "", {}, "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg=="], + + "@protobufjs/eventemitter": ["@protobufjs/eventemitter@1.1.0", "", {}, "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q=="], + + "@protobufjs/fetch": ["@protobufjs/fetch@1.1.0", "", { "dependencies": { "@protobufjs/aspromise": "^1.1.1", "@protobufjs/inquire": "^1.1.0" } }, "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ=="], + + "@protobufjs/float": ["@protobufjs/float@1.0.2", "", {}, "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ=="], + + "@protobufjs/inquire": ["@protobufjs/inquire@1.1.0", "", {}, "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q=="], + + "@protobufjs/path": ["@protobufjs/path@1.1.2", "", {}, "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA=="], + + "@protobufjs/pool": ["@protobufjs/pool@1.1.0", "", {}, "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw=="], + + "@protobufjs/utf8": ["@protobufjs/utf8@1.1.0", "", {}, "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="], + + "@types/bun": ["@types/bun@1.3.9", "", { "dependencies": { "bun-types": "1.3.9" } }, "sha512-KQ571yULOdWJiMH+RIWIOZ7B2RXQGpL1YQrBtLIV3FqDcCu6FsbFUBwhdKUlCKUpS3PJDsHlJ1QKlpxoVR+xtw=="], + + "@types/node": ["@types/node@25.3.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-4K3bqJpXpqfg2XKGK9bpDTc6xO/xoUP/RBWS7AtRMug6zZFaRekiLzjVtAoZMquxoAbzBvy5nxQ7veS5eYzf8A=="], + + "accepts": ["accepts@2.0.0", "", { "dependencies": { "mime-types": "^3.0.0", "negotiator": "^1.0.0" } }, "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng=="], + + "ajv": ["ajv@8.18.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A=="], + + "ajv-formats": ["ajv-formats@3.0.1", "", { "dependencies": { "ajv": "^8.0.0" } }, "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ=="], + + "argparse": ["argparse@1.0.10", "", { "dependencies": { "sprintf-js": "~1.0.2" } }, "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg=="], + + "body-parser": ["body-parser@2.2.2", "", { "dependencies": { "bytes": "^3.1.2", "content-type": "^1.0.5", "debug": "^4.4.3", "http-errors": "^2.0.0", "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", "qs": "^6.14.1", "raw-body": "^3.0.1", "type-is": "^2.0.1" } }, "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA=="], + + "boolean": ["boolean@3.2.0", "", {}, "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw=="], + + "bun-types": ["bun-types@1.3.9", "", { "dependencies": { "@types/node": "*" } }, "sha512-+UBWWOakIP4Tswh0Bt0QD0alpTY8cb5hvgiYeWCMet9YukHbzuruIEeXC2D7nMJPB12kbh8C7XJykSexEqGKJg=="], + + "bytes": ["bytes@3.1.2", "", {}, "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="], + + "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], + + "call-bound": ["call-bound@1.0.4", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "get-intrinsic": "^1.3.0" } }, "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg=="], + + "chownr": ["chownr@3.0.0", "", {}, "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g=="], + + "citty": ["citty@0.2.1", "", {}, "sha512-kEV95lFBhQgtogAPlQfJJ0WGVSokvLr/UEoFPiKKOXF7pl98HfUVUD0ejsuTCld/9xH9vogSywZ5KqHzXrZpqg=="], + + "content-disposition": ["content-disposition@1.0.1", "", {}, "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q=="], + + "content-type": ["content-type@1.0.5", "", {}, "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA=="], + + "cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="], + + "cookie-signature": ["cookie-signature@1.2.2", "", {}, "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg=="], + + "cors": ["cors@2.8.6", "", { "dependencies": { "object-assign": "^4", "vary": "^1" } }, "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw=="], + + "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="], + + "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], + + "define-data-property": ["define-data-property@1.1.4", "", { "dependencies": { "es-define-property": "^1.0.0", "es-errors": "^1.3.0", "gopd": "^1.0.1" } }, "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A=="], + + "define-properties": ["define-properties@1.2.1", "", { "dependencies": { "define-data-property": "^1.0.1", "has-property-descriptors": "^1.0.0", "object-keys": "^1.1.1" } }, "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg=="], + + "depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="], + + "detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="], + + "detect-node": ["detect-node@2.1.0", "", {}, "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="], + + "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], + + "ee-first": ["ee-first@1.1.1", "", {}, "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="], + + "encodeurl": ["encodeurl@2.0.0", "", {}, "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg=="], + + "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], + + "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], + + "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="], + + "es6-error": ["es6-error@4.1.1", "", {}, "sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg=="], + + "escape-html": ["escape-html@1.0.3", "", {}, "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="], + + "escape-string-regexp": ["escape-string-regexp@4.0.0", "", {}, "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA=="], + + "esprima": ["esprima@4.0.1", "", { "bin": { "esparse": "./bin/esparse.js", "esvalidate": "./bin/esvalidate.js" } }, "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A=="], + + "etag": ["etag@1.8.1", "", {}, "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="], + + "events": ["events@3.3.0", "", {}, "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q=="], + + "eventsource": ["eventsource@3.0.7", "", { "dependencies": { "eventsource-parser": "^3.0.1" } }, "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA=="], + + "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="], + + "express": ["express@5.2.1", "", { "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", "content-disposition": "^1.0.0", "content-type": "^1.0.5", "cookie": "^0.7.1", "cookie-signature": "^1.2.1", "debug": "^4.4.0", "depd": "^2.0.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "finalhandler": "^2.1.0", "fresh": "^2.0.0", "http-errors": "^2.0.0", "merge-descriptors": "^2.0.0", "mime-types": "^3.0.0", "on-finished": "^2.4.1", "once": "^1.4.0", "parseurl": "^1.3.3", "proxy-addr": "^2.0.7", "qs": "^6.14.0", "range-parser": "^1.2.1", "router": "^2.2.0", "send": "^1.1.0", "serve-static": "^2.2.0", "statuses": "^2.0.1", "type-is": "^2.0.1", "vary": "^1.1.2" } }, "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw=="], + + "express-rate-limit": ["express-rate-limit@8.2.1", "", { "dependencies": { "ip-address": "10.0.1" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-PCZEIEIxqwhzw4KF0n7QF4QqruVTcF73O5kFKUnGOyjbCCgizBBiFaYpd/fnBLUMPw/BWw9OsiN7GgrNYr7j6g=="], + + "extend-shallow": ["extend-shallow@2.0.1", "", { "dependencies": { "is-extendable": "^0.1.0" } }, "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug=="], + + "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], + + "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="], + + "finalhandler": ["finalhandler@2.1.1", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA=="], + + "flatbuffers": ["flatbuffers@25.9.23", "", {}, "sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ=="], + + "forwarded": ["forwarded@0.2.0", "", {}, "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="], + + "fresh": ["fresh@2.0.0", "", {}, "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A=="], + + "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], + + "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], + + "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], + + "global-agent": ["global-agent@3.0.0", "", { "dependencies": { "boolean": "^3.0.1", "es6-error": "^4.1.1", "matcher": "^3.0.0", "roarr": "^2.15.3", "semver": "^7.3.2", "serialize-error": "^7.0.1" } }, "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q=="], + + "globalthis": ["globalthis@1.0.4", "", { "dependencies": { "define-properties": "^1.2.1", "gopd": "^1.0.1" } }, "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ=="], + + "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], + + "graphology": ["graphology@0.26.0", "", { "dependencies": { "events": "^3.3.0" }, "peerDependencies": { "graphology-types": ">=0.24.0" } }, "sha512-8SSImzgUUYC89Z042s+0r/vMibY7GX/Emz4LDO5e7jYXhuoWfHISPFJYjpRLUSJGq6UQ6xlenvX1p/hJdfXuXg=="], + + "graphology-indices": ["graphology-indices@0.17.0", "", { "dependencies": { "graphology-utils": "^2.4.2", "mnemonist": "^0.39.0" }, "peerDependencies": { "graphology-types": ">=0.20.0" } }, "sha512-A7RXuKQvdqSWOpn7ZVQo4S33O0vCfPBnUSf7FwE0zNCasqwZVUaCXePuWo5HBpWw68KJcwObZDHpFk6HKH6MYQ=="], + + "graphology-traversal": ["graphology-traversal@0.3.1", "", { "dependencies": { "graphology-indices": "^0.17.0", "graphology-utils": "^2.0.0" }, "peerDependencies": { "graphology-types": ">=0.20.0" } }, "sha512-lGLrLKEDKtNgAKgHVhVftKf3cb/nuWwuVPQZHXRnN90JWn0RSjco/s+NB2ARSlMapEMlbnPgv6j++427yTnU3Q=="], + + "graphology-types": ["graphology-types@0.24.8", "", {}, "sha512-hDRKYXa8TsoZHjgEaysSRyPdT6uB78Ci8WnjgbStlQysz7xR52PInxNsmnB7IBOM1BhikxkNyCVEFgmPKnpx3Q=="], + + "graphology-utils": ["graphology-utils@2.5.2", "", { "peerDependencies": { "graphology-types": ">=0.23.0" } }, "sha512-ckHg8MXrXJkOARk56ZaSCM1g1Wihe2d6iTmz1enGOz4W/l831MBCKSayeFQfowgF8wd+PQ4rlch/56Vs/VZLDQ=="], + + "gray-matter": ["gray-matter@4.0.3", "", { "dependencies": { "js-yaml": "^3.13.1", "kind-of": "^6.0.2", "section-matter": "^1.0.0", "strip-bom-string": "^1.0.0" } }, "sha512-5v6yZd4JK3eMI3FqqCouswVqwugaA9r4dNZB1wwcmrD02QkV5H0y7XBQW8QwQqEaZY1pM9aqORSORhJRdNK44Q=="], + + "guid-typescript": ["guid-typescript@1.0.9", "", {}, "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ=="], + + "has-property-descriptors": ["has-property-descriptors@1.0.2", "", { "dependencies": { "es-define-property": "^1.0.0" } }, "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg=="], + + "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], + + "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], + + "hono": ["hono@4.12.0", "", {}, "sha512-NekXntS5M94pUfiVZ8oXXK/kkri+5WpX2/Ik+LVsl+uvw+soj4roXIsPqO+XsWrAw20mOzaXOZf3Q7PfB9A/IA=="], + + "http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="], + + "iconv-lite": ["iconv-lite@0.7.2", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw=="], + + "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="], + + "ip-address": ["ip-address@10.0.1", "", {}, "sha512-NWv9YLW4PoW2B7xtzaS3NCot75m6nK7Icdv0o3lfMceJVRfSoQwqD4wEH5rLwoKJwUiZ/rfpiVBhnaF0FK4HoA=="], + + "ipaddr.js": ["ipaddr.js@1.9.1", "", {}, "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="], + + "is-extendable": ["is-extendable@0.1.1", "", {}, "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw=="], + + "is-promise": ["is-promise@4.0.0", "", {}, "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ=="], + + "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="], + + "jose": ["jose@6.1.3", "", {}, "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ=="], + + "js-yaml": ["js-yaml@3.14.2", "", { "dependencies": { "argparse": "^1.0.7", "esprima": "^4.0.0" }, "bin": { "js-yaml": "bin/js-yaml.js" } }, "sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg=="], + + "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="], + + "json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="], + + "json-stringify-safe": ["json-stringify-safe@5.0.1", "", {}, "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="], + + "kind-of": ["kind-of@6.0.3", "", {}, "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw=="], + + "long": ["long@5.3.2", "", {}, "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA=="], + + "matcher": ["matcher@3.0.0", "", { "dependencies": { "escape-string-regexp": "^4.0.0" } }, "sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng=="], + + "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], + + "media-typer": ["media-typer@1.1.0", "", {}, "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw=="], + + "merge-descriptors": ["merge-descriptors@2.0.0", "", {}, "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g=="], + + "mime-db": ["mime-db@1.54.0", "", {}, "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ=="], + + "mime-types": ["mime-types@3.0.2", "", { "dependencies": { "mime-db": "^1.54.0" } }, "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A=="], + + "minipass": ["minipass@7.1.3", "", {}, "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A=="], + + "minizlib": ["minizlib@3.1.0", "", { "dependencies": { "minipass": "^7.1.2" } }, "sha512-KZxYo1BUkWD2TVFLr0MQoM8vUUigWD3LlD83a/75BqC+4qE0Hb1Vo5v1FgcfaNXvfXzr+5EhQ6ing/CaBijTlw=="], + + "mnemonist": ["mnemonist@0.39.8", "", { "dependencies": { "obliterator": "^2.0.1" } }, "sha512-vyWo2K3fjrUw8YeeZ1zF0fy6Mu59RHokURlld8ymdUPjMlD9EC9ov1/YPqTgqRvUN9nTr3Gqfz29LYAmu0PHPQ=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + + "negotiator": ["negotiator@1.0.0", "", {}, "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg=="], + + "object-assign": ["object-assign@4.1.1", "", {}, "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg=="], + + "object-inspect": ["object-inspect@1.13.4", "", {}, "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew=="], + + "object-keys": ["object-keys@1.1.1", "", {}, "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA=="], + + "obliterator": ["obliterator@2.0.5", "", {}, "sha512-42CPE9AhahZRsMNslczq0ctAEtqk8Eka26QofnqC346BZdHDySk3LWka23LI7ULIw11NmltpiLagIq8gBozxTw=="], + + "on-finished": ["on-finished@2.4.1", "", { "dependencies": { "ee-first": "1.1.1" } }, "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg=="], + + "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="], + + "onnxruntime-common": ["onnxruntime-common@1.21.0", "", {}, "sha512-Q632iLLrtCAVOTO65dh2+mNbQir/QNTVBG3h/QdZBpns7mZ0RYbLRBgGABPbpU9351AgYy7SJf1WaeVwMrBFPQ=="], + + "onnxruntime-node": ["onnxruntime-node@1.21.0", "", { "dependencies": { "global-agent": "^3.0.0", "onnxruntime-common": "1.21.0", "tar": "^7.0.1" }, "os": [ "linux", "win32", "darwin", ] }, "sha512-NeaCX6WW2L8cRCSqy3bInlo5ojjQqu2fD3D+9W5qb5irwxhEyWKXeH2vZ8W9r6VxaMPUan+4/7NDwZMtouZxEw=="], + + "onnxruntime-web": ["onnxruntime-web@1.22.0-dev.20250409-89f8206ba4", "", { "dependencies": { "flatbuffers": "^25.1.24", "guid-typescript": "^1.0.9", "long": "^5.2.3", "onnxruntime-common": "1.22.0-dev.20250409-89f8206ba4", "platform": "^1.3.6", "protobufjs": "^7.2.4" } }, "sha512-0uS76OPgH0hWCPrFKlL8kYVV7ckM7t/36HfbgoFw6Nd0CZVVbQC4PkrR8mBX8LtNUFZO25IQBqV2Hx2ho3FlbQ=="], + + "parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="], + + "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], + + "path-to-regexp": ["path-to-regexp@8.3.0", "", {}, "sha512-7jdwVIRtsP8MYpdXSwOS0YdD0Du+qOoF/AEPIt88PcCFrZCzx41oxku1jD88hZBwbNUIEfpqvuhjFaMAqMTWnA=="], + + "pkce-challenge": ["pkce-challenge@5.0.1", "", {}, "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ=="], + + "platform": ["platform@1.3.6", "", {}, "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="], + + "protobufjs": ["protobufjs@7.5.4", "", { "dependencies": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", "@protobufjs/codegen": "^2.0.4", "@protobufjs/eventemitter": "^1.1.0", "@protobufjs/fetch": "^1.1.0", "@protobufjs/float": "^1.0.2", "@protobufjs/inquire": "^1.1.0", "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.0", "@types/node": ">=13.7.0", "long": "^5.0.0" } }, "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg=="], + + "proxy-addr": ["proxy-addr@2.0.7", "", { "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" } }, "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg=="], + + "qs": ["qs@6.15.0", "", { "dependencies": { "side-channel": "^1.1.0" } }, "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ=="], + + "range-parser": ["range-parser@1.2.1", "", {}, "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="], + + "raw-body": ["raw-body@3.0.2", "", { "dependencies": { "bytes": "~3.1.2", "http-errors": "~2.0.1", "iconv-lite": "~0.7.0", "unpipe": "~1.0.0" } }, "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA=="], + + "require-from-string": ["require-from-string@2.0.2", "", {}, "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw=="], + + "roarr": ["roarr@2.15.4", "", { "dependencies": { "boolean": "^3.0.1", "detect-node": "^2.0.4", "globalthis": "^1.0.1", "json-stringify-safe": "^5.0.1", "semver-compare": "^1.0.0", "sprintf-js": "^1.1.2" } }, "sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A=="], + + "router": ["router@2.2.0", "", { "dependencies": { "debug": "^4.4.0", "depd": "^2.0.0", "is-promise": "^4.0.0", "parseurl": "^1.3.3", "path-to-regexp": "^8.0.0" } }, "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ=="], + + "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + + "section-matter": ["section-matter@1.0.0", "", { "dependencies": { "extend-shallow": "^2.0.1", "kind-of": "^6.0.0" } }, "sha512-vfD3pmTzGpufjScBh50YHKzEu2lxBWhVEHsNGoEXmCmn2hKGfeNLYMzCJpe8cD7gqX7TJluOVpBkAequ6dgMmA=="], + + "semver": ["semver@7.7.4", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA=="], + + "semver-compare": ["semver-compare@1.0.0", "", {}, "sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow=="], + + "send": ["send@1.2.1", "", { "dependencies": { "debug": "^4.4.3", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.1", "mime-types": "^3.0.2", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.2" } }, "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ=="], + + "serialize-error": ["serialize-error@7.0.1", "", { "dependencies": { "type-fest": "^0.13.1" } }, "sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw=="], + + "serve-static": ["serve-static@2.2.1", "", { "dependencies": { "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "parseurl": "^1.3.3", "send": "^1.2.0" } }, "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw=="], + + "setprototypeof": ["setprototypeof@1.2.0", "", {}, "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="], + + "sharp": ["sharp@0.34.5", "", { "dependencies": { "@img/colour": "^1.0.0", "detect-libc": "^2.1.2", "semver": "^7.7.3" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "0.34.5", "@img/sharp-darwin-x64": "0.34.5", "@img/sharp-libvips-darwin-arm64": "1.2.4", "@img/sharp-libvips-darwin-x64": "1.2.4", "@img/sharp-libvips-linux-arm": "1.2.4", "@img/sharp-libvips-linux-arm64": "1.2.4", "@img/sharp-libvips-linux-ppc64": "1.2.4", "@img/sharp-libvips-linux-riscv64": "1.2.4", "@img/sharp-libvips-linux-s390x": "1.2.4", "@img/sharp-libvips-linux-x64": "1.2.4", "@img/sharp-libvips-linuxmusl-arm64": "1.2.4", "@img/sharp-libvips-linuxmusl-x64": "1.2.4", "@img/sharp-linux-arm": "0.34.5", "@img/sharp-linux-arm64": "0.34.5", "@img/sharp-linux-ppc64": "0.34.5", "@img/sharp-linux-riscv64": "0.34.5", "@img/sharp-linux-s390x": "0.34.5", "@img/sharp-linux-x64": "0.34.5", "@img/sharp-linuxmusl-arm64": "0.34.5", "@img/sharp-linuxmusl-x64": "0.34.5", "@img/sharp-wasm32": "0.34.5", "@img/sharp-win32-arm64": "0.34.5", "@img/sharp-win32-ia32": "0.34.5", "@img/sharp-win32-x64": "0.34.5" } }, "sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg=="], + + "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="], + + "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="], + + "side-channel": ["side-channel@1.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3", "side-channel-list": "^1.0.0", "side-channel-map": "^1.0.1", "side-channel-weakmap": "^1.0.2" } }, "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw=="], + + "side-channel-list": ["side-channel-list@1.0.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3" } }, "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA=="], + + "side-channel-map": ["side-channel-map@1.0.1", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3" } }, "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA=="], + + "side-channel-weakmap": ["side-channel-weakmap@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3", "side-channel-map": "^1.0.1" } }, "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A=="], + + "sprintf-js": ["sprintf-js@1.0.3", "", {}, "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g=="], + + "statuses": ["statuses@2.0.2", "", {}, "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw=="], + + "strip-bom-string": ["strip-bom-string@1.0.0", "", {}, "sha512-uCC2VHvQRYu+lMh4My/sFNmF2klFymLX1wHJeXnbEJERpV/ZsVuonzerjfrGpIGF7LBVa1O7i9kjiWvJiFck8g=="], + + "tar": ["tar@7.5.9", "", { "dependencies": { "@isaacs/fs-minipass": "^4.0.0", "chownr": "^3.0.0", "minipass": "^7.1.2", "minizlib": "^3.1.0", "yallist": "^5.0.0" } }, "sha512-BTLcK0xsDh2+PUe9F6c2TlRp4zOOBMTkoQHQIWSIzI0R7KG46uEwq4OPk2W7bZcprBMsuaeFsqwYr7pjh6CuHg=="], + + "toidentifier": ["toidentifier@1.0.1", "", {}, "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA=="], + + "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], + + "type-fest": ["type-fest@0.13.1", "", {}, "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg=="], + + "type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="], + + "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], + + "undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="], + + "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="], + + "vary": ["vary@1.1.2", "", {}, "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="], + + "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="], + + "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="], + + "yallist": ["yallist@5.0.0", "", {}, "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw=="], + + "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + + "zod-to-json-schema": ["zod-to-json-schema@3.25.1", "", { "peerDependencies": { "zod": "^3.25 || ^4" } }, "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA=="], + + "onnxruntime-web/onnxruntime-common": ["onnxruntime-common@1.22.0-dev.20250409-89f8206ba4", "", {}, "sha512-vDJMkfCfb0b1A836rgHj+ORuZf4B4+cc2bASQtpeoJLueuFc5DuYwjIZUBrSvx/fO5IrLjLz+oTrB3pcGlhovQ=="], + + "roarr/sprintf-js": ["sprintf-js@1.1.3", "", {}, "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA=="], + } +} diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 75ed626..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,48 +0,0 @@ -# Build stage -FROM python:3.11-slim as builder - -WORKDIR /app - -# Install build dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - && rm -rf /var/lib/apt/lists/* - -# Install Python dependencies -COPY pyproject.toml ./ -RUN pip install --no-cache-dir build && \ - pip wheel --no-cache-dir --wheel-dir /app/wheels -e . - -# Runtime stage -FROM python:3.11-slim - -WORKDIR /app - -# Install runtime dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - curl \ - && rm -rf /var/lib/apt/lists/* - -# Copy wheels and install -COPY --from=builder /app/wheels /wheels -RUN pip install --no-cache-dir /wheels/* && \ - rm -rf /wheels - -# Copy application code -COPY src/ ./src/ -COPY migrations/ ./migrations/ - -# Create non-root user -RUN useradd --create-home --shell /bin/bash appuser && \ - chown -R appuser:appuser /app -USER appuser - -# Expose port -EXPOSE 8000 - -# Health check -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - -# Run the application -CMD ["uvicorn", "kb_engine.api.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev deleted file mode 100644 index 29286b5..0000000 --- a/docker/Dockerfile.dev +++ /dev/null @@ -1,24 +0,0 @@ -# Development Dockerfile with hot reload -FROM python:3.11-slim - -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - curl \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Install Python dependencies -COPY pyproject.toml ./ -RUN pip install --no-cache-dir -e ".[dev]" - -# Copy application code (will be overridden by volume mount) -COPY . . - -# Expose port -EXPOSE 8000 - -# Run with hot reload -CMD ["uvicorn", "kb_engine.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml deleted file mode 100644 index 88cf34b..0000000 --- a/docker/docker-compose.yml +++ /dev/null @@ -1,63 +0,0 @@ -version: "3.9" - -services: - # PostgreSQL - Traceability Store - postgres: - image: postgres:16-alpine - container_name: kb-engine-postgres - environment: - POSTGRES_USER: kb_engine - POSTGRES_PASSWORD: changeme - POSTGRES_DB: kb_engine - ports: - - "5432:5432" - volumes: - - postgres_data:/var/lib/postgresql/data - healthcheck: - test: ["CMD-SHELL", "pg_isready -U kb_engine -d kb_engine"] - interval: 10s - timeout: 5s - retries: 5 - - # Qdrant - Vector Store - qdrant: - image: qdrant/qdrant:latest - container_name: kb-engine-qdrant - ports: - - "6333:6333" - - "6334:6334" - volumes: - - qdrant_data:/qdrant/storage - environment: - QDRANT__SERVICE__GRPC_PORT: 6334 - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:6333/health"] - interval: 10s - timeout: 5s - retries: 5 - - # Neo4j - Graph Store - neo4j: - image: neo4j:5-community - container_name: kb-engine-neo4j - environment: - NEO4J_AUTH: neo4j/changeme - NEO4J_PLUGINS: '["apoc"]' - NEO4J_dbms_security_procedures_unrestricted: apoc.* - ports: - - "7474:7474" # HTTP - - "7687:7687" # Bolt - volumes: - - neo4j_data:/data - - neo4j_logs:/logs - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:7474"] - interval: 10s - timeout: 5s - retries: 5 - -volumes: - postgres_data: - qdrant_data: - neo4j_data: - neo4j_logs: diff --git a/docs/architecture/kdd-engine.md b/docs/architecture/kdd-engine.md index f1e279f..1213757 100644 --- a/docs/architecture/kdd-engine.md +++ b/docs/architecture/kdd-engine.md @@ -1,89 +1,75 @@ -# KDD Engine (v2) +# KDD Engine (TypeScript/Bun) -Motor de retrieval para especificaciones KDD. Indexa artefactos de dominio (entidades, eventos, reglas, comandos, queries, casos de uso) y ofrece busqueda hibrida (semantica + grafo + lexical) para agentes de IA. +Motor de retrieval para especificaciones KDD. Indexa artefactos de dominio (entidades, eventos, reglas, comandos, queries, casos de uso) y ofrece búsqueda híbrida (semántica + grafo + lexical) para agentes de IA. -> **Paquete**: `src/kdd/` | **CLI**: `kdd` | **Entrada**: `kdd.api.cli:cli` +> **Paquete**: `src/` | **CLI**: `bun run src/cli.ts` | **MCP**: `bun run src/mcp.ts` --- ## Arquitectura -Hexagonal (Ports & Adapters) con CQRS (Commands/Queries separados): +Módulos funcionales con inyección de dependencias via `container.ts`: ``` -src/kdd/ -├── domain/ # Entidades, enums, reglas puras, ports (Protocol) +src/ +├── domain/ # Tipos, enums, reglas puras (sin I/O) ├── application/ # Commands (write) + Queries (read) + Extractors -├── infrastructure/ # Adapters: filesystem, networkx, hnswlib, git, events -├── api/ # Entry points: CLI (Click) + Server (FastAPI) -└── container.py # DI container — ensambla todo +├── infra/ # Adapters: filesystem, graphology, vector store, embeddings +├── cli.ts # Entry point CLI (citty) +├── mcp.ts # Entry point MCP server +└── container.ts # DI — ensambla stores desde .kdd-index/ ``` ### Flujo de dependencias ``` -api/ ──▶ application/ ──▶ domain/ - │ - ▼ - infrastructure/ (implementa domain/ports.py) +cli.ts / mcp.ts ──▶ application/ ──▶ domain/ + │ + ▼ + infra/ (implementa stores en memoria) ``` -El dominio no importa nada de infrastructure. Los adapters implementan los `Protocol` definidos en `domain/ports.py`. +El dominio es puro (sin I/O). Los módulos de infra cargan `.kdd-index/` a stores en memoria. -### Ports (interfaces) +### Stores en memoria -| Port | Responsabilidad | Adapter por defecto | -|------|----------------|---------------------| -| `ArtifactStore` | Leer/escribir `.kdd-index/` | `FilesystemArtifactStore` | -| `GraphStore` | Grafo en memoria para queries | `NetworkXGraphStore` | -| `VectorStore` | Indice vectorial para semantic search | `HNSWLibVectorStore` | -| `EmbeddingModel` | Generar embeddings desde texto | `SentenceTransformerModel` | -| `EventBus` | Pub/sub de domain events | `InMemoryEventBus` | -| `AgentClient` | Enrichment L3 via agente IA | (pendiente) | -| `Transport` | Push/pull de artifacts a remoto | (pendiente) | +| Store | Implementación | Responsabilidad | +|-------|---------------|-----------------| +| `GraphStore` | graphology (directed multigraph) | Nodos, edges, BFS, reverse BFS, text search | +| `VectorStore` | Brute-force Float64Array cosine | Búsqueda semántica por similitud | +| `ArtifactWriter` | Bun.write() + JSON | Escritura de `.kdd-index/` artifacts | +| `ArtifactLoader` | Bun.file() + Bun.Glob | Lectura de `.kdd-index/` artifacts | +| Embedding model | `@huggingface/transformers` | Genera embeddings `all-mpnet-base-v2` (768 dims) | --- ## Index Levels (Capacidad Progresiva) -El engine detecta automaticamente el nivel disponible segun las dependencias instaladas: - -``` -L1 ─────────────────────────────────────────────────────── Siempre disponible - Grafo de nodos/edges extraidos de front-matter + wiki-links - NetworkX en memoria, sin embeddings - Busqueda: grafo + lexical - -L2 ─────────────────────────────────────── sentence-transformers + hnswlib - Todo L1 + embeddings vectoriales (384 dims, all-MiniLM-L6-v2) - HNSWLib en memoria - Busqueda: hibrida (semantica + grafo + lexical) - -L3 ──────────────────────────────────────────────── API de agente IA (TBD) - Todo L2 + enrichment con agente - Analisis de impacto semantico ``` - -**Instalacion por nivel:** - -```bash -pip install -e ".[kdd]" # L1 -pip install -e ".[kdd,kdd-l2]" # L2 +L1 ─────────────────────────────────────────── Siempre disponible + Grafo de nodos/edges extraídos de front-matter + wiki-links + graphology en memoria + Búsqueda: grafo + lexical + +L2 ────────────────────────── @huggingface/transformers + Todo L1 + embeddings vectoriales (768 dims, all-mpnet-base-v2) + Brute-force cosine en memoria + Búsqueda: híbrida (semántica + grafo + lexical) ``` --- ## Artifact Store (`.kdd-index/`) -El indice se persiste como ficheros JSON en disco. No requiere base de datos: +El índice se persiste como ficheros JSON en disco. No requiere base de datos: ``` .kdd-index/ ├── manifest.json # IndexManifest: version, stats, git_commit, level ├── nodes/ │ ├── entity/ -│ │ ├── User.json # GraphNode serializado -│ │ └── Order.json +│ │ ├── KDDDocument.json # GraphNode serializado +│ │ └── GraphEdge.json │ ├── command/ │ │ └── CMD-001.json │ └── ... # Un directorio por KDDKind @@ -91,11 +77,11 @@ El indice se persiste como ficheros JSON en disco. No requiere base de datos: │ └── edges.jsonl # GraphEdge stream (append-only JSONL) └── embeddings/ ├── entity/ - │ └── User.json # Lista de Embedding objects + │ └── KDDDocument.json # Lista de Embedding objects └── ... ``` -Al arrancar un query, el `IndexLoader` lee los artifacts de disco y los carga en los stores en memoria (NetworkX + HNSWLib). +Al arrancar un query, `createContainer()` lee los artifacts de disco y los carga en los stores en memoria (graphology + VectorStore). --- @@ -103,83 +89,55 @@ Al arrancar un query, el `IndexLoader` lee los artifacts de disco y los carga en ### CMD-001 — IndexDocument -Procesa un unico fichero de spec: - -1. Lee fichero, extrae front-matter -2. Routea documento via `BR-DOCUMENT-001` (kind + validacion de ubicacion) -3. Extrae `GraphNode` + `GraphEdge[]` con extractor especifico del kind -4. Valida dependencias de capa (`BR-LAYER-001`) -5. (L2+) Chunking semantico + embeddings (`BR-EMBEDDING-001`) -6. Escribe artifacts en `ArtifactStore` -7. Emite domain events (`DocumentDetected` → `DocumentParsed` → `DocumentIndexed`) - -### CMD-002 — IndexIncremental - -Usa `git diff` para indexar solo cambios: - -- **Sin manifest previo** → full reindex de todos los `**/*.md` -- **Con manifest** → diff contra `git_commit` del manifest: - - Ficheros nuevos → index via CMD-001 - - Ficheros modificados → delete artifacts + re-index - - Ficheros eliminados → cascade delete artifacts - -### CMD-004 — MergeIndex - -Combina multiples `.kdd-index/` de diferentes desarrolladores: +Procesa un único fichero de spec: -- Estrategia de conflictos: `last_write_wins` (por defecto) o `fail_on_conflict` -- Resolucion: el nodo con `indexed_at` mas reciente gana (`BR-MERGE-001`) -- Delete-wins: si un nodo esta ausente en cualquier indice, se elimina - -### CMD-005 — SyncIndex (pendiente) - -Push/pull de `.kdd-index/` a remoto via `Transport` port. +1. Lee fichero, extrae front-matter (`gray-matter`) +2. Routea documento via `routeDocument()` (kind + validación de ubicación) +3. Parsea secciones Markdown (`parseMarkdownSections()`) +4. Extrae `GraphNode` + `GraphEdge[]` con extractor específico del kind +5. Valida dependencias de capa (`isLayerViolation()`) +6. (L2) Chunking por párrafos + embeddings (`chunkDocument()`) +7. Escribe artifacts en `ArtifactWriter` --- ## CQRS: Queries -### QRY-003 — RetrieveHybrid (query principal) +### QRY-003 — HybridSearch (query principal) -Busqueda hibrida con fusion de scores. Es el query por defecto para agentes. +Búsqueda híbrida con fusión de scores. Es el query por defecto para agentes. **Fases:** -1. **Semantic** (L2+): encode query → busqueda vectorial en HNSWLib +1. **Semantic** (L2): encode query → brute-force cosine similarity 2. **Lexical**: text search sobre campos indexados en GraphStore 3. **Graph expansion**: BFS desde nodos encontrados, profundidad configurable -4. **Fusion scoring**: ponderacion `semantic(0.6) + graph(0.3) + lexical(0.1)` + bonus multi-source - -**Degradacion elegante:** sin embeddings (L1) solo usa grafo + lexical con warning. +4. **Fusion scoring**: ponderación `semantic(0.6) + graph(0.3) + lexical(0.1)` + bonus multi-source -### QRY-001 — RetrieveGraph +**Degradación elegante:** sin embeddings (L1) solo usa grafo + lexical con warning. -Traversal puro del grafo desde un nodo raiz, con profundidad y filtro de edge types. +### QRY-001 — GraphQuery -### QRY-002 — RetrieveSemantic +Traversal puro del grafo desde un nodo raíz, con profundidad y filtro de kinds. -Busqueda puramente vectorial (solo L2+). +### QRY-002 — SemanticQuery -### QRY-004 — RetrieveImpact +Búsqueda puramente vectorial (solo L2). -Analisis de impacto: dado un nodo, encuentra todos los nodos afectados directa y transitivamente. +### QRY-004 — ImpactQuery -- Sigue edges **incoming** (quien depende de este nodo) -- Identifica BDD scenarios a re-ejecutar (edges `VALIDATES`) -- Retorna cadenas de dependencia completas +Análisis de impacto: dado un nodo, encuentra todos los nodos afectados (reverse BFS por edges incoming). -### QRY-005 — RetrieveCoverage +### QRY-005 — CoverageQuery -Validacion de gobernanza: verifica que artefactos relacionados requeridos existan. +Validación de gobernanza: verifica que artefactos relacionados requeridos existan. +Ejemplo: una Entity debería tener Events, BusinessRules y UseCases asociados. -Ejemplo: una Entity deberia tener Events, BusinessRules y UseCases asociados. - -### QRY-006 — RetrieveViolations +### QRY-006 — ViolationsQuery Detecta violaciones de dependencia entre capas (`BR-LAYER-001`): - - Capa inferior no debe referenciar capa superior -- `00-requirements` esta exenta -- Retorna: lista de violaciones, tasa de violacion, total de edges analizados +- `00-requirements` está exenta +- Retorna: lista de violaciones, tasa de violación, total de edges analizados --- @@ -187,156 +145,117 @@ Detecta violaciones de dependencia entre capas (`BR-LAYER-001`): ### KDDDocument -Representacion parseada de un fichero de spec. Contiene: +Representación parseada de un fichero de spec. Contiene: - `id`, `kind`, `layer`, `source_path`, `source_hash` -- `front_matter` (dict), `sections` (list[Section]), `wiki_links` +- `front_matter` (Record), `sections` (Section[]), `wiki_links` ### GraphNode Nodo del grafo, producido al indexar un KDDDocument: -- ID: `"{Kind}:{DocumentId}"` (ej. `"Entity:Pedido"`, `"Command:CMD-001"`) -- `indexed_fields`: campos extraidos por el extractor especifico del kind +- ID: `"{Prefix}:{DocumentId}"` (ej. `"Entity:KDDDocument"`, `"CMD:CMD-001"`) +- `indexed_fields`: campos extraídos por el extractor específico del kind ### GraphEdge -Relacion tipada y dirigida entre nodos: -- **Structural** (SCREAMING_SNAKE): `WIKI_LINK`, `ENTITY_RULE`, `UC_EXECUTES_CMD`, `EMITS`, etc. -- **Business** (snake_case): definidos libremente por autores de specs +Relación tipada y dirigida entre nodos: +- 17 edge types: `WIKI_LINK`, `ENTITY_RULE`, `UC_EXECUTES_CMD`, `UC_APPLIES_RULE`, `EMITS`, etc. ### Embedding -Vector semantico generado desde un chunk de texto: -- ID: `"{document_id}:{section_path}:{chunk_index}"` -- Modelo por defecto: `all-MiniLM-L6-v2` (384 dimensiones) +Vector semántico generado desde un chunk de texto: +- ID: `"{document_id}:{section_heading}:{chunk_index}"` +- Modelo: `all-mpnet-base-v2` (768 dimensiones) ### IndexManifest -Metadatos del indice en `manifest.json`: version, nivel, stats, git commit, dominios. +Metadatos del índice en `manifest.json`: version, nivel, stats, git commit, dominios. --- -## 15 KDDKind Types +## 16 KDDKind Types Cada kind tiene un extractor dedicado en `application/extractors/kinds/`: | Kind | Layer | Ejemplo de ID | |------|-------|---------------| -| `entity` | 01-domain | `Entity:Pedido` | -| `event` | 01-domain | `Event:EVT-Pedido-Created` | -| `business-rule` | 01-domain | `BusinessRule:BR-PEDIDO-001` | -| `business-policy` | 02-behavior | `BusinessPolicy:BP-CREDITO-001` | -| `cross-policy` | 02-behavior | `CrossPolicy:XP-CREDITOS-001` | -| `command` | 02-behavior | `Command:CMD-001` | -| `query` | 02-behavior | `Query:QRY-003` | -| `process` | 02-behavior | `Process:PROC-001` | -| `use-case` | 02-behavior | `UseCase:UC-001` | +| `entity` | 01-domain | `Entity:KDDDocument` | +| `event` | 01-domain | `Event:EVT-KDDDocument-Indexed` | +| `business-rule` | 01-domain | `BR:BR-INDEX-001` | +| `business-policy` | 02-behavior | `BP:BP-CREDITO-001` | +| `cross-policy` | 02-behavior | `XP:XP-CREDITOS-001` | +| `command` | 02-behavior | `CMD:CMD-001` | +| `query` | 02-behavior | `QRY:QRY-003` | +| `process` | 02-behavior | `PROC:PROC-001` | +| `use-case` | 02-behavior | `UC:UC-001` | | `ui-view` | 03-experience | `UIView:UI-Dashboard` | | `ui-component` | 03-experience | `UIComponent:UI-Button` | -| `requirement` | 04-verification | `Requirement:REQ-001` | -| `objective` | 00-requirements | `Objective:OBJ-001` | +| `requirement` | 04-verification | `REQ:REQ-001` | +| `objective` | 00-requirements | `OBJ:OBJ-001` | | `prd` | 00-requirements | `PRD:PRD-KBEngine` | | `adr` | 00-requirements | `ADR:ADR-0001` | +| `glossary` | 01-domain | `Glossary:GlossaryName` | --- -## CLI (`kdd`) +## CLI ```bash -# Indexar specs (incremental por defecto) -kdd index ./specs/ -kdd index ./specs/ --full # forzar reindex completo -kdd index ./specs/ --domain core # multi-domain - -# Buscar (hibrido: semantica + grafo + lexical) -kdd search "registro de usuario" -kdd search "pedido" --kind entity --kind command -kdd search "autenticacion" --limit 5 --min-score 0.7 -kdd search "..." --no-graph # solo semantica + lexical -kdd search "..." --json-output # salida JSON +# Indexar specs (full reindex) +bun run src/cli.ts index specs/ +bun run src/cli.ts index specs/ --level L1 # solo grafo +bun run src/cli.ts index specs/ --domain core # multi-domain + +# Buscar (híbrido: semántica + grafo + lexical) +bun run src/cli.ts search --index-path .kdd-index "registro de usuario" +bun run src/cli.ts search --index-path .kdd-index "pedido" --kind entity,command +bun run src/cli.ts search --index-path .kdd-index "auth" --min-score 0.5 -n 5 + +# Búsqueda semántica pura +bun run src/cli.ts semantic --index-path .kdd-index "retrieval query" # Explorar grafo -kdd graph Entity:Pedido # traversal desde nodo -kdd graph Entity:Pedido -d 3 # profundidad 3 +bun run src/cli.ts graph --index-path .kdd-index Entity:KDDDocument +bun run src/cli.ts graph --index-path .kdd-index Entity:KDDDocument --depth 3 -# Analisis de impacto -kdd impact Entity:Pedido # que se rompe si cambio Pedido -kdd impact Entity:Pedido -d 5 # profundidad mayor +# Análisis de impacto +bun run src/cli.ts impact --index-path .kdd-index Entity:KDDDocument # Cobertura de gobernanza -kdd coverage Entity:Pedido # tiene events, rules, UCs? +bun run src/cli.ts coverage --index-path .kdd-index Entity:KDDDocument # Violaciones de capa -kdd violations # edges que violan BR-LAYER-001 - -# Merge de indices -kdd merge ./dev1/.kdd-index ./dev2/.kdd-index -o ./merged/.kdd-index - -# Estado del indice -kdd status +bun run src/cli.ts violations --index-path .kdd-index ``` --- -## Domain Events +## MCP Server -El pipeline emite eventos inmutables (frozen dataclasses) durante el ciclo de vida: +7 tools expuestos via `@modelcontextprotocol/sdk`: -| Evento | Cuando | -|--------|--------| -| `DocumentDetected` | Fichero con front-matter valido encontrado | -| `DocumentParsed` | Documento parseado por su extractor | -| `DocumentIndexed` | Pipeline de indexacion completado | -| `DocumentStale` | Documento modificado en disco vs indice | -| `DocumentDeleted` | Documento eliminado del filesystem | -| `MergeRequested` | Merge de indices solicitado | -| `MergeCompleted` | Merge completado exitosamente | -| `QueryReceived` | Query de retrieval recibido | -| `QueryCompleted` | Query resuelto exitosamente | -| `QueryFailed` | Query fallido (validacion o resolucion) | +| Tool | Implementación | +|------|---------------| +| `kdd_search` | `hybridSearch()` — búsqueda con filtros | +| `kdd_find_spec` | `hybridSearch()` con limit=5 (convenience) | +| `kdd_related` | `graphQuery()` — BFS desde nodo | +| `kdd_impact` | `impactQuery()` — reverse BFS | +| `kdd_read_section` | `Bun.file()` — lee .md + anchor | +| `kdd_list` | graph store iteration — filtra por kind/domain | +| `kdd_stats` | manifest stats + counts | --- ## Business Rules (funciones puras) -Implementadas en `domain/rules.py`, sin I/O ni side-effects: +Implementadas en `domain/rules.ts`, sin I/O ni side-effects: -| Regla | Funcion | Descripcion | +| Regla | Función | Descripción | |-------|---------|-------------| -| BR-DOCUMENT-001 | `route_document()` | Determina KDDKind desde front-matter | -| BR-EMBEDDING-001 | `embeddable_sections()` | Secciones embeddables por kind | -| BR-INDEX-001 | `detect_index_level()` | Nivel de indice segun recursos | -| BR-LAYER-001 | `is_layer_violation()` | Valida dependencias entre capas | -| BR-MERGE-001 | `resolve_node_conflict()` | Resolucion de conflictos last-write-wins | +| BR-DOCUMENT-001 | `routeDocument()` | Determina KDDKind desde front-matter | +| BR-EMBEDDING-001 | `embeddableSections()` | Secciones embeddables por kind | +| BR-INDEX-001 | `detectIndexLevel()` | Nivel de índice según recursos | +| BR-LAYER-001 | `isLayerViolation()` | Valida dependencias entre capas | --- -## Diferencias con v1 (`kb-engine`) - -| Aspecto | v1 (`kb` / `src/kb_engine/`) | v2 (`kdd` / `src/kdd/`) | -|---------|------------------------------|-------------------------| -| **Arquitectura** | Services + Pipelines | Hexagonal CQRS + Ports/Adapters | -| **Storage** | SQLite + ChromaDB + FalkorDB | Filesystem artifacts (`.kdd-index/`) | -| **Grafo** | FalkorDBLite / Neo4j | NetworkX (in-memory, cargado de artifacts) | -| **Vectores** | ChromaDB / Qdrant | HNSWLib (in-memory) | -| **Embeddings** | `paraphrase-multilingual-MiniLM-L12-v2` | `all-MiniLM-L6-v2` | -| **DB requerida** | Si (SQLite minimo) | No (solo ficheros JSON) | -| **Capacidad** | Fija (todo o nada) | Progresiva (L1 → L2 → L3) | -| **Artifact types** | 6 (entity, use-case, rule, process, event, glossary) | 15 KDDKinds | -| **Extractors** | Genericos | 1 dedicado por kind | -| **CLI** | `kb index/search/sync/status/graph` | `kdd index/search/graph/impact/coverage/violations/merge/status` | -| **Queries** | Vector search + graph opcional | 6 queries especializados + hybrid fusion | -| **Specs** | No | 52 specs KDD trazables | -| **Domain events** | No | Si (10 event types) | -| **DI** | Manual / settings | Container con auto-deteccion | - -### Coexistencia - -Ambos paquetes coexisten en `pyproject.toml`: - -```toml -[project.scripts] -kb = "kb_engine.cli:cli" # v1 -kdd = "kdd.api.cli:cli" # v2 -``` - -v1 sigue siendo funcional. v2 es la direccion arquitectonica futura. +*Última actualización: Febrero 2026 (migración a TypeScript/Bun)* diff --git a/docs/architecture/smart-ingestion-pipeline.md b/docs/architecture/smart-ingestion-pipeline.md deleted file mode 100644 index 137a107..0000000 --- a/docs/architecture/smart-ingestion-pipeline.md +++ /dev/null @@ -1,259 +0,0 @@ -# Pipeline de Indexación - Arquitectura - -## Resumen - -Este documento describe la arquitectura del pipeline de indexación implementado en `IndexationPipeline`. El pipeline es responsable de procesar documentos y almacenarlos en los tres repositorios del sistema (trazabilidad, vectorial y grafos). - ---- - -## 1. Visión General - -``` -┌─────────────────────────────────────────────────────────────────────────┐ -│ INDEXATION PIPELINE │ -│ │ -│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌─────────────────┐ │ -│ │ Document │──▶│ Frontmatter│──▶│ Chunking │──▶│ Section Anchor │ │ -│ │ Input │ │ Extraction │ │ (per-type)│ │ Computation │ │ -│ └───────────┘ └───────────┘ └───────────┘ └─────────────────┘ │ -│ │ │ -│ ┌───────────────────────────────────────┘ │ -│ ▼ │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────┐ │ -│ │ Embedding │──▶│ Entity │──▶│ Storage Layer │ │ -│ │ Generation │ │ Extraction │ │ │ │ -│ │ │ │ (optional) │ │ ┌────────────────┐ │ │ -│ └─────────────────┘ └─────────────────┘ │ │ Traceability │ │ │ -│ │ │ (SQLite/PG) │ │ │ -│ │ ├────────────────┤ │ │ -│ │ │ Vector Store │ │ │ -│ │ │ (Chroma/Qdrant)│ │ │ -│ │ ├────────────────┤ │ │ -│ │ │ Graph Store │ │ │ -│ │ │ (SQLite/Neo4j) │ │ │ -│ │ └────────────────┘ │ │ -│ └─────────────────────┘ │ -└─────────────────────────────────────────────────────────────────────────┘ -``` - ---- - -## 2. Fases del Pipeline - -### 2.1 Preparación del Documento - -**Clase**: `IndexationPipeline._build_document()` - -1. Lee el contenido del archivo -2. Determina el tipo de archivo por extensión → `FileTypeConfig` -3. Si es markdown, extrae frontmatter (YAML/TOML) -4. Construye el modelo `Document` con metadata, info git y parser asignado - -**Formatos soportados** (configurables por `FileTypeConfig`): - -| Extensión | Parser | MIME Type | -|-----------|--------|-----------| -| `.md` | markdown | text/markdown | -| `.json` | json | application/json | -| `.yaml`, `.yml` | yaml | text/yaml | -| `.rst` | rst | text/x-rst | -| Otros | plaintext | text/plain | - -### 2.2 Chunking Semántico (ADR-0002) - -**Clase**: `ChunkerFactory` → estrategias especializadas - -El chunking usa estrategias específicas por tipo de documento KDD, seleccionadas según el campo `kind` del frontmatter: - -| Estrategia | Tipo de Documento | ChunkType | -|------------|-------------------|-----------| -| `EntityChunkingStrategy` | entity | `ENTITY` | -| `UseCaseChunkingStrategy` | use_case | `USE_CASE` | -| `RuleChunkingStrategy` | rule | `RULE` | -| `ProcessChunkingStrategy` | process | `PROCESS` | -| `DefaultChunkingStrategy` | (fallback) | `DEFAULT` | - -**Configuración** (`ChunkingConfig`): - -```python -min_chunk_size: 100 # tokens mínimos -target_chunk_size: 512 # tokens objetivo -max_chunk_size: 1024 # tokens máximo -overlap_size: 50 # overlap entre chunks -preserve_sentences: True -respect_headings: True -include_heading_context: True -``` - -Cada chunk incluye: -- `heading_path`: Jerarquía de headings (ej: `["Documento", "Atributos"]`) -- `section_anchor`: Anchor URL calculado del heading path -- `chunk_type`: Tipo semántico del chunk - -### 2.3 Generación de Embeddings - -**Clase**: `EmbeddingProviderFactory` → `LocalEmbeddingProvider` | `OpenAIEmbeddingProvider` - -| Proveedor | Modelo por defecto | Dimensiones | -|-----------|-------------------|-------------| -| `local` | all-MiniLM-L6-v2 (sentence-transformers) | 384 | -| `openai` | text-embedding-3-small | 1536 | - -Los embeddings se generan por chunk y se almacenan en el Vector Store. - -### 2.4 Extracción de Entidades (ADR-0003) — Opcional - -**Clase**: `ExtractionPipelineFactory` → extractores - -Solo se ejecuta si `graph_store != "none"`. Pipeline multi-estrategia: - -1. **FrontmatterExtractor** — Extrae nodos del YAML frontmatter (confidence=1.0) -2. **PatternExtractor** — Detecta patrones en contenido (confidence=0.8-0.9): - - Wiki links: `[[Entity]]` - - IDs KDD: `UC-*`, `RUL-*`, `PRC-*`, `EVT-*`, etc. - - Patrones de actores, sistemas, entidades en texto -3. **LLMExtractor** — Extracción semántica con OpenAI (confidence=0.7) — desactivado por defecto - -**Deduplicación**: Por ID de nodo y por tupla (source, target, type) para edges. - -### 2.5 Almacenamiento - -El pipeline almacena en los tres repositorios de forma secuencial: - -``` -1. Document → Traceability Store -2. Chunks → Traceability Store -3. Embeddings → Vector Store -4. Nodes/Edges → Graph Store (si disponible) -5. Status=INDEXED → Traceability Store (actualización) -``` - -Si cualquier paso falla, el documento se marca como `FAILED`. - ---- - -## 3. Operaciones del Pipeline - -### 3.1 Indexación de Documento Individual - -```python -pipeline.index_document(document) → Document -``` - -Ejecuta el pipeline completo para un documento. - -### 3.2 Indexación de Repositorio - -```python -pipeline.index_repository(repo_config) → list[Document] -``` - -1. Escanea archivos del repo git que coincidan con patrones -2. Para cada archivo: construye `Document` → `index_document()` -3. Registra commit SHA y remote URL - -### 3.3 Sincronización Incremental - -```python -pipeline.sync_repository(repo_config, since_commit) → dict -``` - -1. Obtiene archivos cambiados y eliminados desde el commit dado -2. Elimina documentos de archivos borrados (cascade) -3. Reindexar archivos modificados (compara `content_hash` para saltar sin cambios) -4. Retorna: `{commit, indexed, deleted, skipped}` - -### 3.4 Reindexación - -```python -pipeline.reindex_document(document) → Document -``` - -Elimina datos derivados (vector, graph, chunks) y re-ejecuta `index_document()`. - -### 3.5 Eliminación - -```python -pipeline.delete_document(document) → bool -``` - -Eliminación en cascada: vector → graph → chunks → document. - ---- - -## 4. Pipeline de Retrieval - -**Clase**: `RetrievalPipeline` - -Retorna `DocumentReference` con URLs en lugar de contenido raw: - -``` -Query (texto) - ↓ [Embed query] - ↓ [Vector search → chunk_ids + scores] - ↓ [Fetch chunks + documents desde traceability] - ↓ [Resolve URL: file:// o https://#anchor] - ↓ [Return DocumentReferences] -RetrievalResponse -``` - -**Modos soportados**: -- `VECTOR`: Búsqueda por similitud de embeddings -- `GRAPH`: Traversal del grafo (placeholder) -- `HYBRID`: Combina ambos con Reciprocal Rank Fusion (RRF, k=60) - ---- - -## 5. Diagrama de Secuencia - -``` -┌────────┐ ┌─────────────┐ ┌──────────┐ ┌───────────┐ ┌───────────┐ ┌──────────┐ -│ Client │ │ Pipeline │ │ Chunker │ │ Embedding │ │ Extractor │ │ Stores │ -└───┬────┘ └──────┬──────┘ └────┬─────┘ └─────┬─────┘ └─────┬─────┘ └────┬─────┘ - │ │ │ │ │ │ - │ index_doc() │ │ │ │ │ - │────────────▶│ │ │ │ │ - │ │ save_doc │ │ │ │ - │ │─────────────│─────────────│─────────────│────────────▶│ - │ │ │ │ │ │ - │ │ chunk() │ │ │ │ - │ │────────────▶│ │ │ │ - │ │ chunks │ │ │ │ - │ │◀────────────│ │ │ │ - │ │ │ │ │ │ - │ │ save_chunks │ │ │ │ - │ │─────────────│─────────────│─────────────│────────────▶│ - │ │ │ │ │ │ - │ │ embed_chunks│ │ │ │ - │ │─────────────│────────────▶│ │ │ - │ │ embeddings │ │ │ │ - │ │◀────────────│─────────────│ │ │ - │ │ │ │ │ │ - │ │ upsert_embeddings │ │ │ - │ │─────────────│─────────────│─────────────│────────────▶│ - │ │ │ │ │ │ - │ │ extract_doc │ │ │ │ - │ │─────────────│─────────────│────────────▶│ │ - │ │ nodes │ │ │ │ - │ │◀────────────│─────────────│─────────────│ │ - │ │ │ │ │ │ - │ │ create_nodes│ │ │ │ - │ │─────────────│─────────────│─────────────│────────────▶│ - │ │ │ │ │ │ - │ document │ │ │ │ │ - │◀────────────│ │ │ │ │ -``` - ---- - -## 6. Evolución Futura - -Los siguientes Design Challenges pueden evolucionar este pipeline: - -- **DC-002**: Estrategias de retrieval avanzadas (graph traversal, hybrid) -- **DC-009**: Actualización incremental a nivel de chunk (diff hash) -- **DC-011**: Ciclo de vida del contenido (dev → staging → pro → deprecated) - ---- - -*Última actualización: Febrero 2026 (alineado con v0.2.0)* diff --git a/docs/design/adr/ADR-0001-repository-pattern-for-storage-abstraction.md b/docs/design/adr/ADR-0001-repository-pattern-for-storage-abstraction.md index ac7f790..d3627bb 100644 --- a/docs/design/adr/ADR-0001-repository-pattern-for-storage-abstraction.md +++ b/docs/design/adr/ADR-0001-repository-pattern-for-storage-abstraction.md @@ -2,7 +2,8 @@ --- id: ADR-0001 -status: accepted +status: superseded +superseded_note: "Migración a TypeScript/Bun (Feb 2026). El nuevo stack usa stores in-memory (graphology + brute-force cosine) cargados de ficheros JSON (.kdd-index/). No hay Repository Pattern — los stores se inyectan directamente via container.ts." date: 2025-01-16 deciders: [leopoldo, claude] consulted: [] diff --git a/docs/design/adr/ADR-0002-kdd-semantic-chunking-strategy.md b/docs/design/adr/ADR-0002-kdd-semantic-chunking-strategy.md index 843b597..b8714a8 100644 --- a/docs/design/adr/ADR-0002-kdd-semantic-chunking-strategy.md +++ b/docs/design/adr/ADR-0002-kdd-semantic-chunking-strategy.md @@ -2,7 +2,8 @@ --- id: ADR-0002 -status: accepted +status: superseded +superseded_note: "Migración a TypeScript/Bun (Feb 2026). El chunking ahora es paragraph-level genérico (src/application/chunking.ts) con selección de secciones embeddables por kind via embeddableSections(). No hay estrategias por tipo — un solo chunkDocument() para todos los kinds." date: 2025-01-16 deciders: [leopoldo, claude] consulted: [] diff --git a/docs/design/adr/ADR-0003-entity-extraction-pipeline.md b/docs/design/adr/ADR-0003-entity-extraction-pipeline.md index 57375e2..0b08d01 100644 --- a/docs/design/adr/ADR-0003-entity-extraction-pipeline.md +++ b/docs/design/adr/ADR-0003-entity-extraction-pipeline.md @@ -2,7 +2,8 @@ --- id: ADR-0003 -status: accepted +status: superseded +superseded_note: "Migración a TypeScript/Bun (Feb 2026). Reemplazado por 16 extractores dedicados por KDDKind (src/application/extractors/kinds/). Cada extractor entiende la estructura de su tipo. No hay pipeline multi-estrategia — un extractor por kind, registrado en ExtractorRegistry." date: 2025-01-16 deciders: [leopoldo, claude] consulted: [] diff --git a/docs/design/adr/ADR-0004-mcp-server-agent-integration.md b/docs/design/adr/ADR-0004-mcp-server-agent-integration.md index 571438f..69b35cb 100644 --- a/docs/design/adr/ADR-0004-mcp-server-agent-integration.md +++ b/docs/design/adr/ADR-0004-mcp-server-agent-integration.md @@ -2,7 +2,8 @@ --- id: ADR-0004 -status: accepted +status: superseded +superseded_note: "Migración a TypeScript/Bun (Feb 2026). MCP server reimplementado en src/mcp.ts usando @modelcontextprotocol/sdk (TS). 7 tools (kdd_search, kdd_find_spec, kdd_related, kdd_impact, kdd_read_section, kdd_list, kdd_stats). Sin CLI fallback ni FastMCP." date: 2025-02-07 deciders: [leopoldo, claude] consulted: [] diff --git a/docs/design/adr/README.md b/docs/design/adr/README.md index fb4f510..0eef053 100644 --- a/docs/design/adr/README.md +++ b/docs/design/adr/README.md @@ -6,10 +6,13 @@ | ID | Título | Estado | Fecha | DC Relacionado | |----|--------|--------|-------|----------------| -| [ADR-0001](./ADR-0001-repository-pattern-for-storage-abstraction.md) | Repository Pattern para Abstracción de Almacenamiento | `accepted` | 2025-01-16 | DC-010 | -| [ADR-0002](./ADR-0002-kdd-semantic-chunking-strategy.md) | Estrategia de Chunking Semántico por Tipo KDD | `accepted` | 2025-01-16 | DC-004 | -| [ADR-0003](./ADR-0003-entity-extraction-pipeline.md) | Pipeline de Extracción de Entidades Multi-estrategia | `accepted` | 2025-01-16 | DC-003 | -| [ADR-0004](./ADR-0004-mcp-server-agent-integration.md) | MCP Server para Integración con Agentes de IA | `accepted` | 2025-02-07 | DC-012 | +| [ADR-0001](./ADR-0001-repository-pattern-for-storage-abstraction.md) | Repository Pattern para Abstracción de Almacenamiento | `superseded` | 2025-01-16 | DC-010 | +| [ADR-0002](./ADR-0002-kdd-semantic-chunking-strategy.md) | Estrategia de Chunking Semántico por Tipo KDD | `superseded` | 2025-01-16 | DC-004 | +| [ADR-0003](./ADR-0003-entity-extraction-pipeline.md) | Pipeline de Extracción de Entidades Multi-estrategia | `superseded` | 2025-01-16 | DC-003 | +| [ADR-0004](./ADR-0004-mcp-server-agent-integration.md) | MCP Server para Integración con Agentes de IA | `superseded` | 2025-02-07 | DC-012 | + +> **Nota**: Los ADRs 0001-0004 fueron superseded por la migración a TypeScript/Bun (Feb 2026). +> La arquitectura actual está documentada en [docs/architecture/kdd-engine.md](../../architecture/kdd-engine.md). ## Estados diff --git a/docs/design/challenges/README.md b/docs/design/challenges/README.md index d800a5a..810d14a 100644 --- a/docs/design/challenges/README.md +++ b/docs/design/challenges/README.md @@ -17,6 +17,10 @@ | [DC-009](./DC-009-incremental-updates.md) | Actualización Incremental | `open` | Media | - | | [DC-010](./DC-010-engine-abstraction.md) | Abstracción de Motores | `decided` | Alta | [ADR-0001](../adr/ADR-0001-repository-pattern-for-storage-abstraction.md) | | [DC-011](./DC-011-content-lifecycle.md) | Ciclo de Vida del Contenido | `open` | Alta | - | +| [DC-012](./DC-012-agent-tool-integration.md) | Integración con Agentes (MCP) | `decided` | Alta | [ADR-0004](../adr/ADR-0004-mcp-server-agent-integration.md) | + +> **Nota**: En Feb 2026 se migró de Python a TypeScript/Bun. Los DCs `decided` tienen ADRs ahora marcados como `superseded`. +> La arquitectura actual: [docs/architecture/kdd-engine.md](../../architecture/kdd-engine.md). ## Estados diff --git a/docs/design/requirements.md b/docs/design/requirements.md deleted file mode 100644 index f24cd3d..0000000 --- a/docs/design/requirements.md +++ /dev/null @@ -1,349 +0,0 @@ -# Requisitos - Sistema de Retrieval de Conocimiento - -## 1. Contexto y Alcance - -| Aspecto | Descripción | -|---------|-------------| -| **Dominio** | Desarrollo de código | -| **Fuentes** | Documentación (Markdown, JSON, YAML, RST) + código fuente | -| **Consumidor** | Sistemas de desarrollo vía MCP (fuera de alcance inicial) | -| **Foco actual** | Backend | - -## 2. Arquitectura General - -### 2.1 Motores de Almacenamiento - -| Motor | Propósito | Perfil Local | Perfil Server | -|-------|-----------|--------------|---------------| -| **Trazabilidad** | Lineage, metadatos, relaciones documento→chunk→embedding→nodo | SQLite | PostgreSQL | -| **Vectorial** | Búsqueda semántica (embeddings) | ChromaDB | Qdrant | -| **Grafos** | Modelo de conocimiento (entidades KDD) — opcional | SQLite | Neo4j | - -> **Decisión**: Se mantienen las 3 BBDD separadas (ver ADR-0001). El almacenamiento de grafos puede desactivarse con `graph_store="none"`. - -### 2.2 Separación de Procesos - -El sistema separa dos procesos principales que comparten las bases de datos: - -- **Indexación**: Ingesta, procesamiento y almacenamiento de conocimiento -- **Retrieval**: Búsqueda y recuperación de referencias a documentos - -### 2.3 Stack Tecnológico - -| Componente | Tecnología | -|------------|------------| -| **Backend** | Python 3.11+ | -| **Framework API** | FastAPI | -| **Abstracciones** | Repository Pattern con Factory (ADR-0001) | -| **Cloud** | Agnóstico | - -### 2.4 Perfiles de Configuración - -| Perfil | Trazabilidad | Vectorial | Grafos | Embeddings | -|--------|-------------|-----------|--------|------------| -| **local** (desarrollo) | SQLite | ChromaDB | SQLite | sentence-transformers (all-MiniLM-L6-v2) | -| **server** (producción) | PostgreSQL | Qdrant | Neo4j | OpenAI (text-embedding-3-small) | - -El diseño es agnóstico en bases de datos gracias al Repository Pattern, con abstracciones sobre implementaciones concretas. - -## 3. Modelo de Grafos - -### 3.1 Capas (Desarrollo Incremental) - -| Fase | Capa | Contenido | -|------|------|-----------| -| 1 (actual) | Funcional | Entidades KDD: PRD, Entity, Rule, UseCase, Process, Event, etc. | -| 2 (futuro) | Física | Tablas, servicios, pantallas | - -### 3.2 Características - -- **Granularidad**: Alta (mucho detalle) -- **Origen de entidades**: Extraídas de documentación KDD -- **Metodología base**: Knowledge-Driven Development (KDD) - ver `docs/design/kdd.md` - -### 3.3 Tipos de Nodos (Implementados) - -Definidos en `kb_engine.core.models.graph.NodeType`: - -| Categoría | Tipo de Nodo | Descripción | -|-----------|--------------|-------------| -| **Dominio** | `ENTITY` | Entidades y value objects del dominio | -| **Dominio** | `RULE` | Reglas de negocio | -| **Comportamiento** | `USE_CASE` | Casos de uso | -| **Comportamiento** | `PROCESS` | Procesos/flujos | -| **Actores** | `ACTOR` | Actores del sistema (usuarios, roles) | -| **Actores** | `SYSTEM` | Sistemas o servicios | -| **General** | `CONCEPT` | Conceptos genéricos | -| **Estructural** | `DOCUMENT` | Referencia a documento fuente | -| **Estructural** | `CHUNK` | Referencia a chunk fuente | - -### 3.4 Tipos de Relaciones - -Definidos en `kb_engine.core.models.graph.EdgeType`: - -| Relación | Categoría | Descripción | -|----------|-----------|-------------| -| `CONTAINS` | Estructural | Contención jerárquica | -| `PART_OF` | Estructural | Pertenencia | -| `REFERENCES` | Estructural | Referencia genérica | -| `IMPLEMENTS` | Dominio | Implementación | -| `DEPENDS_ON` | Dominio | Dependencia | -| `RELATED_TO` | Dominio | Relación genérica | -| `TRIGGERS` | Dominio | Disparo de acción | -| `USES` | Dominio | Uso/consumo | -| `PRODUCES` | Dominio | Producción | -| `PERFORMS` | Actor | Actor ejecuta acción | -| `OWNS` | Actor | Propiedad | -| `SIMILAR_TO` | Semántico | Similitud semántica | -| `CONTRADICTS` | Semántico | Contradicción | -| `EXTENDS` | Semántico | Extensión | - -### 3.5 Propiedades Comunes de Nodos - -Basadas en el front-matter KDD: - -```yaml -id: string # Identificador único (ej: UC-Checkout@v1) -kind: string # Tipo de artefacto -status: enum # draft | proposed | approved | deprecated -aliases: string[] # Nombres alternativos -tags: string[] # Etiquetas para clasificación -domain: string # Dominio/proyecto al que pertenece -source_file: string # Archivo origen en el repositorio -``` - -## 4. Modelo de Trazabilidad - -### 4.1 Propósito - -Trazabilidad total entre todas las piezas del sistema para: -- Saber todo lo inferido a partir de un documento (nodos, relaciones, embeddings) -- Actualización precisa a nivel de chunk cuando cambia un documento -- Reindexación selectiva o completa de documentos -- Borrado en cascada cuando se elimina un documento - -### 4.2 Estado de Procesamiento de Documentos - -Los documentos tienen un estado de procesamiento (`DocumentStatus`): - -| Estado | Descripción | -|--------|-------------| -| `PENDING` | Documento registrado, pendiente de procesamiento | -| `PROCESSING` | Pipeline de indexación en curso | -| `INDEXED` | Indexación completada exitosamente | -| `FAILED` | Error durante indexación | -| `ARCHIVED` | Archivado / fuera de uso | - -> **Nota**: El ciclo de vida del contenido (dev → staging → pro → deprecated) está pendiente de implementación (ver DC-011). Actualmente el sistema gestiona solo el estado de procesamiento. - -### 4.3 Entidades de Trazabilidad (Implementadas) - -Basadas en los modelos Pydantic en `kb_engine.core.models`: - -``` -┌──────────────────┐ -│ Document │ -│──────────────────│ -│ id (uuid) │ -│ external_id │ ← repo_name:relative_path -│ title │ -│ content │ -│ source_path │ -│ mime_type │ ← text/markdown, application/json, etc. -│ domain │ ← proyecto/dominio -│ tags │ ← etiquetas del frontmatter -│ metadata (dict) │ ← info del frontmatter + _parser -│ status │ ← PENDING | PROCESSING | INDEXED | FAILED | ARCHIVED -│ content_hash │ ← SHA256 para detectar cambios -│ ──── Git ─────── │ -│ repo_name │ ← nombre del repositorio -│ relative_path │ ← ruta relativa en el repo -│ git_commit │ ← SHA del commit indexado -│ git_remote_url │ ← URL del remote -│ ──── Timestamps ─│ -│ created_at │ -│ updated_at │ -└────────┬─────────┘ - │ 1:N - ▼ -┌──────────────────┐ -│ Chunk │ -│──────────────────│ -│ id (uuid) │ -│ document_id (fk) │ -│ content │ -│ chunk_type │ ← ENTITY | USE_CASE | RULE | PROCESS | DEFAULT -│ heading_path │ ← jerarquía de headings [H1, H2, H3] -│ section_anchor │ ← anchor calculado del heading_path -│ start_offset │ -│ end_offset │ -│ metadata (dict) │ -│ content_hash │ -└────────┬─────────┘ - │ 1:1 - ▼ -┌──────────────────┐ -│ Embedding │ -│──────────────────│ -│ id (uuid) │ -│ chunk_id (fk) │ -│ vector │ ← list[float] -│ model │ ← modelo usado (all-MiniLM-L6-v2 o text-embedding-3-small) -│ dimensions │ -│ metadata (dict) │ -└──────────────────┘ - -┌──────────────────┐ ┌──────────────────┐ -│ Node │ │ Edge │ -│──────────────────│ │──────────────────│ -│ id (uuid) │ │ id (uuid) │ -│ external_id │ │ source_id (fk) │ -│ name │ │ target_id (fk) │ -│ node_type │ │ edge_type │ -│ description │ │ name │ -│ source_doc_id │ │ properties │ -│ source_chunk_id │ │ weight │ -│ properties │ │ source_doc_id │ -│ confidence │ │ source_chunk_id │ -│ extraction_method│ │ confidence │ -│ created_at │ │ extraction_method│ -│ updated_at │ │ created_at │ -└──────────────────┘ └──────────────────┘ -``` - -### 4.4 Operaciones de Trazabilidad - -| Operación | Descripción | -|-----------|-------------| -| **Crear documento** | Inserta Document + genera Chunks + Embeddings + Nodos/Edges | -| **Actualizar documento** | Compara hash, actualiza solo chunks modificados, propaga cambios | -| **Eliminar documento** | Borra en cascada: Document → Chunks → Embeddings → Nodos → Edges | -| **Consultar lineage** | Dado un documento, obtener todos sus derivados | -| **Consultar origen** | Dado un nodo/embedding, obtener documento y chunk origen | - -## 5. Pipeline de Indexación - -### 5.1 Características Generales - -- Pipelines específicos por tipo de documento KDD -- Configurables por código (Python) -- Chunking preciso (estructura KDD conocida) - -### 5.2 Tipos de Documentos y Pipelines - -Basados en la estructura KDD: - -| Fuente | Tipo de Documento | Pipeline | -|--------|-------------------|----------| -| `/specs/vision/` | PRD | Pipeline PRD | -| `/specs/domain/entities/` | Entity | Pipeline Entity | -| `/specs/domain/events/` | Event | Pipeline Event | -| `/specs/domain/rules/` | Rule | Pipeline Rule | -| `/specs/behavior/use-cases/` | UseCase | Pipeline UseCase | -| `/specs/behavior/processes/` | Process | Pipeline Process | -| `/specs/behavior/stories/` | Story | Pipeline Story | -| `/specs/interfaces/api/` | OpenAPI | Pipeline API | -| `/specs/interfaces/async/` | AsyncAPI | Pipeline AsyncAPI | -| `/specs/examples/` | Scenario, Gherkin | Pipeline SBE | -| `/specs/quality/` | NFR | Pipeline NFR | -| `/specs/architecture/adr/` | ADR | Pipeline ADR | - -### 5.3 Pasos del Pipeline - -``` -┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ -│ Ingesta │───▶│ Parsing │───▶│ Chunking │───▶│ Embedding │ -│ (fuentes) │ │ (front-matter│ │ (específico │ │ (vectores) │ -└─────────────┘ │ + contenido)│ │ por tipo) │ └─────────────┘ - └─────────────┘ └─────────────┘ │ - ▼ - ┌─────────────┐ - │ Vector │ - │ DB │ - └─────────────┘ - │ - ▼ - ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ - │ Extracción │───▶│ Validación │───▶│ Graph │ - │ Entidades │ │ (humana) │ │ DB │ - │ + Relaciones│ │ │ │ │ - └─────────────┘ └─────────────┘ └─────────────┘ -``` - -### 5.4 Extracción de Entidades (ADR-0003) - -Pipeline multi-estrategia implementado en `kb_engine.extraction`: - -- **FrontmatterExtractor**: Extrae del YAML frontmatter (confidence=1.0) -- **PatternExtractor**: Detecta patrones en contenido: wiki links `[[Entity]]`, IDs KDD `UC-*`, `RUL-*` (confidence=0.8-0.9) -- **LLMExtractor**: Extracción semántica con OpenAI (confidence=0.7) — **opcional**, desactivado por defecto -- **Deduplicación** automática por ID y (source, target, type) -- El grafo es **opcional**: si `graph_store="none"`, la extracción se omite - -## 6. Pipeline de Retrieval - -### 6.1 Arquitectura - -El `RetrievalPipeline` retorna `DocumentReference` con URLs (file:// o https://#anchor) en lugar de contenido raw. Esto permite a agentes externos leer los documentos fuente directamente. - -### 6.2 Modos de Retrieval (Implementados) - -| Modo | Descripción | Estado | -|------|-------------|--------| -| `VECTOR` | Búsqueda semántica por similitud de embeddings | Implementado | -| `GRAPH` | Búsqueda por traversal del grafo | Placeholder | -| `HYBRID` | Combina vector + graph con Reciprocal Rank Fusion | Implementado (merge) | - -### 6.3 Requisitos de Rendimiento - -- **Latencia**: Muy baja (prioridad) - -## 7. Seguridad (By Design) - -### 7.1 RBAC - -| Aspecto | Detalle | -|---------|---------| -| **Nivel de control** | Documento + Proyecto/Dominio | -| **Origen de roles** | Externo (IdP) | -| **Aplicación** | Sobre grafo Y vector | - -### 7.2 Multi-tenancy - -- Un despliegue por dominio/proyecto -- No se comparte entre clientes (aislamiento total) - -## 8. Interfaces - -### 8.1 UI de Curación - -- Interfaz propia para validación humana -- Gestión de entidades y grafos -- Herramientas para Knowledge Manager (futuro) - -### 8.2 API - -Por definir - -## 9. Volumetría - -| Métrica | Valor | -|---------|-------| -| Documentos por aplicación | ~1500 (escalable a aplicaciones grandes) | - -## 10. Requisitos Pendientes de Definir - -- [x] ~~Tipos de nodos y relaciones del grafo~~ (implementados en `graph.py`) -- [x] ~~Tipos específicos de documentos y sus pipelines~~ (basado en KDD) -- [x] ~~Repository Pattern para abstracción de almacenamiento~~ (ADR-0001) -- [x] ~~Estrategia de chunking semántico~~ (ADR-0002) -- [x] ~~Pipeline de extracción multi-estrategia~~ (ADR-0003) -- [ ] Ciclo de vida del contenido (DC-011) -- [ ] Modelo de API del backend (DC-006) -- [ ] Papers de referencia para estrategia de retrieval por defecto (DC-002) -- [ ] Diseño de UI de curación (DC-007) -- [ ] Modelo de integración con IdP (DC-005) - ---- - -*Documento en evolución - Última actualización: Febrero 2026 (alineado con v0.2.0)* diff --git a/migrations/alembic.ini b/migrations/alembic.ini deleted file mode 100644 index de2e1a3..0000000 --- a/migrations/alembic.ini +++ /dev/null @@ -1,79 +0,0 @@ -[alembic] -# path to migration scripts -script_location = %(here)s - -# template used to generate migration file names -file_template = %%(year)d%%(month).2d%%(day).2d_%%(hour).2d%%(minute).2d_%%(rev)s_%%(slug)s - -# sys.path path, will be prepended to sys.path if present -prepend_sys_path = . - -# timezone to use when rendering the date within the migration file -timezone = UTC - -# max length of characters to apply to the "slug" field -truncate_slug_length = 40 - -# set to 'true' to run the environment during -# the 'revision' command, regardless of autogenerate -revision_environment = false - -# set to 'true' to allow .pyc and .pyo files without -# having the source .py files present -sourceless = false - -# version path separator; default is OS-specific -version_path_separator = os - -# output encoding used when revision files are written -output_encoding = utf-8 - -# SQLAlchemy URL - will be overridden by env.py -sqlalchemy.url = driver://user:pass@localhost/dbname - - -[post_write_hooks] -# post_write_hooks defines scripts or Python functions that are run -# on newly generated revision scripts. See the documentation for further -# detail and examples - -# format using "black" -hooks = black -black.type = console_scripts -black.entrypoint = black -black.options = -q - - -[loggers] -keys = root,sqlalchemy,alembic - -[handlers] -keys = console - -[formatters] -keys = generic - -[logger_root] -level = WARN -handlers = console -qualname = - -[logger_sqlalchemy] -level = WARN -handlers = -qualname = sqlalchemy.engine - -[logger_alembic] -level = INFO -handlers = -qualname = alembic - -[handler_console] -class = StreamHandler -args = (sys.stderr,) -level = NOTSET -formatter = generic - -[formatter_generic] -format = %(levelname)-5.5s [%(name)s] %(message)s -datefmt = %H:%M:%S diff --git a/migrations/env.py b/migrations/env.py deleted file mode 100644 index 53d3bc5..0000000 --- a/migrations/env.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Alembic environment configuration.""" - -import asyncio -from logging.config import fileConfig - -from alembic import context -from sqlalchemy import pool -from sqlalchemy.engine import Connection -from sqlalchemy.ext.asyncio import async_engine_from_config - -from kb_engine.config import get_settings - -# Alembic Config object -config = context.config - -# Interpret the config file for Python logging -if config.config_file_name is not None: - fileConfig(config.config_file_name) - -# Add your model's MetaData object here for 'autogenerate' support -# from kb_engine.repositories.traceability.models import Base -# target_metadata = Base.metadata -target_metadata = None - -# Get database URL from settings -settings = get_settings() -config.set_main_option("sqlalchemy.url", settings.database_url or "") - - -def run_migrations_offline() -> None: - """Run migrations in 'offline' mode. - - This configures the context with just a URL - and not an Engine, though an Engine is acceptable - here as well. By skipping the Engine creation - we don't even need a DBAPI to be available. - - Calls to context.execute() here emit the given string to the - script output. - """ - url = config.get_main_option("sqlalchemy.url") - context.configure( - url=url, - target_metadata=target_metadata, - literal_binds=True, - dialect_opts={"paramstyle": "named"}, - ) - - with context.begin_transaction(): - context.run_migrations() - - -def do_run_migrations(connection: Connection) -> None: - """Run migrations with the given connection.""" - context.configure(connection=connection, target_metadata=target_metadata) - - with context.begin_transaction(): - context.run_migrations() - - -async def run_async_migrations() -> None: - """Run migrations in async mode.""" - connectable = async_engine_from_config( - config.get_section(config.config_ini_section, {}), - prefix="sqlalchemy.", - poolclass=pool.NullPool, - ) - - async with connectable.connect() as connection: - await connection.run_sync(do_run_migrations) - - await connectable.dispose() - - -def run_migrations_online() -> None: - """Run migrations in 'online' mode. - - In this scenario we need to create an Engine - and associate a connection with the context. - """ - asyncio.run(run_async_migrations()) - - -if context.is_offline_mode(): - run_migrations_offline() -else: - run_migrations_online() diff --git a/migrations/versions/.gitkeep b/migrations/versions/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/package.json b/package.json new file mode 100644 index 0000000..97dac22 --- /dev/null +++ b/package.json @@ -0,0 +1,27 @@ +{ + "name": "kdd", + "version": "1.0.0", + "module": "src/cli.ts", + "type": "module", + "private": true, + "scripts": { + "start": "bun run src/cli.ts", + "bench": "bun run bench/compare.ts", + "test": "bun test" + }, + "devDependencies": { + "@types/bun": "latest" + }, + "peerDependencies": { + "typescript": "^5" + }, + "dependencies": { + "@huggingface/transformers": "^3.8.1", + "@modelcontextprotocol/sdk": "^1.0.0", + "citty": "^0.2.1", + "graphology": "^0.26.0", + "graphology-traversal": "^0.3.1", + "graphology-types": "^0.24.8", + "gray-matter": "^4.0.3" + } +} diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index ba25cba..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,155 +0,0 @@ -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[project] -name = "kb-engine" -version = "0.2.0" -description = "Intelligent document retrieval system" -readme = "README.md" -license = "MIT" -requires-python = ">=3.11" -authors = [ - { name = "Babel", email = "dev@babel.es" }, -] -classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", -] -dependencies = [ - "fastapi>=0.109.0", - "uvicorn[standard]>=0.27.0", - "pydantic>=2.5.0", - "pydantic-settings>=2.1.0", - "python-frontmatter>=1.0.0", - "httpx>=0.26.0", - "structlog>=24.1.0", - # Local profile (always available) - "aiosqlite>=0.19.0", - "chromadb>=0.4.0", - "sentence-transformers>=2.3.0", - "falkordblite>=0.4.0", - "pandas>=2.0.0", - "click>=8.0", -] - -[project.optional-dependencies] -server = [ - "asyncpg>=0.29.0", - "sqlalchemy[asyncio]>=2.0", - "alembic>=1.13.0", - "qdrant-client>=1.7.0", - "neo4j>=5.16.0", - "openai>=1.10.0", -] -mcp = [ - "mcp>=1.0.0", -] -kdd = [ - "pydantic>=2.5.0", - "python-frontmatter>=1.0.0", - "click>=8.0", - "networkx>=3.0", -] -kdd-l2 = [ - "sentence-transformers>=2.3.0", - "hnswlib>=0.8.0", -] -kdd-server = [ - "fastapi>=0.109.0", - "uvicorn[standard]>=0.27.0", -] -dev = [ - "pytest>=8.0.0", - "pytest-asyncio>=0.23.0", - "pytest-cov>=4.1.0", - "ruff>=0.2.0", - "mypy>=1.8.0", - "pre-commit>=3.6.0", - "factory-boy>=3.3.0", - "httpx>=0.26.0", -] - -[project.scripts] -kb-engine = "kb_engine.api.main:run" -kb = "kb_engine.cli:cli" -kb-mcp = "kb_engine.mcp_server:main" -kdd = "kdd.api.cli:cli" - -[tool.hatch.build.targets.wheel] -packages = ["src/kb_engine", "src/kdd"] - -[tool.pytest.ini_options] -testpaths = ["tests"] -asyncio_mode = "auto" -addopts = "-v --tb=short" -markers = [ - "unit: Unit tests", - "integration: Integration tests", - "api: API tests", -] - -[tool.ruff] -target-version = "py311" -line-length = 100 -src = ["src", "tests"] - -[tool.ruff.lint] -select = [ - "E", # pycodestyle errors - "W", # pycodestyle warnings - "F", # Pyflakes - "I", # isort - "B", # flake8-bugbear - "C4", # flake8-comprehensions - "UP", # pyupgrade - "ARG", # flake8-unused-arguments - "SIM", # flake8-simplify -] -ignore = [ - "E501", # line too long (handled by formatter) - "B008", # do not perform function calls in argument defaults - "B904", # raise without from inside except -] - -[tool.ruff.lint.isort] -known-first-party = ["kb_engine", "kdd"] - -[tool.mypy] -python_version = "3.11" -strict = true -warn_return_any = true -warn_unused_ignores = true -disallow_untyped_defs = true -plugins = ["pydantic.mypy"] - -[[tool.mypy.overrides]] -module = [ - "neo4j.*", - "qdrant_client.*", - "frontmatter.*", - "chromadb.*", - "sentence_transformers.*", - "falkordb.*", - "redislite.*", - "mcp.*", - "hnswlib.*", - "networkx.*", -] -ignore_missing_imports = true - -[tool.coverage.run] -source = ["src/kb_engine", "src/kdd"] -branch = true - -[tool.coverage.report] -exclude_lines = [ - "pragma: no cover", - "def __repr__", - "raise NotImplementedError", - "if TYPE_CHECKING:", -] diff --git a/src/application/chunking.ts b/src/application/chunking.ts new file mode 100644 index 0000000..eaaef78 --- /dev/null +++ b/src/application/chunking.ts @@ -0,0 +1,138 @@ +/** + * Hierarchical chunking for embedding generation (BR-EMBEDDING-001). + */ + +import type { Chunk, KDDDocument, KDDKind } from "../domain/types.ts"; +import { embeddableSections } from "../domain/rules.ts"; + +export function chunkDocument( + document: KDDDocument, + maxChunkChars = 1500, + overlapChars = 200, +): Chunk[] { + const allowed = embeddableSections(document.kind); + if (allowed.size === 0) return []; + + const identity = buildIdentity(document); + const chunks: Chunk[] = []; + let chunkIdx = 0; + + for (const section of document.sections) { + if (!allowed.has(section.heading.toLowerCase())) continue; + if (!section.content.trim()) continue; + + const paragraphs = splitParagraphs(section.content, maxChunkChars, overlapChars); + + for (const [offset, text] of paragraphs) { + const context = `${identity}\nSection: ${section.heading}\n\n${text}`; + chunks.push({ + chunk_id: `${document.id}:chunk-${chunkIdx}`, + document_id: document.id, + section_heading: section.heading, + content: text, + context_text: context, + char_offset: offset, + }); + chunkIdx++; + } + } + + return chunks; +} + +function buildIdentity(document: KDDDocument): string { + const parts = [ + `Document: ${document.id}`, + `Kind: ${document.kind}`, + `Layer: ${document.layer}`, + ]; + const title = document.front_matter.title; + if (title) parts.push(`Title: ${title}`); + return parts.join("\n"); +} + +function splitParagraphs( + content: string, + maxChars: number, + overlap: number, +): [number, string][] { + const paragraphs = content.split("\n\n"); + const results: [number, string][] = []; + let currentParts: string[] = []; + let currentLen = 0; + let currentOffset = 0; + let charPos = 0; + + for (const rawPara of paragraphs) { + const para = rawPara.trim(); + if (!para) { + charPos += 2; + continue; + } + + const paraLen = para.length; + + if (currentLen + paraLen + 2 > maxChars && currentParts.length > 0) { + results.push([currentOffset, currentParts.join("\n\n")]); + if (overlap > 0 && currentParts.length > 0) { + const last = currentParts[currentParts.length - 1]!; + if (last.length <= overlap) { + currentParts = [last]; + currentLen = last.length; + currentOffset = charPos - last.length - 2; + } else { + currentParts = []; + currentLen = 0; + currentOffset = charPos; + } + } else { + currentParts = []; + currentLen = 0; + currentOffset = charPos; + } + } + + if (paraLen > maxChars && currentParts.length === 0) { + const sentences = splitSentences(para); + const sentBuf: string[] = []; + let sentLen = 0; + let sentOffset = charPos; + + for (const sent of sentences) { + if (sentLen + sent.length + 1 > maxChars && sentBuf.length > 0) { + results.push([sentOffset, sentBuf.join(" ")]); + sentBuf.length = 0; + sentLen = 0; + sentOffset = charPos; + } + sentBuf.push(sent); + sentLen += sent.length + 1; + } + + if (sentBuf.length > 0) { + currentParts = sentBuf; + currentLen = sentLen; + currentOffset = sentOffset; + } + } else { + if (currentParts.length === 0) currentOffset = charPos; + currentParts.push(para); + currentLen += paraLen + 2; + } + + charPos += paraLen + 2; + } + + if (currentParts.length > 0) { + results.push([currentOffset, currentParts.join("\n\n")]); + } + + return results; +} + +function splitSentences(text: string): string[] { + return text + .split(/(?<=\.)\s+/) + .map((s) => s.trim()) + .filter(Boolean); +} diff --git a/src/application/commands/index-document.ts b/src/application/commands/index-document.ts new file mode 100644 index 0000000..f27c3ad --- /dev/null +++ b/src/application/commands/index-document.ts @@ -0,0 +1,129 @@ +/** + * CMD-001 — IndexDocument command. + * + * Processes a single KDD spec file through the full indexing pipeline. + */ + +import { basename, relative } from "node:path"; +import { createHash } from "node:crypto"; +import type { Embedding, IndexResult, KDDDocument, KDDLayer } from "../../domain/types.ts"; +import { IndexLevel } from "../../domain/types.ts"; +import { detectLayer, routeDocument } from "../../domain/rules.ts"; +import { extractFrontmatter, parseMarkdownSections } from "../../infra/markdown-parser.ts"; +import { extractWikiLinkTargets } from "../../infra/wiki-links.ts"; +import { chunkDocument } from "../chunking.ts"; +import type { ExtractorRegistry } from "../extractors/registry.ts"; +import type { ArtifactWriter } from "../../infra/artifact-writer.ts"; + +export async function indexDocument( + filePath: string, + opts: { + specsRoot: string; + registry: ExtractorRegistry; + artifactWriter: ArtifactWriter; + encodeFn?: ((texts: string[]) => Promise) | null; + modelName?: string; + modelDimensions?: number; + indexLevel?: string; + domain?: string | null; + }, +): Promise { + const { + specsRoot, + registry, + artifactWriter, + encodeFn, + modelName, + modelDimensions, + indexLevel = IndexLevel.L1, + domain = null, + } = opts; + + // 1. Read file + const file = Bun.file(filePath); + let content: string; + try { + content = await file.text(); + } catch (e) { + return { success: false, edge_count: 0, embedding_count: 0, skipped_reason: `File error: ${e}` }; + } + + // 2. Extract front-matter and route + const [frontMatter, body] = extractFrontmatter(content); + const relativePath = relative(specsRoot, filePath); + const route = routeDocument(frontMatter, relativePath); + + if (!route.kind) { + return { success: false, edge_count: 0, embedding_count: 0, skipped_reason: "No valid kind in front-matter" }; + } + + // 3. Find extractor + const extractor = registry.get(route.kind); + if (!extractor) { + return { success: false, edge_count: 0, embedding_count: 0, skipped_reason: `No extractor for kind '${route.kind}'` }; + } + + // 4. Build KDDDocument + const sections = parseMarkdownSections(body); + const wikiLinks = extractWikiLinkTargets(body); + const layer: KDDLayer = detectLayer(relativePath) ?? "01-domain"; + const docId = (frontMatter.id as string) ?? basename(filePath, ".md"); + const sourceHash = createHash("sha256").update(content).digest("hex"); + + const document: KDDDocument = { + id: docId, + kind: route.kind, + source_path: relativePath, + source_hash: sourceHash, + layer, + front_matter: frontMatter, + sections, + wiki_links: wikiLinks, + domain, + }; + + // 5. Extract node + edges + const node = extractor.extractNode(document); + const edges = extractor.extractEdges(document); + + // 6. Write artifacts + await artifactWriter.writeNode(node); + if (edges.length > 0) { + await artifactWriter.appendEdges(edges); + } + + // 7. Optional L2: chunk + embed + let embeddingCount = 0; + if ((indexLevel === IndexLevel.L2 || indexLevel === IndexLevel.L3) && encodeFn) { + const chunks = chunkDocument(document); + if (chunks.length > 0) { + const texts = chunks.map((c) => c.context_text); + const vectors = await encodeFn(texts); + const now = new Date().toISOString(); + const embeddings: Embedding[] = chunks.map((chunk, i) => ({ + id: chunk.chunk_id, + document_id: docId, + document_kind: route.kind!, + section_path: chunk.section_heading, + chunk_index: i, + raw_text: chunk.content, + context_text: chunk.context_text, + vector: vectors[i]!, + model: modelName ?? "unknown", + dimensions: modelDimensions ?? vectors[i]!.length, + text_hash: createHash("sha256").update(chunk.content).digest("hex"), + generated_at: now, + })); + await artifactWriter.writeEmbeddings(embeddings); + embeddingCount = embeddings.length; + } + } + + return { + success: true, + node_id: node.id, + edge_count: edges.length, + embedding_count: embeddingCount, + warning: route.warning ?? undefined, + }; +} diff --git a/src/application/extractors/base.ts b/src/application/extractors/base.ts new file mode 100644 index 0000000..b61813e --- /dev/null +++ b/src/application/extractors/base.ts @@ -0,0 +1,208 @@ +/** + * Base extractor protocol and shared helpers. + */ + +import type { GraphEdge, GraphNode, KDDDocument, KDDKind, KDDLayer, Section } from "../../domain/types.ts"; +import { KIND_PREFIX, KDDLayer as Layers, LAYER_NUMERIC } from "../../domain/types.ts"; +import { isLayerViolation } from "../../domain/rules.ts"; +import { extractWikiLinks, type WikiLink } from "../../infra/wiki-links.ts"; + +export interface Extractor { + kind: KDDKind; + extractNode(document: KDDDocument): GraphNode; + extractEdges(document: KDDDocument): GraphEdge[]; +} + +export function makeNodeId(kind: KDDKind, documentId: string): string { + const prefix = KIND_PREFIX[kind] ?? kind.toUpperCase(); + return `${prefix}:${documentId}`; +} + +export function findSection(sections: Section[], ...names: string[]): Section | null { + const targets = new Set(names.map((n) => n.toLowerCase())); + for (const s of sections) { + if (targets.has(s.heading.toLowerCase())) return s; + } + return null; +} + +export function findSections(sections: Section[], ...names: string[]): Section[] { + const targets = new Set(names.map((n) => n.toLowerCase())); + return sections.filter((s) => targets.has(s.heading.toLowerCase())); +} + +export function findSectionWithChildren( + sections: Section[], + ...names: string[] +): string | null { + const targets = new Set(names.map((n) => n.toLowerCase())); + let parentIdx: number | null = null; + let parentLevel = 0; + + for (let i = 0; i < sections.length; i++) { + if (targets.has(sections[i]!.heading.toLowerCase())) { + parentIdx = i; + parentLevel = sections[i]!.level; + break; + } + } + + if (parentIdx === null) return null; + + const parts: string[] = []; + const parent = sections[parentIdx]!; + if (parent.content.trim()) parts.push(parent.content); + + for (let i = parentIdx + 1; i < sections.length; i++) { + const s = sections[i]!; + if (s.level <= parentLevel) break; + parts.push(`### ${s.heading}\n\n${s.content}`); + } + + return parts.length > 0 ? parts.join("\n\n") : null; +} + +export function resolveWikiLinkToNodeId(link: WikiLink): string | null { + const t = link.target; + const prefixMap: [string, string][] = [ + ["EVT-", "Event"], + ["BR-", "BR"], + ["BP-", "BP"], + ["XP-", "XP"], + ["CMD-", "CMD"], + ["QRY-", "QRY"], + ["UC-", "UC"], + ["PROC-", "PROC"], + ["REQ-", "REQ"], + ["OBJ-", "OBJ"], + ["ADR-", "ADR"], + ["PRD-", "PRD"], + ["UI-", "UIView"], + ]; + for (const [prefix, nodePrefix] of prefixMap) { + if (t.startsWith(prefix)) return `${nodePrefix}:${t}`; + } + return `Entity:${t}`; +} + +export function buildWikiLinkEdges( + document: KDDDocument, + fromNodeId: string, + fromLayer: KDDLayer, +): GraphEdge[] { + const edges: GraphEdge[] = []; + const seen = new Set(); + + const fullContent = document.sections.map((s) => s.content).join("\n"); + const links = extractWikiLinks(fullContent); + + for (const link of links) { + const toNodeId = resolveWikiLinkToNodeId(link); + if (!toNodeId) continue; + const key = `${fromNodeId}|${toNodeId}`; + if (seen.has(key)) continue; + seen.add(key); + + const destLayer = guessLayerFromNodeId(toNodeId); + let violation = false; + if (destLayer) violation = isLayerViolation(fromLayer, destLayer); + + const metadata: Record = {}; + if (link.domain) metadata.domain = link.domain; + if (link.alias) metadata.display_alias = link.alias; + + edges.push({ + from_node: fromNodeId, + to_node: toNodeId, + edge_type: "WIKI_LINK", + source_file: document.source_path, + extraction_method: "wiki_link", + metadata, + layer_violation: violation, + bidirectional: true, + }); + } + + return edges; +} + +function guessLayerFromNodeId(nodeId: string): KDDLayer | null { + const prefix = nodeId.includes(":") ? nodeId.split(":")[0]! : ""; + const layerMap: Record = { + Entity: Layers.DOMAIN, + Event: Layers.DOMAIN, + BR: Layers.DOMAIN, + BP: Layers.BEHAVIOR, + XP: Layers.BEHAVIOR, + CMD: Layers.BEHAVIOR, + QRY: Layers.BEHAVIOR, + PROC: Layers.BEHAVIOR, + UC: Layers.BEHAVIOR, + UIView: Layers.EXPERIENCE, + UIComp: Layers.EXPERIENCE, + REQ: Layers.VERIFICATION, + OBJ: Layers.REQUIREMENTS, + PRD: Layers.REQUIREMENTS, + ADR: Layers.REQUIREMENTS, + GLOSS: Layers.DOMAIN, + }; + return layerMap[prefix] ?? null; +} + +// ── Shared table/list parsing helpers ─────────────────────────────── + +export function parseTableRows(content: string): Record[] { + const lines = content + .trim() + .split("\n") + .map((l) => l.trim()) + .filter((l) => l.startsWith("|")); + + if (lines.length < 2) return []; + + const headers = lines[0]! + .replace(/^\||\|$/g, "") + .split("|") + .map((h) => h.trim().replace(/`/g, "")); + + const rows: Record[] = []; + for (const line of lines.slice(2)) { + const cells = line + .replace(/^\||\|$/g, "") + .split("|") + .map((c) => c.trim()); + if (cells.length >= headers.length) { + const row: Record = {}; + headers.forEach((h, i) => (row[h] = cells[i]!)); + rows.push(row); + } + } + return rows; +} + +export function parseListItems(content: string): string[] { + return content + .split("\n") + .map((l) => l.trim()) + .filter((l) => l.startsWith("- ") || l.startsWith("* ")) + .map((l) => l.slice(2).trim()); +} + +export function deduplicateEdges(edges: GraphEdge[]): GraphEdge[] { + const seen = new Set(); + return edges.filter((e) => { + const key = `${e.from_node}|${e.to_node}|${e.edge_type}`; + if (seen.has(key)) return false; + seen.add(key); + return true; + }); +} + +/** Check if a wiki-link target looks like an entity (not a prefixed spec). */ +export function isEntityTarget(target: string): boolean { + const specPrefixes = [ + "EVT-", "BR-", "BP-", "XP-", "CMD-", "QRY-", + "UC-", "PROC-", "REQ-", "OBJ-", "ADR-", "PRD-", "UI-", + ]; + return !specPrefixes.some((p) => target.startsWith(p)); +} diff --git a/src/application/extractors/kinds/adr.ts b/src/application/extractors/kinds/adr.ts new file mode 100644 index 0000000..d217e2b --- /dev/null +++ b/src/application/extractors/kinds/adr.ts @@ -0,0 +1,32 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, type Extractor } from "../base.ts"; + +export class ADRExtractor implements Extractor { + kind: KDDKind = KDDKind.ADR; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.ADR, document.id); + const fields: Record = {}; + + const context = findSection(document.sections, "Contexto", "Context"); + if (context) fields.context = context.content; + const decision = findSection(document.sections, "Decisión", "Decision"); + if (decision) fields.decision = decision.content; + const consequences = findSection(document.sections, "Consecuencias", "Consequences"); + if (consequences) fields.consequences = consequences.content; + + return { + id: nodeId, kind: KDDKind.ADR, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.ADR, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/business-policy.ts b/src/application/extractors/kinds/business-policy.ts new file mode 100644 index 0000000..837abea --- /dev/null +++ b/src/application/extractors/kinds/business-policy.ts @@ -0,0 +1,53 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, isEntityTarget, makeNodeId, resolveWikiLinkToNodeId, type Extractor } from "../base.ts"; +import { extractWikiLinks } from "../../../infra/wiki-links.ts"; + +export class BusinessPolicyExtractor implements Extractor { + kind: KDDKind = KDDKind.BUSINESS_POLICY; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.BUSINESS_POLICY, document.id); + const fields: Record = {}; + + const decl = findSection(document.sections, "Declaración", "Declaration"); + if (decl) fields.declaration = decl.content; + const when = findSection(document.sections, "Cuándo Aplica", "When Applies"); + if (when) fields.when_applies = when.content; + const params = findSection(document.sections, "Parámetros", "Parameters"); + if (params) fields.parameters = params.content; + const violation = findSection(document.sections, "Qué pasa si se incumple", "Violation", "What Happens on Violation"); + if (violation) fields.violation = violation.content; + + return { + id: nodeId, kind: KDDKind.BUSINESS_POLICY, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.BUSINESS_POLICY, document.id); + const edges: GraphEdge[] = [...buildWikiLinkEdges(document, nodeId, document.layer)]; + + const decl = findSection(document.sections, "Declaración", "Declaration"); + if (decl) { + for (const link of extractWikiLinks(decl.content)) { + if (isEntityTarget(link.target)) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, to_node: toNode, edge_type: "ENTITY_RULE", + source_file: document.source_path, extraction_method: "wiki_link", + metadata: {}, layer_violation: false, bidirectional: false, + }); + } + } + } + } + + return deduplicateEdges(edges); + } +} diff --git a/src/application/extractors/kinds/business-rule.ts b/src/application/extractors/kinds/business-rule.ts new file mode 100644 index 0000000..51b4ab4 --- /dev/null +++ b/src/application/extractors/kinds/business-rule.ts @@ -0,0 +1,55 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, isEntityTarget, makeNodeId, resolveWikiLinkToNodeId, type Extractor } from "../base.ts"; +import { extractWikiLinks } from "../../../infra/wiki-links.ts"; + +export class BusinessRuleExtractor implements Extractor { + kind: KDDKind = KDDKind.BUSINESS_RULE; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.BUSINESS_RULE, document.id); + const fields: Record = {}; + + const decl = findSection(document.sections, "Declaración", "Declaration"); + if (decl) fields.declaration = decl.content; + const when = findSection(document.sections, "Cuándo aplica", "When Applies"); + if (when) fields.when_applies = when.content; + const why = findSection(document.sections, "Por qué existe", "Why it exists"); + if (why) fields.why_exists = why.content; + const violation = findSection(document.sections, "Qué pasa si se incumple", "Violation", "What happens if violated"); + if (violation) fields.violation = violation.content; + const examples = findSection(document.sections, "Ejemplos", "Examples"); + if (examples) fields.examples = examples.content; + + return { + id: nodeId, kind: KDDKind.BUSINESS_RULE, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.BUSINESS_RULE, document.id); + const edges: GraphEdge[] = [...buildWikiLinkEdges(document, nodeId, document.layer)]; + + const decl = findSection(document.sections, "Declaración", "Declaration"); + if (decl) { + for (const link of extractWikiLinks(decl.content)) { + if (isEntityTarget(link.target)) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, to_node: toNode, edge_type: "ENTITY_RULE", + source_file: document.source_path, extraction_method: "wiki_link", + metadata: {}, layer_violation: false, bidirectional: false, + }); + } + } + } + } + + return deduplicateEdges(edges); + } +} diff --git a/src/application/extractors/kinds/command.ts b/src/application/extractors/kinds/command.ts new file mode 100644 index 0000000..802e967 --- /dev/null +++ b/src/application/extractors/kinds/command.ts @@ -0,0 +1,55 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, parseTableRows, resolveWikiLinkToNodeId, type Extractor } from "../base.ts"; +import { extractWikiLinks } from "../../../infra/wiki-links.ts"; + +export class CommandExtractor implements Extractor { + kind: KDDKind = KDDKind.COMMAND; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.COMMAND, document.id); + const fields: Record = {}; + + const purpose = findSection(document.sections, "Purpose", "Propósito"); + if (purpose) fields.purpose = purpose.content; + const input = findSection(document.sections, "Input", "Entrada"); + if (input) fields.input_params = parseTableRows(input.content); + const pre = findSection(document.sections, "Preconditions", "Precondiciones"); + if (pre) fields.preconditions = pre.content; + const post = findSection(document.sections, "Postconditions", "Postcondiciones"); + if (post) fields.postconditions = post.content; + const errors = findSection(document.sections, "Possible Errors", "Errores Posibles"); + if (errors) fields.errors = parseTableRows(errors.content); + + return { + id: nodeId, kind: KDDKind.COMMAND, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.COMMAND, document.id); + const edges: GraphEdge[] = [...buildWikiLinkEdges(document, nodeId, document.layer)]; + + const post = findSection(document.sections, "Postconditions", "Postcondiciones"); + if (post) { + for (const link of extractWikiLinks(post.content)) { + if (link.target.startsWith("EVT-")) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, to_node: toNode, edge_type: "EMITS", + source_file: document.source_path, extraction_method: "wiki_link", + metadata: {}, layer_violation: false, bidirectional: false, + }); + } + } + } + } + + return deduplicateEdges(edges); + } +} diff --git a/src/application/extractors/kinds/cross-policy.ts b/src/application/extractors/kinds/cross-policy.ts new file mode 100644 index 0000000..e895ff3 --- /dev/null +++ b/src/application/extractors/kinds/cross-policy.ts @@ -0,0 +1,53 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, isEntityTarget, makeNodeId, resolveWikiLinkToNodeId, type Extractor } from "../base.ts"; +import { extractWikiLinks } from "../../../infra/wiki-links.ts"; + +export class CrossPolicyExtractor implements Extractor { + kind: KDDKind = KDDKind.CROSS_POLICY; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.CROSS_POLICY, document.id); + const fields: Record = {}; + + const purpose = findSection(document.sections, "Propósito", "Purpose"); + if (purpose) fields.purpose = purpose.content; + const decl = findSection(document.sections, "Declaración", "Declaration"); + if (decl) fields.declaration = decl.content; + const formal = findSection(document.sections, "Formalización EARS", "EARS Formalization"); + if (formal) fields.formalization_ears = formal.content; + const behavior = findSection(document.sections, "Comportamiento Estándar", "Standard Behavior"); + if (behavior) fields.standard_behavior = behavior.content; + + return { + id: nodeId, kind: KDDKind.CROSS_POLICY, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.CROSS_POLICY, document.id); + const edges: GraphEdge[] = [...buildWikiLinkEdges(document, nodeId, document.layer)]; + + const decl = findSection(document.sections, "Declaración", "Declaration"); + if (decl) { + for (const link of extractWikiLinks(decl.content)) { + if (isEntityTarget(link.target)) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, to_node: toNode, edge_type: "ENTITY_RULE", + source_file: document.source_path, extraction_method: "wiki_link", + metadata: {}, layer_violation: false, bidirectional: false, + }); + } + } + } + } + + return deduplicateEdges(edges); + } +} diff --git a/src/application/extractors/kinds/entity.ts b/src/application/extractors/kinds/entity.ts new file mode 100644 index 0000000..cf106e5 --- /dev/null +++ b/src/application/extractors/kinds/entity.ts @@ -0,0 +1,116 @@ +/** + * Entity extractor — kind: entity + */ + +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { + buildWikiLinkEdges, + deduplicateEdges, + findSection, + makeNodeId, + parseListItems, + parseTableRows, + resolveWikiLinkToNodeId, + type Extractor, +} from "../base.ts"; +import { extractWikiLinks } from "../../../infra/wiki-links.ts"; + +export class EntityExtractor implements Extractor { + kind: KDDKind = KDDKind.ENTITY; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.ENTITY, document.id); + const fields: Record = {}; + + const desc = findSection(document.sections, "Descripción", "Description"); + if (desc) fields.description = desc.content; + + const attr = findSection(document.sections, "Atributos", "Attributes"); + if (attr) fields.attributes = parseTableRows(attr.content); + + const rel = findSection(document.sections, "Relaciones", "Relations", "Relationships"); + if (rel) fields.relations = parseTableRows(rel.content); + + const inv = findSection(document.sections, "Invariantes", "Invariants", "Constraints"); + if (inv) fields.invariants = parseListItems(inv.content); + + const sm = findSection(document.sections, "Ciclo de Vida", "Lifecycle", "State Machine"); + if (sm) fields.state_machine = sm.content; + + return { + id: nodeId, + kind: KDDKind.ENTITY, + source_file: document.source_path, + source_hash: document.source_hash, + layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, + indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.ENTITY, document.id); + const edges: GraphEdge[] = []; + + edges.push(...buildWikiLinkEdges(document, nodeId, document.layer)); + + // DOMAIN_RELATION from relations table + const rel = findSection(document.sections, "Relaciones", "Relations", "Relationships"); + if (rel) { + const rows = parseTableRows(rel.content); + for (const row of rows) { + let target: string | null = null; + for (const val of Object.values(row)) { + const links = extractWikiLinks(val); + if (links.length > 0) { + target = resolveWikiLinkToNodeId(links[0]!); + break; + } + } + if (!target) continue; + const relName = Object.values(row)[0] ?? ""; + const cardinality = row["Cardinalidad"] ?? row["Cardinality"] ?? ""; + edges.push({ + from_node: nodeId, + to_node: target, + edge_type: "DOMAIN_RELATION", + source_file: document.source_path, + extraction_method: "section_content", + metadata: { relation: relName, cardinality }, + layer_violation: false, + bidirectional: false, + }); + } + } + + // EMITS from lifecycle events + for (const section of document.sections) { + const h = section.heading.toLowerCase(); + if (h === "eventos del ciclo de vida" || h === "lifecycle events") { + const links = extractWikiLinks(section.content); + for (const link of links) { + if (link.target.startsWith("EVT-")) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, + to_node: toNode, + edge_type: "EMITS", + source_file: document.source_path, + extraction_method: "wiki_link", + metadata: {}, + layer_violation: false, + bidirectional: false, + }); + } + } + } + } + } + + return deduplicateEdges(edges); + } +} diff --git a/src/application/extractors/kinds/event.ts b/src/application/extractors/kinds/event.ts new file mode 100644 index 0000000..046ddf0 --- /dev/null +++ b/src/application/extractors/kinds/event.ts @@ -0,0 +1,34 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, parseTableRows, type Extractor } from "../base.ts"; + +export class EventExtractor implements Extractor { + kind: KDDKind = KDDKind.EVENT; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.EVENT, document.id); + const fields: Record = {}; + + const desc = findSection(document.sections, "Descripción", "Description"); + if (desc) fields.description = desc.content; + const payload = findSection(document.sections, "Payload"); + if (payload) fields.payload = parseTableRows(payload.content); + const producer = findSection(document.sections, "Productor", "Producer"); + if (producer) fields.producer = producer.content; + const consumers = findSection(document.sections, "Consumidores", "Consumers"); + if (consumers) fields.consumers = consumers.content; + + return { + id: nodeId, kind: KDDKind.EVENT, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.EVENT, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/glossary.ts b/src/application/extractors/kinds/glossary.ts new file mode 100644 index 0000000..b5c405f --- /dev/null +++ b/src/application/extractors/kinds/glossary.ts @@ -0,0 +1,32 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, type Extractor } from "../base.ts"; + +export class GlossaryExtractor implements Extractor { + kind: KDDKind = KDDKind.GLOSSARY; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.GLOSSARY, document.id); + const fields: Record = {}; + + const definition = findSection(document.sections, "Definición", "Definition"); + if (definition) fields.definition = definition.content; + const context = findSection(document.sections, "Contexto", "Context"); + if (context) fields.context = context.content; + const related = findSection(document.sections, "Términos Relacionados", "Related Terms"); + if (related) fields.related_terms = related.content; + + return { + id: nodeId, kind: KDDKind.GLOSSARY, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.GLOSSARY, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/objective.ts b/src/application/extractors/kinds/objective.ts new file mode 100644 index 0000000..6aee4b0 --- /dev/null +++ b/src/application/extractors/kinds/objective.ts @@ -0,0 +1,32 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, type Extractor } from "../base.ts"; + +export class ObjectiveExtractor implements Extractor { + kind: KDDKind = KDDKind.OBJECTIVE; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.OBJECTIVE, document.id); + const fields: Record = {}; + + const actor = findSection(document.sections, "Actor", "Actors"); + if (actor) fields.actor = actor.content; + const objective = findSection(document.sections, "Objetivo", "Objective"); + if (objective) fields.objective = objective.content; + const criteria = findSection(document.sections, "Criterios de éxito", "Success Criteria"); + if (criteria) fields.success_criteria = criteria.content; + + return { + id: nodeId, kind: KDDKind.OBJECTIVE, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.OBJECTIVE, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/prd.ts b/src/application/extractors/kinds/prd.ts new file mode 100644 index 0000000..6bb20d6 --- /dev/null +++ b/src/application/extractors/kinds/prd.ts @@ -0,0 +1,36 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, findSectionWithChildren, makeNodeId, type Extractor } from "../base.ts"; + +export class PRDExtractor implements Extractor { + kind: KDDKind = KDDKind.PRD; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.PRD, document.id); + const fields: Record = {}; + + const problem = findSection(document.sections, "Problema / Oportunidad", "Problem / Opportunity", "Problema", "Problem"); + if (problem) fields.problem = problem.content; + const scope = findSectionWithChildren(document.sections, "Alcance", "Scope"); + if (scope) fields.scope = scope; + const users = findSectionWithChildren(document.sections, "Usuarios y Jobs-to-be-done", "Users and Jobs-to-be-done"); + if (users) fields.users = users; + const metrics = findSection(document.sections, "Métricas de éxito y telemetría", "Success Metrics"); + if (metrics) fields.metrics = metrics.content; + const deps = findSection(document.sections, "Dependencias", "Dependencies"); + if (deps) fields.dependencies = deps.content; + + return { + id: nodeId, kind: KDDKind.PRD, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.PRD, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/process.ts b/src/application/extractors/kinds/process.ts new file mode 100644 index 0000000..4cf9985 --- /dev/null +++ b/src/application/extractors/kinds/process.ts @@ -0,0 +1,32 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, findSectionWithChildren, makeNodeId, type Extractor } from "../base.ts"; + +export class ProcessExtractor implements Extractor { + kind: KDDKind = KDDKind.PROCESS; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.PROCESS, document.id); + const fields: Record = {}; + + const participants = findSection(document.sections, "Participantes", "Participants"); + if (participants) fields.participants = participants.content; + const steps = findSectionWithChildren(document.sections, "Pasos", "Steps"); + if (steps) fields.steps = steps; + const diagram = findSection(document.sections, "Diagrama", "Diagram"); + if (diagram) fields.mermaid_flow = diagram.content; + + return { + id: nodeId, kind: KDDKind.PROCESS, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.PROCESS, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/query.ts b/src/application/extractors/kinds/query.ts new file mode 100644 index 0000000..c397af5 --- /dev/null +++ b/src/application/extractors/kinds/query.ts @@ -0,0 +1,34 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, parseTableRows, type Extractor } from "../base.ts"; + +export class QueryExtractor implements Extractor { + kind: KDDKind = KDDKind.QUERY; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.QUERY, document.id); + const fields: Record = {}; + + const purpose = findSection(document.sections, "Purpose", "Propósito"); + if (purpose) fields.purpose = purpose.content; + const input = findSection(document.sections, "Input", "Entrada"); + if (input) fields.input_params = parseTableRows(input.content); + const output = findSection(document.sections, "Output", "Salida"); + if (output) fields.output_structure = output.content; + const errors = findSection(document.sections, "Possible Errors", "Errores Posibles"); + if (errors) fields.errors = parseTableRows(errors.content); + + return { + id: nodeId, kind: KDDKind.QUERY, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.QUERY, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/requirement.ts b/src/application/extractors/kinds/requirement.ts new file mode 100644 index 0000000..82bee88 --- /dev/null +++ b/src/application/extractors/kinds/requirement.ts @@ -0,0 +1,32 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, type Extractor } from "../base.ts"; + +export class RequirementExtractor implements Extractor { + kind: KDDKind = KDDKind.REQUIREMENT; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.REQUIREMENT, document.id); + const fields: Record = {}; + + const desc = findSection(document.sections, "Descripción", "Description"); + if (desc) fields.description = desc.content; + const criteria = findSection(document.sections, "Criterios de Aceptación", "Acceptance Criteria"); + if (criteria) fields.acceptance_criteria = criteria.content; + const trace = findSection(document.sections, "Trazabilidad", "Traceability"); + if (trace) fields.traceability = trace.content; + + return { + id: nodeId, kind: KDDKind.REQUIREMENT, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.REQUIREMENT, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/ui-component.ts b/src/application/extractors/kinds/ui-component.ts new file mode 100644 index 0000000..8e3a96e --- /dev/null +++ b/src/application/extractors/kinds/ui-component.ts @@ -0,0 +1,32 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, type Extractor } from "../base.ts"; + +export class UIComponentExtractor implements Extractor { + kind: KDDKind = KDDKind.UI_COMPONENT; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.UI_COMPONENT, document.id); + const fields: Record = {}; + + const desc = findSection(document.sections, "Descripción", "Description"); + if (desc) fields.description = desc.content; + const entities = findSection(document.sections, "Entidades", "Entities"); + if (entities) fields.entities = entities.content; + const useCases = findSection(document.sections, "Casos de Uso", "Use Cases"); + if (useCases) fields.use_cases = useCases.content; + + return { + id: nodeId, kind: KDDKind.UI_COMPONENT, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.UI_COMPONENT, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/ui-view.ts b/src/application/extractors/kinds/ui-view.ts new file mode 100644 index 0000000..d381e40 --- /dev/null +++ b/src/application/extractors/kinds/ui-view.ts @@ -0,0 +1,36 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, makeNodeId, type Extractor } from "../base.ts"; + +export class UIViewExtractor implements Extractor { + kind: KDDKind = KDDKind.UI_VIEW; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.UI_VIEW, document.id); + const fields: Record = {}; + + const desc = findSection(document.sections, "Descripción", "Description"); + if (desc) fields.description = desc.content; + const layout = findSection(document.sections, "Layout", "Diseño"); + if (layout) fields.layout = layout.content; + const components = findSection(document.sections, "Componentes", "Components"); + if (components) fields.components = components.content; + const states = findSection(document.sections, "Estados", "States"); + if (states) fields.states = states.content; + const behavior = findSection(document.sections, "Comportamiento", "Behavior"); + if (behavior) fields.behavior = behavior.content; + + return { + id: nodeId, kind: KDDKind.UI_VIEW, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.UI_VIEW, document.id); + return deduplicateEdges(buildWikiLinkEdges(document, nodeId, document.layer)); + } +} diff --git a/src/application/extractors/kinds/use-case.ts b/src/application/extractors/kinds/use-case.ts new file mode 100644 index 0000000..096206c --- /dev/null +++ b/src/application/extractors/kinds/use-case.ts @@ -0,0 +1,92 @@ +import { KDDKind, type GraphEdge, type GraphNode, type KDDDocument } from "../../../domain/types.ts"; +import { buildWikiLinkEdges, deduplicateEdges, findSection, findSectionWithChildren, makeNodeId, resolveWikiLinkToNodeId, type Extractor } from "../base.ts"; +import { extractWikiLinks } from "../../../infra/wiki-links.ts"; + +export class UseCaseExtractor implements Extractor { + kind: KDDKind = KDDKind.USE_CASE; + + extractNode(document: KDDDocument): GraphNode { + const nodeId = makeNodeId(KDDKind.USE_CASE, document.id); + const fields: Record = {}; + + const desc = findSection(document.sections, "Descripción", "Description"); + if (desc) fields.description = desc.content; + const actors = findSection(document.sections, "Actores", "Actors"); + if (actors) fields.actors = actors.content; + const pre = findSection(document.sections, "Precondiciones", "Preconditions"); + if (pre) fields.preconditions = pre.content; + const flow = findSection(document.sections, "Flujo Principal", "Main Flow"); + if (flow) fields.main_flow = flow.content; + const alt = findSectionWithChildren(document.sections, "Flujos Alternativos", "Alternative Flows"); + if (alt) fields.alternatives = alt; + const exc = findSectionWithChildren(document.sections, "Excepciones", "Exceptions"); + if (exc) fields.exceptions = exc; + const post = findSection(document.sections, "Postcondiciones", "Postconditions"); + if (post) fields.postconditions = post.content; + + return { + id: nodeId, kind: KDDKind.USE_CASE, source_file: document.source_path, + source_hash: document.source_hash, layer: document.layer, + status: String(document.front_matter.status ?? "draft"), + aliases: (document.front_matter.aliases as string[]) ?? [], + domain: document.domain, indexed_fields: fields, + indexed_at: new Date().toISOString(), + }; + } + + extractEdges(document: KDDDocument): GraphEdge[] { + const nodeId = makeNodeId(KDDKind.USE_CASE, document.id); + const edges: GraphEdge[] = [...buildWikiLinkEdges(document, nodeId, document.layer)]; + + // UC_APPLIES_RULE + const rules = findSection(document.sections, "Reglas Aplicadas", "Applied Rules", "Rules Applied"); + if (rules) { + for (const link of extractWikiLinks(rules.content)) { + if (link.target.startsWith("BR-") || link.target.startsWith("BP-") || link.target.startsWith("XP-")) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, to_node: toNode, edge_type: "UC_APPLIES_RULE", + source_file: document.source_path, extraction_method: "wiki_link", + metadata: {}, layer_violation: false, bidirectional: false, + }); + } + } + } + } + + // UC_EXECUTES_CMD + const cmds = findSection(document.sections, "Comandos Ejecutados", "Commands Executed"); + if (cmds) { + for (const link of extractWikiLinks(cmds.content)) { + if (link.target.startsWith("CMD-")) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, to_node: toNode, edge_type: "UC_EXECUTES_CMD", + source_file: document.source_path, extraction_method: "wiki_link", + metadata: {}, layer_violation: false, bidirectional: false, + }); + } + } + } + } + + // UC_STORY from OBJ-* links anywhere + const fullContent = document.sections.map((s) => s.content).join("\n"); + for (const link of extractWikiLinks(fullContent)) { + if (link.target.startsWith("OBJ-")) { + const toNode = resolveWikiLinkToNodeId(link); + if (toNode) { + edges.push({ + from_node: nodeId, to_node: toNode, edge_type: "UC_STORY", + source_file: document.source_path, extraction_method: "wiki_link", + metadata: {}, layer_violation: false, bidirectional: false, + }); + } + } + } + + return deduplicateEdges(edges); + } +} diff --git a/src/application/extractors/registry.ts b/src/application/extractors/registry.ts new file mode 100644 index 0000000..1ea20c6 --- /dev/null +++ b/src/application/extractors/registry.ts @@ -0,0 +1,63 @@ +/** + * Extractor registry — maps KDDKind to extractor instances. + */ + +import type { KDDKind } from "../../domain/types.ts"; +import type { Extractor } from "./base.ts"; +import { EntityExtractor } from "./kinds/entity.ts"; +import { EventExtractor } from "./kinds/event.ts"; +import { BusinessRuleExtractor } from "./kinds/business-rule.ts"; +import { BusinessPolicyExtractor } from "./kinds/business-policy.ts"; +import { CrossPolicyExtractor } from "./kinds/cross-policy.ts"; +import { CommandExtractor } from "./kinds/command.ts"; +import { QueryExtractor } from "./kinds/query.ts"; +import { ProcessExtractor } from "./kinds/process.ts"; +import { UseCaseExtractor } from "./kinds/use-case.ts"; +import { UIViewExtractor } from "./kinds/ui-view.ts"; +import { UIComponentExtractor } from "./kinds/ui-component.ts"; +import { RequirementExtractor } from "./kinds/requirement.ts"; +import { ObjectiveExtractor } from "./kinds/objective.ts"; +import { PRDExtractor } from "./kinds/prd.ts"; +import { ADRExtractor } from "./kinds/adr.ts"; +import { GlossaryExtractor } from "./kinds/glossary.ts"; + +export class ExtractorRegistry { + private extractors = new Map(); + + register(extractor: Extractor): void { + this.extractors.set(extractor.kind, extractor); + } + + get(kind: KDDKind): Extractor | undefined { + return this.extractors.get(kind); + } + + get registeredKinds(): Set { + return new Set(this.extractors.keys()); + } + + get size(): number { + return this.extractors.size; + } +} + +export function createDefaultRegistry(): ExtractorRegistry { + const registry = new ExtractorRegistry(); + registry.register(new EntityExtractor()); + registry.register(new EventExtractor()); + registry.register(new BusinessRuleExtractor()); + registry.register(new BusinessPolicyExtractor()); + registry.register(new CrossPolicyExtractor()); + registry.register(new CommandExtractor()); + registry.register(new QueryExtractor()); + registry.register(new ProcessExtractor()); + registry.register(new UseCaseExtractor()); + registry.register(new UIViewExtractor()); + registry.register(new UIComponentExtractor()); + registry.register(new RequirementExtractor()); + registry.register(new ObjectiveExtractor()); + registry.register(new PRDExtractor()); + registry.register(new ADRExtractor()); + registry.register(new GlossaryExtractor()); + return registry; +} diff --git a/src/application/queries/coverage-query.ts b/src/application/queries/coverage-query.ts new file mode 100644 index 0000000..e169471 --- /dev/null +++ b/src/application/queries/coverage-query.ts @@ -0,0 +1,96 @@ +/** + * QRY-005 — Governance coverage analysis. + */ + +import { EdgeType, KDDKind, type CoverageCategory, type GraphNode } from "../../domain/types.ts"; +import type { GraphStore } from "../../infra/graph-store.ts"; + +export interface CoverageQueryInput { + nodeId: string; +} + +export interface CoverageQueryResult { + analyzedNode: GraphNode | undefined; + categories: CoverageCategory[]; + present: number; + missing: number; + coveragePercent: number; +} + +type CoverageRule = [name: string, description: string, edgeType: string]; + +const COVERAGE_RULES: Partial> = { + [KDDKind.ENTITY]: [ + ["events", "Domain events emitted by this entity", EdgeType.EMITS], + ["business_rules", "Business rules for this entity", EdgeType.ENTITY_RULE], + ["use_cases", "Use cases involving this entity", EdgeType.WIKI_LINK], + ], + [KDDKind.COMMAND]: [ + ["events", "Events emitted by this command", EdgeType.EMITS], + ["use_cases", "Use cases that execute this command", EdgeType.UC_EXECUTES_CMD], + ], + [KDDKind.USE_CASE]: [ + ["commands", "Commands executed by this use case", EdgeType.UC_EXECUTES_CMD], + ["rules", "Business rules applied", EdgeType.UC_APPLIES_RULE], + ["requirements", "Requirements tracing to this UC", EdgeType.REQ_TRACES_TO], + ], + [KDDKind.BUSINESS_RULE]: [ + ["entity", "Entity this rule validates", EdgeType.ENTITY_RULE], + ["use_cases", "Use cases that apply this rule", EdgeType.UC_APPLIES_RULE], + ], + [KDDKind.REQUIREMENT]: [ + ["traces", "Artifacts this requirement traces to", EdgeType.REQ_TRACES_TO], + ], +}; + +export function coverageQuery( + input: CoverageQueryInput, + graphStore: GraphStore, +): CoverageQueryResult { + const { nodeId } = input; + + if (!graphStore.hasNode(nodeId)) { + throw new Error(`NODE_NOT_FOUND: ${nodeId}`); + } + + const node = graphStore.getNode(nodeId); + if (!node) throw new Error(`NODE_NOT_FOUND: ${nodeId}`); + + const rules = COVERAGE_RULES[node.kind as KDDKind]; + if (!rules) { + throw new Error(`UNKNOWN_KIND: no coverage rules for kind '${node.kind}'`); + } + + const incoming = graphStore.incomingEdges(nodeId); + const outgoing = graphStore.outgoingEdges(nodeId); + const allEdges = [...incoming, ...outgoing]; + + const categories: CoverageCategory[] = []; + let present = 0; + let missing = 0; + + for (const [catName, catDesc, edgeType] of rules) { + const foundIds: string[] = []; + for (const edge of allEdges) { + if (edge.edge_type === edgeType) { + const other = edge.from_node === nodeId ? edge.to_node : edge.from_node; + if (!foundIds.includes(other)) { + foundIds.push(other); + } + } + } + + if (foundIds.length > 0) { + present++; + categories.push({ name: catName, description: catDesc, edge_type: edgeType, status: "covered", found: foundIds }); + } else { + missing++; + categories.push({ name: catName, description: catDesc, edge_type: edgeType, status: "missing", found: [] }); + } + } + + const total = present + missing; + const coveragePercent = total > 0 ? Math.round((present / total) * 1000) / 10 : 0; + + return { analyzedNode: node, categories, present, missing, coveragePercent }; +} diff --git a/src/application/queries/graph-query.ts b/src/application/queries/graph-query.ts new file mode 100644 index 0000000..5f3a473 --- /dev/null +++ b/src/application/queries/graph-query.ts @@ -0,0 +1,101 @@ +/** + * QRY-001 — Graph traversal from a root node. + */ + +import type { GraphEdge, GraphNode, KDDKind, ScoredNode } from "../../domain/types.ts"; +import type { GraphStore } from "../../infra/graph-store.ts"; + +export interface GraphQueryInput { + rootNode: string; + depth?: number; + edgeTypes?: string[]; + includeKinds?: KDDKind[]; + respectLayers?: boolean; +} + +export interface GraphQueryResult { + centerNode: GraphNode | undefined; + relatedNodes: ScoredNode[]; + edges: GraphEdge[]; + totalNodes: number; + totalEdges: number; +} + +export function graphQuery( + input: GraphQueryInput, + graphStore: GraphStore, +): GraphQueryResult { + const { rootNode, depth = 2, edgeTypes, includeKinds, respectLayers = true } = input; + + if (!graphStore.hasNode(rootNode)) { + throw new Error(`NODE_NOT_FOUND: ${rootNode}`); + } + + let [nodes, edges] = graphStore.traverse(rootNode, depth, edgeTypes, respectLayers); + + if (includeKinds) { + const kindSet = new Set(includeKinds); + nodes = nodes.filter((n) => kindSet.has(n.kind)); + } + + const center = graphStore.getNode(rootNode); + + const scored: ScoredNode[] = []; + for (const node of nodes) { + if (node.id === rootNode) continue; + const dist = estimateDistance(node.id, rootNode, edges); + const score = 1.0 / (1.0 + dist); + scored.push({ + node_id: node.id, + score, + snippet: buildSnippet(node), + match_source: "graph", + }); + } + + scored.sort((a, b) => b.score - a.score); + + return { + centerNode: center, + relatedNodes: scored, + edges, + totalNodes: scored.length + (center ? 1 : 0), + totalEdges: edges.length, + }; +} + +function estimateDistance( + nodeId: string, + rootId: string, + edges: GraphEdge[], +): number { + const adj = new Map>(); + for (const e of edges) { + if (!adj.has(e.from_node)) adj.set(e.from_node, new Set()); + adj.get(e.from_node)!.add(e.to_node); + if (!adj.has(e.to_node)) adj.set(e.to_node, new Set()); + adj.get(e.to_node)!.add(e.from_node); + } + + const visited = new Set([rootId]); + const queue: Array<[string, number]> = [[rootId, 0]]; + + while (queue.length > 0) { + const [current, dist] = queue.shift()!; + if (current === nodeId) return dist; + for (const neighbor of adj.get(current) ?? []) { + if (!visited.has(neighbor)) { + visited.add(neighbor); + queue.push([neighbor, dist + 1]); + } + } + } + + return 999; +} + +function buildSnippet(node: GraphNode): string { + const title = node.indexed_fields.title; + if (title) return `[${node.kind}] ${title}`; + return `[${node.kind}] ${node.id}`; +} diff --git a/src/application/queries/hybrid-search.ts b/src/application/queries/hybrid-search.ts new file mode 100644 index 0000000..a23cd24 --- /dev/null +++ b/src/application/queries/hybrid-search.ts @@ -0,0 +1,207 @@ +/** + * QRY-003 — Hybrid search (semantic + lexical + graph + fusion). + */ + +import { KIND_PREFIX, type GraphEdge, type KDDKind, type KDDLayer, type ScoredNode } from "../../domain/types.ts"; +import type { GraphStore } from "../../infra/graph-store.ts"; +import type { VectorStore } from "../../infra/vector-store.ts"; + +const WEIGHT_SEMANTIC = 0.6; +const WEIGHT_GRAPH = 0.3; +const WEIGHT_LEXICAL = 0.1; +const CHARS_PER_TOKEN = 4; + +export interface HybridSearchInput { + queryText: string; + expandGraph?: boolean; + depth?: number; + includeKinds?: KDDKind[]; + includeLayers?: KDDLayer[]; + respectLayers?: boolean; + minScore?: number; + limit?: number; + maxTokens?: number; +} + +export interface HybridSearchResult { + results: ScoredNode[]; + graphExpansion: GraphEdge[]; + totalResults: number; + totalTokens: number; + warnings: string[]; +} + +export async function hybridSearch( + input: HybridSearchInput, + graphStore: GraphStore, + vectorStore: VectorStore | null, + encodeFn: ((texts: string[]) => Promise) | null, +): Promise { + const { + queryText, + expandGraph = true, + depth = 2, + includeKinds, + includeLayers, + respectLayers = true, + minScore = 0.5, + limit = 10, + maxTokens = 8000, + } = input; + + if (queryText.trim().length < 3) { + throw new Error("QUERY_TOO_SHORT: query_text must be at least 3 characters"); + } + + const warnings: string[] = []; + const scores = new Map>(); + + // Phase 1: Semantic search + if (vectorStore && encodeFn) { + const vectors = await encodeFn([queryText]); + const matches = vectorStore.search(vectors[0]!, limit * 3, minScore * 0.8); + + for (const [embId, score] of matches) { + const nodeId = embIdToNodeId(embId, graphStore); + if (!nodeId) continue; + const existing = scores.get(nodeId) ?? new Map(); + existing.set("semantic", Math.max(existing.get("semantic") ?? 0, score)); + scores.set(nodeId, existing); + } + } else { + warnings.push("NO_EMBEDDINGS: index is L1, semantic search skipped"); + } + + // Phase 2: Lexical search + const lexicalNodes = graphStore.textSearch(queryText); + for (const node of lexicalNodes) { + if (kindLayerFilter(node, includeKinds, includeLayers)) { + const existing = scores.get(node.id) ?? new Map(); + existing.set("lexical", 0.5); + scores.set(node.id, existing); + } + } + + // Phase 3: Graph expansion + const allGraphEdges: GraphEdge[] = []; + if (expandGraph) { + const seedIds = [...scores.keys()]; + for (const seedId of seedIds) { + if (!graphStore.hasNode(seedId)) continue; + const [nodes, edges] = graphStore.traverse(seedId, depth, undefined, respectLayers); + allGraphEdges.push(...edges); + for (const n of nodes) { + if (n.id === seedId) continue; + if (kindLayerFilter(n, includeKinds, includeLayers)) { + const existing = scores.get(n.id) ?? new Map(); + existing.set("graph", 0.5); + scores.set(n.id, existing); + } + } + } + } + + // Phase 4: Fusion scoring + const fused: ScoredNode[] = []; + for (const [nodeId, sources] of scores) { + const node = graphStore.getNode(nodeId); + if (!node) continue; + if (!kindLayerFilter(node, includeKinds, includeLayers)) continue; + + const score = computeFusionScore(sources); + if (score < minScore) continue; + + fused.push({ + node_id: nodeId, + score, + snippet: buildSnippet(node), + match_source: determineMatchSource(sources), + }); + } + + fused.sort((a, b) => b.score - a.score); + + // Token truncation + const finalResults: ScoredNode[] = []; + let totalTokens = 0; + for (const scored of fused) { + const snippetTokens = countTokens(scored.snippet); + if (totalTokens + snippetTokens > maxTokens && finalResults.length > 0) break; + finalResults.push(scored); + totalTokens += snippetTokens; + if (finalResults.length >= limit) break; + } + + const seen = new Set(); + const uniqueEdges = allGraphEdges.filter((e) => { + const key = `${e.from_node}|${e.to_node}|${e.edge_type}`; + if (seen.has(key)) return false; + seen.add(key); + return true; + }); + + return { + results: finalResults, + graphExpansion: uniqueEdges, + totalResults: finalResults.length, + totalTokens, + warnings, + }; +} + +function embIdToNodeId(embId: string, graphStore: GraphStore): string | null { + const docId = embId.includes(":chunk-") + ? embId.split(":chunk-")[0]! + : embId.split(":")[0]!; + + for (const prefix of Object.values(KIND_PREFIX)) { + const candidate = `${prefix}:${docId}`; + if (graphStore.hasNode(candidate)) return candidate; + } + if (graphStore.hasNode(docId)) return docId; + return null; +} + +function kindLayerFilter( + node: { kind: string; layer: string }, + includeKinds?: KDDKind[], + includeLayers?: KDDLayer[], +): boolean { + if (includeKinds && !includeKinds.includes(node.kind as KDDKind)) return false; + if (includeLayers && !includeLayers.includes(node.layer as KDDLayer)) return false; + return true; +} + +function computeFusionScore(sources: Map): number { + const semantic = sources.get("semantic") ?? 0; + const graph = sources.get("graph") ?? 0; + const lexical = sources.get("lexical") ?? 0; + + const sourceCount = [...sources.values()].filter((v) => v > 0).length; + const bonus = sourceCount > 1 ? 0.1 * (sourceCount - 1) : 0; + + const weighted = + semantic * WEIGHT_SEMANTIC + graph * WEIGHT_GRAPH + lexical * WEIGHT_LEXICAL + bonus; + + return Math.min(weighted / (WEIGHT_SEMANTIC + WEIGHT_GRAPH + WEIGHT_LEXICAL + 0.2), 1.0); +} + +function determineMatchSource(sources: Map): string { + const hasSemantic = (sources.get("semantic") ?? 0) > 0; + const hasGraph = (sources.get("graph") ?? 0) > 0; + + if (hasSemantic && hasGraph) return "fusion"; + if (hasSemantic) return "semantic"; + if (hasGraph) return "graph"; + return "lexical"; +} + +function buildSnippet(node: { kind: string; id: string; indexed_fields: Record }): string { + const title = node.indexed_fields.title; + if (title) return `[${node.kind}] ${title}`; + return `[${node.kind}] ${node.id}`; +} + +function countTokens(text: string): number { + return Math.max(1, Math.floor(text.length / CHARS_PER_TOKEN)); +} diff --git a/src/application/queries/impact-query.ts b/src/application/queries/impact-query.ts new file mode 100644 index 0000000..6e5ad97 --- /dev/null +++ b/src/application/queries/impact-query.ts @@ -0,0 +1,139 @@ +/** + * QRY-004 — Impact analysis (reverse BFS). + */ + +import { EdgeType, type GraphEdge, type GraphNode } from "../../domain/types.ts"; +import type { GraphStore } from "../../infra/graph-store.ts"; + +export interface ImpactQueryInput { + nodeId: string; + changeType?: string; + depth?: number; +} + +export interface AffectedNode { + node_id: string; + kind: string; + edge_type: string; + impact_description: string; +} + +export interface TransitivelyAffected { + node_id: string; + kind: string; + path: string[]; + edge_types: string[]; +} + +export interface ScenarioToRerun { + node_id: string; + scenario_name: string; + reason: string; +} + +export interface ImpactQueryResult { + analyzedNode: GraphNode | undefined; + directlyAffected: AffectedNode[]; + transitivelyAffected: TransitivelyAffected[]; + scenariosToRerun: ScenarioToRerun[]; + totalDirectly: number; + totalTransitively: number; +} + +export function impactQuery( + input: ImpactQueryInput, + graphStore: GraphStore, +): ImpactQueryResult { + const { nodeId, changeType = "modify_attribute", depth = 3 } = input; + + if (!graphStore.hasNode(nodeId)) { + throw new Error(`NODE_NOT_FOUND: ${nodeId}`); + } + + const analyzed = graphStore.getNode(nodeId); + + // Phase 1: Direct dependents (incoming edges) + const directEdges = graphStore.incomingEdges(nodeId); + const directlyAffected: AffectedNode[] = []; + const directIds = new Set(); + + for (const edge of directEdges) { + const predNode = graphStore.getNode(edge.from_node); + if (!predNode) continue; + directIds.add(predNode.id); + directlyAffected.push({ + node_id: predNode.id, + kind: predNode.kind, + edge_type: edge.edge_type, + impact_description: describeImpact(edge, changeType), + }); + } + + // Phase 2: Transitive dependents + const transitivelyAffected: TransitivelyAffected[] = []; + if (depth > 1) { + const reverseResults = graphStore.reverseTraverse(nodeId, depth); + for (const [node, pathEdges] of reverseResults) { + if (directIds.has(node.id) || node.id === nodeId) continue; + const pathIds = [nodeId]; + const edgeTypes: string[] = []; + for (const e of pathEdges) { + pathIds.push(e.from_node); + edgeTypes.push(e.edge_type); + } + transitivelyAffected.push({ + node_id: node.id, + kind: node.kind, + path: pathIds, + edge_types: edgeTypes, + }); + } + } + + // Phase 3: Find BDD scenarios + const scenarios: ScenarioToRerun[] = []; + const allAffectedIds = new Set([ + ...directIds, + ...transitivelyAffected.map((t) => t.node_id), + nodeId, + ]); + + for (const edge of graphStore.allEdges()) { + if (edge.edge_type === EdgeType.VALIDATES && allAffectedIds.has(edge.to_node)) { + const featureNode = graphStore.getNode(edge.from_node); + if (featureNode) { + scenarios.push({ + node_id: featureNode.id, + scenario_name: (featureNode.indexed_fields.title as string) ?? featureNode.id, + reason: `Validates ${edge.to_node} which is affected`, + }); + } + } + } + + return { + analyzedNode: analyzed, + directlyAffected, + transitivelyAffected, + scenariosToRerun: scenarios, + totalDirectly: directlyAffected.length, + totalTransitively: transitivelyAffected.length, + }; +} + +const IMPACT_DESC: Record = { + ENTITY_RULE: "Business rule validates this entity", + UC_APPLIES_RULE: "Use case applies this rule", + UC_EXECUTES_CMD: "Use case executes this command", + EMITS: "Emits this event", + CONSUMES: "Consumes this event", + WIKI_LINK: "References this artifact", + DOMAIN_RELATION: "Has a domain relationship", + REQ_TRACES_TO: "Requirement traces to this artifact", + VALIDATES: "Validates this artifact via BDD scenarios", +}; + +function describeImpact(edge: GraphEdge, changeType: string): string { + const desc = IMPACT_DESC[edge.edge_type] ?? `Connected via ${edge.edge_type}`; + return `${desc} — change type: ${changeType}`; +} diff --git a/src/application/queries/semantic-query.ts b/src/application/queries/semantic-query.ts new file mode 100644 index 0000000..08ec00a --- /dev/null +++ b/src/application/queries/semantic-query.ts @@ -0,0 +1,91 @@ +/** + * QRY-002 — Semantic search (pure vector, no graph expansion). + */ + +import { KIND_PREFIX, type KDDKind, type KDDLayer, type ScoredNode } from "../../domain/types.ts"; +import type { GraphStore } from "../../infra/graph-store.ts"; +import type { VectorStore } from "../../infra/vector-store.ts"; + +export interface SemanticQueryInput { + queryText: string; + includeKinds?: KDDKind[]; + includeLayers?: KDDLayer[]; + minScore?: number; + limit?: number; +} + +export interface SemanticQueryResult { + results: ScoredNode[]; + totalResults: number; + embeddingModel: string; +} + +export async function semanticQuery( + input: SemanticQueryInput, + vectorStore: VectorStore, + graphStore: GraphStore, + encodeFn: (texts: string[]) => Promise, + modelName: string, +): Promise { + const { + queryText, + includeKinds, + includeLayers, + minScore = 0.7, + limit = 10, + } = input; + + if (queryText.trim().length < 3) { + throw new Error("QUERY_TOO_SHORT: query_text must be at least 3 characters"); + } + + const vectors = await encodeFn([queryText]); + const matches = vectorStore.search(vectors[0]!, limit * 3, minScore); + + const seenNodes = new Set(); + const results: ScoredNode[] = []; + + for (const [embId, score] of matches) { + const docId = embId.includes(":chunk-") + ? embId.split(":chunk-")[0]! + : embId.split(":")[0]!; + + const node = findNodeForDoc(docId, graphStore); + if (!node) continue; + + if (seenNodes.has(node.id)) continue; + seenNodes.add(node.id); + + if (includeKinds && !includeKinds.includes(node.kind as KDDKind)) continue; + if (includeLayers && !includeLayers.includes(node.layer as KDDLayer)) continue; + + results.push({ + node_id: node.id, + score, + snippet: buildSnippet(node), + match_source: "semantic", + }); + + if (results.length >= limit) break; + } + + return { + results, + totalResults: results.length, + embeddingModel: modelName, + }; +} + +function findNodeForDoc(docId: string, graphStore: GraphStore) { + for (const prefix of Object.values(KIND_PREFIX)) { + const node = graphStore.getNode(`${prefix}:${docId}`); + if (node) return node; + } + return graphStore.getNode(docId); +} + +function buildSnippet(node: { kind: string; id: string; indexed_fields: Record }): string { + const title = node.indexed_fields.title; + if (title) return `[${node.kind}] ${title}`; + return `[${node.kind}] ${node.id}`; +} diff --git a/src/application/queries/violations-query.ts b/src/application/queries/violations-query.ts new file mode 100644 index 0000000..12aac10 --- /dev/null +++ b/src/application/queries/violations-query.ts @@ -0,0 +1,71 @@ +/** + * QRY-006 — Layer violation detection. + */ + +import { KDDLayer, type GraphEdge, type KDDKind, type LayerViolation } from "../../domain/types.ts"; +import type { GraphStore } from "../../infra/graph-store.ts"; + +export interface ViolationsQueryInput { + includeKinds?: KDDKind[]; + includeLayers?: KDDLayer[]; +} + +export interface ViolationsQueryResult { + violations: LayerViolation[]; + totalViolations: number; + totalEdgesAnalyzed: number; + violationRate: number; +} + +export function violationsQuery( + input: ViolationsQueryInput, + graphStore: GraphStore, +): ViolationsQueryResult { + const { includeKinds, includeLayers } = input; + + const allEdges = graphStore.allEdges(); + let violationEdges = graphStore.findViolations(); + + if (includeKinds || includeLayers) { + violationEdges = violationEdges.filter((edge) => { + const fromNode = graphStore.getNode(edge.from_node); + const toNode = graphStore.getNode(edge.to_node); + + if (includeKinds) { + const fromMatch = fromNode && includeKinds.includes(fromNode.kind as KDDKind); + const toMatch = toNode && includeKinds.includes(toNode.kind as KDDKind); + if (!fromMatch && !toMatch) return false; + } + + if (includeLayers) { + const fromMatch = fromNode && includeLayers.includes(fromNode.layer as KDDLayer); + const toMatch = toNode && includeLayers.includes(toNode.layer as KDDLayer); + if (!fromMatch && !toMatch) return false; + } + + return true; + }); + } + + const violations: LayerViolation[] = violationEdges.map((edge) => { + const fromNode = graphStore.getNode(edge.from_node); + const toNode = graphStore.getNode(edge.to_node); + return { + from_node: edge.from_node, + to_node: edge.to_node, + from_layer: fromNode?.layer as KDDLayer ?? KDDLayer.DOMAIN, + to_layer: toNode?.layer as KDDLayer ?? KDDLayer.DOMAIN, + edge_type: edge.edge_type, + }; + }); + + const total = allEdges.length; + const rate = total > 0 ? Math.round((violations.length / total) * 10000) / 100 : 0; + + return { + violations, + totalViolations: violations.length, + totalEdgesAnalyzed: total, + violationRate: rate, + }; +} diff --git a/src/cli.ts b/src/cli.ts new file mode 100644 index 0000000..2bf6af8 --- /dev/null +++ b/src/cli.ts @@ -0,0 +1,318 @@ +/** + * kdd CLI — TypeScript/Bun implementation. + * + * Subcommands: index, search, graph, impact, semantic, coverage, violations + */ + +import { defineCommand, runMain } from "citty"; +import { resolve } from "node:path"; +import { Glob } from "bun"; +import { createContainer } from "./container.ts"; +import { hybridSearch } from "./application/queries/hybrid-search.ts"; +import { graphQuery } from "./application/queries/graph-query.ts"; +import { impactQuery } from "./application/queries/impact-query.ts"; +import { semanticQuery } from "./application/queries/semantic-query.ts"; +import { coverageQuery } from "./application/queries/coverage-query.ts"; +import { violationsQuery } from "./application/queries/violations-query.ts"; +import { indexDocument } from "./application/commands/index-document.ts"; +import { createDefaultRegistry } from "./application/extractors/registry.ts"; +import { ArtifactWriter } from "./infra/artifact-writer.ts"; +import { createEncoder } from "./infra/embedding-model.ts"; +import { detectIndexLevel } from "./domain/rules.ts"; +import { IndexLevel, type KDDKind, type KDDLayer, type Manifest } from "./domain/types.ts"; + +// ── Index command ─────────────────────────────────────────────────── + +const indexCmd = defineCommand({ + meta: { name: "index", description: "Index KDD specs into .kdd-index/" }, + args: { + specsPath: { type: "positional", description: "Path to specs directory", required: true }, + "index-path": { type: "string", description: "Output .kdd-index/ path", default: ".kdd-index" }, + domain: { type: "string", description: "Domain name" }, + level: { type: "string", description: "Index level: L1 (graph only) or L2 (graph + embeddings)", default: "L2" }, + }, + async run({ args }) { + const specsRoot = resolve(args.specsPath); + const indexPath = resolve(args["index-path"]); + const domain = args.domain ?? null; + + const indexLevel = args.level === "L1" ? IndexLevel.L1 : IndexLevel.L2; + + console.log(`Indexing specs from: ${specsRoot}`); + console.log(`Output: ${indexPath}`); + console.log(`Level: ${indexLevel}`); + + const registry = createDefaultRegistry(); + const writer = new ArtifactWriter(indexPath); + + // Clear previous edges + await writer.clearEdges(); + + let encodeFn: ((texts: string[]) => Promise) | null = null; + let modelName: string | undefined; + let modelDimensions: number | undefined; + + if (indexLevel !== IndexLevel.L1) { + modelName = "all-mpnet-base-v2"; + modelDimensions = 768; + console.log(`Loading embedding model: ${modelName}...`); + encodeFn = createEncoder(modelName); + } + + const glob = new Glob("**/*.md"); + const files: string[] = []; + for await (const path of glob.scan({ cwd: specsRoot, absolute: true })) { + files.push(path); + } + files.sort(); + + console.log(`Found ${files.length} markdown files\n`); + + let nodeCount = 0; + let edgeCount = 0; + let embeddingCount = 0; + let skippedCount = 0; + const domains = new Set(); + + for (const filePath of files) { + const result = await indexDocument(filePath, { + specsRoot, + registry, + artifactWriter: writer, + encodeFn, + modelName, + modelDimensions, + indexLevel, + domain, + }); + + if (result.success) { + nodeCount++; + edgeCount += result.edge_count; + embeddingCount += result.embedding_count; + if (domain) domains.add(domain); + const icon = result.warning ? "⚠" : "✓"; + console.log(` ${icon} ${result.node_id} (${result.edge_count} edges, ${result.embedding_count} embeddings)`); + if (result.warning) console.log(` Warning: ${result.warning}`); + } else { + skippedCount++; + } + } + + // Write manifest + let gitCommit: string | null = null; + try { + const proc = Bun.spawn(["git", "rev-parse", "HEAD"], { stdout: "pipe" }); + gitCommit = (await new Response(proc.stdout).text()).trim() || null; + } catch { /* not a git repo */ } + + const manifest: Manifest = { + version: "1.0.0", + kdd_version: "1.0.0", + embedding_model: modelName ?? null, + embedding_dimensions: modelDimensions ?? null, + indexed_at: new Date().toISOString(), + indexed_by: "kdd-ts", + structure: "flat", + index_level: indexLevel, + stats: { nodes: nodeCount, edges: edgeCount, embeddings: embeddingCount, enrichments: 0 }, + domains: [...domains], + git_commit: gitCommit, + }; + await writer.writeManifest(manifest); + + console.log(`\nDone: ${nodeCount} nodes, ${edgeCount} edges, ${embeddingCount} embeddings (${skippedCount} skipped)`); + }, +}); + +// ── Search subcommands ────────────────────────────────────────────── + +const searchCmd = defineCommand({ + meta: { name: "search", description: "Hybrid search (semantic + lexical + graph)" }, + args: { + query: { type: "positional", description: "Search query text", required: true }, + "index-path": { type: "string", description: "Path to .kdd-index/", default: ".kdd-index" }, + "min-score": { type: "string", description: "Minimum score threshold", default: "0.3" }, + n: { type: "string", description: "Max results", default: "10" }, + kind: { type: "string", description: "Filter by kind (comma-separated)" }, + "no-embeddings": { type: "boolean", description: "Skip embedding model loading", default: false }, + }, + async run({ args }) { + const indexPath = resolve(args["index-path"]); + const container = await createContainer(indexPath, { + skipEmbeddings: args["no-embeddings"], + }); + + const includeKinds = args.kind + ? (args.kind.split(",") as KDDKind[]) + : undefined; + + const result = await hybridSearch( + { + queryText: args.query, + minScore: parseFloat(args["min-score"]), + limit: parseInt(args.n, 10), + includeKinds, + }, + container.graphStore, + container.vectorStore, + container.encodeFn, + ); + + console.log(JSON.stringify(result, null, 2)); + }, +}); + +const graphCmd = defineCommand({ + meta: { name: "graph", description: "Graph traversal from a root node" }, + args: { + root: { type: "positional", description: "Root node ID (e.g. Entity:KDDDocument)", required: true }, + "index-path": { type: "string", description: "Path to .kdd-index/", default: ".kdd-index" }, + depth: { type: "string", description: "Traversal depth", default: "2" }, + kind: { type: "string", description: "Filter by kind (comma-separated)" }, + }, + async run({ args }) { + const indexPath = resolve(args["index-path"]); + const container = await createContainer(indexPath, { skipEmbeddings: true }); + + const includeKinds = args.kind + ? (args.kind.split(",") as KDDKind[]) + : undefined; + + const result = graphQuery( + { + rootNode: args.root, + depth: parseInt(args.depth, 10), + includeKinds, + }, + container.graphStore, + ); + + console.log(JSON.stringify(result, null, 2)); + }, +}); + +const impactCmd = defineCommand({ + meta: { name: "impact", description: "Impact analysis (reverse BFS)" }, + args: { + node: { type: "positional", description: "Node ID to analyze", required: true }, + "index-path": { type: "string", description: "Path to .kdd-index/", default: ".kdd-index" }, + depth: { type: "string", description: "Analysis depth", default: "3" }, + }, + async run({ args }) { + const indexPath = resolve(args["index-path"]); + const container = await createContainer(indexPath, { skipEmbeddings: true }); + + const result = impactQuery( + { + nodeId: args.node, + depth: parseInt(args.depth, 10), + }, + container.graphStore, + ); + + console.log(JSON.stringify(result, null, 2)); + }, +}); + +const semanticCmd = defineCommand({ + meta: { name: "semantic", description: "Pure semantic search (vector only)" }, + args: { + query: { type: "positional", description: "Search query text", required: true }, + "index-path": { type: "string", description: "Path to .kdd-index/", default: ".kdd-index" }, + "min-score": { type: "string", description: "Minimum score threshold", default: "0.7" }, + n: { type: "string", description: "Max results", default: "10" }, + kind: { type: "string", description: "Filter by kind (comma-separated)" }, + }, + async run({ args }) { + const indexPath = resolve(args["index-path"]); + const container = await createContainer(indexPath); + + if (!container.vectorStore || !container.encodeFn) { + console.error("Error: No embeddings found in index. Semantic search requires L2+ index."); + process.exit(1); + } + + const includeKinds = args.kind + ? (args.kind.split(",") as KDDKind[]) + : undefined; + + const result = await semanticQuery( + { + queryText: args.query, + minScore: parseFloat(args["min-score"]), + limit: parseInt(args.n, 10), + includeKinds, + }, + container.vectorStore, + container.graphStore, + container.encodeFn, + container.modelName ?? "unknown", + ); + + console.log(JSON.stringify(result, null, 2)); + }, +}); + +const coverageCmd = defineCommand({ + meta: { name: "coverage", description: "Governance coverage analysis" }, + args: { + node: { type: "positional", description: "Node ID to analyze (e.g. Entity:KDDDocument)", required: true }, + "index-path": { type: "string", description: "Path to .kdd-index/", default: ".kdd-index" }, + }, + async run({ args }) { + const indexPath = resolve(args["index-path"]); + const container = await createContainer(indexPath, { skipEmbeddings: true }); + + const result = coverageQuery( + { nodeId: args.node }, + container.graphStore, + ); + + console.log(JSON.stringify(result, null, 2)); + }, +}); + +const violationsCmd = defineCommand({ + meta: { name: "violations", description: "Detect layer dependency violations" }, + args: { + "index-path": { type: "string", description: "Path to .kdd-index/", default: ".kdd-index" }, + kind: { type: "string", description: "Filter by kind (comma-separated)" }, + layer: { type: "string", description: "Filter by layer (comma-separated)" }, + }, + async run({ args }) { + const indexPath = resolve(args["index-path"]); + const container = await createContainer(indexPath, { skipEmbeddings: true }); + + const includeKinds = args.kind + ? (args.kind.split(",") as KDDKind[]) + : undefined; + const includeLayers = args.layer + ? (args.layer.split(",") as KDDLayer[]) + : undefined; + + const result = violationsQuery( + { includeKinds, includeLayers }, + container.graphStore, + ); + + console.log(JSON.stringify(result, null, 2)); + }, +}); + +// ── Main ──────────────────────────────────────────────────────────── + +const main = defineCommand({ + meta: { name: "kdd", version: "1.0.0", description: "KDD specification toolkit (TypeScript/Bun)" }, + subCommands: { + index: indexCmd, + search: searchCmd, + graph: graphCmd, + impact: impactCmd, + semantic: semanticCmd, + coverage: coverageCmd, + violations: violationsCmd, + }, +}); + +runMain(main); diff --git a/src/container.ts b/src/container.ts new file mode 100644 index 0000000..142daa7 --- /dev/null +++ b/src/container.ts @@ -0,0 +1,48 @@ +/** + * Container — wires up artifact loading and store initialization. + */ + +import { loadAllEmbeddings, loadAllNodes, loadEdges, loadManifest } from "./infra/artifact-loader.ts"; +import { createEncoder } from "./infra/embedding-model.ts"; +import { GraphStore } from "./infra/graph-store.ts"; +import { VectorStore } from "./infra/vector-store.ts"; +import type { Manifest } from "./domain/types.ts"; + +export interface Container { + manifest: Manifest; + graphStore: GraphStore; + vectorStore: VectorStore | null; + encodeFn: ((texts: string[]) => Promise) | null; + modelName: string | null; +} + +export async function createContainer( + indexPath: string, + options: { skipEmbeddings?: boolean } = {}, +): Promise { + const manifest = await loadManifest(indexPath); + + const [nodes, edges] = await Promise.all([ + loadAllNodes(indexPath), + loadEdges(indexPath), + ]); + + const graphStore = new GraphStore(); + graphStore.load(nodes, edges); + + let vectorStore: VectorStore | null = null; + let encodeFn: ((texts: string[]) => Promise) | null = null; + let modelName: string | null = null; + + const hasEmbeddings = manifest.stats.embeddings > 0; + if (hasEmbeddings && !options.skipEmbeddings) { + const embeddings = await loadAllEmbeddings(indexPath); + vectorStore = new VectorStore(); + vectorStore.load(embeddings); + + modelName = embeddings[0]?.model ?? manifest.embedding_model ?? null; + encodeFn = createEncoder(modelName ?? undefined); + } + + return { manifest, graphStore, vectorStore, encodeFn, modelName }; +} diff --git a/src/domain/rules.ts b/src/domain/rules.ts new file mode 100644 index 0000000..1e5453f --- /dev/null +++ b/src/domain/rules.ts @@ -0,0 +1,117 @@ +/** + * Business rules as pure functions. + * + * BR-DOCUMENT-001, BR-EMBEDDING-001, BR-INDEX-001, BR-LAYER-001 + */ + +import { IndexLevel, KDDKind, KDDLayer, LAYER_NUMERIC } from "./types.ts"; + +// ── BR-DOCUMENT-001 — Kind Router ──────────────────────────────────── + +export const KIND_LOOKUP: Record = Object.fromEntries( + Object.values(KDDKind).map((k) => [k, k]), +); + +export const KIND_EXPECTED_PATH: Partial> = { + [KDDKind.ENTITY]: "01-domain/entities/", + [KDDKind.EVENT]: "01-domain/events/", + [KDDKind.BUSINESS_RULE]: "01-domain/rules/", + [KDDKind.BUSINESS_POLICY]: "02-behavior/policies/", + [KDDKind.CROSS_POLICY]: "02-behavior/policies/", + [KDDKind.COMMAND]: "02-behavior/commands/", + [KDDKind.QUERY]: "02-behavior/queries/", + [KDDKind.PROCESS]: "02-behavior/processes/", + [KDDKind.USE_CASE]: "02-behavior/use-cases/", + [KDDKind.UI_VIEW]: "03-experience/views/", + [KDDKind.UI_COMPONENT]: "03-experience/views/", + [KDDKind.REQUIREMENT]: "04-verification/criteria/", + [KDDKind.OBJECTIVE]: "00-requirements/objectives/", + [KDDKind.PRD]: "00-requirements/", + [KDDKind.ADR]: "00-requirements/decisions/", + [KDDKind.GLOSSARY]: "01-domain/glossary/", +}; + +export interface RouteResult { + kind: KDDKind | null; + warning: string | null; +} + +export function routeDocument( + frontMatter: Record | null, + sourcePath: string, +): RouteResult { + if (!frontMatter) return { kind: null, warning: null }; + + const kindStr = String(frontMatter.kind ?? "").toLowerCase().trim(); + if (!kindStr || !(kindStr in KIND_LOOKUP)) return { kind: null, warning: null }; + + const kind = KIND_LOOKUP[kindStr]!; + const expected = KIND_EXPECTED_PATH[kind] ?? ""; + let warning: string | null = null; + if (expected && !sourcePath.includes(expected)) { + warning = `${kind} '${sourcePath}' found outside expected path '${expected}'`; + } + + return { kind, warning }; +} + +// ── BR-EMBEDDING-001 — Embedding Strategy ──────────────────────────── + +export const EMBEDDABLE_SECTIONS: Record> = { + [KDDKind.ENTITY]: new Set(["descripción", "description"]), + [KDDKind.EVENT]: new Set(), + [KDDKind.BUSINESS_RULE]: new Set(["declaración", "declaration", "cuándo aplica", "when applies"]), + [KDDKind.BUSINESS_POLICY]: new Set(["declaración", "declaration"]), + [KDDKind.CROSS_POLICY]: new Set(["propósito", "purpose", "declaración", "declaration"]), + [KDDKind.COMMAND]: new Set(["purpose", "propósito"]), + [KDDKind.QUERY]: new Set(["purpose", "propósito"]), + [KDDKind.PROCESS]: new Set(["participantes", "participants", "pasos", "steps"]), + [KDDKind.USE_CASE]: new Set(["descripción", "description", "flujo principal", "main flow"]), + [KDDKind.UI_VIEW]: new Set(["descripción", "description", "comportamiento", "behavior"]), + [KDDKind.UI_COMPONENT]: new Set(["descripción", "description"]), + [KDDKind.REQUIREMENT]: new Set(["descripción", "description"]), + [KDDKind.OBJECTIVE]: new Set(["objetivo", "objective"]), + [KDDKind.PRD]: new Set(["problema / oportunidad", "problem / opportunity"]), + [KDDKind.ADR]: new Set(["contexto", "context", "decisión", "decision"]), + [KDDKind.GLOSSARY]: new Set(["definición", "definition"]), +}; + +export function embeddableSections(kind: KDDKind): Set { + return EMBEDDABLE_SECTIONS[kind] ?? new Set(); +} + +// ── BR-INDEX-001 — Index Level detection ──────────────────────────── + +export function detectIndexLevel( + embeddingModelAvailable: boolean, + agentApiAvailable: boolean, +): IndexLevel { + if (agentApiAvailable && embeddingModelAvailable) return IndexLevel.L3; + if (embeddingModelAvailable) return IndexLevel.L2; + return IndexLevel.L1; +} + +// ── BR-LAYER-001 — Layer Validation ───────────────────────────────── + +const LAYER_BY_PREFIX: Record = { + "00-requirements": KDDLayer.REQUIREMENTS, + "01-domain": KDDLayer.DOMAIN, + "02-behavior": KDDLayer.BEHAVIOR, + "03-experience": KDDLayer.EXPERIENCE, + "04-verification": KDDLayer.VERIFICATION, +}; + +export function detectLayer(sourcePath: string): KDDLayer | null { + for (const [prefix, layer] of Object.entries(LAYER_BY_PREFIX)) { + if (sourcePath.includes(prefix)) return layer; + } + return null; +} + +export function isLayerViolation( + originLayer: KDDLayer, + destinationLayer: KDDLayer, +): boolean { + if (originLayer === KDDLayer.REQUIREMENTS) return false; + return LAYER_NUMERIC[originLayer] < LAYER_NUMERIC[destinationLayer]; +} diff --git a/src/domain/types.ts b/src/domain/types.ts new file mode 100644 index 0000000..87e4d20 --- /dev/null +++ b/src/domain/types.ts @@ -0,0 +1,212 @@ +/** + * Domain types for KDD. + * + * Ported from: src/kdd/domain/enums.py + src/kdd/domain/entities.py + */ + +// ── Enums (as const objects for runtime + type safety) ────────────── + +export const KDDKind = { + ENTITY: "entity", + EVENT: "event", + BUSINESS_RULE: "business-rule", + BUSINESS_POLICY: "business-policy", + CROSS_POLICY: "cross-policy", + COMMAND: "command", + QUERY: "query", + PROCESS: "process", + USE_CASE: "use-case", + UI_VIEW: "ui-view", + UI_COMPONENT: "ui-component", + REQUIREMENT: "requirement", + OBJECTIVE: "objective", + PRD: "prd", + ADR: "adr", + GLOSSARY: "glossary", +} as const; +export type KDDKind = (typeof KDDKind)[keyof typeof KDDKind]; + +export const KDDLayer = { + REQUIREMENTS: "00-requirements", + DOMAIN: "01-domain", + BEHAVIOR: "02-behavior", + EXPERIENCE: "03-experience", + VERIFICATION: "04-verification", +} as const; +export type KDDLayer = (typeof KDDLayer)[keyof typeof KDDLayer]; + +/** Numeric ordering for layer violation checks. */ +export const LAYER_NUMERIC: Record = { + "00-requirements": 0, + "01-domain": 1, + "02-behavior": 2, + "03-experience": 3, + "04-verification": 4, +}; + +export const EdgeType = { + WIKI_LINK: "WIKI_LINK", + DOMAIN_RELATION: "DOMAIN_RELATION", + ENTITY_RULE: "ENTITY_RULE", + ENTITY_POLICY: "ENTITY_POLICY", + EMITS: "EMITS", + CONSUMES: "CONSUMES", + UC_APPLIES_RULE: "UC_APPLIES_RULE", + UC_EXECUTES_CMD: "UC_EXECUTES_CMD", + UC_STORY: "UC_STORY", + VIEW_TRIGGERS_UC: "VIEW_TRIGGERS_UC", + VIEW_USES_COMPONENT: "VIEW_USES_COMPONENT", + COMPONENT_USES_ENTITY: "COMPONENT_USES_ENTITY", + REQ_TRACES_TO: "REQ_TRACES_TO", + VALIDATES: "VALIDATES", + DECIDES_FOR: "DECIDES_FOR", + CROSS_DOMAIN_REF: "CROSS_DOMAIN_REF", + GLOSSARY_DEFINES: "GLOSSARY_DEFINES", +} as const; +export type EdgeType = (typeof EdgeType)[keyof typeof EdgeType]; + +export const IndexLevel = { L1: "L1", L2: "L2", L3: "L3" } as const; +export type IndexLevel = (typeof IndexLevel)[keyof typeof IndexLevel]; + +// ── Kind → Node ID prefix mapping ────────────────────────────────── + +export const KIND_PREFIX: Record = { + entity: "Entity", + event: "Event", + "business-rule": "BR", + "business-policy": "BP", + "cross-policy": "XP", + command: "CMD", + query: "QRY", + process: "PROC", + "use-case": "UC", + "ui-view": "UIView", + "ui-component": "UIComp", + requirement: "REQ", + objective: "OBJ", + prd: "PRD", + adr: "ADR", + glossary: "GLOSS", +}; + +// ── Data interfaces ───────────────────────────────────────────────── + +export interface GraphNode { + id: string; + kind: KDDKind; + source_file: string; + source_hash: string; + layer: KDDLayer; + status: string; + aliases: string[]; + domain: string | null; + indexed_fields: Record; + indexed_at: string; +} + +export interface GraphEdge { + from_node: string; + to_node: string; + edge_type: string; + source_file: string; + extraction_method: string; + metadata: Record; + layer_violation: boolean; + bidirectional: boolean; +} + +export interface Embedding { + id: string; + document_id: string; + document_kind: KDDKind; + section_path: string; + chunk_index: number; + raw_text: string; + context_text: string; + vector: number[]; + model: string; + dimensions: number; + text_hash: string; + generated_at: string; +} + +export interface Manifest { + version: string; + kdd_version: string; + embedding_model: string | null; + embedding_dimensions: number | null; + indexed_at: string; + indexed_by: string; + structure: string; + index_level: IndexLevel; + stats: { + nodes: number; + edges: number; + embeddings: number; + enrichments: number; + }; + domains: string[]; + git_commit: string | null; +} + +export interface ScoredNode { + node_id: string; + score: number; + snippet: string; + match_source: string; +} + +// ── Document model (for indexing pipeline) ────────────────────────── + +export interface Section { + heading: string; + level: number; + content: string; + path: string; +} + +export interface KDDDocument { + id: string; + kind: KDDKind; + source_path: string; + source_hash: string; + layer: KDDLayer; + front_matter: Record; + sections: Section[]; + wiki_links: string[]; + domain: string | null; +} + +export interface Chunk { + chunk_id: string; + document_id: string; + section_heading: string; + content: string; + context_text: string; + char_offset: number; +} + +export interface IndexResult { + success: boolean; + node_id?: string; + edge_count: number; + embedding_count: number; + skipped_reason?: string; + warning?: string; +} + +export interface LayerViolation { + from_node: string; + to_node: string; + from_layer: KDDLayer; + to_layer: KDDLayer; + edge_type: string; +} + +export interface CoverageCategory { + name: string; + description: string; + edge_type: string; + status: "covered" | "missing" | "partial"; + found: string[]; +} diff --git a/src/infra/artifact-loader.ts b/src/infra/artifact-loader.ts new file mode 100644 index 0000000..aec46e8 --- /dev/null +++ b/src/infra/artifact-loader.ts @@ -0,0 +1,52 @@ +/** + * Artifact loader — reads .kdd-index/ artifacts. + */ + +import { join } from "node:path"; +import { Glob } from "bun"; +import type { Embedding, GraphEdge, GraphNode, Manifest } from "../domain/types.ts"; + +export async function loadManifest(indexPath: string): Promise { + return Bun.file(join(indexPath, "manifest.json")).json(); +} + +export async function loadAllNodes(indexPath: string): Promise { + const nodesDir = join(indexPath, "nodes"); + const glob = new Glob("**/*.json"); + const nodes: GraphNode[] = []; + + for await (const path of glob.scan({ cwd: nodesDir, absolute: true })) { + const node: GraphNode = await Bun.file(path).json(); + nodes.push(node); + } + + return nodes; +} + +export async function loadEdges(indexPath: string): Promise { + const edgesFile = join(indexPath, "edges", "edges.jsonl"); + const text = await Bun.file(edgesFile).text(); + const edges: GraphEdge[] = []; + + for (const line of text.split("\n")) { + const trimmed = line.trim(); + if (trimmed) { + edges.push(JSON.parse(trimmed) as GraphEdge); + } + } + + return edges; +} + +export async function loadAllEmbeddings(indexPath: string): Promise { + const embDir = join(indexPath, "embeddings"); + const glob = new Glob("**/*.json"); + const embeddings: Embedding[] = []; + + for await (const path of glob.scan({ cwd: embDir, absolute: true })) { + const chunks: Embedding[] = await Bun.file(path).json(); + embeddings.push(...chunks); + } + + return embeddings; +} diff --git a/src/infra/artifact-writer.ts b/src/infra/artifact-writer.ts new file mode 100644 index 0000000..5b5de7d --- /dev/null +++ b/src/infra/artifact-writer.ts @@ -0,0 +1,115 @@ +/** + * Artifact writer — writes .kdd-index/ artifacts. + */ + +import { join } from "node:path"; +import { mkdir } from "node:fs/promises"; +import type { Embedding, GraphEdge, GraphNode, Manifest } from "../domain/types.ts"; + +export class ArtifactWriter { + constructor(private indexPath: string) {} + + async writeManifest(manifest: Manifest): Promise { + await mkdir(this.indexPath, { recursive: true }); + const path = join(this.indexPath, "manifest.json"); + await Bun.write(path, JSON.stringify(manifest, null, 2)); + } + + async writeNode(node: GraphNode): Promise { + const docId = node.id.includes(":") ? node.id.split(":").slice(1).join(":") : node.id; + const dir = join(this.indexPath, "nodes", node.kind); + await mkdir(dir, { recursive: true }); + const path = join(dir, `${docId}.json`); + await Bun.write(path, JSON.stringify(node, null, 2)); + } + + async appendEdges(edges: GraphEdge[]): Promise { + const dir = join(this.indexPath, "edges"); + await mkdir(dir, { recursive: true }); + const path = join(dir, "edges.jsonl"); + const lines = edges.map((e) => JSON.stringify(e)).join("\n") + "\n"; + const file = Bun.file(path); + if (await file.exists()) { + const existing = await file.text(); + await Bun.write(path, existing + lines); + } else { + await Bun.write(path, lines); + } + } + + async writeEmbeddings(embeddings: Embedding[]): Promise { + if (embeddings.length === 0) return; + // Group by (kind, document_id) + const byDoc = new Map(); + for (const emb of embeddings) { + const key = `${emb.document_kind}/${emb.document_id}`; + const list = byDoc.get(key) ?? []; + list.push(emb); + byDoc.set(key, list); + } + + for (const [key, docEmbeddings] of byDoc) { + const [kind, docId] = key.split("/", 2) as [string, string]; + const dir = join(this.indexPath, "embeddings", kind); + await mkdir(dir, { recursive: true }); + const path = join(dir, `${docId}.json`); + await Bun.write(path, JSON.stringify(docEmbeddings, null, 2)); + } + } + + async deleteDocumentArtifacts(documentId: string): Promise { + const { readdir, unlink, rmdir } = await import("node:fs/promises"); + const nodesDir = join(this.indexPath, "nodes"); + + try { + const kinds = await readdir(nodesDir); + for (const kind of kinds) { + const path = join(nodesDir, kind, `${documentId}.json`); + const file = Bun.file(path); + if (await file.exists()) { + const data = await file.json(); + const nodeId = data.id ?? ""; + await unlink(path); + await this.removeEdgesForNode(nodeId); + break; + } + } + } catch { /* nodes dir may not exist */ } + + // Delete embeddings + const embDir = join(this.indexPath, "embeddings"); + try { + const kinds = await readdir(embDir); + for (const kind of kinds) { + const path = join(embDir, kind, `${documentId}.json`); + const file = Bun.file(path); + if (await file.exists()) { + await unlink(path); + } + } + } catch { /* embeddings dir may not exist */ } + } + + async clearEdges(): Promise { + const path = join(this.indexPath, "edges", "edges.jsonl"); + const dir = join(this.indexPath, "edges"); + await mkdir(dir, { recursive: true }); + await Bun.write(path, ""); + } + + private async removeEdgesForNode(nodeId: string): Promise { + const path = join(this.indexPath, "edges", "edges.jsonl"); + const file = Bun.file(path); + if (!(await file.exists())) return; + const text = await file.text(); + const kept = text + .split("\n") + .filter((line) => { + if (!line.trim()) return false; + const data = JSON.parse(line); + return data.from_node !== nodeId && data.to_node !== nodeId; + }) + .join("\n"); + await Bun.write(path, kept + (kept ? "\n" : "")); + } +} diff --git a/src/infra/embedding-model.ts b/src/infra/embedding-model.ts new file mode 100644 index 0000000..0f9d670 --- /dev/null +++ b/src/infra/embedding-model.ts @@ -0,0 +1,38 @@ +/** + * Embedding model — @huggingface/transformers wrapper. + */ + +import type { FeatureExtractionPipeline } from "@huggingface/transformers"; + +const MODEL_MAP: Record = { + "all-MiniLM-L6-v2": "Xenova/all-MiniLM-L6-v2", + "all-mpnet-base-v2": "Xenova/all-mpnet-base-v2", + "paraphrase-multilingual-MiniLM-L12-v2": "Xenova/paraphrase-multilingual-MiniLM-L12-v2", +}; + +const DEFAULT_MODEL = "Xenova/all-MiniLM-L6-v2"; + +let pipelineInstance: FeatureExtractionPipeline | null = null; +let currentModelId: string | null = null; + +async function getPipeline(modelId: string): Promise { + if (pipelineInstance && currentModelId === modelId) return pipelineInstance; + const { pipeline } = await import("@huggingface/transformers"); + pipelineInstance = (await pipeline("feature-extraction", modelId, { + dtype: "fp32", + })) as FeatureExtractionPipeline; + currentModelId = modelId; + return pipelineInstance; +} + +export function createEncoder(modelName?: string): (texts: string[]) => Promise { + const modelId = modelName + ? (MODEL_MAP[modelName] ?? `Xenova/${modelName}`) + : DEFAULT_MODEL; + + return async (texts: string[]) => { + const pipe = await getPipeline(modelId); + const output = await pipe(texts, { pooling: "mean", normalize: true }); + return output.tolist() as number[][]; + }; +} diff --git a/src/infra/graph-store.ts b/src/infra/graph-store.ts new file mode 100644 index 0000000..c1c0509 --- /dev/null +++ b/src/infra/graph-store.ts @@ -0,0 +1,226 @@ +/** + * Graph store — graphology wrapper with BFS, reverse traversal, text search. + */ + +import Graph from "graphology"; +import type { GraphEdge, GraphNode } from "../domain/types.ts"; + +export class GraphStore { + private graph = new Graph({ multi: true, type: "directed" }); + private nodes = new Map(); + + load(nodes: GraphNode[], edges: GraphEdge[]): void { + this.graph.clear(); + this.nodes.clear(); + + for (const node of nodes) { + this.graph.addNode(node.id, { data: node }); + this.nodes.set(node.id, node); + } + + for (const edge of edges) { + if (!this.graph.hasNode(edge.from_node)) continue; + if (!this.graph.hasNode(edge.to_node)) continue; + const key = `${edge.from_node}→${edge.to_node}:${edge.edge_type}`; + if (!this.graph.hasEdge(key)) { + this.graph.addEdgeWithKey(key, edge.from_node, edge.to_node, { data: edge }); + } + } + } + + addNode(node: GraphNode): void { + if (this.graph.hasNode(node.id)) { + this.graph.replaceNodeAttributes(node.id, { data: node }); + } else { + this.graph.addNode(node.id, { data: node }); + } + this.nodes.set(node.id, node); + } + + addEdge(edge: GraphEdge): void { + if (!this.graph.hasNode(edge.from_node)) return; + if (!this.graph.hasNode(edge.to_node)) return; + const key = `${edge.from_node}→${edge.to_node}:${edge.edge_type}`; + if (!this.graph.hasEdge(key)) { + this.graph.addEdgeWithKey(key, edge.from_node, edge.to_node, { data: edge }); + } + } + + traverse( + root: string, + depth: number, + edgeTypes?: string[], + respectLayers = true, + ): [GraphNode[], GraphEdge[]] { + if (!this.graph.hasNode(root)) return [[], []]; + + const visited = new Set([root]); + const collectedEdges: GraphEdge[] = []; + const queue: Array<[string, number]> = [[root, 0]]; + + while (queue.length > 0) { + const [current, dist] = queue.shift()!; + if (dist >= depth) continue; + + this.graph.forEachOutEdge(current, (_edgeKey, attrs, _src, target) => { + const edge: GraphEdge = attrs.data; + if (!edgeMatches(edge, edgeTypes, respectLayers)) return; + collectedEdges.push(edge); + if (!visited.has(target)) { + visited.add(target); + queue.push([target, dist + 1]); + } + }); + + this.graph.forEachInEdge(current, (_edgeKey, attrs, source) => { + const edge: GraphEdge = attrs.data; + if (!edgeMatches(edge, edgeTypes, respectLayers)) return; + collectedEdges.push(edge); + if (!visited.has(source)) { + visited.add(source); + queue.push([source, dist + 1]); + } + }); + } + + const resultNodes = [...visited] + .map((id) => this.nodes.get(id)) + .filter((n): n is GraphNode => n != null); + + const seen = new Set(); + const uniqueEdges = collectedEdges.filter((e) => { + const key = `${e.from_node}|${e.to_node}|${e.edge_type}`; + if (seen.has(key)) return false; + seen.add(key); + return true; + }); + + return [resultNodes, uniqueEdges]; + } + + reverseTraverse( + root: string, + depth: number, + ): Array<[GraphNode, GraphEdge[]]> { + if (!this.graph.hasNode(root)) return []; + + const results: Array<[GraphNode, GraphEdge[]]> = []; + const visited = new Set([root]); + const queue: Array<[string, number, GraphEdge[]]> = [[root, 0, []]]; + + while (queue.length > 0) { + const [current, dist, path] = queue.shift()!; + if (dist >= depth) continue; + + this.graph.forEachInEdge(current, (_edgeKey, attrs, source) => { + if (visited.has(source)) return; + visited.add(source); + const edge: GraphEdge = attrs.data; + const newPath = [...path, edge]; + const predNode = this.nodes.get(source); + if (predNode) { + results.push([predNode, newPath]); + } + queue.push([source, dist + 1, newPath]); + }); + } + + return results; + } + + textSearch(query: string, fields?: string[]): GraphNode[] { + const queryLower = query.toLowerCase(); + const results: GraphNode[] = []; + + for (const node of this.nodes.values()) { + if (nodeMatchesText(node, queryLower, fields)) { + results.push(node); + } + } + + return results; + } + + getNode(id: string): GraphNode | undefined { + return this.nodes.get(id); + } + + hasNode(id: string): boolean { + return this.nodes.has(id); + } + + incomingEdges(nodeId: string): GraphEdge[] { + if (!this.graph.hasNode(nodeId)) return []; + const edges: GraphEdge[] = []; + this.graph.forEachInEdge(nodeId, (_key, attrs) => { + edges.push(attrs.data as GraphEdge); + }); + return edges; + } + + outgoingEdges(nodeId: string): GraphEdge[] { + if (!this.graph.hasNode(nodeId)) return []; + const edges: GraphEdge[] = []; + this.graph.forEachOutEdge(nodeId, (_key, attrs) => { + edges.push(attrs.data as GraphEdge); + }); + return edges; + } + + allEdges(): GraphEdge[] { + const edges: GraphEdge[] = []; + this.graph.forEachEdge((_key, attrs) => { + edges.push(attrs.data as GraphEdge); + }); + return edges; + } + + allNodes(): GraphNode[] { + return [...this.nodes.values()]; + } + + nodeCount(): number { + return this.nodes.size; + } + + edgeCount(): number { + return this.graph.size; + } + + findViolations(): GraphEdge[] { + return this.allEdges().filter((e) => e.layer_violation); + } +} + +function edgeMatches( + edge: GraphEdge, + edgeTypes: string[] | undefined, + respectLayers: boolean, +): boolean { + if (respectLayers && edge.layer_violation) return false; + if (edgeTypes != null && !edgeTypes.includes(edge.edge_type)) return false; + return true; +} + +function nodeMatchesText( + node: GraphNode, + queryLower: string, + fields?: string[], +): boolean { + let searchValues: string[]; + + if (fields) { + searchValues = Object.entries(node.indexed_fields) + .filter(([k, v]) => fields.includes(k) && v != null) + .map(([, v]) => String(v)); + } else { + searchValues = Object.values(node.indexed_fields) + .filter((v) => v != null) + .map((v) => String(v)); + } + + searchValues.push(node.id); + searchValues.push(...node.aliases); + + return searchValues.some((val) => val.toLowerCase().includes(queryLower)); +} diff --git a/src/infra/markdown-parser.ts b/src/infra/markdown-parser.ts new file mode 100644 index 0000000..ae037a9 --- /dev/null +++ b/src/infra/markdown-parser.ts @@ -0,0 +1,87 @@ +/** + * Markdown parsing — frontmatter extraction and section parsing. + */ + +import matter from "gray-matter"; +import type { Section } from "../domain/types.ts"; + +export function extractFrontmatter(content: string): [Record, string] { + try { + const { data, content: body } = matter(content); + return [data as Record, body]; + } catch { + return [{}, content]; + } +} + +export function parseMarkdownSections(content: string): Section[] { + const sections: Section[] = []; + const currentHeadings: string[] = []; + const currentLevels: number[] = []; + let currentLines: string[] = []; + + function flush(): void { + const text = currentLines.join("\n").trim(); + if (currentHeadings.length > 0) { + const path = currentHeadings.map(headingToAnchor).join("."); + sections.push({ + heading: currentHeadings[currentHeadings.length - 1]!, + level: currentLevels[currentLevels.length - 1] ?? 1, + content: text, + path, + }); + } + } + + for (const line of content.split("\n")) { + if (line.startsWith("#")) { + flush(); + currentLines = []; + + const level = line.length - line.replace(/^#+/, "").length; + const headingText = line.replace(/^#+\s*/, ""); + + // Maintain hierarchy: pop deeper or equal headings + while (currentLevels.length > 0 && currentLevels[currentLevels.length - 1]! >= level) { + currentLevels.pop(); + if (currentHeadings.length > 0) currentHeadings.pop(); + } + + currentHeadings.push(headingText); + currentLevels.push(level); + } else { + currentLines.push(line); + } + } + + flush(); + return sections; +} + +export function headingToAnchor(heading: string): string { + let text = heading.normalize("NFKD").toLowerCase(); + text = text.replace(/[^\w\s-]/g, ""); + text = text.replace(/\s+/g, "-"); + text = text.replace(/^-+|-+$/g, ""); + return text; +} + +export function extractSnippet(content: string, maxLength = 200): string { + let text = content.trim(); + text = text.replace(/^#+\s+/gm, ""); + text = text.replace(/\*\*([^*]+)\*\*/g, "$1"); + text = text.replace(/\*([^*]+)\*/g, "$1"); + text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1"); + text = text.replace(/\s+/g, " ").trim(); + + if (text.length <= maxLength) return text; + + const truncated = text.slice(0, maxLength); + const lastPeriod = truncated.lastIndexOf(". "); + if (lastPeriod > maxLength / 2) return truncated.slice(0, lastPeriod + 1); + + const lastSpace = truncated.lastIndexOf(" "); + if (lastSpace > maxLength / 2) return truncated.slice(0, lastSpace) + "..."; + + return truncated + "..."; +} diff --git a/src/infra/vector-store.ts b/src/infra/vector-store.ts new file mode 100644 index 0000000..fde653a --- /dev/null +++ b/src/infra/vector-store.ts @@ -0,0 +1,60 @@ +/** + * Vector store — brute-force cosine similarity. + */ + +import type { Embedding } from "../domain/types.ts"; + +export class VectorStore { + private ids: string[] = []; + private vectors: Float64Array[] = []; + + load(embeddings: Embedding[]): void { + this.ids = embeddings.map((e) => e.id); + this.vectors = embeddings.map((e) => new Float64Array(e.vector)); + } + + search( + queryVector: number[], + limit: number, + minScore: number, + ): Array<[string, number]> { + if (this.ids.length === 0) return []; + + const qv = new Float64Array(queryVector); + const qNorm = norm(qv); + if (qNorm === 0) return []; + + const scored: Array<[string, number]> = []; + + for (let i = 0; i < this.vectors.length; i++) { + const v = this.vectors[i]!; + const sim = dot(qv, v) / (qNorm * norm(v)); + if (sim >= minScore) { + scored.push([this.ids[i]!, sim]); + } + } + + scored.sort((a, b) => b[1] - a[1]); + return scored.slice(0, limit); + } + + get size(): number { + return this.ids.length; + } +} + +function dot(a: Float64Array, b: Float64Array): number { + let sum = 0; + for (let i = 0; i < a.length; i++) { + sum += a[i]! * b[i]!; + } + return sum; +} + +function norm(a: Float64Array): number { + let sum = 0; + for (let i = 0; i < a.length; i++) { + sum += a[i]! * a[i]!; + } + return Math.sqrt(sum); +} diff --git a/src/infra/wiki-links.ts b/src/infra/wiki-links.ts new file mode 100644 index 0000000..65b7a28 --- /dev/null +++ b/src/infra/wiki-links.ts @@ -0,0 +1,47 @@ +/** + * Wiki-link extraction from markdown content. + * + * Handles: [[Target]], [[domain::Target]], [[Target|Display]] + */ + +const WIKI_LINK_RE = /\[\[([^\]]+)\]\]/g; + +export interface WikiLink { + raw: string; + target: string; + domain: string | null; + alias: string | null; +} + +export function extractWikiLinks(content: string): WikiLink[] { + const results: WikiLink[] = []; + for (const match of content.matchAll(WIKI_LINK_RE)) { + const raw = match[1]!.trim(); + if (!raw) continue; + + let domain: string | null = null; + let alias: string | null = null; + let target = raw; + + // Cross-domain: [[domain::Target]] + if (target.includes("::")) { + const parts = target.split("::", 2); + domain = parts[0]!.trim(); + target = parts[1]!.trim(); + } + + // Display alias: [[Target|Alias]] + if (target.includes("|")) { + const parts = target.split("|", 2); + target = parts[0]!.trim(); + alias = parts[1]!.trim(); + } + + results.push({ raw, target, domain, alias }); + } + return results; +} + +export function extractWikiLinkTargets(content: string): string[] { + return extractWikiLinks(content).map((link) => link.target); +} diff --git a/src/kb_engine/__init__.py b/src/kb_engine/__init__.py deleted file mode 100644 index ce9e1b4..0000000 --- a/src/kb_engine/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""KB-Engine: Intelligent document retrieval system.""" - -__version__ = "0.2.0" diff --git a/src/kb_engine/api/__init__.py b/src/kb_engine/api/__init__.py deleted file mode 100644 index 9c3a8eb..0000000 --- a/src/kb_engine/api/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""FastAPI REST API for KB-Engine.""" - -from kb_engine.api.main import app, create_app - -__all__ = ["app", "create_app"] diff --git a/src/kb_engine/api/dependencies.py b/src/kb_engine/api/dependencies.py deleted file mode 100644 index e775e78..0000000 --- a/src/kb_engine/api/dependencies.py +++ /dev/null @@ -1,97 +0,0 @@ -"""FastAPI dependencies for dependency injection.""" - -from typing import Annotated - -from fastapi import Depends, Request - -from kb_engine.config import Settings, get_settings -from kb_engine.services.indexing import IndexingService -from kb_engine.services.retrieval import RetrievalService - - -def get_settings_dep() -> Settings: - """Get application settings.""" - return get_settings() - - -async def get_indexing_service(request: Request) -> IndexingService: - """Get the indexing service from app state.""" - if hasattr(request.app.state, "indexing_service"): - return request.app.state.indexing_service - - # Initialize on first request using the profile-based factory - settings = get_settings() - service = await _create_indexing_service(settings) - request.app.state.indexing_service = service - return service - - -async def get_retrieval_service(request: Request) -> RetrievalService: - """Get the retrieval service from app state.""" - if hasattr(request.app.state, "retrieval_service"): - return request.app.state.retrieval_service - - settings = get_settings() - service = await _create_retrieval_service(settings) - request.app.state.retrieval_service = service - return service - - -async def _create_indexing_service(settings: Settings) -> IndexingService: - """Create the indexing service based on settings profile.""" - from kb_engine.embedding.config import EmbeddingConfig - from kb_engine.pipelines.indexation import IndexationPipeline - from kb_engine.repositories.factory import RepositoryFactory - - factory = RepositoryFactory(settings) - traceability = await factory.get_traceability_repository() - vector = await factory.get_vector_repository() - graph_strategy = await factory.get_graph_strategy() - - embedding_config = EmbeddingConfig( - provider=settings.embedding_provider, - local_model_name=settings.local_embedding_model, - openai_model=settings.openai_embedding_model, - ) - - pipeline = IndexationPipeline( - traceability_repo=traceability, - vector_repo=vector, - graph_strategy=graph_strategy, - embedding_config=embedding_config, - ) - - return IndexingService(pipeline=pipeline) - - -async def _create_retrieval_service(settings: Settings) -> RetrievalService: - """Create the retrieval service based on settings profile.""" - from kb_engine.embedding.config import EmbeddingConfig - from kb_engine.pipelines.inference.pipeline import RetrievalPipeline - from kb_engine.repositories.factory import RepositoryFactory - - factory = RepositoryFactory(settings) - traceability = await factory.get_traceability_repository() - vector = await factory.get_vector_repository() - graph = await factory.get_graph_repository() - - embedding_config = EmbeddingConfig( - provider=settings.embedding_provider, - local_model_name=settings.local_embedding_model, - openai_model=settings.openai_embedding_model, - ) - - pipeline = RetrievalPipeline( - traceability_repo=traceability, - vector_repo=vector, - graph_repo=graph, - embedding_config=embedding_config, - ) - - return RetrievalService(pipeline=pipeline) - - -# Type aliases for dependency injection -SettingsDep = Annotated[Settings, Depends(get_settings_dep)] -IndexingServiceDep = Annotated[IndexingService, Depends(get_indexing_service)] -RetrievalServiceDep = Annotated[RetrievalService, Depends(get_retrieval_service)] diff --git a/src/kb_engine/api/main.py b/src/kb_engine/api/main.py deleted file mode 100644 index 282a02e..0000000 --- a/src/kb_engine/api/main.py +++ /dev/null @@ -1,84 +0,0 @@ -"""FastAPI application factory.""" - -from contextlib import asynccontextmanager -from typing import AsyncGenerator - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware - -from kb_engine import __version__ -from kb_engine.api.routers import admin, curation, health, indexing, retrieval -from kb_engine.config import get_settings -from kb_engine.config.logging import configure_logging - - -@asynccontextmanager -async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: - """Application lifespan manager.""" - settings = get_settings() - configure_logging( - log_level=settings.log_level, - json_logs=settings.is_production, - ) - - # Services are lazily initialized on first request via dependencies - - yield - - # Cleanup - if hasattr(app.state, "repo_factory"): - await app.state.repo_factory.close() - - -def create_app() -> FastAPI: - """Create and configure the FastAPI application.""" - settings = get_settings() - - app = FastAPI( - title="KB-Engine", - description="Intelligent document retrieval system", - version=__version__, - docs_url="/docs" if settings.is_development else None, - redoc_url="/redoc" if settings.is_development else None, - lifespan=lifespan, - ) - - # CORS middleware - app.add_middleware( - CORSMiddleware, - allow_origins=["*"] if settings.is_development else [], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - # Include routers - app.include_router(health.router, tags=["Health"]) - app.include_router(retrieval.router, prefix="/api/v1", tags=["Retrieval"]) - app.include_router(indexing.router, prefix="/api/v1", tags=["Indexing"]) - app.include_router(curation.router, prefix="/api/v1", tags=["Curation"]) - app.include_router(admin.router, prefix="/api/v1", tags=["Admin"]) - - return app - - -# Create default app instance -app = create_app() - - -def run() -> None: - """Run the application with uvicorn.""" - import uvicorn - - settings = get_settings() - uvicorn.run( - "kb_engine.api.main:app", - host=settings.api_host, - port=settings.api_port, - workers=settings.api_workers, - reload=settings.is_development, - ) - - -if __name__ == "__main__": - run() diff --git a/src/kb_engine/api/middleware/__init__.py b/src/kb_engine/api/middleware/__init__.py deleted file mode 100644 index bce2a04..0000000 --- a/src/kb_engine/api/middleware/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""API middleware.""" diff --git a/src/kb_engine/api/middleware/auth.py b/src/kb_engine/api/middleware/auth.py deleted file mode 100644 index e28b68f..0000000 --- a/src/kb_engine/api/middleware/auth.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Authentication middleware.""" - -from fastapi import HTTPException, Security, status -from fastapi.security import APIKeyHeader - -api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) - - -async def verify_api_key(api_key: str | None = Security(api_key_header)) -> str: - """Verify API key if authentication is enabled. - - TODO: Implement actual API key validation. - """ - # Placeholder - would validate against stored keys - if api_key is None: - # For now, allow unauthenticated access - return "anonymous" - - # TODO: Validate API key - return api_key diff --git a/src/kb_engine/api/middleware/logging.py b/src/kb_engine/api/middleware/logging.py deleted file mode 100644 index 9ff297a..0000000 --- a/src/kb_engine/api/middleware/logging.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Logging middleware.""" - -import time -from typing import Callable - -from fastapi import Request, Response -from starlette.middleware.base import BaseHTTPMiddleware - -from kb_engine.config.logging import get_logger - -logger = get_logger(__name__) - - -class LoggingMiddleware(BaseHTTPMiddleware): - """Middleware for request/response logging.""" - - async def dispatch( - self, request: Request, call_next: Callable[[Request], Response] - ) -> Response: - """Log request and response details.""" - start_time = time.time() - - # Log request - logger.info( - "request_started", - method=request.method, - path=request.url.path, - query=str(request.query_params), - ) - - # Process request - response = await call_next(request) - - # Calculate duration - duration_ms = (time.time() - start_time) * 1000 - - # Log response - logger.info( - "request_completed", - method=request.method, - path=request.url.path, - status_code=response.status_code, - duration_ms=round(duration_ms, 2), - ) - - return response diff --git a/src/kb_engine/api/routers/__init__.py b/src/kb_engine/api/routers/__init__.py deleted file mode 100644 index f7ec5ce..0000000 --- a/src/kb_engine/api/routers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""API routers.""" diff --git a/src/kb_engine/api/routers/admin.py b/src/kb_engine/api/routers/admin.py deleted file mode 100644 index 1b75afe..0000000 --- a/src/kb_engine/api/routers/admin.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Admin API endpoints.""" - -from fastapi import APIRouter, HTTPException, status -from pydantic import BaseModel - -router = APIRouter(prefix="/admin") - - -class SystemStats(BaseModel): - """System statistics.""" - - documents_count: int - chunks_count: int - embeddings_count: int - nodes_count: int - edges_count: int - - -class StoreInfo(BaseModel): - """Information about a data store.""" - - name: str - status: str - details: dict - - -@router.get("/stats", response_model=SystemStats) -async def get_system_stats() -> dict: - """Get system statistics. - - TODO: Implement with repositories. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Admin endpoints not yet implemented", - ) - - -@router.get("/stores", response_model=list[StoreInfo]) -async def get_stores_info() -> list: - """Get information about all data stores. - - TODO: Implement with repositories. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Admin endpoints not yet implemented", - ) - - -@router.post("/reindex-all", status_code=status.HTTP_202_ACCEPTED) -async def reindex_all_documents() -> dict: - """Trigger reindexing of all documents. - - TODO: Implement with background task. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Admin endpoints not yet implemented", - ) - - -@router.post("/clear-cache", status_code=status.HTTP_204_NO_CONTENT) -async def clear_cache() -> None: - """Clear all caches. - - TODO: Implement cache clearing. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Admin endpoints not yet implemented", - ) diff --git a/src/kb_engine/api/routers/curation.py b/src/kb_engine/api/routers/curation.py deleted file mode 100644 index 204fb45..0000000 --- a/src/kb_engine/api/routers/curation.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Curation API endpoints for manual knowledge management.""" - -from uuid import UUID - -from fastapi import APIRouter, HTTPException, status -from pydantic import BaseModel, Field - -from kb_engine.core.models.graph import EdgeType, NodeType - -router = APIRouter(prefix="/curation") - - -class CreateNodeRequest(BaseModel): - """Request to create a node in the knowledge graph.""" - - name: str = Field(..., min_length=1, max_length=255) - node_type: NodeType - description: str | None = None - properties: dict = Field(default_factory=dict) - - -class CreateEdgeRequest(BaseModel): - """Request to create an edge in the knowledge graph.""" - - source_node_id: UUID - target_node_id: UUID - edge_type: EdgeType - name: str | None = None - properties: dict = Field(default_factory=dict) - - -class NodeResponse(BaseModel): - """Response for node operations.""" - - id: UUID - name: str - node_type: NodeType - description: str | None - - -class EdgeResponse(BaseModel): - """Response for edge operations.""" - - id: UUID - source_id: UUID - target_id: UUID - edge_type: EdgeType - name: str | None - - -@router.post("/nodes", response_model=NodeResponse, status_code=status.HTTP_201_CREATED) -async def create_node(request: CreateNodeRequest) -> dict: - """Manually create a node in the knowledge graph. - - TODO: Implement with graph repository. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Curation endpoints not yet implemented", - ) - - -@router.get("/nodes/{node_id}", response_model=NodeResponse) -async def get_node(node_id: UUID) -> dict: - """Get a node by ID. - - TODO: Implement with graph repository. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Curation endpoints not yet implemented", - ) - - -@router.delete("/nodes/{node_id}", status_code=status.HTTP_204_NO_CONTENT) -async def delete_node(node_id: UUID) -> None: - """Delete a node. - - TODO: Implement with graph repository. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Curation endpoints not yet implemented", - ) - - -@router.post("/edges", response_model=EdgeResponse, status_code=status.HTTP_201_CREATED) -async def create_edge(request: CreateEdgeRequest) -> dict: - """Manually create an edge in the knowledge graph. - - TODO: Implement with graph repository. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Curation endpoints not yet implemented", - ) - - -@router.delete("/edges/{edge_id}", status_code=status.HTTP_204_NO_CONTENT) -async def delete_edge(edge_id: UUID) -> None: - """Delete an edge. - - TODO: Implement with graph repository. - """ - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Curation endpoints not yet implemented", - ) diff --git a/src/kb_engine/api/routers/health.py b/src/kb_engine/api/routers/health.py deleted file mode 100644 index f08c06b..0000000 --- a/src/kb_engine/api/routers/health.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Health check endpoints.""" - -from fastapi import APIRouter, HTTPException, status - -from kb_engine.config import get_settings -from kb_engine.repositories.factory import RepositoryFactory - -router = APIRouter() - - -@router.get("/health") -async def health_check() -> dict[str, str]: - """Basic health check endpoint.""" - return {"status": "ok"} - - -@router.get("/health/ready") -async def readiness_check() -> dict[str, str | dict[str, str]]: - """Readiness check - verifies all dependencies are available.""" - settings = get_settings() - checks: dict[str, str] = {} - errors: dict[str, str] = {} - factory = RepositoryFactory(settings) - - try: - try: - traceability = await factory.get_traceability_repository() - await traceability.list_documents(limit=1) - checks["traceability"] = "ok" - except Exception as exc: - errors["traceability"] = exc.__class__.__name__ - - try: - vector = await factory.get_vector_repository() - await vector.get_collection_info() - checks["vector"] = "ok" - except Exception as exc: - errors["vector"] = exc.__class__.__name__ - - graph_store = settings.graph_store.lower() - if graph_store == "none": - checks["graph"] = "skipped" - elif graph_store == "falkordb": - try: - from kb_engine.smart.stores.falkordb_graph import FalkorDBGraphStore - - store = FalkorDBGraphStore(settings.falkordb_path) - store.initialize() - store.close() - checks["graph"] = "ok" - except Exception as exc: - errors["graph"] = exc.__class__.__name__ - else: - try: - graph = await factory.get_graph_repository() - if graph is None: - checks["graph"] = "skipped" - else: - await graph.find_nodes(limit=1) - checks["graph"] = "ok" - except Exception as exc: - errors["graph"] = exc.__class__.__name__ - finally: - await factory.close() - - if errors: - raise HTTPException( - status_code=status.HTTP_503_SERVICE_UNAVAILABLE, - detail={"status": "error", "checks": {**checks, **errors}}, - ) - - return {"status": "ok", "checks": checks} - - -@router.get("/health/live") -async def liveness_check() -> dict[str, str]: - """Liveness check - verifies the service is running.""" - return {"status": "ok"} diff --git a/src/kb_engine/api/routers/indexing.py b/src/kb_engine/api/routers/indexing.py deleted file mode 100644 index 0010a14..0000000 --- a/src/kb_engine/api/routers/indexing.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Indexing API endpoints.""" - -from typing import Any -from uuid import UUID - -from fastapi import APIRouter, HTTPException, status -from pydantic import BaseModel, Field - -from kb_engine.api.dependencies import IndexingServiceDep -from kb_engine.core.exceptions import DocumentNotFoundError -from kb_engine.core.models.document import Document, DocumentStatus - -router = APIRouter(prefix="/indexing") - - -# --- Request/Response models --- - -class IndexDocumentRequest(BaseModel): - """Request model for indexing a document.""" - - title: str = Field(..., min_length=1, max_length=500) - content: str = Field(..., min_length=1) - source_path: str | None = Field(default=None, max_length=1000) - external_id: str | None = Field(default=None, max_length=255) - domain: str | None = Field(default=None, max_length=100) - tags: list[str] = Field(default_factory=list) - metadata: dict[str, Any] = Field(default_factory=dict) - - -class DocumentResponse(BaseModel): - """Response model for document operations.""" - - id: UUID - title: str - status: DocumentStatus - source_path: str | None - external_id: str | None - domain: str | None - tags: list[str] - repo_name: str | None = None - relative_path: str | None = None - - class Config: - from_attributes = True - - -class RegisterRepositoryRequest(BaseModel): - """Request to register a Git repository for indexing.""" - - name: str = Field(..., min_length=1, max_length=255) - local_path: str = Field(..., min_length=1) - remote_url: str | None = None - branch: str = "main" - include_patterns: list[str] = Field(default_factory=lambda: ["**/*.md"]) - exclude_patterns: list[str] = Field(default_factory=list) - base_url_template: str | None = None - - -class SyncRepositoryRequest(BaseModel): - """Request to sync a repository from a specific commit.""" - - since_commit: str = Field(..., min_length=7, max_length=40) - - -class RepositoryIndexResult(BaseModel): - """Result of a repository indexing operation.""" - - repo_name: str - documents_indexed: int - status: str = "completed" - - -class RepositorySyncResult(BaseModel): - """Result of a repository sync operation.""" - - repo_name: str - commit: str - indexed: int - deleted: int - skipped: int - - -# --- Document endpoints --- - -@router.post("/documents", response_model=DocumentResponse, status_code=status.HTTP_201_CREATED) -async def index_document( - request: IndexDocumentRequest, - service: IndexingServiceDep, -) -> Document: - """Index a new document.""" - return await service.index_document( - title=request.title, - content=request.content, - source_path=request.source_path, - external_id=request.external_id, - domain=request.domain, - tags=request.tags, - metadata=request.metadata, - ) - - -@router.get("/documents/{document_id}", response_model=DocumentResponse) -async def get_document( - document_id: UUID, - service: IndexingServiceDep, -) -> Document: - """Get a document by ID.""" - try: - return await service.get_document(document_id) - except DocumentNotFoundError: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=f"Document not found: {document_id}", - ) - - -@router.post("/documents/{document_id}/reindex", response_model=DocumentResponse) -async def reindex_document( - document_id: UUID, - service: IndexingServiceDep, -) -> Document: - """Reindex an existing document.""" - try: - return await service.reindex_document(document_id) - except DocumentNotFoundError: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=f"Document not found: {document_id}", - ) - - -@router.delete("/documents/{document_id}", status_code=status.HTTP_204_NO_CONTENT) -async def delete_document( - document_id: UUID, - service: IndexingServiceDep, -) -> None: - """Delete a document and all its indexed data.""" - try: - await service.delete_document(document_id) - except DocumentNotFoundError: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=f"Document not found: {document_id}", - ) - - -@router.get("/documents", response_model=list[DocumentResponse]) -async def list_documents( - service: IndexingServiceDep, - limit: int = 100, - offset: int = 0, - domain: str | None = None, -) -> list[Document]: - """List indexed documents.""" - from kb_engine.core.models.search import SearchFilters - - filters = None - if domain: - filters = SearchFilters(domains=[domain]) - - return await service.list_documents( - filters=filters, - limit=limit, - offset=offset, - ) - - -# --- Repository endpoints --- - -@router.post( - "/repositories", - response_model=RepositoryIndexResult, - status_code=status.HTTP_201_CREATED, -) -async def register_and_index_repository( - request: RegisterRepositoryRequest, - service: IndexingServiceDep, -) -> RepositoryIndexResult: - """Register a Git repository and index all matching files.""" - from kb_engine.core.models.repository import RepositoryConfig - - config = RepositoryConfig( - name=request.name, - local_path=request.local_path, - remote_url=request.remote_url, - branch=request.branch, - include_patterns=request.include_patterns, - exclude_patterns=request.exclude_patterns, - base_url_template=request.base_url_template, - ) - - documents = await service.index_repository(config) - return RepositoryIndexResult( - repo_name=request.name, - documents_indexed=len(documents), - ) - - -@router.post( - "/repositories/{name}/sync", - response_model=RepositorySyncResult, -) -async def sync_repository( - name: str, - request: SyncRepositoryRequest, - service: IndexingServiceDep, -) -> RepositorySyncResult: - """Incrementally sync a repository (only changed files).""" - from kb_engine.core.models.repository import RepositoryConfig - - # For now, the caller must provide the full config - # In a future version, we'd store repo configs in the DB - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Repository sync via API requires stored repo config. Use the CLI instead.", - ) - - -@router.post( - "/repositories/{name}/reindex", - response_model=RepositoryIndexResult, -) -async def reindex_repository( - name: str, - service: IndexingServiceDep, -) -> RepositoryIndexResult: - """Full reindex of a repository.""" - raise HTTPException( - status_code=status.HTTP_501_NOT_IMPLEMENTED, - detail="Repository reindex via API requires stored repo config. Use the CLI instead.", - ) diff --git a/src/kb_engine/api/routers/retrieval.py b/src/kb_engine/api/routers/retrieval.py deleted file mode 100644 index 3e102be..0000000 --- a/src/kb_engine/api/routers/retrieval.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Retrieval API endpoints.""" - -from typing import Annotated - -from fastapi import APIRouter, Depends, Query -from pydantic import BaseModel, Field - -from kb_engine.api.dependencies import RetrievalServiceDep -from kb_engine.core.models.search import RetrievalMode, RetrievalResponse, SearchFilters - -router = APIRouter(prefix="/retrieval") - - -class RetrievalRequest(BaseModel): - """Request model for retrieval endpoint.""" - - query: str = Field(..., min_length=1, max_length=1000, description="Search query") - mode: RetrievalMode = Field( - default=RetrievalMode.VECTOR, description="Retrieval mode" - ) - limit: int = Field(default=10, ge=1, le=100, description="Max results") - score_threshold: float | None = Field( - default=None, ge=0.0, le=1.0, description="Min score threshold" - ) - filters: SearchFilters | None = Field(default=None, description="Search filters") - - -@router.post("/search", response_model=RetrievalResponse) -async def search( - request: RetrievalRequest, - service: RetrievalServiceDep, -) -> RetrievalResponse: - """Search the knowledge base and return document references with URLs.""" - return await service.search( - query=request.query, - mode=request.mode, - filters=request.filters, - limit=request.limit, - score_threshold=request.score_threshold, - ) - - -@router.get("/search", response_model=RetrievalResponse) -async def search_get( - service: RetrievalServiceDep, - query: Annotated[str, Query(min_length=1, max_length=1000)], - mode: RetrievalMode = RetrievalMode.VECTOR, - limit: Annotated[int, Query(ge=1, le=100)] = 10, -) -> RetrievalResponse: - """Search the knowledge base (GET variant for simple queries).""" - return await service.search( - query=query, - mode=mode, - limit=limit, - ) diff --git a/src/kb_engine/chunking/__init__.py b/src/kb_engine/chunking/__init__.py deleted file mode 100644 index 17d24ac..0000000 --- a/src/kb_engine/chunking/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Semantic chunking module for KB-Engine (ADR-0002).""" - -from kb_engine.chunking.base import BaseChunkingStrategy -from kb_engine.chunking.config import ChunkingConfig -from kb_engine.chunking.factory import ChunkerFactory -from kb_engine.chunking.parsers import get_parser -from kb_engine.chunking.types import ChunkType - -__all__ = [ - "ChunkingConfig", - "ChunkType", - "BaseChunkingStrategy", - "ChunkerFactory", - "get_parser", -] diff --git a/src/kb_engine/chunking/base.py b/src/kb_engine/chunking/base.py deleted file mode 100644 index 2c3df77..0000000 --- a/src/kb_engine/chunking/base.py +++ /dev/null @@ -1,100 +0,0 @@ -"""Base chunking strategy implementation.""" - -from abc import ABC, abstractmethod - -from kb_engine.chunking.config import ChunkingConfig -from kb_engine.core.interfaces.chunkers import ChunkingStrategy -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class BaseChunkingStrategy(ChunkingStrategy, ABC): - """Base class for chunking strategies. - - Provides common functionality for all chunking strategies. - """ - - def __init__(self, config: ChunkingConfig | None = None) -> None: - self._config = config or ChunkingConfig() - - @property - @abstractmethod - def chunk_type(self) -> ChunkType: - """The type of chunks this strategy produces.""" - ... - - @abstractmethod - def can_handle(self, document: Document, section_content: str) -> bool: - """Check if this strategy can handle the given content.""" - ... - - @abstractmethod - def chunk( - self, - document: Document, - content: str, - heading_path: list[str] | None = None, - ) -> list[Chunk]: - """Chunk the content into semantic units.""" - ... - - def _create_chunk( - self, - document: Document, - content: str, - sequence: int, - heading_path: list[str] | None = None, - start_offset: int | None = None, - end_offset: int | None = None, - ) -> Chunk: - """Create a chunk with standard metadata.""" - return Chunk( - document_id=document.id, - content=content, - chunk_type=self.chunk_type, - sequence=sequence, - heading_path=heading_path or [], - start_offset=start_offset, - end_offset=end_offset, - metadata={ - "domain": document.domain, - "source_path": document.source_path, - }, - ) - - def _split_by_size( - self, - text: str, - max_size: int | None = None, - ) -> list[str]: - """Split text into chunks respecting size limits. - - This is a simple character-based split. Subclasses may - override with token-based splitting. - """ - max_size = max_size or self._config.max_chunk_size - if len(text) <= max_size: - return [text] - - chunks = [] - current_pos = 0 - overlap = self._config.overlap_size - - while current_pos < len(text): - end_pos = min(current_pos + max_size, len(text)) - - if end_pos < len(text) and self._config.preserve_sentences: - # Only accept sentence breaks that leave room to advance past overlap - search_start = current_pos + overlap + 1 - for sep in [". ", ".\n", "! ", "!\n", "? ", "?\n"]: - last_sep = text.rfind(sep, search_start, end_pos) - if last_sep > current_pos: - end_pos = last_sep + len(sep) - break - - chunks.append(text[current_pos:end_pos].strip()) - - if end_pos >= len(text): - break - current_pos = end_pos - overlap - - return [c for c in chunks if c] diff --git a/src/kb_engine/chunking/config.py b/src/kb_engine/chunking/config.py deleted file mode 100644 index c6144a3..0000000 --- a/src/kb_engine/chunking/config.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Chunking configuration.""" - -from pydantic import BaseModel, Field - - -class ChunkingConfig(BaseModel): - """Configuration for the chunking process. - - These values follow the recommendations in ADR-0002 for - semantic chunking of KDD documents. - """ - - # Size constraints - min_chunk_size: int = Field(default=100, ge=50, description="Minimum chunk size in tokens") - target_chunk_size: int = Field( - default=512, ge=100, description="Target chunk size in tokens" - ) - max_chunk_size: int = Field(default=1024, ge=200, description="Maximum chunk size in tokens") - overlap_size: int = Field(default=50, ge=0, description="Overlap between chunks in tokens") - - # Behavior - preserve_sentences: bool = Field( - default=True, description="Avoid splitting in the middle of sentences" - ) - respect_headings: bool = Field( - default=True, description="Use markdown headings as chunk boundaries" - ) - include_heading_context: bool = Field( - default=True, description="Include heading hierarchy in chunk metadata" - ) - - # Strategy selection - enable_semantic_chunking: bool = Field( - default=True, description="Enable semantic chunk type detection" - ) - default_strategy: str = Field( - default="default", description="Default chunking strategy to use" - ) - - class Config: - frozen = True diff --git a/src/kb_engine/chunking/factory.py b/src/kb_engine/chunking/factory.py deleted file mode 100644 index 240263f..0000000 --- a/src/kb_engine/chunking/factory.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Factory for creating and managing chunking strategies.""" - -from kb_engine.chunking.base import BaseChunkingStrategy -from kb_engine.chunking.config import ChunkingConfig -from kb_engine.chunking.parsers import get_parser -from kb_engine.chunking.strategies.default import DefaultChunkingStrategy -from kb_engine.chunking.strategies.entity import EntityChunkingStrategy -from kb_engine.chunking.strategies.process import ProcessChunkingStrategy -from kb_engine.chunking.strategies.rule import RuleChunkingStrategy -from kb_engine.chunking.strategies.use_case import UseCaseChunkingStrategy -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class ChunkerFactory: - """Factory for creating and orchestrating chunking strategies. - - The factory maintains a registry of strategies and selects - the appropriate one based on content analysis. - """ - - def __init__(self, config: ChunkingConfig | None = None) -> None: - self._config = config or ChunkingConfig() - self._strategies: list[BaseChunkingStrategy] = [] - self._default_strategy: BaseChunkingStrategy | None = None - self._initialize_strategies() - - def _initialize_strategies(self) -> None: - """Initialize the default set of strategies.""" - self._strategies = [ - EntityChunkingStrategy(self._config), - UseCaseChunkingStrategy(self._config), - RuleChunkingStrategy(self._config), - ProcessChunkingStrategy(self._config), - ] - self._default_strategy = DefaultChunkingStrategy(self._config) - - def register_strategy(self, strategy: BaseChunkingStrategy) -> None: - """Register a custom chunking strategy.""" - self._strategies.append(strategy) - - def get_strategy_for_content( - self, - document: Document, - content: str, - ) -> BaseChunkingStrategy: - """Select the appropriate strategy for the given content. - - Iterates through registered strategies and returns the first - one that can handle the content, or the default strategy. - """ - if self._config.enable_semantic_chunking: - for strategy in self._strategies: - if strategy.can_handle(document, content): - return strategy - - return self._default_strategy or DefaultChunkingStrategy(self._config) - - def chunk_document(self, document: Document, parser: str = "markdown") -> list[Chunk]: - """Chunk an entire document. - - Parses the document structure using the specified parser and - applies appropriate strategies to each section. - """ - all_chunks: list[Chunk] = [] - parse_fn = get_parser(parser) - sections = parse_fn(document.content) - - sequence = 0 - for heading_path, content in sections: - strategy = self.get_strategy_for_content(document, content) - chunks = strategy.chunk(document, content, heading_path) - - # Update sequence numbers - for chunk in chunks: - chunk.sequence = sequence - sequence += 1 - - all_chunks.extend(chunks) - - return all_chunks - - def get_available_chunk_types(self) -> list[ChunkType]: - """Get list of chunk types supported by registered strategies.""" - types = [s.chunk_type for s in self._strategies] - if self._default_strategy: - types.append(self._default_strategy.chunk_type) - return list(set(types)) diff --git a/src/kb_engine/chunking/parsers.py b/src/kb_engine/chunking/parsers.py deleted file mode 100644 index 92e6118..0000000 --- a/src/kb_engine/chunking/parsers.py +++ /dev/null @@ -1,225 +0,0 @@ -"""Content parsers for different file formats. - -Each parser converts raw content into a list of (heading_path, section_content) tuples, -which is the format consumed by chunking strategies. -""" - -import json -import re -from collections.abc import Callable - -import yaml - -Sections = list[tuple[list[str], str]] - -_PARSER_REGISTRY: dict[str, Callable[[str], Sections]] = {} - - -def register_parser(name: str) -> Callable: - """Decorator to register a parser function.""" - - def decorator(fn: Callable[[str], Sections]) -> Callable[[str], Sections]: - _PARSER_REGISTRY[name] = fn - return fn - - return decorator - - -def get_parser(name: str) -> Callable[[str], Sections]: - """Get a parser by name. - - Raises ValueError if the parser name is unknown. - """ - if name not in _PARSER_REGISTRY: - raise ValueError( - f"Unknown parser: {name!r}. Available: {sorted(_PARSER_REGISTRY)}" - ) - return _PARSER_REGISTRY[name] - - -@register_parser("markdown") -def parse_markdown(content: str) -> Sections: - """Parse markdown content into sections with heading paths. - - Extracted from ChunkerFactory._parse_sections(). - """ - sections: Sections = [] - current_path: list[str] = [] - current_content: list[str] = [] - current_levels: list[int] = [] - - lines = content.split("\n") - - for line in lines: - if line.startswith("#"): - section_text = "\n".join(current_content).strip() - if section_text: - sections.append((list(current_path), section_text)) - current_content = [] - - level = len(line) - len(line.lstrip("#")) - heading_text = line.lstrip("#").strip() - - while current_levels and current_levels[-1] >= level: - current_levels.pop() - if current_path: - current_path.pop() - - current_path.append(heading_text) - current_levels.append(level) - else: - current_content.append(line) - - section_text = "\n".join(current_content).strip() - if section_text: - sections.append((list(current_path), section_text)) - - return sections - - -def _flatten_json(data: object, path: list[str] | None = None, max_depth: int = 3) -> Sections: - """Flatten a JSON/YAML structure into sections. - - Keys become heading_path entries. Leaf values become section content. - Arrays of objects produce one section per element. - """ - if path is None: - path = [] - - sections: Sections = [] - - if isinstance(data, dict): - for key, value in data.items(): - current_path = [*path, str(key)] - if len(current_path) >= max_depth or not isinstance(value, (dict, list)): - sections.append((current_path, _value_to_text(value))) - else: - sections.extend(_flatten_json(value, current_path, max_depth)) - elif isinstance(data, list): - if all(isinstance(item, dict) for item in data) and data: - for i, item in enumerate(data): - item_label = _item_label(item, i) - current_path = [*path, item_label] - if len(current_path) >= max_depth: - sections.append((current_path, _value_to_text(item))) - else: - sections.extend(_flatten_json(item, current_path, max_depth)) - else: - sections.append((path, _value_to_text(data))) - else: - sections.append((path, _value_to_text(data))) - - return sections - - -def _item_label(item: dict, index: int) -> str: - """Generate a label for an array item, using a name/id/title field if available.""" - for key in ("name", "id", "title", "key"): - if key in item: - return str(item[key]) - return f"[{index}]" - - -def _value_to_text(value: object) -> str: - """Convert a value to readable text.""" - if isinstance(value, str): - return value - return json.dumps(value, indent=2, ensure_ascii=False) - - -@register_parser("json") -def parse_json(content: str) -> Sections: - """Parse JSON content into sections using key paths as heading paths.""" - try: - data = json.loads(content) - except (json.JSONDecodeError, ValueError): - return [([], content)] - - sections = _flatten_json(data) - return sections if sections else [([], content)] - - -@register_parser("yaml") -def parse_yaml(content: str) -> Sections: - """Parse YAML content into sections. Delegates to _flatten_json after loading.""" - try: - data = yaml.safe_load(content) - except yaml.YAMLError: - return [([], content)] - - if data is None: - return [([], content)] - - sections = _flatten_json(data) - return sections if sections else [([], content)] - - -@register_parser("rst") -def parse_rst(content: str) -> Sections: - """Parse reStructuredText content into sections. - - Detects headings by adornment lines (===, ---, ~~~, etc.). - Hierarchy is determined by order of first appearance of each adornment character. - """ - sections: Sections = [] - current_path: list[str] = [] - current_levels: list[int] = [] - current_content: list[str] = [] - - adornment_chars: list[str] = [] # order of first appearance determines hierarchy - adornment_pattern = re.compile(r"^([=\-~`:.'^\"#*+_!])\1{2,}$") - - lines = content.split("\n") - i = 0 - - while i < len(lines): - # Check for heading: line followed by adornment of same length or longer - if ( - i + 1 < len(lines) - and lines[i].strip() - and not adornment_pattern.match(lines[i]) - and adornment_pattern.match(lines[i + 1].rstrip()) - and len(lines[i + 1].rstrip()) >= len(lines[i].rstrip()) - ): - # Save previous section - section_text = "\n".join(current_content).strip() - if section_text: - sections.append((list(current_path), section_text)) - current_content = [] - - heading_text = lines[i].strip() - adornment_char = lines[i + 1].rstrip()[0] - - if adornment_char not in adornment_chars: - adornment_chars.append(adornment_char) - level = adornment_chars.index(adornment_char) + 1 - - while current_levels and current_levels[-1] >= level: - current_levels.pop() - if current_path: - current_path.pop() - - current_path.append(heading_text) - current_levels.append(level) - i += 2 - else: - current_content.append(lines[i]) - i += 1 - - section_text = "\n".join(current_content).strip() - if section_text: - sections.append((list(current_path), section_text)) - - return sections - - -@register_parser("plaintext") -def parse_plaintext(content: str) -> Sections: - """Parse plain text by splitting on blank lines (paragraphs).""" - paragraphs = re.split(r"\n\s*\n", content) - sections: Sections = [] - for para in paragraphs: - text = para.strip() - if text: - sections.append(([], text)) - return sections if sections else [([], content)] diff --git a/src/kb_engine/chunking/strategies/__init__.py b/src/kb_engine/chunking/strategies/__init__.py deleted file mode 100644 index 85dd9a9..0000000 --- a/src/kb_engine/chunking/strategies/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Chunking strategy implementations.""" - -from kb_engine.chunking.strategies.default import DefaultChunkingStrategy -from kb_engine.chunking.strategies.entity import EntityChunkingStrategy -from kb_engine.chunking.strategies.process import ProcessChunkingStrategy -from kb_engine.chunking.strategies.rule import RuleChunkingStrategy -from kb_engine.chunking.strategies.use_case import UseCaseChunkingStrategy - -__all__ = [ - "DefaultChunkingStrategy", - "EntityChunkingStrategy", - "UseCaseChunkingStrategy", - "RuleChunkingStrategy", - "ProcessChunkingStrategy", -] diff --git a/src/kb_engine/chunking/strategies/default.py b/src/kb_engine/chunking/strategies/default.py deleted file mode 100644 index 8086876..0000000 --- a/src/kb_engine/chunking/strategies/default.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Default chunking strategy.""" - -from kb_engine.chunking.base import BaseChunkingStrategy -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class DefaultChunkingStrategy(BaseChunkingStrategy): - """Default chunking strategy for generic content. - - Used when no specialized strategy matches the content. - Applies general-purpose text chunking with overlap. - """ - - @property - def chunk_type(self) -> ChunkType: - return ChunkType.DEFAULT - - def can_handle(self, document: Document, section_content: str) -> bool: - """Default strategy can handle any content.""" - return True - - def chunk( - self, - document: Document, - content: str, - heading_path: list[str] | None = None, - ) -> list[Chunk]: - """Chunk content using default size-based splitting.""" - chunks = [] - - if len(content) <= self._config.max_chunk_size: - # Content fits in a single chunk - chunks.append( - self._create_chunk( - document=document, - content=content, - sequence=0, - heading_path=heading_path, - ) - ) - else: - # Split content respecting size limits - text_parts = self._split_by_size(content) - for i, part in enumerate(text_parts): - chunks.append( - self._create_chunk( - document=document, - content=part, - sequence=i, - heading_path=heading_path, - ) - ) - - return chunks diff --git a/src/kb_engine/chunking/strategies/entity.py b/src/kb_engine/chunking/strategies/entity.py deleted file mode 100644 index 5f2ab2a..0000000 --- a/src/kb_engine/chunking/strategies/entity.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Entity chunking strategy.""" - -import re - -from kb_engine.chunking.base import BaseChunkingStrategy -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class EntityChunkingStrategy(BaseChunkingStrategy): - """Chunking strategy for entity definitions. - - Identifies and extracts chunks that define domain entities, - their attributes, and relationships. - """ - - # Patterns that indicate entity definitions - ENTITY_PATTERNS = [ - r"^#+\s*(?:entidad|entity|objeto|object)[\s:]+", - r"(?:se\s+define|is\s+defined\s+as|represents?|describes?)\s+(?:una?|an?|the)\s+\w+", - r"(?:atributos?|attributes?|propiedades?|properties?)[\s:]+", - r"^[-*]\s*\*\*\w+\*\*\s*[:]\s*", # Attribute definitions like "- **name**: description" - ] - - @property - def chunk_type(self) -> ChunkType: - return ChunkType.ENTITY - - def can_handle(self, document: Document, section_content: str) -> bool: - """Check if content appears to define an entity.""" - content_lower = section_content.lower() - - # Check heading path in document metadata or common patterns - for pattern in self.ENTITY_PATTERNS: - if re.search(pattern, content_lower, re.IGNORECASE | re.MULTILINE): - return True - - # Check for attribute list pattern (common in entity definitions) - attribute_lines = re.findall(r"^[-*]\s*\*\*\w+\*\*", section_content, re.MULTILINE) - if len(attribute_lines) >= 3: - return True - - return False - - def chunk( - self, - document: Document, - content: str, - heading_path: list[str] | None = None, - ) -> list[Chunk]: - """Chunk entity content. - - For entities, we try to keep the entire definition together - if possible, including all attributes. - """ - chunks = [] - - # Try to keep entity definition intact - if len(content) <= self._config.max_chunk_size: - chunks.append( - self._create_chunk( - document=document, - content=content, - sequence=0, - heading_path=heading_path, - ) - ) - else: - # Split large entity definitions - text_parts = self._split_by_size(content) - for i, part in enumerate(text_parts): - chunks.append( - self._create_chunk( - document=document, - content=part, - sequence=i, - heading_path=heading_path, - ) - ) - - return chunks diff --git a/src/kb_engine/chunking/strategies/process.py b/src/kb_engine/chunking/strategies/process.py deleted file mode 100644 index 1cf3d7e..0000000 --- a/src/kb_engine/chunking/strategies/process.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Process/workflow chunking strategy.""" - -import re - -from kb_engine.chunking.base import BaseChunkingStrategy -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class ProcessChunkingStrategy(BaseChunkingStrategy): - """Chunking strategy for process/workflow descriptions. - - Identifies and extracts chunks that describe processes, - workflows, procedures, or sequences of steps. - """ - - PROCESS_PATTERNS = [ - r"^#+\s*(?:proceso|process|flujo|flow|workflow|procedimiento|procedure)", - r"(?:diagrama\s+de\s+(?:flujo|actividad)|flow\s*chart|activity\s+diagram)", - r"(?:paso(?:s)?|step(?:s)?)\s*(?:\d+|[:])?\s*", - r"(?:secuencia|sequence)\s+(?:de|of)\s+", - r"(?:primero|segundo|tercero|first|second|third|then|después|luego)", - r"(?:\d+\.\s+|\d+\)\s+).*(?:\d+\.\s+|\d+\)\s+)", # Numbered steps - ] - - @property - def chunk_type(self) -> ChunkType: - return ChunkType.PROCESS - - def can_handle(self, document: Document, section_content: str) -> bool: - """Check if content appears to describe a process.""" - content_lower = section_content.lower() - - for pattern in self.PROCESS_PATTERNS: - if re.search(pattern, content_lower, re.IGNORECASE | re.MULTILINE): - return True - - # Check for numbered lists (common in process descriptions) - numbered_items = re.findall(r"^\s*\d+[\.\)]\s+", section_content, re.MULTILINE) - if len(numbered_items) >= 3: - return True - - return False - - def chunk( - self, - document: Document, - content: str, - heading_path: list[str] | None = None, - ) -> list[Chunk]: - """Chunk process content. - - Processes are sequential, so we try to preserve the order - and context of steps while respecting size limits. - """ - chunks = [] - - # Try to keep the entire process together if possible - if len(content) <= self._config.max_chunk_size: - chunks.append( - self._create_chunk( - document=document, - content=content, - sequence=0, - heading_path=heading_path, - ) - ) - else: - # Split by logical groups of steps - step_groups = self._group_steps(content) - - for i, group in enumerate(step_groups): - if group.strip(): - # Further split if still too large - if len(group) <= self._config.max_chunk_size: - chunks.append( - self._create_chunk( - document=document, - content=group.strip(), - sequence=len(chunks), - heading_path=heading_path, - ) - ) - else: - text_parts = self._split_by_size(group) - for part in text_parts: - chunks.append( - self._create_chunk( - document=document, - content=part, - sequence=len(chunks), - heading_path=heading_path, - ) - ) - - return chunks - - def _group_steps(self, content: str) -> list[str]: - """Group process steps into logical chunks.""" - # Find numbered steps - step_pattern = r"(?:^|\n)(\s*\d+[\.\)]\s+)" - matches = list(re.finditer(step_pattern, content)) - - if not matches: - # No numbered steps, try bullet points - step_pattern = r"(?:^|\n)(\s*[-*]\s+)" - matches = list(re.finditer(step_pattern, content)) - - if not matches: - return [content] - - # Group steps to fit within target size - groups = [] - current_group_start = 0 - current_group_size = 0 - target_size = self._config.target_chunk_size - - for i, match in enumerate(matches): - step_start = match.start() - - # Determine step content (until next step or end) - if i + 1 < len(matches): - step_end = matches[i + 1].start() - else: - step_end = len(content) - - step_size = step_end - step_start - - if current_group_size + step_size > target_size and current_group_size > 0: - # Start a new group - groups.append(content[current_group_start:step_start].strip()) - current_group_start = step_start - current_group_size = step_size - else: - current_group_size += step_size - - # Add the last group - if current_group_start < len(content): - groups.append(content[current_group_start:].strip()) - - return groups diff --git a/src/kb_engine/chunking/strategies/rule.py b/src/kb_engine/chunking/strategies/rule.py deleted file mode 100644 index ab41c97..0000000 --- a/src/kb_engine/chunking/strategies/rule.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Business rule chunking strategy.""" - -import re - -from kb_engine.chunking.base import BaseChunkingStrategy -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class RuleChunkingStrategy(BaseChunkingStrategy): - """Chunking strategy for business rules. - - Identifies and extracts chunks that define business rules, - constraints, validations, or policies. - """ - - RULE_PATTERNS = [ - r"^#+\s*(?:regla|rule|rn[-_]?\d+|br[-_]?\d+)", - r"(?:regla\s+de\s+negocio|business\s+rule)", - r"(?:restricci[oó]n|constraint|validaci[oó]n|validation)", - r"(?:cuando|when|si|if)\s+.*(?:entonces|then|debe|must|should)", - r"(?:no\s+(?:se\s+)?permite|not\s+allowed|prohibited|forbidden)", - r"(?:obligatorio|mandatory|required|requerido)", - r"(?:pol[ií]tica|policy)\s*[:]\s*", - ] - - @property - def chunk_type(self) -> ChunkType: - return ChunkType.RULE - - def can_handle(self, document: Document, section_content: str) -> bool: - """Check if content appears to define a business rule.""" - content_lower = section_content.lower() - - for pattern in self.RULE_PATTERNS: - if re.search(pattern, content_lower, re.IGNORECASE | re.MULTILINE): - return True - - return False - - def chunk( - self, - document: Document, - content: str, - heading_path: list[str] | None = None, - ) -> list[Chunk]: - """Chunk business rule content. - - Business rules are typically atomic and should be kept intact. - We only split if absolutely necessary due to size. - """ - chunks = [] - - # Try to identify individual rules - rules = self._extract_individual_rules(content) - - if len(rules) <= 1: - # Single rule or no clear structure - if len(content) <= self._config.max_chunk_size: - chunks.append( - self._create_chunk( - document=document, - content=content, - sequence=0, - heading_path=heading_path, - ) - ) - else: - text_parts = self._split_by_size(content) - for i, part in enumerate(text_parts): - chunks.append( - self._create_chunk( - document=document, - content=part, - sequence=i, - heading_path=heading_path, - ) - ) - else: - # Multiple rules - chunk each separately - for i, rule in enumerate(rules): - if rule.strip(): - chunks.append( - self._create_chunk( - document=document, - content=rule.strip(), - sequence=i, - heading_path=heading_path, - ) - ) - - return chunks - - def _extract_individual_rules(self, content: str) -> list[str]: - """Extract individual rules from content.""" - # Look for numbered rules or bullet points - rule_pattern = r"(?:^|\n)(?:\d+\.|[-*])\s*(?:RN[-_]?\d+|BR[-_]?\d+|Regla|Rule)?\s*[:\s]" - - parts = re.split(rule_pattern, content, flags=re.IGNORECASE) - if len(parts) > 1: - return [p.strip() for p in parts if p.strip()] - - # Try splitting by "when/if...then" patterns - conditional_pattern = r"(?:^|\n\n)(?:cuando|when|si|if)\s+" - parts = re.split(conditional_pattern, content, flags=re.IGNORECASE) - if len(parts) > 1: - return [p.strip() for p in parts if p.strip()] - - return [content] diff --git a/src/kb_engine/chunking/strategies/use_case.py b/src/kb_engine/chunking/strategies/use_case.py deleted file mode 100644 index b15d2ab..0000000 --- a/src/kb_engine/chunking/strategies/use_case.py +++ /dev/null @@ -1,107 +0,0 @@ -"""Use case chunking strategy.""" - -import re - -from kb_engine.chunking.base import BaseChunkingStrategy -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class UseCaseChunkingStrategy(BaseChunkingStrategy): - """Chunking strategy for use case descriptions. - - Identifies and extracts chunks that describe use cases, - user stories, or functional requirements. - """ - - USE_CASE_PATTERNS = [ - r"^#+\s*(?:caso\s+de\s+uso|use\s+case|cu[-_]?\d+)", - r"^#+\s*(?:historia\s+de\s+usuario|user\s+story|us[-_]?\d+)", - r"(?:como|as\s+a)\s+(?:un|una|an?)\s+\w+.*(?:quiero|want|necesito|need)", - r"(?:actor(?:es)?|actors?)\s*[:]\s*", - r"(?:precondici[oó]n|precondition|postcondici[oó]n|postcondition)", - r"(?:flujo\s+(?:principal|alternativo)|main\s+flow|alternative\s+flow)", - ] - - @property - def chunk_type(self) -> ChunkType: - return ChunkType.USE_CASE - - def can_handle(self, document: Document, section_content: str) -> bool: - """Check if content appears to describe a use case.""" - content_lower = section_content.lower() - - for pattern in self.USE_CASE_PATTERNS: - if re.search(pattern, content_lower, re.IGNORECASE | re.MULTILINE): - return True - - return False - - def chunk( - self, - document: Document, - content: str, - heading_path: list[str] | None = None, - ) -> list[Chunk]: - """Chunk use case content. - - Use cases have a specific structure (actors, preconditions, - flows, postconditions) that we try to preserve. - """ - chunks = [] - - # Try to identify use case sections - sections = self._split_use_case_sections(content) - - if len(sections) == 1 or sum(len(s) for s in sections) <= self._config.max_chunk_size: - # Keep entire use case together if small enough - chunks.append( - self._create_chunk( - document=document, - content=content, - sequence=0, - heading_path=heading_path, - ) - ) - else: - # Split by sections - for i, section in enumerate(sections): - if section.strip(): - text_parts = self._split_by_size(section) - for part in text_parts: - chunks.append( - self._create_chunk( - document=document, - content=part, - sequence=len(chunks), - heading_path=heading_path, - ) - ) - - return chunks - - def _split_use_case_sections(self, content: str) -> list[str]: - """Split use case into its constituent sections.""" - section_patterns = [ - r"(?:^|\n)(?:actor(?:es)?|actors?)\s*[:\n]", - r"(?:^|\n)(?:precondici[oó]n(?:es)?|preconditions?)\s*[:\n]", - r"(?:^|\n)(?:flujo\s+principal|main\s+flow)\s*[:\n]", - r"(?:^|\n)(?:flujo(?:s)?\s+alternativo(?:s)?|alternative\s+flow(?:s)?)\s*[:\n]", - r"(?:^|\n)(?:postcondici[oó]n(?:es)?|postconditions?)\s*[:\n]", - ] - - # Find all section boundaries - boundaries = [0] - for pattern in section_patterns: - for match in re.finditer(pattern, content, re.IGNORECASE): - boundaries.append(match.start()) - boundaries.append(len(content)) - boundaries = sorted(set(boundaries)) - - # Extract sections - sections = [] - for i in range(len(boundaries) - 1): - section = content[boundaries[i] : boundaries[i + 1]].strip() - if section: - sections.append(section) - - return sections if sections else [content] diff --git a/src/kb_engine/chunking/types.py b/src/kb_engine/chunking/types.py deleted file mode 100644 index a057d0c..0000000 --- a/src/kb_engine/chunking/types.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Chunk type definitions (re-exported from core for convenience).""" - -from kb_engine.core.models.document import ChunkType - -__all__ = ["ChunkType"] diff --git a/src/kb_engine/cli.py b/src/kb_engine/cli.py deleted file mode 100644 index e5a3f89..0000000 --- a/src/kb_engine/cli.py +++ /dev/null @@ -1,620 +0,0 @@ -"""CLI for KB-Engine local mode.""" - -import asyncio -import sys -from pathlib import Path - -import click -import structlog - -from kb_engine.config.logging import configure_logging - -logger = structlog.get_logger(__name__) - - -def run_async(coro): - """Run an async function synchronously.""" - return asyncio.run(coro) - - -async def _create_services(settings=None): - """Create indexing and retrieval services.""" - from kb_engine.config.settings import Settings, get_settings - from kb_engine.embedding.config import EmbeddingConfig - from kb_engine.pipelines.indexation.pipeline import IndexationPipeline - from kb_engine.pipelines.inference.pipeline import RetrievalPipeline - from kb_engine.repositories.factory import RepositoryFactory - from kb_engine.services.indexing import IndexingService - from kb_engine.services.retrieval import RetrievalService - - if settings is None: - settings = get_settings() - - factory = RepositoryFactory(settings) - traceability = await factory.get_traceability_repository() - vector = await factory.get_vector_repository() - graph_strategy = await factory.get_graph_strategy() - graph = await factory.get_graph_repository() - - embedding_config = EmbeddingConfig( - provider=settings.embedding_provider, - local_model_name=settings.local_embedding_model, - openai_model=settings.openai_embedding_model, - ) - - indexing_pipeline = IndexationPipeline( - traceability_repo=traceability, - vector_repo=vector, - graph_strategy=graph_strategy, - embedding_config=embedding_config, - ) - retrieval_pipeline = RetrievalPipeline( - traceability_repo=traceability, - vector_repo=vector, - graph_repo=graph, - embedding_config=embedding_config, - ) - - return ( - IndexingService(pipeline=indexing_pipeline), - RetrievalService(pipeline=retrieval_pipeline), - factory, - ) - - -@click.group() -@click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging") -def cli(verbose: bool) -> None: - """KB-Engine: Intelligent document retrieval system.""" - log_level = "DEBUG" if verbose else "INFO" - configure_logging(log_level=log_level) - - -@cli.command() -@click.argument("repo_path", default=".") -@click.option("--name", "-n", help="Repository name (default: directory name)") -@click.option("--pattern", "-p", multiple=True, default=["**/*.md"], help="Include glob patterns") -@click.option("--exclude", "-e", multiple=True, help="Exclude glob patterns") -def index(repo_path: str, name: str | None, pattern: tuple[str, ...], exclude: tuple[str, ...]) -> None: - """Index a Git repository. - - Scans the repository for matching files and indexes them. - """ - repo_path_obj = Path(repo_path).resolve() - if not repo_path_obj.exists(): - click.echo(f"Error: Path does not exist: {repo_path_obj}", err=True) - sys.exit(1) - if repo_path_obj.is_file(): - repo_path_obj = repo_path_obj.parent - - repo_name = name or repo_path_obj.name - - async def _index(): - from kb_engine.core.models.repository import RepositoryConfig - - config = RepositoryConfig( - name=repo_name, - local_path=str(repo_path_obj), - include_patterns=list(pattern), - exclude_patterns=list(exclude), - ) - - indexing_service, _, factory = await _create_services() - try: - click.echo(f"Indexing repository: {repo_name} ({repo_path_obj})") - documents = await indexing_service.index_repository(config) - click.echo(f"Indexed {len(documents)} documents") - for doc in documents: - click.echo(f" - {doc.relative_path or doc.title}") - finally: - await factory.close() - - run_async(_index()) - - -@cli.command() -@click.argument("query") -@click.option("--limit", "-l", default=10, help="Max results") -@click.option("--threshold", "-t", type=float, default=None, help="Min score threshold") -@click.option("--json", "output_json", is_flag=True, help="Output results as JSON") -@click.option("--mode", "-m", type=click.Choice(["vector", "graph", "hybrid"]), default="vector", help="Retrieval mode") -@click.option("--status", "-s", multiple=True, help="Include documents with these statuses (default: approved). Can repeat.") -@click.option("--include-all", is_flag=True, help="Include documents of all statuses") -def search(query: str, limit: int, threshold: float | None, output_json: bool, mode: str, status: tuple[str, ...], include_all: bool) -> None: - """Search the knowledge base. - - Returns document references with URLs pointing to exact sections. - By default, only 'approved' documents are searched. Use --status to include - other statuses (draft, proposed, deprecated) or --include-all for everything. - """ - import json - - from kb_engine.core.models.search import RetrievalMode, SearchFilters - - mode_map = { - "vector": RetrievalMode.VECTOR, - "graph": RetrievalMode.GRAPH, - "hybrid": RetrievalMode.HYBRID, - } - - # Build status filters - filters = None - if include_all: - filters = SearchFilters(include_all_statuses=True) - elif status: - filters = SearchFilters(include_statuses=list(status)) - - async def _search(): - _, retrieval_service, factory = await _create_services() - try: - response = await retrieval_service.search( - query=query, - limit=limit, - score_threshold=threshold, - mode=mode_map[mode], - filters=filters, - ) - - if output_json: - # JSON output for agents - output = { - "query": response.query, - "total_count": response.total_count, - "processing_time_ms": response.processing_time_ms, - "references": [ - { - "url": ref.url, - "document_path": ref.document_path, - "title": ref.title, - "section_title": ref.section_title, - "section_anchor": ref.section_anchor, - "score": ref.score, - "snippet": ref.snippet, - "domain": ref.domain, - "tags": ref.tags, - "chunk_type": ref.chunk_type, - "retrieval_mode": ref.retrieval_mode.value, - "kdd_status": ref.kdd_status, - "kdd_version": ref.kdd_version, - "metadata": ref.metadata, - } - for ref in response.references - ], - } - click.echo(json.dumps(output, indent=2, ensure_ascii=False)) - return - - # Human-readable output - if not response.references: - click.echo("No results found.") - return - - click.echo(f"Found {response.total_count} results ({response.processing_time_ms:.0f}ms):\n") - for i, ref in enumerate(response.references, 1): - mode_indicator = f"[{ref.retrieval_mode.value}]" if ref.retrieval_mode.value != "vector" else "" - click.echo(f" {i}. [{ref.score:.3f}] {mode_indicator} {ref.url}") - if ref.title: - click.echo(f" Title: {ref.title}") - if ref.section_title: - click.echo(f" Section: {ref.section_title}") - if ref.snippet: - snippet = ref.snippet[:120] + "..." if len(ref.snippet) > 120 else ref.snippet - click.echo(f" {snippet}") - # Show graph relationships if present - if ref.metadata.get("graph_relationships"): - rels = ref.metadata["graph_relationships"] - rel_strs = [f"{r['type']}→{r['related_node']}" for r in rels[:3]] - click.echo(f" Relations: {', '.join(rel_strs)}") - click.echo() - finally: - await factory.close() - - run_async(_search()) - - -@cli.command() -@click.argument("repo_path", default=".") -@click.option("--name", "-n", help="Repository name (default: directory name)") -@click.option("--since", "-s", required=True, help="Commit hash to sync from") -@click.option("--pattern", "-p", multiple=True, default=["**/*.md"], help="Include glob patterns") -def sync(repo_path: str, name: str | None, since: str, pattern: tuple[str, ...]) -> None: - """Sync a repository incrementally. - - Only re-indexes files that changed since the given commit. - """ - repo_path_obj = Path(repo_path).resolve() - repo_name = name or repo_path_obj.name - - async def _sync(): - from kb_engine.core.models.repository import RepositoryConfig - - config = RepositoryConfig( - name=repo_name, - local_path=str(repo_path_obj), - include_patterns=list(pattern), - ) - - indexing_service, _, factory = await _create_services() - try: - click.echo(f"Syncing repository: {repo_name} (since {since[:8]}...)") - result = await indexing_service.sync_repository(config, since) - click.echo( - f"Sync complete: {result['indexed']} indexed, " - f"{result['deleted']} deleted, {result['skipped']} unchanged" - ) - click.echo(f"Current commit: {result['commit'][:8]}") - finally: - await factory.close() - - run_async(_sync()) - - -@cli.command() -def status() -> None: - """Show the status of the local index.""" - async def _status(): - from kb_engine.config.settings import get_settings - - settings = get_settings() - _, _, factory = await _create_services(settings) - - try: - traceability = await factory.get_traceability_repository() - vector = await factory.get_vector_repository() - - docs = await traceability.list_documents(limit=1000) - vector_info = await vector.get_collection_info() - - click.echo("KB-Engine Status") - click.echo(f" Profile: {settings.profile}") - click.echo(f" SQLite DB: {settings.sqlite_path}") - click.echo(f" ChromaDB: {settings.chroma_path}") - click.echo(f" Embedding: {settings.embedding_provider} ({settings.local_embedding_model})") - click.echo(f" Documents: {len(docs)}") - click.echo(f" Vectors: {vector_info.get('count', 'N/A')}") - - if docs: - click.echo("\nIndexed documents:") - for doc in docs[:20]: - status_str = doc.status.value - path = doc.relative_path or doc.source_path or doc.title - click.echo(f" [{status_str:>10}] {path}") - if len(docs) > 20: - click.echo(f" ... and {len(docs) - 20} more") - finally: - await factory.close() - - run_async(_status()) - - -def _get_graph_store(): - """Create and initialize a FalkorDB graph store from settings.""" - from kb_engine.config.settings import get_settings - from kb_engine.smart.stores.falkordb_graph import FalkorDBGraphStore - - settings = get_settings() - store = FalkorDBGraphStore(settings.falkordb_path) - store.initialize() - return store - - -@cli.group() -def graph() -> None: - """Graph-related commands.""" - pass - - -@graph.command("orphans") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_orphans(output_json: bool) -> None: - """List stub entities without a primary document. - - These are entities referenced by other documents but whose own - document hasn't been indexed yet. - """ - import json - - store = _get_graph_store() - orphans = store.get_orphan_entities() - - if output_json: - click.echo(json.dumps({"orphans": orphans, "count": len(orphans)}, indent=2)) - return - - if not orphans: - click.echo("No orphan entities found. All referenced entities have primary documents.") - return - - click.echo(f"Found {len(orphans)} orphan entities (stubs without primary document):\n") - for entity in orphans: - click.echo(f" - {entity['name']} (confidence: {entity['confidence']:.2f})") - click.echo(f" Referenced by: {', '.join(entity['referenced_by'])}") - - -@graph.command("completeness") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -@click.option("--status", "-s", type=click.Choice(["complete", "stub", "orphan"]), help="Filter by status") -def graph_completeness(output_json: bool, status: str | None) -> None: - """Show completeness status for all entities. - - Status types: - - complete: Has a primary document - - stub: Only referenced, no primary document yet - - orphan: No provenance edges at all - """ - import json - - store = _get_graph_store() - entities = store.get_entity_completeness() - - if status: - entities = [e for e in entities if e["status"] == status] - - if output_json: - click.echo(json.dumps({"entities": entities, "count": len(entities)}, indent=2)) - return - - if not entities: - click.echo("No entities found.") - return - - # Group by status - by_status = {"complete": [], "stub": [], "orphan": []} - for e in entities: - by_status[e["status"]].append(e) - - click.echo(f"Entity completeness ({len(entities)} total):\n") - - if by_status["complete"]: - click.echo(f" Complete ({len(by_status['complete'])}):") - for e in by_status["complete"][:10]: - docs = ", ".join(e["primary_docs"]) if e["primary_docs"] else "?" - click.echo(f" [OK] {e['name']} <- {docs}") - if len(by_status["complete"]) > 10: - click.echo(f" ... and {len(by_status['complete']) - 10} more") - - if by_status["stub"]: - click.echo(f"\n Stubs ({len(by_status['stub'])}):") - for e in by_status["stub"]: - refs = ", ".join(e["referenced_by"]) if e["referenced_by"] else "?" - click.echo(f" [STUB] {e['name']} (referenced by: {refs})") - - if by_status["orphan"]: - click.echo(f"\n Orphans ({len(by_status['orphan'])}):") - for e in by_status["orphan"]: - click.echo(f" [ORPHAN] {e['name']}") - - -@graph.command("stats") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_stats(output_json: bool) -> None: - """Show graph database statistics.""" - import json - - store = _get_graph_store() - stats = store.get_stats() - - if output_json: - click.echo(json.dumps(stats, indent=2)) - return - - click.echo("Graph Statistics:") - click.echo(f" Entities: {stats.get('entity_count', 0)}") - click.echo(f" Concepts: {stats.get('concept_count', 0)}") - click.echo(f" Events: {stats.get('event_count', 0)}") - click.echo(f" Documents: {stats.get('document_count', 0)}") - total = sum(stats.get(f"{t}_count", 0) for t in ["entity", "concept", "event"]) - click.echo(f" Total domain nodes: {total}") - - -@graph.command("ls") -@click.option("--type", "node_type", type=click.Choice(["entity", "concept", "event"]), help="Filter by node type") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_ls(node_type: str | None, output_json: bool) -> None: - """List all domain nodes in the graph.""" - import json - - store = _get_graph_store() - nodes = store.get_all_nodes(node_type) - - if output_json: - click.echo(json.dumps({"nodes": nodes, "count": len(nodes)}, indent=2)) - return - - if not nodes: - click.echo("No nodes found.") - return - - click.echo(f"Found {len(nodes)} nodes:\n") - for node in nodes: - click.echo(f" [{node['label']}] {node['id']} {node['name']}") - - -@graph.command("inspect") -@click.argument("node_id") -@click.option("--depth", "-d", default=2, help="Traversal depth (default: 2)") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_inspect(node_id: str, depth: int, output_json: bool) -> None: - """Inspect a node and its neighborhood. - - Shows the node's properties, related nodes, and provenance. - """ - import json - - store = _get_graph_store() - neighborhood = store.get_node_graph(node_id, depth=depth) - provenance = store.get_node_provenance(node_id) - - if output_json: - click.echo(json.dumps({ - "neighborhood": neighborhood, - "provenance": provenance, - }, indent=2)) - return - - click.echo(f"Node: {node_id}") - click.echo(f" Depth: {depth}") - - if neighborhood["nodes"]: - click.echo(f"\n Related nodes ({len(neighborhood['nodes'])}):") - for n in neighborhood["nodes"]: - click.echo(f" [{n['node_type']}] {n['id']} {n['name']}") - else: - click.echo("\n No related nodes.") - - if neighborhood["edge_types"]: - click.echo(f"\n Relationship types: {', '.join(neighborhood['edge_types'])}") - - if provenance: - click.echo(f"\n Provenance ({len(provenance)} documents):") - for p in provenance: - click.echo(f" [{p['role']}] {p['doc_id']} {p['title']}") - else: - click.echo("\n No provenance records.") - - -@graph.command("path") -@click.argument("from_id") -@click.argument("to_id") -@click.option("--max-depth", default=5, help="Maximum traversal depth (default: 5)") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_path(from_id: str, to_id: str, max_depth: int, output_json: bool) -> None: - """Check reachability between two nodes.""" - import json - - store = _get_graph_store() - paths = store.find_path(from_id, to_id, max_depth=max_depth) - - if output_json: - click.echo(json.dumps({ - "from": from_id, - "to": to_id, - "max_depth": max_depth, - "reachable": len(paths) > 0, - "paths": paths, - }, indent=2)) - return - - if paths: - p = paths[0] - click.echo(f"Path found: {p['start_name']} -> {p['end_name']}") - else: - click.echo(f"No path found between {from_id} and {to_id} (max depth: {max_depth}).") - - -@graph.command("impact") -@click.argument("doc_id") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_impact(doc_id: str, output_json: bool) -> None: - """Show nodes extracted from a document.""" - import json - - store = _get_graph_store() - nodes = store.get_document_impact(doc_id) - - if output_json: - click.echo(json.dumps({"doc_id": doc_id, "nodes": nodes, "count": len(nodes)}, indent=2)) - return - - if not nodes: - click.echo(f"No nodes found for document: {doc_id}") - return - - click.echo(f"Document {doc_id} impact ({len(nodes)} nodes):\n") - for n in nodes: - conf = f" (confidence: {n['confidence']:.2f})" if n.get("confidence") else "" - click.echo(f" [{n['node_type']}] {n['id']} {n['name']} role={n['role']}{conf}") - - -@graph.command("provenance") -@click.argument("node_id") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_provenance(node_id: str, output_json: bool) -> None: - """Show documents that contributed to a node.""" - import json - - store = _get_graph_store() - provenance = store.get_node_provenance(node_id) - - if output_json: - click.echo(json.dumps({ - "node_id": node_id, - "provenance": provenance, - "count": len(provenance), - }, indent=2)) - return - - if not provenance: - click.echo(f"No provenance records for node: {node_id}") - return - - click.echo(f"Provenance for {node_id} ({len(provenance)} documents):\n") - for p in provenance: - click.echo(f" [{p['role']}] {p['doc_id']} {p['title']}") - if p.get("path"): - click.echo(f" {p['path']}") - - -@graph.command("cypher") -@click.argument("query") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_cypher(query: str, output_json: bool) -> None: - """Execute a raw Cypher query. - - Example: kb graph cypher "MATCH (n) RETURN labels(n)[0] as type, count(n) as cnt" - """ - import json - - store = _get_graph_store() - - try: - results = store.execute_cypher(query) - except Exception as e: - if output_json: - click.echo(json.dumps({"error": str(e)}, indent=2)) - else: - click.echo(f"Cypher error: {e}", err=True) - sys.exit(1) - - if output_json: - click.echo(json.dumps({"results": results, "count": len(results)}, indent=2)) - return - - if not results: - click.echo("Query returned no results.") - return - - # Table output - headers = list(results[0].keys()) - click.echo(" ".join(headers)) - click.echo(" ".join("-" * len(h) for h in headers)) - for row in results: - click.echo(" ".join(str(row.get(h, "")) for h in headers)) - - -@graph.command("delete") -@click.argument("node_id") -@click.option("--force", "-f", is_flag=True, help="Skip confirmation prompt") -@click.option("--json", "output_json", is_flag=True, help="Output as JSON") -def graph_delete(node_id: str, force: bool, output_json: bool) -> None: - """Delete a node and all its relationships.""" - import json - - if not force: - click.confirm(f"Delete node '{node_id}' and all its relationships?", abort=True) - - store = _get_graph_store() - deleted = store.delete_node(node_id) - - if output_json: - click.echo(json.dumps({"node_id": node_id, "deleted": deleted}, indent=2)) - return - - if deleted: - click.echo(f"Deleted node: {node_id}") - else: - click.echo(f"Node not found: {node_id}") - - -if __name__ == "__main__": - cli() diff --git a/src/kb_engine/config/__init__.py b/src/kb_engine/config/__init__.py deleted file mode 100644 index 2a3fbf3..0000000 --- a/src/kb_engine/config/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Configuration module for KB-Engine.""" - -from kb_engine.config.settings import Settings, get_settings - -__all__ = ["Settings", "get_settings"] diff --git a/src/kb_engine/config/logging.py b/src/kb_engine/config/logging.py deleted file mode 100644 index 326fee5..0000000 --- a/src/kb_engine/config/logging.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Logging configuration.""" - -import logging -import sys - -import structlog - - -def configure_logging(log_level: str = "INFO", json_logs: bool = False) -> None: - """Configure structured logging for the application.""" - - # Set base log level - logging.basicConfig( - format="%(message)s", - stream=sys.stdout, - level=getattr(logging, log_level.upper()), - ) - - # Configure structlog - processors = [ - structlog.contextvars.merge_contextvars, - structlog.processors.add_log_level, - structlog.processors.StackInfoRenderer(), - structlog.dev.set_exc_info, - structlog.processors.TimeStamper(fmt="iso"), - ] - - if json_logs: - processors.append(structlog.processors.JSONRenderer()) - else: - processors.append(structlog.dev.ConsoleRenderer()) - - structlog.configure( - processors=processors, - wrapper_class=structlog.make_filtering_bound_logger( - getattr(logging, log_level.upper()) - ), - context_class=dict, - logger_factory=structlog.PrintLoggerFactory(), - cache_logger_on_first_use=True, - ) - - -def get_logger(name: str) -> structlog.BoundLogger: - """Get a structured logger instance.""" - return structlog.get_logger(name) diff --git a/src/kb_engine/config/settings.py b/src/kb_engine/config/settings.py deleted file mode 100644 index f200523..0000000 --- a/src/kb_engine/config/settings.py +++ /dev/null @@ -1,149 +0,0 @@ -"""Application settings using Pydantic Settings.""" - -from functools import lru_cache -from pathlib import Path - -from pydantic_settings import BaseSettings, SettingsConfigDict - - -class Settings(BaseSettings): - """Application settings loaded from environment variables.""" - - model_config = SettingsConfigDict( - env_file=".env", - env_file_encoding="utf-8", - case_sensitive=False, - ) - - # General - environment: str = "development" - debug: bool = True - log_level: str = "INFO" - - # Profile: "local" (SQLite+ChromaDB) or "server" (PostgreSQL+Qdrant+Neo4j) - profile: str = "local" - - # API - api_host: str = "0.0.0.0" - api_port: int = 8000 - api_workers: int = 1 - - # --- Traceability store --- - traceability_store: str = "sqlite" # "sqlite" | "postgres" - - # SQLite (local profile) - sqlite_path: str = "~/.kb-engine/kb.db" - - # PostgreSQL (server profile) - postgres_host: str = "localhost" - postgres_port: int = 5432 - postgres_user: str = "kb_engine" - postgres_password: str = "changeme" - postgres_db: str = "kb_engine" - database_url: str | None = None - - # --- Vector store --- - vector_store: str = "chroma" # "chroma" | "qdrant" - - # ChromaDB (local profile) - chroma_path: str = "~/.kb-engine/chroma" - - # Qdrant (server profile) - qdrant_host: str = "localhost" - qdrant_port: int = 6333 - qdrant_grpc_port: int = 6334 - qdrant_api_key: str | None = None - qdrant_collection: str = "kb_engine_embeddings" - - # --- Graph store --- - graph_store: str = "falkordb" # "sqlite" | "neo4j" | "falkordb" | "none" - - # FalkorDB (local profile) - falkordb_path: str = "~/.kb-engine/graph.db" - - # Neo4j (server profile) - neo4j_uri: str = "bolt://localhost:7687" - neo4j_user: str = "neo4j" - neo4j_password: str = "changeme" - - # --- Embeddings (independent of profile) --- - embedding_provider: str = "local" # "local" | "openai" - local_embedding_model: str = "all-mpnet-base-v2" - - # OpenAI - openai_api_key: str | None = None - openai_embedding_model: str = "text-embedding-3-small" - openai_chat_model: str = "gpt-4-turbo-preview" - - # Chunking - chunk_size_min: int = 100 - chunk_size_target: int = 512 - chunk_size_max: int = 1024 - chunk_overlap: int = 50 - - # Extraction - extraction_use_llm: bool = False - extraction_confidence_threshold: float = 0.7 - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - # Build database URL if not provided (server profile) - if self.database_url is None and self.traceability_store == "postgres": - self.database_url = ( - f"postgresql+asyncpg://{self.postgres_user}:{self.postgres_password}" - f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}" - ) - # Resolve paths - self.sqlite_path = str(Path(self.sqlite_path).expanduser()) - self.chroma_path = str(Path(self.chroma_path).expanduser()) - self.falkordb_path = str(Path(self.falkordb_path).expanduser()) - self._validate_config() - - def _validate_config(self) -> None: - """Validate cross-field configuration.""" - profile = self.profile.lower() - traceability = self.traceability_store.lower() - vector = self.vector_store.lower() - graph = self.graph_store.lower() - - if profile not in {"local", "server"}: - raise ValueError(f"Unknown profile: {self.profile}") - - if profile == "local": - if traceability != "sqlite": - raise ValueError("profile=local requires traceability_store=sqlite") - if vector != "chroma": - raise ValueError("profile=local requires vector_store=chroma") - if graph not in {"sqlite", "falkordb", "none"}: - raise ValueError("profile=local requires graph_store=sqlite|falkordb|none") - elif profile == "server": - if traceability != "postgres": - raise ValueError("profile=server requires traceability_store=postgres") - if vector != "qdrant": - raise ValueError("profile=server requires vector_store=qdrant") - if graph not in {"neo4j", "none"}: - raise ValueError("profile=server requires graph_store=neo4j|none") - - if traceability == "postgres" and not self.database_url: - raise ValueError("database_url is required when traceability_store=postgres") - - if self.embedding_provider.lower() == "openai" and not self.openai_api_key: - raise ValueError("openai_api_key is required when embedding_provider=openai") - - @property - def is_production(self) -> bool: - return self.environment.lower() == "production" - - @property - def is_development(self) -> bool: - return self.environment.lower() == "development" - - @property - def is_local_profile(self) -> bool: - return self.profile.lower() == "local" - - -@lru_cache -def get_settings() -> Settings: - """Get cached settings instance.""" - return Settings() diff --git a/src/kb_engine/core/__init__.py b/src/kb_engine/core/__init__.py deleted file mode 100644 index d21340a..0000000 --- a/src/kb_engine/core/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Core domain models and interfaces for KB-Engine.""" - -from kb_engine.core.exceptions import ( - ChunkingError, - ConfigurationError, - ExtractionError, - KBPodError, - RepositoryError, - ValidationError, -) -from kb_engine.core.models import ( - EXTENSION_DEFAULTS, - Chunk, - Document, - DocumentReference, - Edge, - EdgeType, - Embedding, - FileTypeConfig, - Node, - NodeType, - RetrievalMode, - RetrievalResponse, - RepositoryConfig, - SearchFilters, -) - -__all__ = [ - # Models - "Document", - "Chunk", - "Embedding", - "Node", - "Edge", - "NodeType", - "EdgeType", - "SearchFilters", - "DocumentReference", - "RetrievalResponse", - "RetrievalMode", - "RepositoryConfig", - "FileTypeConfig", - "EXTENSION_DEFAULTS", - # Exceptions - "KBPodError", - "ConfigurationError", - "ValidationError", - "RepositoryError", - "ChunkingError", - "ExtractionError", -] diff --git a/src/kb_engine/core/exceptions.py b/src/kb_engine/core/exceptions.py deleted file mode 100644 index 2d710f1..0000000 --- a/src/kb_engine/core/exceptions.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Custom exceptions for KB-Engine.""" - - -class KBPodError(Exception): - """Base exception for all KB-Engine errors.""" - - def __init__(self, message: str, details: dict | None = None) -> None: - super().__init__(message) - self.message = message - self.details = details or {} - - -class ConfigurationError(KBPodError): - """Raised when there's a configuration problem.""" - - pass - - -class ValidationError(KBPodError): - """Raised when validation fails.""" - - pass - - -class RepositoryError(KBPodError): - """Raised when a repository operation fails.""" - - pass - - -class ChunkingError(KBPodError): - """Raised when chunking fails.""" - - pass - - -class ExtractionError(KBPodError): - """Raised when entity extraction fails.""" - - pass - - -class EmbeddingError(KBPodError): - """Raised when embedding generation fails.""" - - pass - - -class PipelineError(KBPodError): - """Raised when a pipeline step fails.""" - - pass - - -class DocumentNotFoundError(KBPodError): - """Raised when a document is not found.""" - - pass - - -class DuplicateDocumentError(KBPodError): - """Raised when attempting to create a duplicate document.""" - - pass diff --git a/src/kb_engine/core/interfaces/__init__.py b/src/kb_engine/core/interfaces/__init__.py deleted file mode 100644 index 8f81052..0000000 --- a/src/kb_engine/core/interfaces/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Core interfaces (protocols) for KB-Engine.""" - -from kb_engine.core.interfaces.chunkers import ChunkingStrategy -from kb_engine.core.interfaces.extractors import EntityExtractor -from kb_engine.core.interfaces.repositories import ( - GraphRepository, - TraceabilityRepository, - VectorRepository, -) - -__all__ = [ - "TraceabilityRepository", - "VectorRepository", - "GraphRepository", - "ChunkingStrategy", - "EntityExtractor", -] diff --git a/src/kb_engine/core/interfaces/chunkers.py b/src/kb_engine/core/interfaces/chunkers.py deleted file mode 100644 index 6e06f01..0000000 --- a/src/kb_engine/core/interfaces/chunkers.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Chunking strategy protocol as defined in ADR-0002.""" - -from typing import Protocol - -from kb_engine.core.models.document import Chunk, ChunkType, Document - - -class ChunkingStrategy(Protocol): - """Protocol for chunking strategies. - - Different content types require different chunking approaches. - Each strategy knows how to extract meaningful chunks from - specific types of content. - """ - - @property - def chunk_type(self) -> ChunkType: - """The type of chunks this strategy produces.""" - ... - - def can_handle(self, document: Document, section_content: str) -> bool: - """Check if this strategy can handle the given content. - - Args: - document: The source document. - section_content: The content of a section to potentially chunk. - - Returns: - True if this strategy should handle this content. - """ - ... - - def chunk( - self, - document: Document, - content: str, - heading_path: list[str] | None = None, - ) -> list[Chunk]: - """Chunk the content into semantic units. - - Args: - document: The source document. - content: The content to chunk. - heading_path: The path of headings leading to this content. - - Returns: - List of chunks extracted from the content. - """ - ... diff --git a/src/kb_engine/core/interfaces/extractors.py b/src/kb_engine/core/interfaces/extractors.py deleted file mode 100644 index bc9f07e..0000000 --- a/src/kb_engine/core/interfaces/extractors.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Entity extractor protocol as defined in ADR-0003.""" - -from dataclasses import dataclass -from typing import Protocol - -from kb_engine.core.models.document import Chunk, Document -from kb_engine.core.models.graph import Edge, Node - - -@dataclass -class GraphExtractionResult: - """Result of graph extraction strategy.""" - - nodes_created: int = 0 - edges_created: int = 0 - - -class ExtractionResult: - """Result of entity extraction.""" - - def __init__( - self, - nodes: list[Node] | None = None, - edges: list[Edge] | None = None, - ) -> None: - self.nodes = nodes or [] - self.edges = edges or [] - - -class EntityExtractor(Protocol): - """Protocol for entity extractors. - - Extractors identify entities and relationships from chunks - to build the knowledge graph. - """ - - @property - def name(self) -> str: - """The name of this extractor.""" - ... - - @property - def priority(self) -> int: - """Extraction priority (lower = higher priority).""" - ... - - def can_extract(self, chunk: Chunk, document: Document) -> bool: - """Check if this extractor can process the given chunk. - - Args: - chunk: The chunk to potentially extract from. - document: The source document. - - Returns: - True if this extractor should process this chunk. - """ - ... - - async def extract( - self, - chunk: Chunk, - document: Document, - ) -> ExtractionResult: - """Extract entities and relationships from a chunk. - - Args: - chunk: The chunk to extract from. - document: The source document for context. - - Returns: - ExtractionResult containing nodes and edges. - """ - ... diff --git a/src/kb_engine/core/interfaces/repositories.py b/src/kb_engine/core/interfaces/repositories.py deleted file mode 100644 index 1c9fe2b..0000000 --- a/src/kb_engine/core/interfaces/repositories.py +++ /dev/null @@ -1,153 +0,0 @@ -"""Repository protocols as defined in ADR-0001.""" - -from typing import Protocol -from uuid import UUID - -from kb_engine.core.models.document import Chunk, Document -from kb_engine.core.models.embedding import Embedding -from kb_engine.core.models.graph import Edge, Node -from kb_engine.core.models.search import SearchFilters - - -class TraceabilityRepository(Protocol): - """Protocol for the traceability store (PostgreSQL). - - Stores documents, chunks, and their relationships for - full traceability and audit capabilities. - """ - - async def save_document(self, document: Document) -> Document: - """Save a document to the store.""" - ... - - async def get_document(self, document_id: UUID) -> Document | None: - """Get a document by ID.""" - ... - - async def get_document_by_external_id(self, external_id: str) -> Document | None: - """Get a document by external ID.""" - ... - - async def list_documents( - self, - filters: SearchFilters | None = None, - limit: int = 100, - offset: int = 0, - ) -> list[Document]: - """List documents with optional filters.""" - ... - - async def update_document(self, document: Document) -> Document: - """Update an existing document.""" - ... - - async def delete_document(self, document_id: UUID) -> bool: - """Delete a document and its chunks.""" - ... - - async def save_chunks(self, chunks: list[Chunk]) -> list[Chunk]: - """Save multiple chunks.""" - ... - - async def get_chunks_by_document(self, document_id: UUID) -> list[Chunk]: - """Get all chunks for a document.""" - ... - - async def get_chunk(self, chunk_id: UUID) -> Chunk | None: - """Get a chunk by ID.""" - ... - - async def delete_chunks_by_document(self, document_id: UUID) -> int: - """Delete all chunks for a document. Returns count deleted.""" - ... - - -class VectorRepository(Protocol): - """Protocol for the vector store (Qdrant/Weaviate/PGVector). - - Stores embeddings and provides similarity search. - """ - - async def upsert_embeddings(self, embeddings: list[Embedding]) -> int: - """Upsert embeddings. Returns count upserted.""" - ... - - async def search( - self, - query_vector: list[float], - limit: int = 10, - filters: SearchFilters | None = None, - score_threshold: float | None = None, - ) -> list[tuple[UUID, float]]: - """Search for similar vectors. Returns (chunk_id, score) pairs.""" - ... - - async def delete_by_document(self, document_id: UUID) -> int: - """Delete all embeddings for a document. Returns count deleted.""" - ... - - async def delete_by_chunk_ids(self, chunk_ids: list[UUID]) -> int: - """Delete embeddings by chunk IDs. Returns count deleted.""" - ... - - async def get_collection_info(self) -> dict[str, int | str]: - """Get information about the collection.""" - ... - - -class GraphRepository(Protocol): - """Protocol for the graph store (Neo4j/Nebula). - - Stores the knowledge graph for entity relationships. - """ - - async def create_node(self, node: Node) -> Node: - """Create a node in the graph.""" - ... - - async def get_node(self, node_id: UUID) -> Node | None: - """Get a node by ID.""" - ... - - async def find_nodes( - self, - node_type: str | None = None, - name_pattern: str | None = None, - limit: int = 100, - ) -> list[Node]: - """Find nodes by type or name pattern.""" - ... - - async def create_edge(self, edge: Edge) -> Edge: - """Create an edge between nodes.""" - ... - - async def get_edges( - self, - node_id: UUID, - direction: str = "both", # "in", "out", "both" - edge_type: str | None = None, - ) -> list[Edge]: - """Get edges connected to a node.""" - ... - - async def traverse( - self, - start_node_id: UUID, - max_hops: int = 2, - edge_types: list[str] | None = None, - ) -> list[tuple[Node, Edge, Node]]: - """Traverse the graph from a starting node. Returns (source, edge, target) triples.""" - ... - - async def delete_by_document(self, document_id: UUID) -> int: - """Delete all nodes and edges from a document. Returns count deleted.""" - ... - - async def find_similar_nodes( - self, - node_id: UUID, - limit: int = 10, - ) -> list[tuple[Node, float]]: - """Find similar nodes based on graph structure. Returns (node, similarity) pairs.""" - ... diff --git a/src/kb_engine/core/models/__init__.py b/src/kb_engine/core/models/__init__.py deleted file mode 100644 index f331607..0000000 --- a/src/kb_engine/core/models/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Domain models for KB-Engine.""" - -from kb_engine.core.models.document import Chunk, Document -from kb_engine.core.models.embedding import Embedding -from kb_engine.core.models.graph import Edge, EdgeType, Node, NodeType -from kb_engine.core.models.repository import EXTENSION_DEFAULTS, FileTypeConfig, RepositoryConfig -from kb_engine.core.models.search import ( - DocumentReference, - RetrievalMode, - RetrievalResponse, - SearchFilters, -) - -__all__ = [ - "Document", - "Chunk", - "Embedding", - "Node", - "Edge", - "NodeType", - "EdgeType", - "SearchFilters", - "DocumentReference", - "RetrievalResponse", - "RetrievalMode", - "RepositoryConfig", - "FileTypeConfig", - "EXTENSION_DEFAULTS", -] diff --git a/src/kb_engine/core/models/document.py b/src/kb_engine/core/models/document.py deleted file mode 100644 index 0102eca..0000000 --- a/src/kb_engine/core/models/document.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Document and Chunk models.""" - -from datetime import datetime -from enum import Enum -from typing import Any -from uuid import UUID, uuid4 - -from pydantic import BaseModel, Field, field_validator - - -class DocumentStatus(str, Enum): - """Document processing status (internal workflow).""" - - PENDING = "pending" - PROCESSING = "processing" - INDEXED = "indexed" - FAILED = "failed" - ARCHIVED = "archived" - - -class KDDStatus(str, Enum): - """KDD document lifecycle status (from frontmatter). - - Controls visibility in search results: - - draft: Work in progress, not ready for review - - proposed: Ready for review, not yet approved - - approved: Official, included in default searches (alias: active) - - deprecated: Obsolete, excluded by default but preserved - """ - - DRAFT = "draft" - PROPOSED = "proposed" - APPROVED = "approved" - DEPRECATED = "deprecated" - - @classmethod - def from_string(cls, value: str | None) -> "KDDStatus": - """Parse status from string, handling aliases.""" - if value is None: - return cls.APPROVED # Default - normalized = value.lower().strip() - # Handle aliases - if normalized == "active": - return cls.APPROVED - try: - return cls(normalized) - except ValueError: - return cls.APPROVED # Default for unknown values - - -class ChunkType(str, Enum): - """Semantic chunk types as defined in ADR-0002.""" - - ENTITY = "entity" - USE_CASE = "use_case" - RULE = "rule" - PROCESS = "process" - DEFAULT = "default" - - -class Document(BaseModel): - """A document in the knowledge base. - - Represents a source document (typically a KDD markdown file) that - contains knowledge to be indexed and retrieved. - """ - - id: UUID = Field(default_factory=uuid4) - - @field_validator("id", mode="before") - @classmethod - def _ensure_id(cls, v: Any) -> Any: - return v if v is not None else uuid4() - - external_id: str | None = None - title: str - content: str - source_path: str | None = None - mime_type: str = "text/markdown" - - # Metadata extracted from frontmatter or inferred - metadata: dict[str, Any] = Field(default_factory=dict) - tags: list[str] = Field(default_factory=list) - domain: str | None = None - - # KDD lifecycle fields (from frontmatter) - kdd_status: KDDStatus = KDDStatus.APPROVED - kdd_version: str | None = None - - # Git-aware fields - repo_name: str | None = None - relative_path: str | None = None - git_commit: str | None = None - git_remote_url: str | None = None - - # Processing state - status: DocumentStatus = DocumentStatus.PENDING - content_hash: str | None = None - - # Timestamps - created_at: datetime = Field(default_factory=datetime.utcnow) - updated_at: datetime = Field(default_factory=datetime.utcnow) - indexed_at: datetime | None = None - - class Config: - frozen = False - - -class Chunk(BaseModel): - """A semantic chunk extracted from a document. - - Chunks are the atomic units for embedding and retrieval. - Each chunk has a semantic type that determines how it was - extracted and how it should be processed. - """ - - id: UUID = Field(default_factory=uuid4) - document_id: UUID - sequence: int = 0 - - # Content - content: str - chunk_type: ChunkType = ChunkType.DEFAULT - - # Position in source document - start_offset: int | None = None - end_offset: int | None = None - heading_path: list[str] = Field(default_factory=list) - - # Section anchor (computed from heading_path) - section_anchor: str | None = None - - # Metadata - metadata: dict[str, Any] = Field(default_factory=dict) - token_count: int | None = None - - # Embedding reference - embedding_id: UUID | None = None - - # Timestamps - created_at: datetime = Field(default_factory=datetime.utcnow) - - class Config: - frozen = False diff --git a/src/kb_engine/core/models/embedding.py b/src/kb_engine/core/models/embedding.py deleted file mode 100644 index 59779d7..0000000 --- a/src/kb_engine/core/models/embedding.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Embedding model.""" - -from datetime import datetime -from uuid import UUID, uuid4 - -from pydantic import BaseModel, Field - - -class Embedding(BaseModel): - """A vector embedding for a chunk. - - Embeddings are stored in the vector store (Qdrant) and used - for semantic similarity search. - """ - - id: UUID = Field(default_factory=uuid4) - chunk_id: UUID - document_id: UUID - - # Vector data - vector: list[float] - model: str - dimensions: int - - # Metadata for filtering - metadata: dict[str, str | int | float | bool] = Field(default_factory=dict) - - # Timestamps - created_at: datetime = Field(default_factory=datetime.utcnow) - - class Config: - frozen = False - - @property - def payload(self) -> dict[str, str | int | float | bool]: - """Get the payload for vector store.""" - return { - "chunk_id": str(self.chunk_id), - "document_id": str(self.document_id), - "model": self.model, - **self.metadata, - } diff --git a/src/kb_engine/core/models/graph.py b/src/kb_engine/core/models/graph.py deleted file mode 100644 index 6a2d8d8..0000000 --- a/src/kb_engine/core/models/graph.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Graph models for knowledge graph representation.""" - -from datetime import datetime -from enum import Enum -from typing import Any -from uuid import UUID, uuid4 - -from pydantic import BaseModel, Field - - -class NodeType(str, Enum): - """Types of nodes in the knowledge graph (ADR-0003).""" - - ENTITY = "entity" - USE_CASE = "use_case" - RULE = "rule" - PROCESS = "process" - ACTOR = "actor" - SYSTEM = "system" - CONCEPT = "concept" - DOCUMENT = "document" - CHUNK = "chunk" - - -class EdgeType(str, Enum): - """Types of edges in the knowledge graph (ADR-0003).""" - - # Structural relationships - CONTAINS = "CONTAINS" - PART_OF = "PART_OF" - REFERENCES = "REFERENCES" - - # Domain relationships - IMPLEMENTS = "IMPLEMENTS" - DEPENDS_ON = "DEPENDS_ON" - RELATED_TO = "RELATED_TO" - TRIGGERS = "TRIGGERS" - USES = "USES" - PRODUCES = "PRODUCES" - - # Actor relationships - PERFORMS = "PERFORMS" - OWNS = "OWNS" - - # Semantic relationships - SIMILAR_TO = "SIMILAR_TO" - CONTRADICTS = "CONTRADICTS" - EXTENDS = "EXTENDS" - - -class Node(BaseModel): - """A node in the knowledge graph. - - Nodes represent entities, concepts, or structural elements - extracted from documents. - """ - - id: UUID = Field(default_factory=uuid4) - external_id: str | None = None - name: str - node_type: NodeType - description: str | None = None - - # Source traceability - source_document_id: UUID | None = None - source_chunk_id: UUID | None = None - - # Properties - properties: dict[str, Any] = Field(default_factory=dict) - - # Extraction metadata - confidence: float = 1.0 - extraction_method: str | None = None - - # Timestamps - created_at: datetime = Field(default_factory=datetime.utcnow) - updated_at: datetime = Field(default_factory=datetime.utcnow) - - class Config: - frozen = False - - -class Edge(BaseModel): - """An edge in the knowledge graph. - - Edges represent relationships between nodes. - """ - - id: UUID = Field(default_factory=uuid4) - source_id: UUID - target_id: UUID - edge_type: EdgeType - name: str | None = None - - # Properties - properties: dict[str, Any] = Field(default_factory=dict) - weight: float = 1.0 - - # Source traceability - source_document_id: UUID | None = None - source_chunk_id: UUID | None = None - - # Extraction metadata - confidence: float = 1.0 - extraction_method: str | None = None - - # Timestamps - created_at: datetime = Field(default_factory=datetime.utcnow) - - class Config: - frozen = False diff --git a/src/kb_engine/core/models/repository.py b/src/kb_engine/core/models/repository.py deleted file mode 100644 index d07535c..0000000 --- a/src/kb_engine/core/models/repository.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Repository configuration models.""" - -from pydantic import BaseModel, Field - - -class FileTypeConfig(BaseModel): - """Configuration for how a file type should be parsed and chunked.""" - - parser: str = "markdown" # "markdown" | "json" | "yaml" | "rst" | "plaintext" - mime_type: str = "text/markdown" - - -EXTENSION_DEFAULTS: dict[str, FileTypeConfig] = { - ".md": FileTypeConfig(parser="markdown", mime_type="text/markdown"), - ".json": FileTypeConfig(parser="json", mime_type="application/json"), - ".yaml": FileTypeConfig(parser="yaml", mime_type="text/yaml"), - ".yml": FileTypeConfig(parser="yaml", mime_type="text/yaml"), - ".rst": FileTypeConfig(parser="rst", mime_type="text/x-rst"), - ".txt": FileTypeConfig(parser="plaintext", mime_type="text/plain"), -} - - -class RepositoryConfig(BaseModel): - """Configuration for a Git repository to index.""" - - name: str - local_path: str - remote_url: str | None = None - branch: str = "main" - include_patterns: list[str] = Field(default_factory=lambda: ["**/*.md"]) - exclude_patterns: list[str] = Field(default_factory=list) - base_url_template: str | None = None # e.g. "{remote}/blob/{branch}/{path}" - file_type_config: dict[str, FileTypeConfig] = Field( - default_factory=lambda: {".md": FileTypeConfig()} - ) diff --git a/src/kb_engine/core/models/search.py b/src/kb_engine/core/models/search.py deleted file mode 100644 index c9cf65c..0000000 --- a/src/kb_engine/core/models/search.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Search-related models.""" - -from __future__ import annotations - -from datetime import datetime -from enum import Enum -from typing import TYPE_CHECKING, Any -from uuid import UUID - -from pydantic import BaseModel, Field - -if TYPE_CHECKING: - from kb_engine.core.models.document import Chunk - - -class RetrievalMode(str, Enum): - """Retrieval strategy mode.""" - - VECTOR = "vector" - GRAPH = "graph" - HYBRID = "hybrid" - - -class SearchFilters(BaseModel): - """Filters for search queries.""" - - # Document filters - document_ids: list[UUID] | None = None - domains: list[str] | None = None - tags: list[str] | None = None - - # Chunk type filters - chunk_types: list[str] | None = None - - # KDD status filters - # By default, only "approved" documents are included - # Use include_statuses to expand (e.g., ["approved", "proposed"]) - # Use exclude_statuses to explicitly exclude - include_statuses: list[str] | None = None # None = ["approved"] by default - exclude_statuses: list[str] | None = None - include_all_statuses: bool = False # Override to include everything - - # Date filters - created_after: datetime | None = None - created_before: datetime | None = None - - # Metadata filters (key-value pairs) - metadata: dict[str, Any] | None = None - - class Config: - frozen = True - - def get_effective_statuses(self) -> list[str] | None: - """Get the list of statuses to include in search. - - Returns None if all statuses should be included. - """ - if self.include_all_statuses: - return None - - if self.include_statuses: - statuses = set(self.include_statuses) - else: - statuses = {"approved"} # Default - - if self.exclude_statuses: - statuses -= set(self.exclude_statuses) - - return list(statuses) if statuses else None - - -class DocumentReference(BaseModel): - """A reference to a document section returned by retrieval. - - Instead of returning raw content, we return URLs pointing to - the exact section so an external agent can read the source directly. - """ - - url: str - document_path: str - section_anchor: str | None = None - title: str - section_title: str | None = None - score: float = 0.0 - snippet: str = "" - domain: str | None = None - tags: list[str] = Field(default_factory=list) - chunk_type: str | None = None - metadata: dict[str, Any] = Field(default_factory=dict) - retrieval_mode: RetrievalMode = RetrievalMode.VECTOR - - # KDD lifecycle - kdd_status: str = "approved" - kdd_version: str | None = None - - -class RetrievalResponse(BaseModel): - """Response from a retrieval query.""" - - query: str - references: list[DocumentReference] - total_count: int - processing_time_ms: float | None = None diff --git a/src/kb_engine/embedding/__init__.py b/src/kb_engine/embedding/__init__.py deleted file mode 100644 index 13e33e3..0000000 --- a/src/kb_engine/embedding/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Embedding generation module for KB-Engine.""" - -from kb_engine.embedding.base import EmbeddingProvider -from kb_engine.embedding.config import EmbeddingConfig -from kb_engine.embedding.factory import EmbeddingProviderFactory - -__all__ = [ - "EmbeddingConfig", - "EmbeddingProvider", - "EmbeddingProviderFactory", -] diff --git a/src/kb_engine/embedding/base.py b/src/kb_engine/embedding/base.py deleted file mode 100644 index f5cfd97..0000000 --- a/src/kb_engine/embedding/base.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Base embedding provider.""" - -from abc import ABC, abstractmethod - -from kb_engine.core.models.document import Chunk -from kb_engine.core.models.embedding import Embedding - - -class EmbeddingProvider(ABC): - """Abstract base class for embedding providers.""" - - @property - @abstractmethod - def model_name(self) -> str: - """The name of the embedding model.""" - ... - - @property - @abstractmethod - def dimensions(self) -> int: - """The dimensionality of the embeddings.""" - ... - - @abstractmethod - async def embed_text(self, text: str) -> list[float]: - """Generate embedding for a single text.""" - ... - - @abstractmethod - async def embed_texts(self, texts: list[str]) -> list[list[float]]: - """Generate embeddings for multiple texts.""" - ... - - async def embed_chunk(self, chunk: Chunk) -> Embedding: - """Generate embedding for a chunk.""" - vector = await self.embed_text(chunk.content) - metadata = { - "chunk_type": chunk.chunk_type.value, - "kdd_status": chunk.metadata.get("kdd_status", "approved"), - } - if chunk.metadata.get("kdd_version"): - metadata["kdd_version"] = chunk.metadata["kdd_version"] - return Embedding( - chunk_id=chunk.id, - document_id=chunk.document_id, - vector=vector, - model=self.model_name, - dimensions=self.dimensions, - metadata=metadata, - ) - - async def embed_chunks(self, chunks: list[Chunk]) -> list[Embedding]: - """Generate embeddings for multiple chunks.""" - texts = [c.content for c in chunks] - vectors = await self.embed_texts(texts) - - embeddings = [] - for chunk, vector in zip(chunks, vectors, strict=True): - metadata = { - "chunk_type": chunk.chunk_type.value, - "kdd_status": chunk.metadata.get("kdd_status", "approved"), - } - if chunk.metadata.get("kdd_version"): - metadata["kdd_version"] = chunk.metadata["kdd_version"] - embeddings.append( - Embedding( - chunk_id=chunk.id, - document_id=chunk.document_id, - vector=vector, - model=self.model_name, - dimensions=self.dimensions, - metadata=metadata, - ) - ) - - return embeddings diff --git a/src/kb_engine/embedding/config.py b/src/kb_engine/embedding/config.py deleted file mode 100644 index 1576696..0000000 --- a/src/kb_engine/embedding/config.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Embedding configuration.""" - -from pydantic import BaseModel, Field - - -class EmbeddingConfig(BaseModel): - """Configuration for embedding generation.""" - - # Provider selection - provider: str = Field(default="local", description="Embedding provider (openai, local)") - - # OpenAI settings - openai_model: str = Field( - default="text-embedding-3-small", description="OpenAI embedding model" - ) - openai_dimensions: int = Field( - default=1536, description="Embedding dimensions for OpenAI" - ) - - # Local model settings - local_model_name: str = Field( - default="all-mpnet-base-v2", description="Sentence-transformers model name" - ) - local_model_path: str | None = Field( - default=None, description="Path to local embedding model (overrides name)" - ) - - # Batch settings - batch_size: int = Field(default=100, ge=1, le=1000, description="Batch size for embedding") - - class Config: - frozen = True diff --git a/src/kb_engine/embedding/factory.py b/src/kb_engine/embedding/factory.py deleted file mode 100644 index a2a747b..0000000 --- a/src/kb_engine/embedding/factory.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Factory for creating embedding providers.""" - -from kb_engine.embedding.base import EmbeddingProvider -from kb_engine.embedding.config import EmbeddingConfig - - -class EmbeddingProviderFactory: - """Factory for creating embedding providers.""" - - def __init__(self, config: EmbeddingConfig | None = None) -> None: - self._config = config or EmbeddingConfig() - - def create_provider(self) -> EmbeddingProvider: - """Create an embedding provider based on configuration.""" - provider = self._config.provider.lower() - - if provider == "openai": - from kb_engine.embedding.providers.openai import OpenAIEmbeddingProvider - - return OpenAIEmbeddingProvider( - model=self._config.openai_model, - dimensions=self._config.openai_dimensions, - ) - elif provider == "local": - from kb_engine.embedding.providers.local import LocalEmbeddingProvider - - return LocalEmbeddingProvider( - model_path=self._config.local_model_path, - model_name=self._config.local_model_name, - ) - else: - raise ValueError(f"Unknown embedding provider: {provider}") diff --git a/src/kb_engine/embedding/providers/__init__.py b/src/kb_engine/embedding/providers/__init__.py deleted file mode 100644 index 74d87c8..0000000 --- a/src/kb_engine/embedding/providers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Embedding provider implementations.""" diff --git a/src/kb_engine/embedding/providers/local.py b/src/kb_engine/embedding/providers/local.py deleted file mode 100644 index 44a550d..0000000 --- a/src/kb_engine/embedding/providers/local.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Local embedding provider using sentence-transformers.""" - -import asyncio - -import structlog - -from kb_engine.embedding.base import EmbeddingProvider - -logger = structlog.get_logger(__name__) - - -class LocalEmbeddingProvider(EmbeddingProvider): - """Local model-based embedding provider. - - Uses sentence-transformers for local embedding generation. - Runs the model in a thread pool to avoid blocking the event loop. - """ - - def __init__( - self, - model_path: str | None = None, - model_name: str = "all-mpnet-base-v2", - ) -> None: - self._model_path = model_path - self._model_name_str = model_name - self._model = None - self._dimensions_cache: int | None = None - - @property - def model_name(self) -> str: - return self._model_name_str - - @property - def dimensions(self) -> int: - if self._dimensions_cache is not None: - return self._dimensions_cache - # Default dimensions for common models - defaults = { - "all-MiniLM-L6-v2": 384, - "all-MiniLM-L12-v2": 384, - "all-mpnet-base-v2": 768, - "paraphrase-multilingual-MiniLM-L12-v2": 384, - } - return defaults.get(self._model_name_str, 384) - - def _ensure_model(self) -> None: - """Load the sentence-transformers model (synchronous).""" - if self._model is None: - from sentence_transformers import SentenceTransformer - - model_id = self._model_path or self._model_name_str - logger.info("Loading embedding model", model=model_id) - self._model = SentenceTransformer(model_id) - self._dimensions_cache = self._model.get_sentence_embedding_dimension() - logger.info( - "Embedding model loaded", - model=model_id, - dimensions=self._dimensions_cache, - ) - - async def embed_text(self, text: str) -> list[float]: - """Generate embedding for a single text using local model.""" - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, self._embed_text_sync, text) - - def _embed_text_sync(self, text: str) -> list[float]: - self._ensure_model() - assert self._model is not None - embedding = self._model.encode(text, normalize_embeddings=True) - return embedding.tolist() - - async def embed_texts(self, texts: list[str]) -> list[list[float]]: - """Generate embeddings for multiple texts using local model.""" - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, self._embed_texts_sync, texts) - - def _embed_texts_sync(self, texts: list[str]) -> list[list[float]]: - self._ensure_model() - assert self._model is not None - embeddings = self._model.encode(texts, normalize_embeddings=True, show_progress_bar=False) - return [e.tolist() for e in embeddings] diff --git a/src/kb_engine/embedding/providers/openai.py b/src/kb_engine/embedding/providers/openai.py deleted file mode 100644 index 58fb71b..0000000 --- a/src/kb_engine/embedding/providers/openai.py +++ /dev/null @@ -1,44 +0,0 @@ -"""OpenAI embedding provider.""" - -from kb_engine.embedding.base import EmbeddingProvider - - -class OpenAIEmbeddingProvider(EmbeddingProvider): - """OpenAI-based embedding provider.""" - - def __init__( - self, - model: str = "text-embedding-3-small", - dimensions: int = 1536, - api_key: str | None = None, - ) -> None: - self._model = model - self._dimensions = dimensions - self._api_key = api_key - self._client = None - - @property - def model_name(self) -> str: - return self._model - - @property - def dimensions(self) -> int: - return self._dimensions - - async def _ensure_client(self) -> None: - """Ensure OpenAI client is initialized.""" - if self._client is None: - # TODO: Initialize OpenAI async client - pass - - async def embed_text(self, text: str) -> list[float]: - """Generate embedding for a single text using OpenAI.""" - await self._ensure_client() - # TODO: Implement OpenAI embedding - raise NotImplementedError("OpenAIEmbeddingProvider.embed_text not implemented") - - async def embed_texts(self, texts: list[str]) -> list[list[float]]: - """Generate embeddings for multiple texts using OpenAI.""" - await self._ensure_client() - # TODO: Implement batch OpenAI embedding - raise NotImplementedError("OpenAIEmbeddingProvider.embed_texts not implemented") diff --git a/src/kb_engine/extraction/__init__.py b/src/kb_engine/extraction/__init__.py deleted file mode 100644 index 3a32e8a..0000000 --- a/src/kb_engine/extraction/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Entity extraction module for KB-Engine (ADR-0003).""" - -from kb_engine.extraction.config import ExtractionConfig -from kb_engine.extraction.factory import ExtractionPipelineFactory -from kb_engine.extraction.models import ExtractedEdge, ExtractedNode -from kb_engine.extraction.pipeline import ExtractionPipeline - -__all__ = [ - "ExtractionConfig", - "ExtractedNode", - "ExtractedEdge", - "ExtractionPipeline", - "ExtractionPipelineFactory", -] diff --git a/src/kb_engine/extraction/config.py b/src/kb_engine/extraction/config.py deleted file mode 100644 index 5f551d3..0000000 --- a/src/kb_engine/extraction/config.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Extraction configuration.""" - -from pydantic import BaseModel, Field - - -class ExtractionConfig(BaseModel): - """Configuration for the entity extraction process. - - These values follow the recommendations in ADR-0003 for - entity and relationship extraction. - """ - - # Extraction behavior - use_llm: bool = Field( - default=False, description="Use LLM for extraction (vs. pattern-only)" - ) - confidence_threshold: float = Field( - default=0.7, ge=0.0, le=1.0, description="Minimum confidence for extracted entities" - ) - - # Extractor selection - enable_frontmatter_extraction: bool = Field( - default=True, description="Extract entities from frontmatter" - ) - enable_pattern_extraction: bool = Field( - default=True, description="Extract entities using patterns" - ) - enable_llm_extraction: bool = Field( - default=False, description="Extract entities using LLM" - ) - - # LLM settings - llm_model: str = Field( - default="gpt-4-turbo-preview", description="LLM model to use for extraction" - ) - llm_temperature: float = Field( - default=0.0, ge=0.0, le=1.0, description="LLM temperature for extraction" - ) - - # Deduplication - deduplicate_entities: bool = Field( - default=True, description="Deduplicate extracted entities" - ) - similarity_threshold: float = Field( - default=0.85, ge=0.0, le=1.0, description="Threshold for entity deduplication" - ) - - class Config: - frozen = True diff --git a/src/kb_engine/extraction/extractors/__init__.py b/src/kb_engine/extraction/extractors/__init__.py deleted file mode 100644 index d413932..0000000 --- a/src/kb_engine/extraction/extractors/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Entity extractor implementations.""" - -from kb_engine.extraction.extractors.base import BaseExtractor -from kb_engine.extraction.extractors.frontmatter import FrontmatterExtractor -from kb_engine.extraction.extractors.llm import LLMExtractor -from kb_engine.extraction.extractors.pattern import PatternExtractor - -__all__ = [ - "BaseExtractor", - "FrontmatterExtractor", - "PatternExtractor", - "LLMExtractor", -] diff --git a/src/kb_engine/extraction/extractors/base.py b/src/kb_engine/extraction/extractors/base.py deleted file mode 100644 index 6f7cc7a..0000000 --- a/src/kb_engine/extraction/extractors/base.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Base extractor implementation.""" - -from abc import ABC, abstractmethod - -from kb_engine.core.interfaces.extractors import EntityExtractor, ExtractionResult -from kb_engine.core.models.document import Chunk, Document - - -class BaseExtractor(EntityExtractor, ABC): - """Base class for entity extractors.""" - - @property - @abstractmethod - def name(self) -> str: - """The name of this extractor.""" - ... - - @property - @abstractmethod - def priority(self) -> int: - """Extraction priority (lower = higher priority).""" - ... - - @abstractmethod - def can_extract(self, chunk: Chunk, document: Document) -> bool: - """Check if this extractor can process the given chunk.""" - ... - - @abstractmethod - async def extract( - self, - chunk: Chunk, - document: Document, - ) -> ExtractionResult: - """Extract entities and relationships from a chunk.""" - ... diff --git a/src/kb_engine/extraction/extractors/frontmatter.py b/src/kb_engine/extraction/extractors/frontmatter.py deleted file mode 100644 index 81b8f1d..0000000 --- a/src/kb_engine/extraction/extractors/frontmatter.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Frontmatter-based entity extractor.""" - -from kb_engine.core.interfaces.extractors import ExtractionResult -from kb_engine.core.models.document import Chunk, Document -from kb_engine.core.models.graph import EdgeType, NodeType -from kb_engine.extraction.extractors.base import BaseExtractor -from kb_engine.extraction.models import ExtractedEdge, ExtractedNode - - -class FrontmatterExtractor(BaseExtractor): - """Extracts entities from document frontmatter. - - KDD documents often have YAML frontmatter with structured - metadata including tags, categories, and related entities. - """ - - @property - def name(self) -> str: - return "frontmatter" - - @property - def priority(self) -> int: - return 10 # High priority (runs first) - - def can_extract(self, chunk: Chunk, document: Document) -> bool: - """Can extract if document has metadata.""" - return bool(document.metadata) - - async def extract( - self, - chunk: Chunk, - document: Document, - ) -> ExtractionResult: - """Extract entities from document frontmatter.""" - nodes: list[ExtractedNode] = [] - edges: list[ExtractedEdge] = [] - - metadata = document.metadata - - # Extract from tags - if "tags" in metadata and isinstance(metadata["tags"], list): - for tag in metadata["tags"]: - nodes.append( - ExtractedNode( - name=str(tag), - node_type=NodeType.CONCEPT, - description=f"Tag: {tag}", - confidence=1.0, - extraction_method=self.name, - ) - ) - - # Extract from domain - if document.domain: - nodes.append( - ExtractedNode( - name=document.domain, - node_type=NodeType.CONCEPT, - description=f"Domain: {document.domain}", - confidence=1.0, - extraction_method=self.name, - ) - ) - - # Create document node - doc_node = ExtractedNode( - name=document.title, - node_type=NodeType.DOCUMENT, - description=f"Document: {document.title}", - properties={"source_path": document.source_path}, - confidence=1.0, - extraction_method=self.name, - ) - nodes.append(doc_node) - - # Create edges from document to tags/domain - for node in nodes: - if node.node_type == NodeType.CONCEPT: - edges.append( - ExtractedEdge( - source_name=document.title, - target_name=node.name, - edge_type=EdgeType.RELATED_TO, - confidence=1.0, - extraction_method=self.name, - ) - ) - - # Extract related documents - if "related" in metadata and isinstance(metadata["related"], list): - for related in metadata["related"]: - edges.append( - ExtractedEdge( - source_name=document.title, - target_name=str(related), - edge_type=EdgeType.REFERENCES, - confidence=0.9, - extraction_method=self.name, - ) - ) - - return ExtractionResult(nodes=nodes, edges=edges) # type: ignore diff --git a/src/kb_engine/extraction/extractors/llm.py b/src/kb_engine/extraction/extractors/llm.py deleted file mode 100644 index 9c2916a..0000000 --- a/src/kb_engine/extraction/extractors/llm.py +++ /dev/null @@ -1,89 +0,0 @@ -"""LLM-based entity extractor.""" - -from kb_engine.core.interfaces.extractors import ExtractionResult -from kb_engine.core.models.document import Chunk, Document -from kb_engine.extraction.extractors.base import BaseExtractor -from kb_engine.extraction.models import ExtractedEdge, ExtractedNode - - -class LLMExtractor(BaseExtractor): - """Extracts entities using a Large Language Model. - - Uses structured prompts to extract entities and relationships - that may not be captured by pattern-based extraction. - """ - - def __init__( - self, - model: str = "gpt-4-turbo-preview", - temperature: float = 0.0, - ) -> None: - self._model = model - self._temperature = temperature - self._client = None - - @property - def name(self) -> str: - return "llm" - - @property - def priority(self) -> int: - return 30 # Lower priority (runs last) - - def can_extract(self, chunk: Chunk, document: Document) -> bool: - """Can extract from any chunk with sufficient content.""" - return bool(chunk.content and len(chunk.content) > 50) - - async def _ensure_client(self) -> None: - """Ensure OpenAI client is initialized.""" - if self._client is None: - # TODO: Initialize OpenAI client - pass - - async def extract( - self, - chunk: Chunk, - document: Document, - ) -> ExtractionResult: - """Extract entities using LLM. - - TODO: Implement LLM-based extraction using structured output. - """ - await self._ensure_client() - - # Placeholder - actual implementation would call OpenAI API - nodes: list[ExtractedNode] = [] - edges: list[ExtractedEdge] = [] - - # TODO: Implement LLM extraction with prompts like: - # - "Extract all entities (people, systems, concepts) from this text" - # - "Extract all relationships between entities" - # - Use structured output / function calling for reliable parsing - - return ExtractionResult(nodes=nodes, edges=edges) # type: ignore - - def _build_extraction_prompt(self, chunk: Chunk, document: Document) -> str: - """Build the prompt for entity extraction.""" - return f"""Extract entities and relationships from the following text. - -Document: {document.title} -Domain: {document.domain or 'Unknown'} -Chunk Type: {chunk.chunk_type.value} - -Text: -{chunk.content} - -Extract: -1. Entities: actors, systems, concepts, processes, rules -2. Relationships: dependencies, references, implementations - -Return as JSON with format: -{{ - "entities": [ - {{"name": "...", "type": "...", "description": "..."}} - ], - "relationships": [ - {{"source": "...", "target": "...", "type": "...", "description": "..."}} - ] -}} -""" diff --git a/src/kb_engine/extraction/extractors/pattern.py b/src/kb_engine/extraction/extractors/pattern.py deleted file mode 100644 index fbc8187..0000000 --- a/src/kb_engine/extraction/extractors/pattern.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Pattern-based entity extractor.""" - -import re -from typing import Any - -from kb_engine.core.interfaces.extractors import ExtractionResult -from kb_engine.core.models.document import Chunk, ChunkType, Document -from kb_engine.core.models.graph import EdgeType, NodeType -from kb_engine.extraction.extractors.base import BaseExtractor -from kb_engine.extraction.models import ExtractedEdge, ExtractedNode - - -class PatternExtractor(BaseExtractor): - """Extracts entities using regex patterns. - - Uses predefined patterns to identify entities, actors, - and relationships from text. - """ - - # Patterns for different entity types - ENTITY_PATTERNS: list[tuple[str, NodeType, float]] = [ - # Actor patterns - (r"(?:actor|usuario|user|cliente|customer|administrador|admin)[\s:]+(\w+(?:\s+\w+)?)", NodeType.ACTOR, 0.8), - # System patterns - (r"(?:sistema|system|servicio|service|módulo|module)[\s:]+(\w+(?:\s+\w+)?)", NodeType.SYSTEM, 0.8), - # Entity patterns - (r"(?:entidad|entity|objeto|object)[\s:]+(\w+(?:\s+\w+)?)", NodeType.ENTITY, 0.8), - # Use case patterns - (r"(?:caso de uso|use case|CU[-_]?\d+)[\s:]+(\w+(?:\s+\w+)*)", NodeType.USE_CASE, 0.85), - # Rule patterns - (r"(?:regla|rule|RN[-_]?\d+|BR[-_]?\d+)[\s:]+(\w+(?:\s+\w+)*)", NodeType.RULE, 0.85), - ] - - # Patterns for relationships - RELATIONSHIP_PATTERNS: list[tuple[str, EdgeType, float]] = [ - # Dependency patterns - (r"(\w+(?:\s+\w+)?)\s+(?:depende de|depends on)\s+(\w+(?:\s+\w+)?)", EdgeType.DEPENDS_ON, 0.75), - # Usage patterns - (r"(\w+(?:\s+\w+)?)\s+(?:usa|uses|utiliza|utilizes)\s+(\w+(?:\s+\w+)?)", EdgeType.USES, 0.75), - # Production patterns - (r"(\w+(?:\s+\w+)?)\s+(?:produce|produces|genera|generates)\s+(\w+(?:\s+\w+)?)", EdgeType.PRODUCES, 0.75), - # Reference patterns - (r"(\w+(?:\s+\w+)?)\s+(?:referencia|references|ver|see)\s+(\w+(?:\s+\w+)?)", EdgeType.REFERENCES, 0.7), - # Implementation patterns - (r"(\w+(?:\s+\w+)?)\s+(?:implementa|implements)\s+(\w+(?:\s+\w+)?)", EdgeType.IMPLEMENTS, 0.8), - ] - - @property - def name(self) -> str: - return "pattern" - - @property - def priority(self) -> int: - return 20 # Medium priority - - def can_extract(self, chunk: Chunk, document: Document) -> bool: - """Can extract from any chunk with content.""" - return bool(chunk.content and len(chunk.content) > 10) - - async def extract( - self, - chunk: Chunk, - document: Document, - ) -> ExtractionResult: - """Extract entities using pattern matching.""" - nodes: list[ExtractedNode] = [] - edges: list[ExtractedEdge] = [] - - content = chunk.content - - # Extract entities - for pattern, node_type, base_confidence in self.ENTITY_PATTERNS: - matches = re.finditer(pattern, content, re.IGNORECASE) - for match in matches: - name = match.group(1).strip() - if name and len(name) > 2: - # Adjust confidence based on chunk type alignment - confidence = self._adjust_confidence( - base_confidence, node_type, chunk.chunk_type - ) - nodes.append( - ExtractedNode( - name=name, - node_type=node_type, - confidence=confidence, - extraction_method=self.name, - source_text=match.group(0), - ) - ) - - # Extract relationships - for pattern, edge_type, confidence in self.RELATIONSHIP_PATTERNS: - matches = re.finditer(pattern, content, re.IGNORECASE) - for match in matches: - source = match.group(1).strip() - target = match.group(2).strip() - if source and target and len(source) > 2 and len(target) > 2: - edges.append( - ExtractedEdge( - source_name=source, - target_name=target, - edge_type=edge_type, - confidence=confidence, - extraction_method=self.name, - source_text=match.group(0), - ) - ) - - return ExtractionResult(nodes=nodes, edges=edges) # type: ignore - - def _adjust_confidence( - self, - base_confidence: float, - node_type: NodeType, - chunk_type: ChunkType, - ) -> float: - """Adjust confidence based on alignment between node and chunk types.""" - # Mapping of chunk types to expected node types - type_alignment: dict[ChunkType, set[NodeType]] = { - ChunkType.ENTITY: {NodeType.ENTITY, NodeType.CONCEPT}, - ChunkType.USE_CASE: {NodeType.USE_CASE, NodeType.ACTOR}, - ChunkType.RULE: {NodeType.RULE}, - ChunkType.PROCESS: {NodeType.PROCESS, NodeType.ACTOR, NodeType.SYSTEM}, - } - - expected_types = type_alignment.get(chunk_type, set()) - if node_type in expected_types: - return min(1.0, base_confidence + 0.1) # Boost confidence - - return base_confidence diff --git a/src/kb_engine/extraction/factory.py b/src/kb_engine/extraction/factory.py deleted file mode 100644 index ecea6f7..0000000 --- a/src/kb_engine/extraction/factory.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Factory for creating extraction pipelines.""" - -from kb_engine.extraction.config import ExtractionConfig -from kb_engine.extraction.extractors.frontmatter import FrontmatterExtractor -from kb_engine.extraction.extractors.llm import LLMExtractor -from kb_engine.extraction.extractors.pattern import PatternExtractor -from kb_engine.extraction.pipeline import ExtractionPipeline - - -class ExtractionPipelineFactory: - """Factory for creating configured extraction pipelines.""" - - def __init__(self, config: ExtractionConfig | None = None) -> None: - self._config = config or ExtractionConfig() - - def create_pipeline(self) -> ExtractionPipeline: - """Create an extraction pipeline with configured extractors.""" - pipeline = ExtractionPipeline(config=self._config) - - # Register extractors based on configuration - if self._config.enable_frontmatter_extraction: - pipeline.register_extractor(FrontmatterExtractor()) - - if self._config.enable_pattern_extraction: - pipeline.register_extractor(PatternExtractor()) - - if self._config.enable_llm_extraction and self._config.use_llm: - pipeline.register_extractor( - LLMExtractor( - model=self._config.llm_model, - temperature=self._config.llm_temperature, - ) - ) - - return pipeline diff --git a/src/kb_engine/extraction/models.py b/src/kb_engine/extraction/models.py deleted file mode 100644 index 0b8913a..0000000 --- a/src/kb_engine/extraction/models.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Models for extraction results.""" - -from typing import Any - -from pydantic import BaseModel, Field - -from kb_engine.core.models.graph import EdgeType, NodeType - - -class ExtractedNode(BaseModel): - """An entity extracted from content before graph persistence.""" - - name: str - node_type: NodeType - description: str | None = None - properties: dict[str, Any] = Field(default_factory=dict) - - # Extraction metadata - confidence: float = 1.0 - extraction_method: str = "unknown" - source_text: str | None = None - - class Config: - frozen = False - - -class ExtractedEdge(BaseModel): - """A relationship extracted from content before graph persistence.""" - - source_name: str - target_name: str - edge_type: EdgeType - name: str | None = None - properties: dict[str, Any] = Field(default_factory=dict) - - # Extraction metadata - confidence: float = 1.0 - extraction_method: str = "unknown" - source_text: str | None = None - - class Config: - frozen = False diff --git a/src/kb_engine/extraction/pipeline.py b/src/kb_engine/extraction/pipeline.py deleted file mode 100644 index 73f1e48..0000000 --- a/src/kb_engine/extraction/pipeline.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Extraction pipeline for orchestrating multiple extractors.""" - -from kb_engine.core.interfaces.extractors import EntityExtractor, ExtractionResult -from kb_engine.core.models.document import Chunk, Document -from kb_engine.extraction.config import ExtractionConfig -from kb_engine.extraction.models import ExtractedEdge, ExtractedNode - - -class ExtractionPipeline: - """Pipeline for running multiple extractors on chunks. - - Orchestrates extraction by running all registered extractors - in priority order, then deduplicating and merging results. - """ - - def __init__( - self, - config: ExtractionConfig | None = None, - extractors: list[EntityExtractor] | None = None, - ) -> None: - self._config = config or ExtractionConfig() - self._extractors: list[EntityExtractor] = extractors or [] - - def register_extractor(self, extractor: EntityExtractor) -> None: - """Register an extractor to the pipeline.""" - self._extractors.append(extractor) - # Sort by priority (lower = higher priority) - self._extractors.sort(key=lambda e: e.priority) - - async def extract( - self, - chunk: Chunk, - document: Document, - ) -> ExtractionResult: - """Run all applicable extractors on a chunk. - - Returns combined and deduplicated extraction results. - """ - all_nodes: list[ExtractedNode] = [] - all_edges: list[ExtractedEdge] = [] - - for extractor in self._extractors: - if extractor.can_extract(chunk, document): - result = await extractor.extract(chunk, document) - all_nodes.extend(result.nodes) - all_edges.extend(result.edges) - - # Filter by confidence threshold - filtered_nodes = [ - n for n in all_nodes if n.confidence >= self._config.confidence_threshold - ] - filtered_edges = [ - e for e in all_edges if e.confidence >= self._config.confidence_threshold - ] - - # Deduplicate if enabled - if self._config.deduplicate_entities: - filtered_nodes = self._deduplicate_nodes(filtered_nodes) - filtered_edges = self._deduplicate_edges(filtered_edges) - - return ExtractionResult( - nodes=filtered_nodes, # type: ignore - edges=filtered_edges, # type: ignore - ) - - async def extract_document( - self, - document: Document, - chunks: list[Chunk], - ) -> ExtractionResult: - """Extract entities from all chunks of a document.""" - all_nodes: list[ExtractedNode] = [] - all_edges: list[ExtractedEdge] = [] - - for chunk in chunks: - result = await self.extract(chunk, document) - all_nodes.extend(result.nodes) # type: ignore - all_edges.extend(result.edges) # type: ignore - - # Final deduplication across all chunks - if self._config.deduplicate_entities: - all_nodes = self._deduplicate_nodes(all_nodes) - all_edges = self._deduplicate_edges(all_edges) - - return ExtractionResult( - nodes=all_nodes, # type: ignore - edges=all_edges, # type: ignore - ) - - def _deduplicate_nodes( - self, - nodes: list[ExtractedNode], - ) -> list[ExtractedNode]: - """Deduplicate nodes by name and type. - - When duplicates are found, keeps the one with highest confidence. - """ - seen: dict[tuple[str, str], ExtractedNode] = {} - - for node in nodes: - key = (node.name.lower(), node.node_type.value) - if key not in seen or node.confidence > seen[key].confidence: - seen[key] = node - - return list(seen.values()) - - def _deduplicate_edges( - self, - edges: list[ExtractedEdge], - ) -> list[ExtractedEdge]: - """Deduplicate edges by source, target, and type. - - When duplicates are found, keeps the one with highest confidence. - """ - seen: dict[tuple[str, str, str], ExtractedEdge] = {} - - for edge in edges: - key = (edge.source_name.lower(), edge.target_name.lower(), edge.edge_type.value) - if key not in seen or edge.confidence > seen[key].confidence: - seen[key] = edge - - return list(seen.values()) diff --git a/src/kb_engine/extraction/strategies.py b/src/kb_engine/extraction/strategies.py deleted file mode 100644 index 625cf59..0000000 --- a/src/kb_engine/extraction/strategies.py +++ /dev/null @@ -1,151 +0,0 @@ -"""Graph extraction strategies for the indexation pipeline.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Protocol - -import structlog - -from kb_engine.core.interfaces.extractors import GraphExtractionResult - -if TYPE_CHECKING: - from kb_engine.core.models.document import Chunk, Document - from kb_engine.extraction.pipeline import ExtractionPipeline - from kb_engine.smart.stores.falkordb_graph import FalkorDBGraphStore - -logger = structlog.get_logger(__name__) - - -class GraphExtractionStrategy(Protocol): - """Protocol for graph extraction strategies.""" - - async def extract_and_store( - self, document: Document, chunks: list[Chunk] - ) -> GraphExtractionResult: ... - - async def delete_by_document(self, document_id: str) -> None: ... - - -class SmartGraphExtractionStrategy: - """Graph extraction using FalkorDB with KDD-aware entity parsing. - - Uses DocumentKindDetector + EntityParser + EntityGraphExtractor - to produce a rich knowledge graph from KDD documents. - For non-entity documents, creates a basic document node. - """ - - def __init__(self, graph_store: FalkorDBGraphStore) -> None: - self._graph_store = graph_store - - async def extract_and_store( - self, document: Document, chunks: list[Chunk] - ) -> GraphExtractionResult: - from kb_engine.smart.parsers.detector import DocumentKindDetector - from kb_engine.smart.parsers.entity import EntityParser - from kb_engine.smart.extraction.entity import EntityGraphExtractor - from kb_engine.smart.types import KDDDocumentKind - - log = logger.bind(document_id=str(document.id), title=document.title) - - detector = DocumentKindDetector() - detection = detector.detect( - document.content, - filename=document.relative_path or document.source_path, - ) - log.debug( - "smart_strategy.detected", - kind=detection.kind.value, - confidence=detection.confidence, - ) - - if detection.kind == KDDDocumentKind.ENTITY and detection.confidence >= 0.5: - parser = EntityParser() - parsed = parser.parse( - document.content, - filename=document.relative_path, - ) - entity_info = parser.extract_entity_info(parsed) - - # Override source doc id and propagate path for provenance - doc_id = document.relative_path or str(document.id) - parsed.frontmatter["id"] = doc_id - parsed.frontmatter["path"] = document.relative_path or document.source_path or "" - - extractor = EntityGraphExtractor(self._graph_store) - nodes, edges = extractor.extract_and_store(parsed, entity_info) - - log.info( - "smart_strategy.entity_extracted", - entity=entity_info.name, - nodes=nodes, - edges=edges, - ) - return GraphExtractionResult(nodes_created=nodes, edges_created=edges) - - # Non-entity document: create Document node + basic Entity node + EXTRACTED_FROM - doc_id = document.relative_path or str(document.id) - doc_path = document.relative_path or document.source_path or "" - doc_kind = detection.kind.value - - self._graph_store.upsert_document( - doc_id=doc_id, - title=document.title, - path=doc_path, - kind=doc_kind, - ) - - entity_id = f"doc:{doc_id}" - self._graph_store.upsert_entity( - entity_id=entity_id, - name=document.title, - description=f"Document: {document.title}", - confidence=0.5, - ) - self._graph_store.add_extracted_from(entity_id, "Entity", doc_id, "primary", 0.5) - - log.debug("smart_strategy.basic_node", kind=detection.kind.value) - return GraphExtractionResult(nodes_created=2, edges_created=1) - - async def delete_by_document(self, document_id: str) -> None: - self._graph_store.delete_by_source_doc(document_id) - - -class LegacyGraphExtractionStrategy: - """Graph extraction wrapping the original ExtractionPipeline + GraphRepository. - - Preserves existing SQLite/Neo4j behavior without changes. - """ - - def __init__(self, graph_repo, extraction_pipeline: ExtractionPipeline) -> None: - self._graph = graph_repo - self._extraction_pipeline = extraction_pipeline - - async def extract_and_store( - self, document: Document, chunks: list[Chunk] - ) -> GraphExtractionResult: - from kb_engine.core.models.graph import Node - - extraction_result = await self._extraction_pipeline.extract_document( - document, chunks - ) - nodes_created = 0 - for node_data in extraction_result.nodes: - node = Node( - name=node_data.name, - node_type=node_data.node_type, - description=node_data.description, - source_document_id=document.id, - properties=node_data.properties, - confidence=node_data.confidence, - extraction_method=node_data.extraction_method, - ) - await self._graph.create_node(node) - nodes_created += 1 - - return GraphExtractionResult( - nodes_created=nodes_created, - edges_created=0, - ) - - async def delete_by_document(self, document_id: str) -> None: - await self._graph.delete_by_document(document_id) diff --git a/src/kb_engine/git/__init__.py b/src/kb_engine/git/__init__.py deleted file mode 100644 index de9a105..0000000 --- a/src/kb_engine/git/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Git integration module for KB-Engine.""" - -from kb_engine.git.scanner import GitRepoScanner -from kb_engine.git.url_resolver import URLResolver - -__all__ = ["GitRepoScanner", "URLResolver"] diff --git a/src/kb_engine/git/scanner.py b/src/kb_engine/git/scanner.py deleted file mode 100644 index 4a49bf1..0000000 --- a/src/kb_engine/git/scanner.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Git repository scanner using subprocess.""" - -import subprocess -from pathlib import Path - -import structlog - -from kb_engine.core.models.repository import RepositoryConfig - -logger = structlog.get_logger(__name__) - - -class GitRepoScanner: - """Scans a Git repository for indexable files. - - Uses subprocess + git CLI directly (no gitpython dependency). - """ - - def __init__(self, config: RepositoryConfig) -> None: - self._config = config - self._repo_path = Path(config.local_path).resolve() - - @property - def repo_path(self) -> Path: - return self._repo_path - - def _run_git(self, *args: str) -> str: - """Run a git command and return stdout.""" - result = subprocess.run( - ["git", *args], - cwd=self._repo_path, - capture_output=True, - text=True, - check=True, - ) - return result.stdout.strip() - - def is_git_repo(self) -> bool: - """Check if the path is a valid git repository.""" - try: - self._run_git("rev-parse", "--git-dir") - return True - except (subprocess.CalledProcessError, FileNotFoundError, NotADirectoryError): - return False - - def get_current_commit(self) -> str: - """Get the current HEAD commit hash.""" - return self._run_git("rev-parse", "HEAD") - - def get_remote_url(self) -> str | None: - """Get the remote origin URL, if available.""" - try: - url = self._run_git("remote", "get-url", "origin") - return url if url else None - except subprocess.CalledProcessError: - return None - - def get_current_branch(self) -> str: - """Get the current branch name.""" - try: - return self._run_git("rev-parse", "--abbrev-ref", "HEAD") - except subprocess.CalledProcessError: - return "main" - - def scan_files(self) -> list[str]: - """Scan the repo for files matching include/exclude patterns. - - Returns relative paths from the repo root. - """ - # Use git ls-files to get tracked files - try: - output = self._run_git("ls-files") - except subprocess.CalledProcessError: - logger.warning("git ls-files failed, falling back to filesystem scan") - return self._scan_filesystem() - - all_files = output.splitlines() if output else [] - return self._filter_files(all_files) - - def _scan_filesystem(self) -> list[str]: - """Fallback: scan filesystem directly.""" - all_files = [] - for path in self._repo_path.rglob("*"): - if path.is_file(): - all_files.append(str(path.relative_to(self._repo_path))) - return self._filter_files(all_files) - - def _filter_files(self, files: list[str]) -> list[str]: - """Filter files by include/exclude patterns.""" - result = [] - for filepath in files: - # Check include patterns - included = any( - self._match_pattern(filepath, pattern) - for pattern in self._config.include_patterns - ) - if not included: - continue - - # Check exclude patterns - excluded = any( - self._match_pattern(filepath, pattern) - for pattern in self._config.exclude_patterns - ) - if excluded: - continue - - result.append(filepath) - - return sorted(result) - - @staticmethod - def _match_pattern(filepath: str, pattern: str) -> bool: - """Match a filepath against a glob pattern. - - Handles ** patterns correctly for both root and nested files. - For example, "**/*.md" matches both "README.md" and "docs/entity.md". - """ - path = Path(filepath) - # PurePath.match doesn't match root files against **/*.ext patterns - # So we also check with the plain extension pattern - if path.match(pattern): - return True - # If pattern starts with **/, also try without the **/ prefix - if pattern.startswith("**/"): - return path.match(pattern[3:]) - return False - - def get_changed_files(self, since_commit: str) -> list[str]: - """Get files changed since a given commit. - - Returns relative paths of changed files that match patterns. - """ - try: - output = self._run_git("diff", "--name-only", since_commit, "HEAD") - changed = output.splitlines() if output else [] - except subprocess.CalledProcessError: - logger.warning( - "git diff failed, returning all files", - since_commit=since_commit, - ) - return self.scan_files() - - return self._filter_files(changed) - - def get_deleted_files(self, since_commit: str) -> list[str]: - """Get files deleted since a given commit.""" - try: - output = self._run_git( - "diff", "--name-only", "--diff-filter=D", since_commit, "HEAD" - ) - deleted = output.splitlines() if output else [] - except subprocess.CalledProcessError: - return [] - - return self._filter_files(deleted) - - def read_file(self, relative_path: str) -> str: - """Read a file from the repository.""" - file_path = self._repo_path / relative_path - return file_path.read_text(encoding="utf-8") diff --git a/src/kb_engine/git/url_resolver.py b/src/kb_engine/git/url_resolver.py deleted file mode 100644 index 3bf5894..0000000 --- a/src/kb_engine/git/url_resolver.py +++ /dev/null @@ -1,78 +0,0 @@ -"""URL resolver for document references.""" - -import re -from pathlib import Path - -from kb_engine.core.models.repository import RepositoryConfig - - -class URLResolver: - """Resolves (relative_path, anchor) to full URLs. - - Supports two modes: - - Local: file:///absolute/path/to/doc.md#section - - Remote: https://github.com/org/repo/blob/main/doc.md#section - """ - - def __init__(self, config: RepositoryConfig) -> None: - self._config = config - self._repo_path = Path(config.local_path).resolve() - - def resolve(self, relative_path: str, anchor: str | None = None) -> str: - """Resolve a relative path and optional anchor to a full URL. - - If a remote_url or base_url_template is configured, produces - a remote URL. Otherwise, produces a local file:// URL. - """ - if self._config.base_url_template: - return self._resolve_template(relative_path, anchor) - elif self._config.remote_url: - return self._resolve_remote(relative_path, anchor) - else: - return self._resolve_local(relative_path, anchor) - - def _resolve_local(self, relative_path: str, anchor: str | None) -> str: - """Resolve to a local file:// URL.""" - absolute_path = self._repo_path / relative_path - url = f"file://{absolute_path}" - if anchor: - url += f"#{anchor}" - return url - - def _resolve_remote(self, relative_path: str, anchor: str | None) -> str: - """Resolve to a remote URL based on the git remote.""" - remote = self._config.remote_url or "" - base = self._normalize_remote_url(remote) - branch = self._config.branch - url = f"{base}/blob/{branch}/{relative_path}" - if anchor: - url += f"#{anchor}" - return url - - def _resolve_template(self, relative_path: str, anchor: str | None) -> str: - """Resolve using a custom URL template.""" - template = self._config.base_url_template or "" - remote = self._normalize_remote_url(self._config.remote_url or "") - url = template.replace("{remote}", remote) - url = url.replace("{branch}", self._config.branch) - url = url.replace("{path}", relative_path) - if anchor: - url += f"#{anchor}" - return url - - @staticmethod - def _normalize_remote_url(url: str) -> str: - """Normalize a git remote URL to an HTTPS base URL. - - Handles: - - git@github.com:org/repo.git -> https://github.com/org/repo - - https://github.com/org/repo.git -> https://github.com/org/repo - """ - # Strip .git suffix - url = re.sub(r"\.git$", "", url) - # Convert SSH to HTTPS - ssh_match = re.match(r"git@([^:]+):(.+)", url) - if ssh_match: - host, path = ssh_match.groups() - return f"https://{host}/{path}" - return url diff --git a/src/kb_engine/mcp_server.py b/src/kb_engine/mcp_server.py deleted file mode 100644 index 77cbe06..0000000 --- a/src/kb_engine/mcp_server.py +++ /dev/null @@ -1,358 +0,0 @@ -"""MCP Server for kb-engine. - -Exposes semantic search tools (kdd_search, kdd_related, kdd_list) -for AI agents via the Model Context Protocol. -""" - -from __future__ import annotations - -import asyncio -import atexit -import json -import signal -import sys -from typing import Any - -import click -from mcp.server.fastmcp import FastMCP - -from kb_engine.core.interfaces.repositories import GraphRepository, TraceabilityRepository -from kb_engine.services.retrieval import RetrievalService - -mcp = FastMCP("kb-engine", instructions="Semantic search over a KDD knowledge base.") - -# --- Lazy-initialized services --- - -_retrieval_service: RetrievalService | None = None -_graph_repo: GraphRepository | None = None -_traceability_repo: TraceabilityRepository | None = None -_factory: Any = None - - -async def _get_services() -> ( - tuple[RetrievalService, GraphRepository | None, TraceabilityRepository] -): - """Lazy-init: create RepositoryFactory + repos + pipelines on first use.""" - global _retrieval_service, _graph_repo, _traceability_repo, _factory - - if _retrieval_service is not None and _traceability_repo is not None: - return _retrieval_service, _graph_repo, _traceability_repo - - from kb_engine.config.settings import get_settings - from kb_engine.embedding.config import EmbeddingConfig - from kb_engine.pipelines.inference.pipeline import RetrievalPipeline - from kb_engine.repositories.factory import RepositoryFactory - - settings = get_settings() - _factory = RepositoryFactory(settings) - - traceability = await _factory.get_traceability_repository() - vector = await _factory.get_vector_repository() - graph = await _factory.get_graph_repository() - - embedding_config = EmbeddingConfig( - provider=settings.embedding_provider, - local_model_name=settings.local_embedding_model, - openai_model=settings.openai_embedding_model, - ) - - retrieval_pipeline = RetrievalPipeline( - traceability_repo=traceability, - vector_repo=vector, - graph_repo=graph, - embedding_config=embedding_config, - ) - - _retrieval_service = RetrievalService(pipeline=retrieval_pipeline) - _graph_repo = graph - _traceability_repo = traceability - - return _retrieval_service, _graph_repo, _traceability_repo - - -# --- Cleanup --- - - -def _cleanup() -> None: - """Close the factory on exit.""" - if _factory is not None: - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - loop.create_task(_factory.close()) - else: - loop.run_until_complete(_factory.close()) - except Exception: - pass - - -atexit.register(_cleanup) - - -def _signal_handler(sig: int, frame: Any) -> None: - _cleanup() - sys.exit(0) - - -signal.signal(signal.SIGINT, _signal_handler) -signal.signal(signal.SIGTERM, _signal_handler) - - -# --- MCP Tools --- - - -@mcp.tool() -async def kdd_search( - query: str, - limit: int = 5, - mode: str = "vector", - chunk_types: list[str] | None = None, - domains: list[str] | None = None, - tags: list[str] | None = None, - score_threshold: float | None = None, -) -> str: - """Search the knowledge base semantically. - - Returns document references with URLs pointing to the matching sections. - Use this to find documentation, ADRs, design challenges, or any indexed content. - - Args: - query: Natural language search query. - limit: Maximum number of results (default 5). - mode: Retrieval mode - "vector" (semantic), "graph" (knowledge graph), or "hybrid" (both). - chunk_types: Filter by chunk types (e.g. ["header", "paragraph"]). - domains: Filter by document domains. - tags: Filter by document tags. - score_threshold: Minimum relevance score (0.0-1.0). - """ - from kb_engine.core.models.search import RetrievalMode, SearchFilters - - retrieval, _, _ = await _get_services() - - # Map mode string to enum - mode_map = { - "vector": RetrievalMode.VECTOR, - "graph": RetrievalMode.GRAPH, - "hybrid": RetrievalMode.HYBRID, - } - retrieval_mode = mode_map.get(mode.lower(), RetrievalMode.VECTOR) - - filters = None - if chunk_types or domains or tags: - filters = SearchFilters( - chunk_types=chunk_types, - domains=domains, - tags=tags, - ) - - response = await retrieval.search( - query=query, - mode=retrieval_mode, - limit=limit, - filters=filters, - score_threshold=score_threshold, - ) - - results = [] - for ref in response.references: - result = { - "url": ref.url, - "title": ref.title, - "section": ref.section_title, - "score": round(ref.score, 4), - "snippet": ref.snippet[:200] if ref.snippet else "", - "type": ref.chunk_type, - "domain": ref.domain, - "retrieval_mode": ref.retrieval_mode.value, - } - # Include graph metadata if present - if ref.metadata.get("graph_relationships"): - result["graph"] = { - "node_name": ref.metadata.get("graph_node_name"), - "node_type": ref.metadata.get("graph_node_type"), - "relationships": ref.metadata.get("graph_relationships"), - } - results.append(result) - - return json.dumps({ - "query": response.query, - "mode": retrieval_mode.value, - "total": response.total_count, - "results": results, - }) - - -@mcp.tool() -async def kdd_related( - entity: str, - depth: int = 1, - edge_types: list[str] | None = None, - limit: int = 20, -) -> str: - """Find entities related to a given entity in the knowledge graph. - - Traverses the knowledge graph to find connected concepts, entities, and events. - - Args: - entity: Name or pattern of the entity to search for. - depth: How many hops to traverse (default 1). - edge_types: Filter by relationship types (e.g. ["REFERENCES", "CONTAINS"]). - limit: Maximum number of related entities to return. - """ - _, graph_repo, traceability = await _get_services() - - if graph_repo is None: - return json.dumps({ - "error": "Graph store is not available. Configure graph_store in settings.", - }) - - nodes = await graph_repo.find_nodes(name_pattern=entity) - if not nodes: - return json.dumps({ - "entity": entity, - "related": [], - "message": f"No entity found matching '{entity}'.", - }) - - start_node = nodes[0] - triples = await graph_repo.traverse( - start_node_id=start_node.id, - max_hops=depth, - edge_types=edge_types, - ) - - related = [] - seen_ids = set() - for source, edge, target in triples[:limit]: - if target.id in seen_ids: - continue - seen_ids.add(target.id) - - doc_url = None - if target.source_document_id and traceability: - doc = await traceability.get_document(target.source_document_id) - if doc: - doc_url = f"file://{doc.source_path}" if doc.source_path else None - - related.append({ - "name": target.name, - "type": target.node_type.value if hasattr(target.node_type, "value") else str(target.node_type), - "relationship": edge.edge_type.value if hasattr(edge.edge_type, "value") else str(edge.edge_type), - "confidence": round(edge.confidence, 4), - "document_url": doc_url, - }) - - return json.dumps({ - "entity": { - "name": start_node.name, - "type": start_node.node_type.value if hasattr(start_node.node_type, "value") else str(start_node.node_type), - }, - "related": related, - }) - - -@mcp.tool() -async def kdd_list( - kind: str | None = None, - domain: str | None = None, - status: str | None = None, - limit: int = 20, -) -> str: - """List indexed documents in the knowledge base. - - Returns a summary of all indexed documents with metadata. - - Args: - kind: Filter by document kind (from KDD metadata, e.g. "adr", "challenge"). - domain: Filter by document domain. - status: Filter by document status (e.g. "indexed", "pending"). - limit: Maximum number of documents to return. - """ - from kb_engine.core.models.search import SearchFilters - - _, _, traceability = await _get_services() - - filters = None - if domain: - filters = SearchFilters(domains=[domain]) - - docs = await traceability.list_documents(filters=filters, limit=limit) - - # Apply in-memory filters that the repository doesn't support directly - if kind: - docs = [d for d in docs if d.metadata.get("kind") == kind] - if status: - docs = [d for d in docs if d.status.value == status] - - results = [] - for doc in docs: - chunks = await traceability.get_chunks_by_document(doc.id) - results.append({ - "path": doc.relative_path or doc.source_path, - "title": doc.title, - "kind": doc.metadata.get("kind"), - "domain": doc.domain, - "status": doc.status.value, - "chunks": len(chunks), - "indexed_at": doc.indexed_at.isoformat() if doc.indexed_at else None, - }) - - return json.dumps({"total": len(results), "documents": results}) - - -# --- CLI fallback --- - - -@click.group() -def mcp_cli() -> None: - """KB-Engine MCP Server CLI (for testing).""" - pass - - -@mcp_cli.command("search") -@click.argument("query") -@click.option("--limit", "-l", default=5, help="Max results") -@click.option("--mode", "-m", type=click.Choice(["vector", "graph", "hybrid"]), default="vector", help="Retrieval mode") -def cli_search(query: str, limit: int, mode: str) -> None: - """Search the knowledge base.""" - result = asyncio.run(kdd_search(query=query, limit=limit, mode=mode)) - data = json.loads(result) - click.echo(json.dumps(data, indent=2)) - - -@mcp_cli.command("related") -@click.argument("entity") -@click.option("--depth", "-d", default=1, help="Traversal depth") -def cli_related(entity: str, depth: int) -> None: - """Find related entities.""" - result = asyncio.run(kdd_related(entity=entity, depth=depth)) - data = json.loads(result) - click.echo(json.dumps(data, indent=2)) - - -@mcp_cli.command("list") -@click.option("--kind", "-k", default=None, help="Filter by document kind") -@click.option("--domain", "-d", default=None, help="Filter by domain") -@click.option("--status", "-s", default=None, help="Filter by status") -@click.option("--limit", "-l", default=20, help="Max results") -def cli_list(kind: str | None, domain: str | None, status: str | None, limit: int) -> None: - """List indexed documents.""" - result = asyncio.run(kdd_list(kind=kind, domain=domain, status=status, limit=limit)) - data = json.loads(result) - click.echo(json.dumps(data, indent=2)) - - -# --- Entry points --- - - -def main() -> None: - """Entry point for the kb-mcp script.""" - mcp.run() - - -if __name__ == "__main__": - if len(sys.argv) > 1 and sys.argv[1] == "--cli": - sys.argv.pop(1) - mcp_cli() - else: - mcp.run() diff --git a/src/kb_engine/pipelines/__init__.py b/src/kb_engine/pipelines/__init__.py deleted file mode 100644 index 62b8c18..0000000 --- a/src/kb_engine/pipelines/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Processing pipelines for KB-Engine.""" - -from kb_engine.pipelines.indexation import IndexationPipeline -from kb_engine.pipelines.inference import RetrievalPipeline - -__all__ = ["IndexationPipeline", "RetrievalPipeline"] diff --git a/src/kb_engine/pipelines/indexation/__init__.py b/src/kb_engine/pipelines/indexation/__init__.py deleted file mode 100644 index 48aab5d..0000000 --- a/src/kb_engine/pipelines/indexation/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Indexation pipeline for document processing.""" - -from kb_engine.pipelines.indexation.pipeline import IndexationPipeline - -__all__ = ["IndexationPipeline"] diff --git a/src/kb_engine/pipelines/indexation/pipeline.py b/src/kb_engine/pipelines/indexation/pipeline.py deleted file mode 100644 index 9ed7e16..0000000 --- a/src/kb_engine/pipelines/indexation/pipeline.py +++ /dev/null @@ -1,330 +0,0 @@ -"""Main indexation pipeline.""" - -from __future__ import annotations - -from datetime import datetime -from pathlib import Path -from typing import TYPE_CHECKING - -import structlog - -from kb_engine.chunking import ChunkerFactory, ChunkingConfig -from kb_engine.core.exceptions import PipelineError -from kb_engine.core.models.document import Document, DocumentStatus, KDDStatus -from kb_engine.core.models.repository import ( - EXTENSION_DEFAULTS, - FileTypeConfig, - RepositoryConfig, -) -from kb_engine.embedding import EmbeddingConfig, EmbeddingProviderFactory -from kb_engine.extraction import ExtractionConfig, ExtractionPipelineFactory -from kb_engine.git.scanner import GitRepoScanner -from kb_engine.git.url_resolver import URLResolver -from kb_engine.utils.hashing import compute_content_hash -from kb_engine.utils.markdown import extract_frontmatter, heading_path_to_anchor - -if TYPE_CHECKING: - from kb_engine.extraction.strategies import GraphExtractionStrategy - -logger = structlog.get_logger(__name__) - - -class IndexationPipeline: - """Pipeline for indexing documents into the knowledge base. - - Orchestrates the full indexation process: - 1. Parse and validate document - 2. Chunk content using semantic strategies - 3. Compute section anchors for each chunk - 4. Generate embeddings - 5. Extract entities and relationships (optional) - 6. Store in all repositories - """ - - def __init__( - self, - traceability_repo, - vector_repo, - graph_repo=None, - graph_strategy: GraphExtractionStrategy | None = None, - url_resolver: URLResolver | None = None, - chunking_config: ChunkingConfig | None = None, - embedding_config: EmbeddingConfig | None = None, - extraction_config: ExtractionConfig | None = None, - ) -> None: - self._traceability = traceability_repo - self._vector = vector_repo - self._url_resolver = url_resolver - - # Initialize components - self._chunker = ChunkerFactory(chunking_config) - self._embedding_provider = EmbeddingProviderFactory(embedding_config).create_provider() - self._extraction_pipeline = ExtractionPipelineFactory(extraction_config).create_pipeline() - - # Graph strategy: explicit strategy > legacy wrapper around graph_repo > None - if graph_strategy is not None: - self._graph_strategy = graph_strategy - elif graph_repo is not None: - from kb_engine.extraction.strategies import LegacyGraphExtractionStrategy - - self._graph_strategy = LegacyGraphExtractionStrategy( - graph_repo, self._extraction_pipeline - ) - else: - self._graph_strategy = None - - async def index_document(self, document: Document) -> Document: - """Index a document through the full pipeline.""" - try: - document.status = DocumentStatus.PROCESSING - document.content_hash = compute_content_hash(document.content) - - # 1. Save document to traceability store - logger.debug("Step 1/8: saving document", title=document.title) - document = await self._traceability.save_document(document) - - # 2. Chunk the document - logger.debug("Step 2/8: chunking document", title=document.title) - parser = document.metadata.get("_parser", "markdown") - chunks = self._chunker.chunk_document(document, parser=parser) - - # 3. Compute section anchors and propagate KDD status to chunks - logger.debug("Step 3/8: computing anchors", chunks=len(chunks)) - for chunk in chunks: - chunk.section_anchor = heading_path_to_anchor(chunk.heading_path) - # Propagate KDD lifecycle fields to chunk metadata - chunk.metadata["kdd_status"] = document.kdd_status.value - if document.kdd_version: - chunk.metadata["kdd_version"] = document.kdd_version - - # 4. Save chunks to traceability store - logger.debug("Step 4/8: saving chunks", chunks=len(chunks)) - chunks = await self._traceability.save_chunks(chunks) - - # 5. Generate embeddings - logger.debug("Step 5/8: generating embeddings", chunks=len(chunks)) - embeddings = await self._embedding_provider.embed_chunks(chunks) - - # 6. Store embeddings in vector store - logger.debug("Step 6/8: storing embeddings", count=len(embeddings)) - await self._vector.upsert_embeddings(embeddings) - - # 7. Extract entities and store in graph (if strategy available) - if self._graph_strategy is not None: - logger.debug("Step 7/8: extracting entities", title=document.title) - graph_result = await self._graph_strategy.extract_and_store( - document, chunks - ) - logger.debug( - "Step 7/8: entities extracted", - nodes=graph_result.nodes_created, - edges=graph_result.edges_created, - ) - - # 8. Update document status - logger.debug("Step 8/8: updating status", title=document.title) - document.status = DocumentStatus.INDEXED - document.indexed_at = datetime.utcnow() - document = await self._traceability.update_document(document) - - logger.info( - "Document indexed", - document_id=str(document.id), - title=document.title, - chunks=len(chunks), - ) - return document - - except Exception as e: - document.status = DocumentStatus.FAILED - try: - await self._traceability.update_document(document) - except Exception: - pass - raise PipelineError( - f"Failed to index document: {e}", - details={"document_id": str(document.id)}, - ) from e - - async def reindex_document(self, document: Document) -> Document: - """Reindex an existing document.""" - await self._vector.delete_by_document(document.id) - if self._graph_strategy is not None: - await self._graph_strategy.delete_by_document(document.relative_path or str(document.id)) - await self._traceability.delete_chunks_by_document(document.id) - return await self.index_document(document) - - async def delete_document(self, document: Document) -> bool: - """Delete a document and all its indexed data.""" - await self._vector.delete_by_document(document.id) - if self._graph_strategy is not None: - await self._graph_strategy.delete_by_document(document.relative_path or str(document.id)) - await self._traceability.delete_chunks_by_document(document.id) - return await self._traceability.delete_document(document.id) - - @staticmethod - def _resolve_file_type_config( - repo_config: RepositoryConfig, relative_path: str - ) -> FileTypeConfig: - """Resolve the FileTypeConfig for a file based on its extension.""" - ext = Path(relative_path).suffix.lower() - if ext in repo_config.file_type_config: - return repo_config.file_type_config[ext] - if ext in EXTENSION_DEFAULTS: - return EXTENSION_DEFAULTS[ext] - return FileTypeConfig(parser="plaintext", mime_type="text/plain") - - def _build_document( - self, - scanner: GitRepoScanner, - repo_config: RepositoryConfig, - relative_path: str, - commit: str, - remote_url: str | None, - existing_id=None, - content: str | None = None, - ) -> Document: - """Build a Document from a repository file with file-type-aware parsing.""" - if content is None: - content = scanner.read_file(relative_path) - title = Path(relative_path).stem - ft_config = self._resolve_file_type_config(repo_config, relative_path) - - if ft_config.parser == "markdown": - frontmatter, body = extract_frontmatter(content) - else: - frontmatter = {} - - metadata = {**frontmatter, "_parser": ft_config.parser} - - # Extract KDD lifecycle fields from frontmatter - kdd_status = KDDStatus.from_string(frontmatter.get("status")) - kdd_version = frontmatter.get("version") - - kwargs: dict = dict( - title=frontmatter.get("title", title), - content=content, - source_path=str(scanner.repo_path / relative_path), - external_id=f"{repo_config.name}:{relative_path}", - domain=frontmatter.get("domain"), - tags=frontmatter.get("tags", []), - metadata=metadata, - mime_type=ft_config.mime_type, - repo_name=repo_config.name, - relative_path=relative_path, - git_commit=commit, - git_remote_url=remote_url, - kdd_status=kdd_status, - kdd_version=kdd_version, - ) - if existing_id is not None: - kwargs["id"] = existing_id - return Document(**kwargs) - - async def index_repository(self, repo_config: RepositoryConfig) -> list[Document]: - """Index all matching files from a Git repository.""" - scanner = GitRepoScanner(repo_config) - if not scanner.is_git_repo(): - raise PipelineError(f"Not a git repository: {repo_config.local_path}") - - resolver = self._url_resolver or URLResolver(repo_config) - commit = scanner.get_current_commit() - remote_url = scanner.get_remote_url() - files = scanner.scan_files() - - logger.info( - "Indexing repository", - repo=repo_config.name, - files=len(files), - commit=commit[:8], - ) - - documents = [] - for relative_path in files: - try: - doc = self._build_document( - scanner, repo_config, relative_path, commit, remote_url - ) - doc = await self.index_document(doc) - documents.append(doc) - except Exception as e: - logger.error( - "Failed to index file", - path=relative_path, - error=str(e), - ) - - return documents - - async def sync_repository(self, repo_config: RepositoryConfig, since_commit: str) -> dict: - """Incrementally sync a repository since a given commit. - - Returns a summary dict with indexed, deleted, and skipped counts. - """ - scanner = GitRepoScanner(repo_config) - resolver = self._url_resolver or URLResolver(repo_config) - current_commit = scanner.get_current_commit() - remote_url = scanner.get_remote_url() - - changed_files = scanner.get_changed_files(since_commit) - deleted_files = scanner.get_deleted_files(since_commit) - - logger.info( - "Syncing repository", - repo=repo_config.name, - changed=len(changed_files), - deleted=len(deleted_files), - since=since_commit[:8], - ) - - indexed = 0 - skipped = 0 - - # Delete removed files - for relative_path in deleted_files: - external_id = f"{repo_config.name}:{relative_path}" - existing = await self._traceability.get_document_by_external_id(external_id) - if existing: - await self.delete_document(existing) - - # Reindex changed files - for relative_path in changed_files: - try: - content = scanner.read_file(relative_path) - content_hash = compute_content_hash(content) - - external_id = f"{repo_config.name}:{relative_path}" - existing = await self._traceability.get_document_by_external_id(external_id) - - if existing and existing.content_hash == content_hash: - skipped += 1 - continue - - doc = self._build_document( - scanner, - repo_config, - relative_path, - current_commit, - remote_url, - existing_id=existing.id if existing else None, - content=content, - ) - - if existing: - await self.reindex_document(doc) - else: - await self.index_document(doc) - indexed += 1 - except Exception as e: - logger.error( - "Failed to sync file", - path=relative_path, - error=str(e), - ) - - return { - "commit": current_commit, - "indexed": indexed, - "deleted": len(deleted_files), - "skipped": skipped, - } diff --git a/src/kb_engine/pipelines/inference/__init__.py b/src/kb_engine/pipelines/inference/__init__.py deleted file mode 100644 index 9126368..0000000 --- a/src/kb_engine/pipelines/inference/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Retrieval pipeline for query processing.""" - -from kb_engine.pipelines.inference.pipeline import RetrievalPipeline - -__all__ = ["RetrievalPipeline"] diff --git a/src/kb_engine/pipelines/inference/pipeline.py b/src/kb_engine/pipelines/inference/pipeline.py deleted file mode 100644 index fec302b..0000000 --- a/src/kb_engine/pipelines/inference/pipeline.py +++ /dev/null @@ -1,307 +0,0 @@ -"""Retrieval pipeline - returns document references with URLs.""" - -import time - -import structlog - -from kb_engine.core.models.search import ( - DocumentReference, - RetrievalMode, - RetrievalResponse, - SearchFilters, -) -from kb_engine.embedding import EmbeddingConfig, EmbeddingProviderFactory -from kb_engine.git.url_resolver import URLResolver -from kb_engine.utils.markdown import extract_snippet, heading_path_to_anchor - -logger = structlog.get_logger(__name__) - - -class RetrievalPipeline: - """Pipeline for processing retrieval queries. - - Returns DocumentReference objects with full URLs (including #anchors) - instead of raw document content. This allows external agents to - read the source documents directly. - """ - - def __init__( - self, - traceability_repo, - vector_repo, - graph_repo=None, - url_resolver: URLResolver | None = None, - embedding_config: EmbeddingConfig | None = None, - ) -> None: - self._traceability = traceability_repo - self._vector = vector_repo - self._graph = graph_repo - self._url_resolver = url_resolver - - self._embedding_provider = EmbeddingProviderFactory(embedding_config).create_provider() - - async def search( - self, - query: str, - mode: RetrievalMode = RetrievalMode.VECTOR, - filters: SearchFilters | None = None, - limit: int = 10, - score_threshold: float | None = None, - ) -> RetrievalResponse: - """Execute a retrieval query, returning document references.""" - start_time = time.time() - - references: list[DocumentReference] = [] - - if mode in (RetrievalMode.VECTOR, RetrievalMode.HYBRID): - vector_refs = await self._vector_search( - query, filters, limit, score_threshold - ) - references.extend(vector_refs) - - if mode in (RetrievalMode.GRAPH, RetrievalMode.HYBRID): - graph_refs = await self._graph_search(query, filters, limit) - references.extend(graph_refs) - - # Deduplicate by URL if hybrid - if mode == RetrievalMode.HYBRID: - references = self._deduplicate_references(references, limit) - - # Sort by score descending - references.sort(key=lambda r: r.score, reverse=True) - references = references[:limit] - - processing_time = (time.time() - start_time) * 1000 - - return RetrievalResponse( - query=query, - references=references, - total_count=len(references), - processing_time_ms=processing_time, - ) - - async def _vector_search( - self, - query: str, - filters: SearchFilters | None, - limit: int, - score_threshold: float | None, - ) -> list[DocumentReference]: - """Perform vector similarity search and resolve to references.""" - query_vector = await self._embedding_provider.embed_text(query) - - chunk_scores = await self._vector.search( - query_vector=query_vector, - limit=limit, - filters=filters, - score_threshold=score_threshold, - ) - - references = [] - for chunk_id, score in chunk_scores: - chunk = await self._traceability.get_chunk(chunk_id) - if not chunk: - continue - - document = await self._traceability.get_document(chunk.document_id) - if not document: - continue - - # Resolve URL - anchor = chunk.section_anchor or heading_path_to_anchor(chunk.heading_path) - if self._url_resolver and document.relative_path: - url = self._url_resolver.resolve(document.relative_path, anchor) - elif document.source_path: - url = f"file://{document.source_path}" - if anchor: - url += f"#{anchor}" - else: - url = f"doc://{document.id}" - - # Build section title from heading path - section_title = chunk.heading_path[-1] if chunk.heading_path else None - - references.append( - DocumentReference( - url=url, - document_path=document.relative_path or document.source_path or "", - section_anchor=anchor, - title=document.title, - section_title=section_title, - score=score, - snippet=extract_snippet(chunk.content), - domain=document.domain, - tags=document.tags, - chunk_type=chunk.chunk_type.value, - metadata=chunk.metadata, - retrieval_mode=RetrievalMode.VECTOR, - kdd_status=document.kdd_status.value, - kdd_version=document.kdd_version, - ) - ) - - return references - - async def _graph_search( - self, - query: str, - filters: SearchFilters | None, - limit: int, - ) -> list[DocumentReference]: - """Graph-based search: find nodes matching query and return related documents.""" - if not self._graph: - logger.debug("graph_search.skipped", reason="no graph repository") - return [] - - # Find nodes matching the query by name - matching_nodes = await self._graph.find_nodes( - name_pattern=query, - limit=limit * 2, # Get extra to account for filtering - ) - - if not matching_nodes: - return [] - - references: list[DocumentReference] = [] - seen_keys: set[str] = set() - - for node in matching_nodes: - # Get edges for this node to include relationship info - edges = await self._graph.get_edges(node.id, direction="both") - - # Build relationship metadata - relationships = [] - for edge in edges[:5]: # Limit to 5 relationships per node - other_node_id = edge.target_id if edge.source_id == node.id else edge.source_id - other_node = await self._graph.get_node(other_node_id) - other_name = other_node.name if other_node else str(other_node_id) - - rel_info = { - "type": edge.edge_type.value, - "direction": "outgoing" if edge.source_id == node.id else "incoming", - "related_node": other_name, - "confidence": edge.confidence, - } - relationships.append(rel_info) - - # Try to resolve document via source_chunk_id first - chunk = None - document = None - anchor = None - section_title = None - - if node.source_chunk_id: - chunk_key = str(node.source_chunk_id) - if chunk_key in seen_keys: - continue - seen_keys.add(chunk_key) - - chunk = await self._traceability.get_chunk(node.source_chunk_id) - if chunk: - document = await self._traceability.get_document(chunk.document_id) - anchor = chunk.section_anchor or heading_path_to_anchor(chunk.heading_path) - section_title = chunk.heading_path[-1] if chunk.heading_path else None - - # Fallback: try to find document by node name (for nodes without chunk_id) - if not document and node.source_document_id: - doc_key = str(node.source_document_id) - if doc_key in seen_keys: - continue - seen_keys.add(doc_key) - document = await self._traceability.get_document(node.source_document_id) - - # Last resort: search for document by title matching node name - if not document: - node_key = f"node:{node.id}" - if node_key in seen_keys: - continue - seen_keys.add(node_key) - - # Search documents by title - docs = await self._traceability.list_documents(limit=100) - for doc in docs: - if doc.title and node.name.lower() in doc.title.lower(): - document = doc - break - - if not document: - continue - - # Resolve URL - if self._url_resolver and document.relative_path: - url = self._url_resolver.resolve(document.relative_path, anchor) - elif document.source_path: - url = f"file://{document.source_path}" - if anchor: - url += f"#{anchor}" - else: - url = f"doc://{document.id}" - - # Build metadata with graph info - metadata = dict(chunk.metadata) if chunk and chunk.metadata else {} - metadata["graph_node_name"] = node.name - metadata["graph_node_type"] = node.node_type.value - metadata["graph_relationships"] = relationships - if node.description: - metadata["graph_node_description"] = node.description - - # Score based on node confidence and number of relationships - score = node.confidence * (1 + len(relationships) * 0.1) - - # Build snippet from chunk or node description - if chunk: - snippet = extract_snippet(chunk.content) - chunk_type = chunk.chunk_type.value - else: - snippet = node.description or f"Graph node: {node.name}" - chunk_type = "graph_node" - - references.append( - DocumentReference( - url=url, - document_path=document.relative_path or document.source_path or "", - section_anchor=anchor, - title=document.title, - section_title=section_title, - score=min(score, 1.0), - snippet=snippet, - domain=document.domain, - tags=document.tags, - chunk_type=chunk_type, - metadata=metadata, - retrieval_mode=RetrievalMode.GRAPH, - kdd_status=document.kdd_status.value, - kdd_version=document.kdd_version, - ) - ) - - if len(references) >= limit: - break - - return references - - def _deduplicate_references( - self, - references: list[DocumentReference], - limit: int, - ) -> list[DocumentReference]: - """Deduplicate references using Reciprocal Rank Fusion.""" - url_scores: dict[str, tuple[DocumentReference, float]] = {} - k = 60 # RRF constant - - for rank, ref in enumerate(references): - rrf_score = 1.0 / (k + rank + 1) - if ref.url in url_scores: - existing_ref, existing_score = url_scores[ref.url] - url_scores[ref.url] = (existing_ref, existing_score + rrf_score) - else: - url_scores[ref.url] = (ref, rrf_score) - - merged = [] - for ref, rrf_score in url_scores.values(): - ref.score = rrf_score - ref.retrieval_mode = RetrievalMode.HYBRID - merged.append(ref) - - merged.sort(key=lambda r: r.score, reverse=True) - return merged[:limit] diff --git a/src/kb_engine/py.typed b/src/kb_engine/py.typed deleted file mode 100644 index e69de29..0000000 diff --git a/src/kb_engine/repositories/__init__.py b/src/kb_engine/repositories/__init__.py deleted file mode 100644 index a3c51c9..0000000 --- a/src/kb_engine/repositories/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Repository implementations for KB-Engine.""" - -from kb_engine.repositories.factory import RepositoryFactory - -__all__ = ["RepositoryFactory"] diff --git a/src/kb_engine/repositories/factory.py b/src/kb_engine/repositories/factory.py deleted file mode 100644 index d73c801..0000000 --- a/src/kb_engine/repositories/factory.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Repository factory for creating repository instances.""" - -from typing import TYPE_CHECKING - -import structlog - -if TYPE_CHECKING: - from kb_engine.config.settings import Settings - -logger = structlog.get_logger(__name__) - - -class RepositoryFactory: - """Factory for creating repository instances. - - Creates the appropriate repository implementations based on - configuration settings (profile: local vs server). - """ - - def __init__(self, settings: "Settings") -> None: - self._settings = settings - self._traceability = None - self._vector = None - self._graph = None - self._graph_strategy = None - - async def get_traceability_repository(self): - """Get or create the traceability repository.""" - if self._traceability is None: - store = self._settings.traceability_store.lower() - - if store == "sqlite": - from kb_engine.repositories.traceability.sqlite import SQLiteRepository - - self._traceability = SQLiteRepository( - db_path=self._settings.sqlite_path, - ) - await self._traceability.initialize() - elif store == "postgres": - from kb_engine.repositories.traceability.postgres import PostgresRepository - - self._traceability = PostgresRepository( - connection_string=self._settings.database_url, - ) - else: - raise ValueError(f"Unknown traceability store: {store}") - - logger.info("Traceability repository created", store=store) - - return self._traceability - - async def get_vector_repository(self): - """Get or create the vector repository.""" - if self._vector is None: - vector_store = self._settings.vector_store.lower() - - if vector_store == "chroma": - from kb_engine.repositories.vector.chroma import ChromaRepository - - self._vector = ChromaRepository( - persist_directory=self._settings.chroma_path, - ) - await self._vector.initialize() - elif vector_store == "qdrant": - from kb_engine.repositories.vector.qdrant import QdrantRepository - - self._vector = QdrantRepository( - host=self._settings.qdrant_host, - port=self._settings.qdrant_port, - api_key=self._settings.qdrant_api_key, - collection_name=self._settings.qdrant_collection, - ) - else: - raise ValueError(f"Unknown vector store: {vector_store}") - - logger.info("Vector repository created", store=vector_store) - - return self._vector - - async def get_graph_repository(self): - """Get or create the graph repository (optional). - - For retrieval, we use SQLiteGraphRepository which stores nodes/edges - in SQLite. This works regardless of which extraction strategy is used - (smart/falkordb stores extraction results in both FalkorDB and SQLite). - """ - if self._graph is None: - graph_store = self._settings.graph_store.lower() - - if graph_store == "none": - return None - elif graph_store in ("sqlite", "falkordb"): - from kb_engine.repositories.graph.sqlite import SQLiteGraphRepository - - self._graph = SQLiteGraphRepository( - db_path=self._settings.sqlite_path, - ) - await self._graph.initialize() - elif graph_store == "neo4j": - from kb_engine.repositories.graph.neo4j import Neo4jRepository - - self._graph = Neo4jRepository( - uri=self._settings.neo4j_uri, - user=self._settings.neo4j_user, - password=self._settings.neo4j_password, - ) - else: - raise ValueError(f"Unknown graph store: {graph_store}") - - logger.info("Graph repository created", store=graph_store) - - return self._graph - - async def get_graph_strategy(self): - """Get or create the graph extraction strategy. - - Returns a GraphExtractionStrategy based on settings: - - "falkordb": SmartGraphExtractionStrategy (FalkorDB + KDD-aware extraction) - - "sqlite" | "neo4j": LegacyGraphExtractionStrategy (wraps existing repo) - - "none": None - """ - if self._graph_strategy is not None: - return self._graph_strategy - - graph_store = self._settings.graph_store.lower() - - if graph_store == "none": - return None - elif graph_store == "falkordb": - from kb_engine.smart.stores.falkordb_graph import FalkorDBGraphStore - from kb_engine.extraction.strategies import SmartGraphExtractionStrategy - - store = FalkorDBGraphStore(db_path=self._settings.falkordb_path) - store.initialize() - self._graph_strategy = SmartGraphExtractionStrategy(store) - logger.info("Graph strategy created", strategy="smart", store="falkordb") - elif graph_store in ("sqlite", "neo4j"): - from kb_engine.extraction.strategies import LegacyGraphExtractionStrategy - from kb_engine.extraction import ExtractionConfig, ExtractionPipelineFactory - - graph_repo = await self.get_graph_repository() - extraction_pipeline = ExtractionPipelineFactory(ExtractionConfig()).create_pipeline() - self._graph_strategy = LegacyGraphExtractionStrategy( - graph_repo, extraction_pipeline - ) - logger.info("Graph strategy created", strategy="legacy", store=graph_store) - else: - raise ValueError(f"Unknown graph store: {graph_store}") - - return self._graph_strategy - - async def close(self) -> None: - """Close all repository connections.""" - if hasattr(self._traceability, "close"): - await self._traceability.close() - if hasattr(self._graph, "close"): - await self._graph.close() - self._traceability = None - self._vector = None - self._graph = None - self._graph_strategy = None diff --git a/src/kb_engine/repositories/graph/__init__.py b/src/kb_engine/repositories/graph/__init__.py deleted file mode 100644 index 33cb18b..0000000 --- a/src/kb_engine/repositories/graph/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Graph repository implementations.""" diff --git a/src/kb_engine/repositories/graph/neo4j.py b/src/kb_engine/repositories/graph/neo4j.py deleted file mode 100644 index a57c7ab..0000000 --- a/src/kb_engine/repositories/graph/neo4j.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Neo4j implementation of the graph repository.""" - -from uuid import UUID - -from kb_engine.core.interfaces.repositories import GraphRepository -from kb_engine.core.models.graph import Edge, Node - - -class Neo4jRepository(GraphRepository): - """Neo4j implementation for knowledge graph storage. - - Uses the official neo4j Python driver for async operations. - """ - - def __init__( - self, - uri: str = "bolt://localhost:7687", - user: str = "neo4j", - password: str = "", - ) -> None: - self._uri = uri - self._user = user - self._password = password - self._driver = None - - async def _ensure_connected(self) -> None: - """Ensure Neo4j driver is connected.""" - if self._driver is None: - # TODO: Initialize neo4j driver - pass - - async def create_node(self, node: Node) -> Node: - """Create a node in Neo4j.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.create_node not implemented") - - async def get_node(self, node_id: UUID) -> Node | None: - """Get a node by ID from Neo4j.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.get_node not implemented") - - async def find_nodes( - self, - node_type: str | None = None, - name_pattern: str | None = None, - limit: int = 100, - ) -> list[Node]: - """Find nodes by type or name pattern.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.find_nodes not implemented") - - async def create_edge(self, edge: Edge) -> Edge: - """Create an edge between nodes in Neo4j.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.create_edge not implemented") - - async def get_edges( - self, - node_id: UUID, - direction: str = "both", - edge_type: str | None = None, - ) -> list[Edge]: - """Get edges connected to a node.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.get_edges not implemented") - - async def traverse( - self, - start_node_id: UUID, - max_hops: int = 2, - edge_types: list[str] | None = None, - ) -> list[tuple[Node, Edge, Node]]: - """Traverse the graph from a starting node.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.traverse not implemented") - - async def delete_by_document(self, document_id: UUID) -> int: - """Delete all nodes and edges from a document.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.delete_by_document not implemented") - - async def find_similar_nodes( - self, - node_id: UUID, - limit: int = 10, - ) -> list[tuple[Node, float]]: - """Find similar nodes based on graph structure.""" - await self._ensure_connected() - raise NotImplementedError("Neo4jRepository.find_similar_nodes not implemented") diff --git a/src/kb_engine/repositories/graph/sqlite.py b/src/kb_engine/repositories/graph/sqlite.py deleted file mode 100644 index 9a6f107..0000000 --- a/src/kb_engine/repositories/graph/sqlite.py +++ /dev/null @@ -1,393 +0,0 @@ -"""SQLite implementation of the graph repository.""" - -import json -from uuid import UUID - -import aiosqlite -import structlog - -from kb_engine.core.models.graph import Edge, EdgeType, Node, NodeType - -logger = structlog.get_logger(__name__) - -CREATE_GRAPH_TABLES_SQL = """ -CREATE TABLE IF NOT EXISTS graph_nodes ( - id TEXT PRIMARY KEY, - external_id TEXT, - name TEXT NOT NULL, - node_type TEXT NOT NULL, - description TEXT, - source_document_id TEXT, - source_chunk_id TEXT, - properties TEXT DEFAULT '{}', - confidence REAL DEFAULT 1.0, - extraction_method TEXT, - created_at TEXT, - updated_at TEXT -); - -CREATE TABLE IF NOT EXISTS graph_edges ( - id TEXT PRIMARY KEY, - source_id TEXT NOT NULL, - target_id TEXT NOT NULL, - edge_type TEXT NOT NULL, - name TEXT, - properties TEXT DEFAULT '{}', - weight REAL DEFAULT 1.0, - source_document_id TEXT, - source_chunk_id TEXT, - confidence REAL DEFAULT 1.0, - extraction_method TEXT, - created_at TEXT, - FOREIGN KEY (source_id) REFERENCES graph_nodes(id) ON DELETE CASCADE, - FOREIGN KEY (target_id) REFERENCES graph_nodes(id) ON DELETE CASCADE -); - -CREATE INDEX IF NOT EXISTS idx_graph_nodes_name ON graph_nodes(name); -CREATE INDEX IF NOT EXISTS idx_graph_nodes_type ON graph_nodes(node_type); -CREATE INDEX IF NOT EXISTS idx_graph_nodes_document ON graph_nodes(source_document_id); -CREATE INDEX IF NOT EXISTS idx_graph_edges_source ON graph_edges(source_id); -CREATE INDEX IF NOT EXISTS idx_graph_edges_target ON graph_edges(target_id); -CREATE INDEX IF NOT EXISTS idx_graph_edges_document ON graph_edges(source_document_id); -""" - - -class SQLiteGraphRepository: - """SQLite implementation for knowledge graph storage. - - Stores nodes and edges in the same SQLite database as traceability. - Uses recursive CTEs for graph traversal. - """ - - def __init__(self, db_path: str) -> None: - self._db_path = db_path - self._db: aiosqlite.Connection | None = None - - async def initialize(self) -> None: - """Initialize graph tables in the database.""" - from pathlib import Path - Path(self._db_path).parent.mkdir(parents=True, exist_ok=True) - self._db = await aiosqlite.connect(self._db_path) - self._db.row_factory = aiosqlite.Row - await self._db.executescript(CREATE_GRAPH_TABLES_SQL) - await self._db.commit() - logger.info("SQLite graph repository initialized", db_path=self._db_path) - - async def _ensure_connected(self) -> aiosqlite.Connection: - if self._db is None: - await self.initialize() - assert self._db is not None - return self._db - - async def close(self) -> None: - if self._db: - await self._db.close() - self._db = None - - # --- Node operations --- - - async def create_node(self, node: Node) -> Node: - db = await self._ensure_connected() - await db.execute( - """INSERT OR REPLACE INTO graph_nodes - (id, external_id, name, node_type, description, - source_document_id, source_chunk_id, properties, - confidence, extraction_method, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - ( - str(node.id), - node.external_id, - node.name, - node.node_type.value, - node.description, - str(node.source_document_id) if node.source_document_id else None, - str(node.source_chunk_id) if node.source_chunk_id else None, - json.dumps(node.properties), - node.confidence, - node.extraction_method, - node.created_at.isoformat(), - node.updated_at.isoformat(), - ), - ) - await db.commit() - return node - - async def get_node(self, node_id: UUID) -> Node | None: - db = await self._ensure_connected() - cursor = await db.execute( - "SELECT * FROM graph_nodes WHERE id = ?", (str(node_id),) - ) - row = await cursor.fetchone() - if row is None: - return None - return self._row_to_node(row) - - async def find_nodes( - self, - node_type: str | None = None, - name_pattern: str | None = None, - limit: int = 100, - ) -> list[Node]: - db = await self._ensure_connected() - query = "SELECT * FROM graph_nodes" - params: list = [] - conditions: list[str] = [] - - if node_type: - conditions.append("node_type = ?") - params.append(node_type) - if name_pattern: - conditions.append("name LIKE ?") - params.append(f"%{name_pattern}%") - - if conditions: - query += " WHERE " + " AND ".join(conditions) - query += " LIMIT ?" - params.append(limit) - - cursor = await db.execute(query, params) - rows = await cursor.fetchall() - return [self._row_to_node(row) for row in rows] - - # --- Edge operations --- - - async def create_edge(self, edge: Edge) -> Edge: - db = await self._ensure_connected() - await db.execute( - """INSERT OR REPLACE INTO graph_edges - (id, source_id, target_id, edge_type, name, properties, - weight, source_document_id, source_chunk_id, - confidence, extraction_method, created_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - ( - str(edge.id), - str(edge.source_id), - str(edge.target_id), - edge.edge_type.value, - edge.name, - json.dumps(edge.properties), - edge.weight, - str(edge.source_document_id) if edge.source_document_id else None, - str(edge.source_chunk_id) if edge.source_chunk_id else None, - edge.confidence, - edge.extraction_method, - edge.created_at.isoformat(), - ), - ) - await db.commit() - return edge - - async def get_edges( - self, - node_id: UUID, - direction: str = "both", - edge_type: str | None = None, - ) -> list[Edge]: - db = await self._ensure_connected() - node_str = str(node_id) - - if direction == "out": - query = "SELECT * FROM graph_edges WHERE source_id = ?" - params: list = [node_str] - elif direction == "in": - query = "SELECT * FROM graph_edges WHERE target_id = ?" - params = [node_str] - else: - query = "SELECT * FROM graph_edges WHERE source_id = ? OR target_id = ?" - params = [node_str, node_str] - - if edge_type: - query += " AND edge_type = ?" - params.append(edge_type) - - cursor = await db.execute(query, params) - rows = await cursor.fetchall() - return [self._row_to_edge(row) for row in rows] - - async def traverse( - self, - start_node_id: UUID, - max_hops: int = 2, - edge_types: list[str] | None = None, - ) -> list[tuple[Node, Edge, Node]]: - """Traverse the graph using recursive CTEs.""" - db = await self._ensure_connected() - - edge_filter = "" - params: list = [str(start_node_id), max_hops] - if edge_types: - placeholders = ",".join("?" * len(edge_types)) - edge_filter = f"AND e.edge_type IN ({placeholders})" - params = [str(start_node_id)] + edge_types + [max_hops] + edge_types - - query = f""" - WITH RECURSIVE path AS ( - SELECT e.id as edge_id, e.source_id, e.target_id, 1 as depth - FROM graph_edges e - WHERE e.source_id = ? {edge_filter} - UNION ALL - SELECT e.id as edge_id, e.source_id, e.target_id, p.depth + 1 - FROM graph_edges e - JOIN path p ON e.source_id = p.target_id - WHERE p.depth < ? {edge_filter} - ) - SELECT DISTINCT - p.edge_id, - sn.id as source_node_id, sn.name as source_name, - sn.node_type as source_type, sn.description as source_desc, - sn.properties as source_props, sn.confidence as source_confidence, - sn.extraction_method as source_extraction, - sn.source_document_id as source_doc_id, - sn.source_chunk_id as source_chunk_id, - sn.created_at as source_created, sn.updated_at as source_updated, - sn.external_id as source_external_id, - tn.id as target_node_id, tn.name as target_name, - tn.node_type as target_type, tn.description as target_desc, - tn.properties as target_props, tn.confidence as target_confidence, - tn.extraction_method as target_extraction, - tn.source_document_id as target_doc_id, - tn.source_chunk_id as target_chunk_id, - tn.created_at as target_created, tn.updated_at as target_updated, - tn.external_id as target_external_id, - e.edge_type, e.name as edge_name, e.properties as edge_props, - e.weight, e.confidence as edge_confidence, - e.extraction_method as edge_extraction, - e.source_document_id as edge_doc_id, - e.source_chunk_id as edge_chunk_id, - e.created_at as edge_created - FROM path p - JOIN graph_edges e ON e.id = p.edge_id - JOIN graph_nodes sn ON sn.id = p.source_id - JOIN graph_nodes tn ON tn.id = p.target_id - """ - - cursor = await db.execute(query, params) - rows = await cursor.fetchall() - - results: list[tuple[Node, Edge, Node]] = [] - for row in rows: - source = Node( - id=UUID(row["source_node_id"]), - external_id=row["source_external_id"], - name=row["source_name"], - node_type=NodeType(row["source_type"]), - description=row["source_desc"], - source_document_id=UUID(row["source_doc_id"]) if row["source_doc_id"] else None, - source_chunk_id=UUID(row["source_chunk_id"]) if row["source_chunk_id"] else None, - properties=json.loads(row["source_props"]) if row["source_props"] else {}, - confidence=row["source_confidence"], - extraction_method=row["source_extraction"], - ) - target = Node( - id=UUID(row["target_node_id"]), - external_id=row["target_external_id"], - name=row["target_name"], - node_type=NodeType(row["target_type"]), - description=row["target_desc"], - source_document_id=UUID(row["target_doc_id"]) if row["target_doc_id"] else None, - source_chunk_id=UUID(row["target_chunk_id"]) if row["target_chunk_id"] else None, - properties=json.loads(row["target_props"]) if row["target_props"] else {}, - confidence=row["target_confidence"], - extraction_method=row["target_extraction"], - ) - edge = Edge( - id=UUID(row["edge_id"]), - source_id=UUID(row["source_node_id"]), - target_id=UUID(row["target_node_id"]), - edge_type=EdgeType(row["edge_type"]), - name=row["edge_name"], - properties=json.loads(row["edge_props"]) if row["edge_props"] else {}, - weight=row["weight"], - source_document_id=UUID(row["edge_doc_id"]) if row["edge_doc_id"] else None, - source_chunk_id=UUID(row["edge_chunk_id"]) if row["edge_chunk_id"] else None, - confidence=row["edge_confidence"], - extraction_method=row["edge_extraction"], - ) - results.append((source, edge, target)) - - return results - - async def delete_by_document(self, document_id: UUID) -> int: - db = await self._ensure_connected() - doc_str = str(document_id) - cursor = await db.execute( - "DELETE FROM graph_edges WHERE source_document_id = ?", (doc_str,) - ) - edge_count = cursor.rowcount - cursor = await db.execute( - "DELETE FROM graph_nodes WHERE source_document_id = ?", (doc_str,) - ) - node_count = cursor.rowcount - await db.commit() - return edge_count + node_count - - async def find_similar_nodes( - self, - node_id: UUID, - limit: int = 10, - ) -> list[tuple[Node, float]]: - """Find similar nodes by shared edges.""" - db = await self._ensure_connected() - node_str = str(node_id) - cursor = await db.execute( - """SELECT n.*, COUNT(DISTINCT e2.id) as shared_edges - FROM graph_nodes n - JOIN graph_edges e2 ON (e2.source_id = n.id OR e2.target_id = n.id) - WHERE e2.target_id IN ( - SELECT target_id FROM graph_edges WHERE source_id = ? - ) OR e2.source_id IN ( - SELECT source_id FROM graph_edges WHERE target_id = ? - ) - AND n.id != ? - GROUP BY n.id - ORDER BY shared_edges DESC - LIMIT ?""", - (node_str, node_str, node_str, limit), - ) - rows = await cursor.fetchall() - results = [] - for row in rows: - node = self._row_to_node(row) - similarity = float(row["shared_edges"]) / 10.0 # Normalize - results.append((node, min(similarity, 1.0))) - return results - - # --- Row mapping --- - - @staticmethod - def _row_to_node(row: aiosqlite.Row) -> Node: - from datetime import datetime - - return Node( - id=UUID(row["id"]), - external_id=row["external_id"], - name=row["name"], - node_type=NodeType(row["node_type"]), - description=row["description"], - source_document_id=UUID(row["source_document_id"]) if row["source_document_id"] else None, - source_chunk_id=UUID(row["source_chunk_id"]) if row["source_chunk_id"] else None, - properties=json.loads(row["properties"]) if row["properties"] else {}, - confidence=row["confidence"], - extraction_method=row["extraction_method"], - created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else datetime.utcnow(), - updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else datetime.utcnow(), - ) - - @staticmethod - def _row_to_edge(row: aiosqlite.Row) -> Edge: - from datetime import datetime - - return Edge( - id=UUID(row["id"]), - source_id=UUID(row["source_id"]), - target_id=UUID(row["target_id"]), - edge_type=EdgeType(row["edge_type"]), - name=row["name"], - properties=json.loads(row["properties"]) if row["properties"] else {}, - weight=row["weight"], - source_document_id=UUID(row["source_document_id"]) if row["source_document_id"] else None, - source_chunk_id=UUID(row["source_chunk_id"]) if row["source_chunk_id"] else None, - confidence=row["confidence"], - extraction_method=row["extraction_method"], - created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else datetime.utcnow(), - ) diff --git a/src/kb_engine/repositories/traceability/__init__.py b/src/kb_engine/repositories/traceability/__init__.py deleted file mode 100644 index 20a57c7..0000000 --- a/src/kb_engine/repositories/traceability/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Traceability repository implementations.""" diff --git a/src/kb_engine/repositories/traceability/postgres.py b/src/kb_engine/repositories/traceability/postgres.py deleted file mode 100644 index af9abe4..0000000 --- a/src/kb_engine/repositories/traceability/postgres.py +++ /dev/null @@ -1,97 +0,0 @@ -"""PostgreSQL implementation of the traceability repository.""" - -from uuid import UUID - -from kb_engine.core.interfaces.repositories import TraceabilityRepository -from kb_engine.core.models.document import Chunk, Document -from kb_engine.core.models.search import SearchFilters - - -class PostgresRepository(TraceabilityRepository): - """PostgreSQL implementation for document and chunk storage. - - This implementation uses SQLAlchemy with asyncpg for - async PostgreSQL operations. - """ - - def __init__(self, connection_string: str) -> None: - self._connection_string = connection_string - self._engine = None - self._session_factory = None - - async def _ensure_connected(self) -> None: - """Ensure database connection is established.""" - if self._engine is None: - # TODO: Initialize SQLAlchemy async engine - pass - - async def save_document(self, document: Document) -> Document: - """Save a document to the store.""" - await self._ensure_connected() - # TODO: Implement document save - raise NotImplementedError("PostgresRepository.save_document not implemented") - - async def get_document(self, document_id: UUID) -> Document | None: - """Get a document by ID.""" - await self._ensure_connected() - # TODO: Implement document retrieval - raise NotImplementedError("PostgresRepository.get_document not implemented") - - async def get_document_by_external_id(self, external_id: str) -> Document | None: - """Get a document by external ID.""" - await self._ensure_connected() - # TODO: Implement - raise NotImplementedError( - "PostgresRepository.get_document_by_external_id not implemented" - ) - - async def list_documents( - self, - filters: SearchFilters | None = None, - limit: int = 100, - offset: int = 0, - ) -> list[Document]: - """List documents with optional filters.""" - await self._ensure_connected() - # TODO: Implement document listing with filters - raise NotImplementedError("PostgresRepository.list_documents not implemented") - - async def update_document(self, document: Document) -> Document: - """Update an existing document.""" - await self._ensure_connected() - # TODO: Implement document update - raise NotImplementedError("PostgresRepository.update_document not implemented") - - async def delete_document(self, document_id: UUID) -> bool: - """Delete a document and its chunks.""" - await self._ensure_connected() - # TODO: Implement document deletion - raise NotImplementedError("PostgresRepository.delete_document not implemented") - - async def save_chunks(self, chunks: list[Chunk]) -> list[Chunk]: - """Save multiple chunks.""" - await self._ensure_connected() - # TODO: Implement chunk save - raise NotImplementedError("PostgresRepository.save_chunks not implemented") - - async def get_chunks_by_document(self, document_id: UUID) -> list[Chunk]: - """Get all chunks for a document.""" - await self._ensure_connected() - # TODO: Implement chunk retrieval - raise NotImplementedError( - "PostgresRepository.get_chunks_by_document not implemented" - ) - - async def get_chunk(self, chunk_id: UUID) -> Chunk | None: - """Get a chunk by ID.""" - await self._ensure_connected() - # TODO: Implement - raise NotImplementedError("PostgresRepository.get_chunk not implemented") - - async def delete_chunks_by_document(self, document_id: UUID) -> int: - """Delete all chunks for a document.""" - await self._ensure_connected() - # TODO: Implement chunk deletion - raise NotImplementedError( - "PostgresRepository.delete_chunks_by_document not implemented" - ) diff --git a/src/kb_engine/repositories/traceability/sqlite.py b/src/kb_engine/repositories/traceability/sqlite.py deleted file mode 100644 index 4bd0c0e..0000000 --- a/src/kb_engine/repositories/traceability/sqlite.py +++ /dev/null @@ -1,332 +0,0 @@ -"""SQLite implementation of the traceability repository.""" - -import json -from pathlib import Path -from uuid import UUID - -import aiosqlite -import structlog - -from kb_engine.core.models.document import Chunk, ChunkType, Document, DocumentStatus -from kb_engine.core.models.search import SearchFilters - -logger = structlog.get_logger(__name__) - -CREATE_TABLES_SQL = """ -CREATE TABLE IF NOT EXISTS documents ( - id TEXT PRIMARY KEY, - external_id TEXT UNIQUE, - title TEXT NOT NULL, - content TEXT NOT NULL, - source_path TEXT, - mime_type TEXT DEFAULT 'text/markdown', - metadata TEXT DEFAULT '{}', - tags TEXT DEFAULT '[]', - domain TEXT, - repo_name TEXT, - relative_path TEXT, - git_commit TEXT, - git_remote_url TEXT, - status TEXT DEFAULT 'pending', - content_hash TEXT, - created_at TEXT, - updated_at TEXT, - indexed_at TEXT -); - -CREATE TABLE IF NOT EXISTS chunks ( - id TEXT PRIMARY KEY, - document_id TEXT NOT NULL, - sequence INTEGER DEFAULT 0, - content TEXT NOT NULL, - chunk_type TEXT DEFAULT 'default', - start_offset INTEGER, - end_offset INTEGER, - heading_path TEXT DEFAULT '[]', - section_anchor TEXT, - metadata TEXT DEFAULT '{}', - token_count INTEGER, - embedding_id TEXT, - created_at TEXT, - FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE -); - -CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id); -CREATE INDEX IF NOT EXISTS idx_documents_external_id ON documents(external_id); -CREATE INDEX IF NOT EXISTS idx_documents_domain ON documents(domain); -CREATE INDEX IF NOT EXISTS idx_documents_repo_name ON documents(repo_name); -CREATE INDEX IF NOT EXISTS idx_documents_relative_path ON documents(relative_path); -""" - - -class SQLiteRepository: - """SQLite implementation for document and chunk storage. - - Uses aiosqlite for async SQLite operations. Stores both - traceability data and graph data in the same database file. - """ - - def __init__(self, db_path: str) -> None: - self._db_path = db_path - self._db: aiosqlite.Connection | None = None - - async def initialize(self) -> None: - """Initialize the database and create tables.""" - Path(self._db_path).parent.mkdir(parents=True, exist_ok=True) - self._db = await aiosqlite.connect(self._db_path) - self._db.row_factory = aiosqlite.Row - await self._db.executescript(CREATE_TABLES_SQL) - await self._db.commit() - logger.info("SQLite traceability repository initialized", db_path=self._db_path) - - async def _ensure_connected(self) -> aiosqlite.Connection: - if self._db is None: - await self.initialize() - assert self._db is not None - return self._db - - async def close(self) -> None: - if self._db: - await self._db.close() - self._db = None - - # --- Document operations --- - - async def save_document(self, document: Document) -> Document: - db = await self._ensure_connected() - await db.execute( - """INSERT OR REPLACE INTO documents - (id, external_id, title, content, source_path, mime_type, - metadata, tags, domain, repo_name, relative_path, - git_commit, git_remote_url, status, content_hash, - created_at, updated_at, indexed_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - ( - str(document.id), - document.external_id, - document.title, - document.content, - document.source_path, - document.mime_type, - json.dumps(document.metadata), - json.dumps(document.tags), - document.domain, - document.repo_name, - document.relative_path, - document.git_commit, - document.git_remote_url, - document.status.value, - document.content_hash, - document.created_at.isoformat(), - document.updated_at.isoformat(), - document.indexed_at.isoformat() if document.indexed_at else None, - ), - ) - await db.commit() - return document - - async def get_document(self, document_id: UUID) -> Document | None: - db = await self._ensure_connected() - cursor = await db.execute( - "SELECT * FROM documents WHERE id = ?", (str(document_id),) - ) - row = await cursor.fetchone() - if row is None: - return None - return self._row_to_document(row) - - async def get_document_by_external_id(self, external_id: str) -> Document | None: - db = await self._ensure_connected() - cursor = await db.execute( - "SELECT * FROM documents WHERE external_id = ?", (external_id,) - ) - row = await cursor.fetchone() - if row is None: - return None - return self._row_to_document(row) - - async def get_document_by_relative_path( - self, repo_name: str, relative_path: str - ) -> Document | None: - db = await self._ensure_connected() - cursor = await db.execute( - "SELECT * FROM documents WHERE repo_name = ? AND relative_path = ?", - (repo_name, relative_path), - ) - row = await cursor.fetchone() - if row is None: - return None - return self._row_to_document(row) - - async def list_documents( - self, - filters: SearchFilters | None = None, - limit: int = 100, - offset: int = 0, - ) -> list[Document]: - db = await self._ensure_connected() - query = "SELECT * FROM documents" - params: list = [] - conditions: list[str] = [] - - if filters: - if filters.domains: - placeholders = ",".join("?" * len(filters.domains)) - conditions.append(f"domain IN ({placeholders})") - params.extend(filters.domains) - if filters.document_ids: - placeholders = ",".join("?" * len(filters.document_ids)) - conditions.append(f"id IN ({placeholders})") - params.extend(str(d) for d in filters.document_ids) - - if conditions: - query += " WHERE " + " AND ".join(conditions) - - query += " ORDER BY updated_at DESC LIMIT ? OFFSET ?" - params.extend([limit, offset]) - - cursor = await db.execute(query, params) - rows = await cursor.fetchall() - return [self._row_to_document(row) for row in rows] - - async def update_document(self, document: Document) -> Document: - db = await self._ensure_connected() - await db.execute( - """UPDATE documents SET - title=?, content=?, source_path=?, metadata=?, tags=?, - domain=?, repo_name=?, relative_path=?, git_commit=?, - git_remote_url=?, status=?, content_hash=?, - updated_at=?, indexed_at=? - WHERE id=?""", - ( - document.title, - document.content, - document.source_path, - json.dumps(document.metadata), - json.dumps(document.tags), - document.domain, - document.repo_name, - document.relative_path, - document.git_commit, - document.git_remote_url, - document.status.value, - document.content_hash, - document.updated_at.isoformat(), - document.indexed_at.isoformat() if document.indexed_at else None, - str(document.id), - ), - ) - await db.commit() - return document - - async def delete_document(self, document_id: UUID) -> bool: - db = await self._ensure_connected() - cursor = await db.execute( - "DELETE FROM documents WHERE id = ?", (str(document_id),) - ) - await db.commit() - return cursor.rowcount > 0 - - # --- Chunk operations --- - - async def save_chunks(self, chunks: list[Chunk]) -> list[Chunk]: - db = await self._ensure_connected() - for chunk in chunks: - await db.execute( - """INSERT OR REPLACE INTO chunks - (id, document_id, sequence, content, chunk_type, - start_offset, end_offset, heading_path, section_anchor, - metadata, token_count, embedding_id, created_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - ( - str(chunk.id), - str(chunk.document_id), - chunk.sequence, - chunk.content, - chunk.chunk_type.value, - chunk.start_offset, - chunk.end_offset, - json.dumps(chunk.heading_path), - chunk.section_anchor, - json.dumps(chunk.metadata), - chunk.token_count, - str(chunk.embedding_id) if chunk.embedding_id else None, - chunk.created_at.isoformat(), - ), - ) - await db.commit() - return chunks - - async def get_chunks_by_document(self, document_id: UUID) -> list[Chunk]: - db = await self._ensure_connected() - cursor = await db.execute( - "SELECT * FROM chunks WHERE document_id = ? ORDER BY sequence", - (str(document_id),), - ) - rows = await cursor.fetchall() - return [self._row_to_chunk(row) for row in rows] - - async def get_chunk(self, chunk_id: UUID) -> Chunk | None: - db = await self._ensure_connected() - cursor = await db.execute( - "SELECT * FROM chunks WHERE id = ?", (str(chunk_id),) - ) - row = await cursor.fetchone() - if row is None: - return None - return self._row_to_chunk(row) - - async def delete_chunks_by_document(self, document_id: UUID) -> int: - db = await self._ensure_connected() - cursor = await db.execute( - "DELETE FROM chunks WHERE document_id = ?", (str(document_id),) - ) - await db.commit() - return cursor.rowcount - - # --- Row mapping --- - - @staticmethod - def _row_to_document(row: aiosqlite.Row) -> Document: - from datetime import datetime - - return Document( - id=UUID(row["id"]), - external_id=row["external_id"], - title=row["title"], - content=row["content"], - source_path=row["source_path"], - mime_type=row["mime_type"] or "text/markdown", - metadata=json.loads(row["metadata"]) if row["metadata"] else {}, - tags=json.loads(row["tags"]) if row["tags"] else [], - domain=row["domain"], - repo_name=row["repo_name"], - relative_path=row["relative_path"], - git_commit=row["git_commit"], - git_remote_url=row["git_remote_url"], - status=DocumentStatus(row["status"]), - content_hash=row["content_hash"], - created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else datetime.utcnow(), - updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else datetime.utcnow(), - indexed_at=datetime.fromisoformat(row["indexed_at"]) if row["indexed_at"] else None, - ) - - @staticmethod - def _row_to_chunk(row: aiosqlite.Row) -> Chunk: - from datetime import datetime - - return Chunk( - id=UUID(row["id"]), - document_id=UUID(row["document_id"]), - sequence=row["sequence"], - content=row["content"], - chunk_type=ChunkType(row["chunk_type"]), - start_offset=row["start_offset"], - end_offset=row["end_offset"], - heading_path=json.loads(row["heading_path"]) if row["heading_path"] else [], - section_anchor=row["section_anchor"], - metadata=json.loads(row["metadata"]) if row["metadata"] else {}, - token_count=row["token_count"], - embedding_id=UUID(row["embedding_id"]) if row["embedding_id"] else None, - created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else datetime.utcnow(), - ) diff --git a/src/kb_engine/repositories/vector/__init__.py b/src/kb_engine/repositories/vector/__init__.py deleted file mode 100644 index 63f930e..0000000 --- a/src/kb_engine/repositories/vector/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Vector repository implementations.""" diff --git a/src/kb_engine/repositories/vector/chroma.py b/src/kb_engine/repositories/vector/chroma.py deleted file mode 100644 index d8568f7..0000000 --- a/src/kb_engine/repositories/vector/chroma.py +++ /dev/null @@ -1,184 +0,0 @@ -"""ChromaDB implementation of the vector repository.""" - -from pathlib import Path -from uuid import UUID - -import structlog - -from kb_engine.core.models.embedding import Embedding -from kb_engine.core.models.search import SearchFilters - -logger = structlog.get_logger(__name__) - - -class ChromaRepository: - """ChromaDB embedded implementation for vector storage and search. - - Uses ChromaDB in embedded mode (in-process, persistent on disk). - """ - - def __init__( - self, - persist_directory: str, - collection_name: str = "kb_engine_embeddings", - ) -> None: - self._persist_directory = persist_directory - self._collection_name = collection_name - self._client = None - self._collection = None - - async def initialize(self) -> None: - """Initialize ChromaDB client and collection.""" - import chromadb - from chromadb.config import Settings - - Path(self._persist_directory).mkdir(parents=True, exist_ok=True) - self._client = chromadb.PersistentClient( - path=self._persist_directory, - settings=Settings(anonymized_telemetry=False), - ) - self._collection = self._client.get_or_create_collection( - name=self._collection_name, - metadata={"hnsw:space": "cosine"}, - ) - logger.info( - "ChromaDB repository initialized", - path=self._persist_directory, - collection=self._collection_name, - ) - - def _ensure_collection(self): - if self._collection is None: - import chromadb - from chromadb.config import Settings - - Path(self._persist_directory).mkdir(parents=True, exist_ok=True) - self._client = chromadb.PersistentClient( - path=self._persist_directory, - settings=Settings(anonymized_telemetry=False), - ) - self._collection = self._client.get_or_create_collection( - name=self._collection_name, - metadata={"hnsw:space": "cosine"}, - ) - return self._collection - - async def upsert_embeddings(self, embeddings: list[Embedding]) -> int: - collection = self._ensure_collection() - if not embeddings: - return 0 - - ids = [str(e.chunk_id) for e in embeddings] - vectors = [e.vector for e in embeddings] - metadatas = [ - { - "chunk_id": str(e.chunk_id), - "document_id": str(e.document_id), - "model": e.model, - **{k: str(v) for k, v in e.metadata.items()}, - } - for e in embeddings - ] - - collection.upsert( - ids=ids, - embeddings=vectors, - metadatas=metadatas, - ) - return len(embeddings) - - async def search( - self, - query_vector: list[float], - limit: int = 10, - filters: SearchFilters | None = None, - score_threshold: float | None = None, - ) -> list[tuple[UUID, float]]: - collection = self._ensure_collection() - - where_conditions = [] - - # Apply KDD status filter (default: only approved) - if filters: - effective_statuses = filters.get_effective_statuses() - else: - effective_statuses = ["approved"] # Default - - if effective_statuses: - where_conditions.append( - {"kdd_status": {"$in": effective_statuses}} - ) - - if filters: - if filters.document_ids: - where_conditions.append( - {"document_id": {"$in": [str(d) for d in filters.document_ids]}} - ) - if filters.chunk_types: - where_conditions.append( - {"chunk_type": {"$in": filters.chunk_types}} - ) - if filters.domains: - where_conditions.append( - {"domain": {"$in": filters.domains}} - ) - - if len(where_conditions) == 1: - where_filter = where_conditions[0] - elif len(where_conditions) > 1: - where_filter = {"$and": where_conditions} - else: - where_filter = None - - results = collection.query( - query_embeddings=[query_vector], - n_results=limit, - where=where_filter, - include=["distances"], - ) - - chunk_scores: list[tuple[UUID, float]] = [] - if results["ids"] and results["distances"]: - for chunk_id_str, distance in zip( - results["ids"][0], results["distances"][0], strict=True - ): - # ChromaDB returns cosine distance; convert to similarity score - score = 1.0 - distance - if score_threshold is not None and score < score_threshold: - continue - chunk_scores.append((UUID(chunk_id_str), score)) - - return chunk_scores - - async def delete_by_document(self, document_id: UUID) -> int: - collection = self._ensure_collection() - try: - # Get all embeddings for this document - results = collection.get( - where={"document_id": str(document_id)}, - include=[], - ) - if results["ids"]: - collection.delete(ids=results["ids"]) - return len(results["ids"]) - except Exception: - logger.warning("Failed to delete by document", document_id=str(document_id)) - return 0 - - async def delete_by_chunk_ids(self, chunk_ids: list[UUID]) -> int: - collection = self._ensure_collection() - ids = [str(cid) for cid in chunk_ids] - try: - collection.delete(ids=ids) - return len(ids) - except Exception: - logger.warning("Failed to delete by chunk IDs") - return 0 - - async def get_collection_info(self) -> dict[str, int | str]: - collection = self._ensure_collection() - return { - "name": self._collection_name, - "count": collection.count(), - "persist_directory": self._persist_directory, - } diff --git a/src/kb_engine/repositories/vector/qdrant.py b/src/kb_engine/repositories/vector/qdrant.py deleted file mode 100644 index a3f3d4e..0000000 --- a/src/kb_engine/repositories/vector/qdrant.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Qdrant implementation of the vector repository.""" - -from uuid import UUID - -from kb_engine.core.interfaces.repositories import VectorRepository -from kb_engine.core.models.embedding import Embedding -from kb_engine.core.models.search import SearchFilters - - -class QdrantRepository(VectorRepository): - """Qdrant implementation for vector storage and search. - - Uses the qdrant-client for async operations. - """ - - def __init__( - self, - host: str = "localhost", - port: int = 6333, - api_key: str | None = None, - collection_name: str = "kb_engine_embeddings", - ) -> None: - self._host = host - self._port = port - self._api_key = api_key - self._collection_name = collection_name - self._client = None - - async def _ensure_connected(self) -> None: - """Ensure Qdrant client is connected.""" - if self._client is None: - # TODO: Initialize qdrant-client - pass - - async def upsert_embeddings(self, embeddings: list[Embedding]) -> int: - """Upsert embeddings into Qdrant.""" - await self._ensure_connected() - # TODO: Implement embedding upsert - raise NotImplementedError("QdrantRepository.upsert_embeddings not implemented") - - async def search( - self, - query_vector: list[float], - limit: int = 10, - filters: SearchFilters | None = None, - score_threshold: float | None = None, - ) -> list[tuple[UUID, float]]: - """Search for similar vectors in Qdrant.""" - await self._ensure_connected() - # TODO: Implement vector search - raise NotImplementedError("QdrantRepository.search not implemented") - - async def delete_by_document(self, document_id: UUID) -> int: - """Delete all embeddings for a document.""" - await self._ensure_connected() - # TODO: Implement deletion by document - raise NotImplementedError("QdrantRepository.delete_by_document not implemented") - - async def delete_by_chunk_ids(self, chunk_ids: list[UUID]) -> int: - """Delete embeddings by chunk IDs.""" - await self._ensure_connected() - # TODO: Implement deletion by chunk IDs - raise NotImplementedError("QdrantRepository.delete_by_chunk_ids not implemented") - - async def get_collection_info(self) -> dict[str, int | str]: - """Get information about the Qdrant collection.""" - await self._ensure_connected() - # TODO: Implement collection info retrieval - raise NotImplementedError("QdrantRepository.get_collection_info not implemented") diff --git a/src/kb_engine/services/__init__.py b/src/kb_engine/services/__init__.py deleted file mode 100644 index ad1d054..0000000 --- a/src/kb_engine/services/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Business logic services for KB-Engine.""" - -from kb_engine.services.indexing import IndexingService -from kb_engine.services.retrieval import RetrievalService - -__all__ = [ - "IndexingService", - "RetrievalService", -] diff --git a/src/kb_engine/services/indexing.py b/src/kb_engine/services/indexing.py deleted file mode 100644 index 93254b7..0000000 --- a/src/kb_engine/services/indexing.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Indexing service.""" - -from uuid import UUID - -from kb_engine.core.exceptions import DocumentNotFoundError -from kb_engine.core.models.document import Document -from kb_engine.core.models.repository import RepositoryConfig -from kb_engine.core.models.search import SearchFilters -from kb_engine.pipelines.indexation import IndexationPipeline - - -class IndexingService: - """Service for document indexing operations.""" - - def __init__(self, pipeline: IndexationPipeline) -> None: - self._pipeline = pipeline - - async def index_document( - self, - title: str, - content: str, - source_path: str | None = None, - external_id: str | None = None, - domain: str | None = None, - tags: list[str] | None = None, - metadata: dict | None = None, - ) -> Document: - """Index a new document.""" - document = Document( - title=title, - content=content, - source_path=source_path, - external_id=external_id, - domain=domain, - tags=tags or [], - metadata=metadata or {}, - ) - return await self._pipeline.index_document(document) - - async def reindex_document(self, document_id: UUID) -> Document: - """Reindex an existing document.""" - document = await self._pipeline._traceability.get_document(document_id) - if not document: - raise DocumentNotFoundError( - f"Document not found: {document_id}", - details={"document_id": str(document_id)}, - ) - return await self._pipeline.reindex_document(document) - - async def delete_document(self, document_id: UUID) -> bool: - """Delete a document.""" - document = await self._pipeline._traceability.get_document(document_id) - if not document: - raise DocumentNotFoundError( - f"Document not found: {document_id}", - details={"document_id": str(document_id)}, - ) - return await self._pipeline.delete_document(document) - - async def get_document(self, document_id: UUID) -> Document: - """Get a document by ID.""" - document = await self._pipeline._traceability.get_document(document_id) - if not document: - raise DocumentNotFoundError( - f"Document not found: {document_id}", - details={"document_id": str(document_id)}, - ) - return document - - async def list_documents( - self, - filters: SearchFilters | None = None, - limit: int = 100, - offset: int = 0, - ) -> list[Document]: - """List documents with optional filters.""" - return await self._pipeline._traceability.list_documents( - filters=filters, - limit=limit, - offset=offset, - ) - - async def index_repository(self, repo_config: RepositoryConfig) -> list[Document]: - """Index all matching files from a Git repository.""" - return await self._pipeline.index_repository(repo_config) - - async def sync_repository(self, repo_config: RepositoryConfig, since_commit: str) -> dict: - """Incrementally sync a repository.""" - return await self._pipeline.sync_repository(repo_config, since_commit) diff --git a/src/kb_engine/services/retrieval.py b/src/kb_engine/services/retrieval.py deleted file mode 100644 index 9939af7..0000000 --- a/src/kb_engine/services/retrieval.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Retrieval service.""" - -from kb_engine.core.models.search import RetrievalMode, RetrievalResponse, SearchFilters -from kb_engine.pipelines.inference.pipeline import RetrievalPipeline - - -class RetrievalService: - """Service for document retrieval operations. - - Returns DocumentReference objects with URLs instead of raw content. - """ - - def __init__(self, pipeline: RetrievalPipeline) -> None: - self._pipeline = pipeline - - async def search( - self, - query: str, - mode: RetrievalMode | str = RetrievalMode.VECTOR, - filters: SearchFilters | None = None, - limit: int = 10, - score_threshold: float | None = None, - ) -> RetrievalResponse: - """Execute a retrieval query.""" - if isinstance(mode, str): - mode = RetrievalMode(mode.lower()) - - return await self._pipeline.search( - query=query, - mode=mode, - filters=filters, - limit=limit, - score_threshold=score_threshold, - ) diff --git a/src/kb_engine/smart/__init__.py b/src/kb_engine/smart/__init__.py deleted file mode 100644 index ce110b0..0000000 --- a/src/kb_engine/smart/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Smart Ingestion Pipeline for KDD documents. - -This module provides intelligent document processing with: -- Template-aware parsing based on document kind -- Hierarchical chunking with LLM-generated summaries -- FalkorDB graph store for knowledge graph -- ChromaDB for vector embeddings (TODO) -- SQLite for metadata and traceability (TODO) -""" - -from kb_engine.smart.chunking import HierarchicalChunker, LLMSummaryService, MockSummaryService -from kb_engine.smart.extraction import EntityGraphExtractor -from kb_engine.smart.parsers import DocumentKindDetector, EntityParser -from kb_engine.smart.pipelines import EntityIngestionPipeline -from kb_engine.smart.schemas import ENTITY_SCHEMA -from kb_engine.smart.stores import FalkorDBGraphStore -from kb_engine.smart.types import ( - ContextualizedChunk, - ExtractedEntityInfo, - IngestionResult, - KDDDocumentKind, - ParsedDocument, -) - -__all__ = [ - # Types - "KDDDocumentKind", - "ParsedDocument", - "ContextualizedChunk", - "ExtractedEntityInfo", - "IngestionResult", - # Schemas - "ENTITY_SCHEMA", - # Parsers - "DocumentKindDetector", - "EntityParser", - # Chunking - "HierarchicalChunker", - "LLMSummaryService", - "MockSummaryService", - # Stores - "FalkorDBGraphStore", - # Extraction - "EntityGraphExtractor", - # Pipelines - "EntityIngestionPipeline", -] diff --git a/src/kb_engine/smart/chunking/__init__.py b/src/kb_engine/smart/chunking/__init__.py deleted file mode 100644 index 854c6d8..0000000 --- a/src/kb_engine/smart/chunking/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Chunking strategies for KDD documents.""" - -from kb_engine.smart.chunking.hierarchical import HierarchicalChunker -from kb_engine.smart.chunking.summarizer import LLMSummaryService, MockSummaryService, SummaryService - -__all__ = [ - "HierarchicalChunker", - "SummaryService", - "LLMSummaryService", - "MockSummaryService", -] diff --git a/src/kb_engine/smart/chunking/hierarchical.py b/src/kb_engine/smart/chunking/hierarchical.py deleted file mode 100644 index 4bfd1b5..0000000 --- a/src/kb_engine/smart/chunking/hierarchical.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Hierarchical chunker with context-aware chunks.""" - -from uuid import uuid4 - -import structlog - -from kb_engine.smart.chunking.summarizer import SummaryService -from kb_engine.smart.types import ( - ChunkingStrategy, - ContentExpectation, - ContextualizedChunk, - HierarchicalContext, - ParsedDocument, - ParsedSection, - ParsedTable, - TemplateSchema, -) - -logger = structlog.get_logger(__name__) - - -class HierarchicalChunker: - """Generates contextualized chunks with hierarchical summaries.""" - - def __init__( - self, - summary_service: SummaryService, - max_chunk_size: int = 1024, - chunk_overlap: int = 50, - ) -> None: - """Initialize the hierarchical chunker.""" - self.summary_service = summary_service - self.max_chunk_size = max_chunk_size - self.chunk_overlap = chunk_overlap - - async def chunk( - self, - parsed: ParsedDocument, - schema: TemplateSchema, - ) -> list[ContextualizedChunk]: - """Generate contextualized chunks from parsed document.""" - log = logger.bind(doc_title=parsed.title) - log.debug("chunker.start", sections=len(parsed.sections)) - - chunks: list[ContextualizedChunk] = [] - sequence = 0 - - # Generate document summary - log.debug("chunker.summarize_doc.start") - doc_summary = await self.summary_service.summarize_document(parsed) - log.debug("chunker.summarize_doc.complete", summary_length=len(doc_summary)) - - doc_id = parsed.frontmatter.get("id", str(uuid4())[:8]) - - for section in parsed.sections: - section_log = log.bind(section=section.name) - section_log.debug("chunker.section.start") - - section_summary = await self.summary_service.summarize_section(section, doc_summary) - section_log.debug("chunker.section.summary", summary_length=len(section_summary)) - - context = HierarchicalContext( - document_summary=doc_summary, - section_summaries=[section_summary], - heading_path=[parsed.title, section.name], - ) - - strategy = self._get_strategy(section, schema) - section_log.debug("chunker.section.strategy", strategy=strategy.value) - - if strategy == ChunkingStrategy.TABLE_ROWS and section.tables: - for table in section.tables: - table_chunks = self._chunk_table_rows( - table=table, - context=context, - doc_id=doc_id, - doc_kind=parsed.kind, - section_name=section.name, - start_sequence=sequence, - ) - chunks.extend(table_chunks) - sequence += len(table_chunks) - section_log.debug("chunker.section.table_rows", rows=len(table_chunks)) - - non_table_text = self._extract_non_table_text(section) - if non_table_text.strip(): - text_chunks = self._chunk_text( - text=non_table_text, - context=context, - doc_id=doc_id, - doc_kind=parsed.kind, - section_name=section.name, - start_sequence=sequence, - ) - chunks.extend(text_chunks) - sequence += len(text_chunks) - - elif strategy == ChunkingStrategy.KEEP_INTACT: - chunk = self._create_chunk( - content=section.content, - context=context, - chunk_type="section", - doc_id=doc_id, - doc_kind=parsed.kind, - section_name=section.name, - sequence=sequence, - start_offset=section.start_offset, - end_offset=section.end_offset, - ) - chunks.append(chunk) - sequence += 1 - section_log.debug("chunker.section.keep_intact", content_length=len(section.content)) - - elif strategy == ChunkingStrategy.SPLIT_BY_PARAGRAPHS: - text_chunks = self._split_by_paragraphs( - text=section.content, - context=context, - doc_id=doc_id, - doc_kind=parsed.kind, - section_name=section.name, - start_sequence=sequence, - ) - chunks.extend(text_chunks) - sequence += len(text_chunks) - section_log.debug("chunker.section.paragraphs", count=len(text_chunks)) - - elif strategy == ChunkingStrategy.SPLIT_BY_ITEMS: - items = self._extract_list_items(section.content) - for item in items: - chunk = self._create_chunk( - content=item, - context=context, - chunk_type="list_item", - doc_id=doc_id, - doc_kind=parsed.kind, - section_name=section.name, - sequence=sequence, - ) - chunks.append(chunk) - sequence += 1 - section_log.debug("chunker.section.list_items", count=len(items)) - - else: # DEFAULT - text_chunks = self._chunk_text( - text=section.content, - context=context, - doc_id=doc_id, - doc_kind=parsed.kind, - section_name=section.name, - start_sequence=sequence, - ) - chunks.extend(text_chunks) - sequence += len(text_chunks) - section_log.debug("chunker.section.text_chunks", count=len(text_chunks)) - - section_log.debug("chunker.section.complete", total_chunks=sequence) - - log.debug("chunker.complete", total_chunks=len(chunks)) - return chunks - - def _get_strategy(self, section: ParsedSection, schema: TemplateSchema) -> ChunkingStrategy: - """Determine chunking strategy for a section.""" - section_lower = section.name.lower() - - for sec_def in schema.required_sections + schema.optional_sections: - names = [sec_def.name.lower()] + [a.lower() for a in sec_def.aliases] - if section_lower in names: - return sec_def.chunking_strategy - - if section.tables: - return ChunkingStrategy.TABLE_ROWS - - if section.content_type in ( - ContentExpectation.MERMAID_STATE, - ContentExpectation.MERMAID_FLOW, - ContentExpectation.YAML, - ContentExpectation.JSON, - ): - return ChunkingStrategy.KEEP_INTACT - - if len(section.content) <= self.max_chunk_size: - return ChunkingStrategy.KEEP_INTACT - - return ChunkingStrategy.DEFAULT - - def _chunk_table_rows( - self, - table: ParsedTable, - context: HierarchicalContext, - doc_id: str, - doc_kind, - section_name: str, - start_sequence: int, - ) -> list[ContextualizedChunk]: - """Generate one chunk per table row.""" - chunks: list[ContextualizedChunk] = [] - header_str = " | ".join(table.headers) - - for i, row in enumerate(table.rows): - row_content = self._format_table_row(table.headers, row) - - row_context = HierarchicalContext( - document_summary=context.document_summary, - section_summaries=context.section_summaries + [f"Tabla: {header_str}"], - heading_path=context.heading_path, - ) - - row_data = dict(zip(table.headers, row)) if len(row) == len(table.headers) else None - - chunk = self._create_chunk( - content=row_content, - context=row_context, - chunk_type="table_row", - doc_id=doc_id, - doc_kind=doc_kind, - section_name=section_name, - sequence=start_sequence + i, - table_headers=table.headers, - row_index=i, - row_data=row_data, - ) - chunks.append(chunk) - - return chunks - - def _format_table_row(self, headers: list[str], row: list[str]) -> str: - """Format a table row as readable text.""" - parts = [] - for i, header in enumerate(headers): - value = row[i] if i < len(row) else "" - if value.strip(): - parts.append(f"**{header}**: {value}") - return "\n".join(parts) - - def _chunk_text( - self, - text: str, - context: HierarchicalContext, - doc_id: str, - doc_kind, - section_name: str, - start_sequence: int, - ) -> list[ContextualizedChunk]: - """Split text into chunks with overlap.""" - chunks: list[ContextualizedChunk] = [] - - if len(text) <= self.max_chunk_size: - chunk = self._create_chunk( - content=text, - context=context, - chunk_type="text", - doc_id=doc_id, - doc_kind=doc_kind, - section_name=section_name, - sequence=start_sequence, - ) - return [chunk] - - current_pos = 0 - seq = start_sequence - - while current_pos < len(text): - end_pos = min(current_pos + self.max_chunk_size, len(text)) - - if end_pos < len(text): - for sep in [". ", ".\n", "\n\n", "\n"]: - last_sep = text.rfind(sep, current_pos, end_pos) - if last_sep > current_pos: - end_pos = last_sep + len(sep) - break - - chunk_text = text[current_pos:end_pos].strip() - if chunk_text: - chunk = self._create_chunk( - content=chunk_text, - context=context, - chunk_type="text", - doc_id=doc_id, - doc_kind=doc_kind, - section_name=section_name, - sequence=seq, - ) - chunks.append(chunk) - seq += 1 - - if end_pos >= len(text): - break - current_pos = end_pos - self.chunk_overlap - - return chunks - - MIN_PARAGRAPH_WORDS = 20 - - def _split_by_paragraphs( - self, - text: str, - context: HierarchicalContext, - doc_id: str, - doc_kind, - section_name: str, - start_sequence: int, - ) -> list[ContextualizedChunk]: - """Split text into chunks by paragraph per BR-EMBEDDING-001. - - Rules: - 1. Each paragraph (separated by blank lines) produces one chunk. - 2. Paragraphs with < 20 words are merged with the next paragraph. - 3. If a merged paragraph exceeds max_chunk_size, fall back to - size-based splitting for that paragraph. - """ - raw_paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] - - if not raw_paragraphs: - return [] - - # Merge short paragraphs (< 20 words) with the next one - merged: list[str] = [] - buffer = "" - for para in raw_paragraphs: - if buffer: - buffer = f"{buffer}\n\n{para}" - else: - buffer = para - - if len(buffer.split()) >= self.MIN_PARAGRAPH_WORDS: - merged.append(buffer) - buffer = "" - - # Flush remaining buffer - if buffer: - if merged: - merged[-1] = f"{merged[-1]}\n\n{buffer}" - else: - merged.append(buffer) - - # Create chunks, falling back to size-based splitting for large paragraphs - chunks: list[ContextualizedChunk] = [] - seq = start_sequence - for para in merged: - if len(para) > self.max_chunk_size: - sub_chunks = self._chunk_text( - text=para, - context=context, - doc_id=doc_id, - doc_kind=doc_kind, - section_name=section_name, - start_sequence=seq, - ) - chunks.extend(sub_chunks) - seq += len(sub_chunks) - else: - chunk = self._create_chunk( - content=para, - context=context, - chunk_type="paragraph", - doc_id=doc_id, - doc_kind=doc_kind, - section_name=section_name, - sequence=seq, - ) - chunks.append(chunk) - seq += 1 - - return chunks - - def _extract_non_table_text(self, section: ParsedSection) -> str: - """Extract text content that's not part of tables.""" - content = section.content - for table in section.tables: - content = content.replace(table.raw_content, "") - return content.strip() - - def _extract_list_items(self, content: str) -> list[str]: - """Extract list items from content.""" - import re - items = re.findall(r"^[-*]\s+(.+)$", content, re.MULTILINE) - return items if items else [content] - - def _create_chunk( - self, - content: str, - context: HierarchicalContext, - chunk_type: str, - doc_id: str, - doc_kind, - section_name: str, - sequence: int, - start_offset: int | None = None, - end_offset: int | None = None, - table_headers: list[str] | None = None, - row_index: int | None = None, - row_data: dict[str, str] | None = None, - ) -> ContextualizedChunk: - """Create a contextualized chunk.""" - context_prefix = context.as_prefix() - contextualized = f"{context_prefix}\n\n{content}" if context_prefix else content - - chunk_id = f"{doc_id}#{sequence}" - - return ContextualizedChunk( - id=chunk_id, - content=content, - contextualized_content=contextualized, - chunk_type=chunk_type, - context=context, - document_id=doc_id, - document_kind=doc_kind, - section_name=section_name, - sequence=sequence, - table_headers=table_headers, - row_index=row_index, - row_data=row_data, - start_offset=start_offset, - end_offset=end_offset, - ) diff --git a/src/kb_engine/smart/chunking/summarizer.py b/src/kb_engine/smart/chunking/summarizer.py deleted file mode 100644 index 0ce4221..0000000 --- a/src/kb_engine/smart/chunking/summarizer.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Summary services for hierarchical chunking.""" - -from abc import ABC, abstractmethod - -import structlog - -from kb_engine.smart.types import ParsedDocument, ParsedSection - -logger = structlog.get_logger(__name__) - - -class SummaryService(ABC): - """Abstract base class for summary generation.""" - - @abstractmethod - async def summarize_document(self, parsed: ParsedDocument) -> str: - """Generate a one-line summary of the document.""" - pass - - @abstractmethod - async def summarize_section(self, section: ParsedSection, doc_context: str) -> str: - """Generate a one-line summary of a section.""" - pass - - -class MockSummaryService(SummaryService): - """Mock summary service for testing (no LLM calls).""" - - async def summarize_document(self, parsed: ParsedDocument) -> str: - """Generate mock document summary.""" - return f"Doc: {parsed.title}" - - async def summarize_section(self, section: ParsedSection, doc_context: str) -> str: - """Generate mock section summary.""" - return f"Sec: {section.name}" - - -class LLMSummaryService(SummaryService): - """Summary service using OpenAI LLM.""" - - def __init__(self, model: str = "gpt-4o-mini", api_key: str | None = None) -> None: - """Initialize LLM summary service. - - Args: - model: OpenAI model to use. - api_key: OpenAI API key. If None, uses OPENAI_API_KEY env var. - """ - self.model = model - self._client = None - self._api_key = api_key - - @property - def client(self): - """Lazy initialize OpenAI client.""" - if self._client is None: - from openai import AsyncOpenAI - self._client = AsyncOpenAI(api_key=self._api_key) - return self._client - - async def summarize_document(self, parsed: ParsedDocument) -> str: - """Generate document summary using LLM.""" - log = logger.bind(title=parsed.title) - log.debug("summarizer.document.start") - - # Build a condensed view of the document - content_preview = parsed.raw_content[:2000] - - try: - response = await self.client.chat.completions.create( - model=self.model, - messages=[ - { - "role": "system", - "content": "Genera un resumen de UNA SOLA LÍNEA (máximo 100 caracteres) del siguiente documento. El resumen debe capturar la esencia del documento de forma concisa." - }, - { - "role": "user", - "content": f"Documento: {parsed.title}\n\n{content_preview}" - } - ], - max_tokens=50, - temperature=0.3, - ) - - summary = response.choices[0].message.content.strip() - log.debug("summarizer.document.complete", summary_length=len(summary)) - return summary - - except Exception as e: - log.warning("summarizer.document.error", error=str(e)) - return f"Doc: {parsed.title}" - - async def summarize_section(self, section: ParsedSection, doc_context: str) -> str: - """Generate section summary using LLM.""" - log = logger.bind(section=section.name) - log.debug("summarizer.section.start") - - content_preview = section.content[:1000] - - try: - response = await self.client.chat.completions.create( - model=self.model, - messages=[ - { - "role": "system", - "content": "Genera un resumen de UNA SOLA LÍNEA (máximo 80 caracteres) de la siguiente sección de documento. El resumen debe ser conciso y capturar el propósito de la sección." - }, - { - "role": "user", - "content": f"Contexto del documento: {doc_context}\n\nSección: {section.name}\n\n{content_preview}" - } - ], - max_tokens=40, - temperature=0.3, - ) - - summary = response.choices[0].message.content.strip() - log.debug("summarizer.section.complete", summary_length=len(summary)) - return summary - - except Exception as e: - log.warning("summarizer.section.error", error=str(e)) - return f"Sec: {section.name}" diff --git a/src/kb_engine/smart/extraction/__init__.py b/src/kb_engine/smart/extraction/__init__.py deleted file mode 100644 index 24ec08f..0000000 --- a/src/kb_engine/smart/extraction/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Entity extraction from parsed KDD documents.""" - -from kb_engine.smart.extraction.entity import EntityGraphExtractor - -__all__ = [ - "EntityGraphExtractor", -] diff --git a/src/kb_engine/smart/extraction/entity.py b/src/kb_engine/smart/extraction/entity.py deleted file mode 100644 index 0650d61..0000000 --- a/src/kb_engine/smart/extraction/entity.py +++ /dev/null @@ -1,251 +0,0 @@ -"""Entity extraction for FalkorDB graph storage.""" - -import structlog - -from kb_engine.smart.stores.falkordb_graph import FalkorDBGraphStore -from kb_engine.smart.types import ExtractedEntityInfo, ParsedDocument - -logger = structlog.get_logger(__name__) - - -class EntityGraphExtractor: - """Extracts and stores entity graph data in FalkorDB. - - Extracts: - - Document node for provenance tracking - - Main entity as Entity node - - Attributes as Concept nodes (linked to entity via CONTAINS) - - States as Concept nodes - - Related entities as Entity nodes (linked via REFERENCES) - - Events as Event nodes (linked via PRODUCES/CONSUMES) - - EXTRACTED_FROM edges from every domain node to the Document - """ - - def __init__(self, graph_store: FalkorDBGraphStore) -> None: - """Initialize extractor with graph store. - - Args: - graph_store: FalkorDB graph store instance. - """ - self.graph_store = graph_store - - def extract_and_store( - self, - parsed: ParsedDocument, - entity_info: ExtractedEntityInfo, - ) -> tuple[int, int]: - """Extract entities and store in graph. - - Args: - parsed: Parsed document. - entity_info: Extracted entity information. - - Returns: - Tuple of (nodes_created, edges_created). - """ - log = logger.bind(entity_name=entity_info.name) - log.debug("extractor.start") - - doc_id = parsed.frontmatter.get("id", entity_info.name) - doc_path = parsed.frontmatter.get("path", "") - doc_kind = parsed.kind.value if hasattr(parsed.kind, "value") else "" - nodes_created = 0 - edges_created = 0 - - # 0. Document node for provenance - self.graph_store.upsert_document( - doc_id=doc_id, - title=entity_info.name, - path=doc_path, - kind=doc_kind, - ) - nodes_created += 1 - - # 1. Main entity node - entity_id = f"entity:{entity_info.name}" - self.graph_store.upsert_entity( - entity_id=entity_id, - name=entity_info.name, - description=entity_info.description[:500] if entity_info.description else "", - code_class=entity_info.code_class, - code_table=entity_info.code_table, - confidence=1.0, - ) - nodes_created += 1 - self.graph_store.add_extracted_from(entity_id, "Entity", doc_id, "primary", 1.0) - edges_created += 1 - log.debug("extractor.entity_created", entity_id=entity_id) - - # 2. Attribute nodes - for attr in entity_info.attributes: - concept_id = f"concept:{entity_info.name}.{attr.name}" - self.graph_store.upsert_concept( - concept_id=concept_id, - name=attr.name, - concept_type="attribute", - description=attr.description, - parent_entity=entity_info.name, - properties={ - "code": attr.code, - "type": attr.type, - "is_reference": attr.is_reference, - "reference_entity": attr.reference_entity, - }, - confidence=0.95, - ) - nodes_created += 1 - self.graph_store.add_extracted_from(concept_id, "Concept", doc_id, "primary", 0.95) - edges_created += 1 - - # CONTAINS edge - self.graph_store.add_contains( - entity_id=entity_id, - concept_id=concept_id, - confidence=1.0, - source_doc_id=doc_id, - ) - edges_created += 1 - - # If attribute references another entity - if attr.is_reference and attr.reference_entity: - ref_entity_id = f"entity:{attr.reference_entity}" - # Ensure referenced entity exists (stub) - self.graph_store.upsert_entity( - entity_id=ref_entity_id, - name=attr.reference_entity, - description=f"Referenced by {entity_info.name}.{attr.name}", - confidence=0.7, # Lower confidence for inferred entities - ) - nodes_created += 1 - self.graph_store.add_extracted_from( - ref_entity_id, "Entity", doc_id, "referenced", 0.7 - ) - edges_created += 1 - - # REFERENCES edge - self.graph_store.add_references( - from_entity_id=entity_id, - to_entity_id=ref_entity_id, - via_attribute=attr.name, - confidence=0.9, - source_doc_id=doc_id, - ) - edges_created += 1 - - log.debug("extractor.attributes_created", count=len(entity_info.attributes)) - - # 3. State nodes - for state in entity_info.states: - concept_id = f"concept:{entity_info.name}::{state.name}" - self.graph_store.upsert_concept( - concept_id=concept_id, - name=state.name, - concept_type="state", - description=state.description, - parent_entity=entity_info.name, - properties={ - "is_initial": state.is_initial, - "is_final": state.is_final, - "entry_conditions": state.entry_conditions, - }, - confidence=0.95, - ) - nodes_created += 1 - self.graph_store.add_extracted_from(concept_id, "Concept", doc_id, "primary", 0.95) - edges_created += 1 - - # CONTAINS edge - self.graph_store.add_contains( - entity_id=entity_id, - concept_id=concept_id, - confidence=1.0, - source_doc_id=doc_id, - ) - edges_created += 1 - - log.debug("extractor.states_created", count=len(entity_info.states)) - - # 4. Relations (to other entities) - for rel in entity_info.relations: - ref_entity_id = f"entity:{rel.target_entity}" - - # Ensure referenced entity exists (stub) - self.graph_store.upsert_entity( - entity_id=ref_entity_id, - name=rel.target_entity, - description=f"Related to {entity_info.name} via {rel.name}", - confidence=0.7, - ) - nodes_created += 1 - self.graph_store.add_extracted_from( - ref_entity_id, "Entity", doc_id, "referenced", 0.7 - ) - edges_created += 1 - - # REFERENCES edge - self.graph_store.add_references( - from_entity_id=entity_id, - to_entity_id=ref_entity_id, - via_attribute=rel.code or rel.name, - cardinality=rel.cardinality, - description=rel.description, - confidence=0.95, - source_doc_id=doc_id, - ) - edges_created += 1 - - log.debug("extractor.relations_created", count=len(entity_info.relations)) - - # 5. Events emitted - for event_name in entity_info.events_emitted: - event_id = f"event:{event_name}" - self.graph_store.upsert_event( - event_id=event_id, - name=event_name, - description=f"Emitted by {entity_info.name}", - confidence=0.9, - ) - nodes_created += 1 - self.graph_store.add_extracted_from(event_id, "Event", doc_id, "primary", 0.9) - edges_created += 1 - - self.graph_store.add_produces( - entity_id=entity_id, - event_id=event_id, - confidence=0.9, - source_doc_id=doc_id, - ) - edges_created += 1 - - log.debug("extractor.events_emitted", count=len(entity_info.events_emitted)) - - # 6. Events consumed - for event_name in entity_info.events_consumed: - event_id = f"event:{event_name}" - self.graph_store.upsert_event( - event_id=event_id, - name=event_name, - description=f"Consumed by {entity_info.name}", - confidence=0.9, - ) - nodes_created += 1 - self.graph_store.add_extracted_from(event_id, "Event", doc_id, "primary", 0.9) - edges_created += 1 - - self.graph_store.add_consumes( - entity_id=entity_id, - event_id=event_id, - confidence=0.9, - source_doc_id=doc_id, - ) - edges_created += 1 - - log.debug("extractor.events_consumed", count=len(entity_info.events_consumed)) - - log.info( - "extractor.complete", - nodes_created=nodes_created, - edges_created=edges_created, - ) - - return nodes_created, edges_created diff --git a/src/kb_engine/smart/parsers/__init__.py b/src/kb_engine/smart/parsers/__init__.py deleted file mode 100644 index 2763ea7..0000000 --- a/src/kb_engine/smart/parsers/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Document parsers for KDD documents.""" - -from kb_engine.smart.parsers.detector import DocumentKindDetector -from kb_engine.smart.parsers.entity import EntityParser - -__all__ = [ - "DocumentKindDetector", - "EntityParser", -] diff --git a/src/kb_engine/smart/parsers/detector.py b/src/kb_engine/smart/parsers/detector.py deleted file mode 100644 index 21dc729..0000000 --- a/src/kb_engine/smart/parsers/detector.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Document kind detector.""" - -import frontmatter -import structlog - -from kb_engine.smart.types import DetectionResult, KDDDocumentKind - -logger = structlog.get_logger(__name__) - - -class DocumentKindDetector: - """Detects the kind of KDD document from content and metadata.""" - - # Mapping from kind values to enum - KIND_MAP = { - "entity": KDDDocumentKind.ENTITY, - "use-case": KDDDocumentKind.USE_CASE, - "rule": KDDDocumentKind.RULE, - "process": KDDDocumentKind.PROCESS, - "event": KDDDocumentKind.EVENT, - "command": KDDDocumentKind.COMMAND, - "query": KDDDocumentKind.QUERY, - "adr": KDDDocumentKind.ADR, - "prd": KDDDocumentKind.PRD, - "nfr": KDDDocumentKind.NFR, - "story": KDDDocumentKind.STORY, - "ui-view": KDDDocumentKind.UI_VIEW, - "ui-flow": KDDDocumentKind.UI_FLOW, - "ui-component": KDDDocumentKind.UI_COMPONENT, - "idea": KDDDocumentKind.IDEA, - "requirement": KDDDocumentKind.REQUIREMENT, - "implementation-charter": KDDDocumentKind.IMPLEMENTATION_CHARTER, - } - - def detect(self, content: str, filename: str | None = None) -> DetectionResult: - """Detect document kind from content and optional filename. - - Args: - content: Raw markdown content. - filename: Optional filename for additional hints. - - Returns: - DetectionResult with kind, confidence, and detection source. - """ - log = logger.bind(filename=filename) - - # 1. Try frontmatter 'kind' field (highest confidence) - try: - fm = frontmatter.loads(content) - kind_value = fm.metadata.get("kind", "").lower().strip() - - if kind_value and kind_value in self.KIND_MAP: - log.debug("detector.frontmatter", kind=kind_value) - return DetectionResult( - kind=self.KIND_MAP[kind_value], - confidence=1.0, - detected_from="frontmatter", - ) - except Exception as e: - log.warning("detector.frontmatter_error", error=str(e)) - - # 2. Try filename patterns (medium confidence) - if filename: - filename_lower = filename.lower() - - # Check for path patterns - path_patterns = { - "entities/": KDDDocumentKind.ENTITY, - "use-cases/": KDDDocumentKind.USE_CASE, - "rules/": KDDDocumentKind.RULE, - "processes/": KDDDocumentKind.PROCESS, - "events/": KDDDocumentKind.EVENT, - "adrs/": KDDDocumentKind.ADR, - } - - for pattern, kind in path_patterns.items(): - if pattern in filename_lower: - log.debug("detector.filename_path", kind=kind.value, pattern=pattern) - return DetectionResult( - kind=kind, - confidence=0.7, - detected_from="filename", - ) - - # Check for filename prefixes - prefix_patterns = { - "uc-": KDDDocumentKind.USE_CASE, - "evt-": KDDDocumentKind.EVENT, - "cmd-": KDDDocumentKind.COMMAND, - "qry-": KDDDocumentKind.QUERY, - "adr-": KDDDocumentKind.ADR, - } - - for prefix, kind in prefix_patterns.items(): - if filename_lower.startswith(prefix): - log.debug("detector.filename_prefix", kind=kind.value, prefix=prefix) - return DetectionResult( - kind=kind, - confidence=0.6, - detected_from="filename", - ) - - # 3. Unknown - log.debug("detector.unknown") - return DetectionResult( - kind=KDDDocumentKind.UNKNOWN, - confidence=0.0, - detected_from="none", - ) diff --git a/src/kb_engine/smart/parsers/entity.py b/src/kb_engine/smart/parsers/entity.py deleted file mode 100644 index e60329d..0000000 --- a/src/kb_engine/smart/parsers/entity.py +++ /dev/null @@ -1,427 +0,0 @@ -"""Parser for Entity KDD documents.""" - -import re -from typing import Any - -import frontmatter -import structlog - -from kb_engine.smart.schemas.entity import ( - ATTRIBUTES_TABLE_COLUMNS, - ATTRIBUTES_TABLE_COLUMNS_EN, - ENTITY_SCHEMA, - RELATIONS_TABLE_COLUMNS, - RELATIONS_TABLE_COLUMNS_EN, - STATES_TABLE_COLUMNS, - STATES_TABLE_COLUMNS_EN, -) -from kb_engine.smart.types import ( - ContentExpectation, - ExtractedAttribute, - ExtractedEntityInfo, - ExtractedRelation, - ExtractedState, - KDDDocumentKind, - ParsedCodeBlock, - ParsedDocument, - ParsedSection, - ParsedTable, - TemplateSchema, -) - -logger = structlog.get_logger(__name__) - - -class EntityParser: - """Parser for Entity KDD documents.""" - - def __init__(self, schema: TemplateSchema | None = None) -> None: - """Initialize parser with schema.""" - self.schema = schema or ENTITY_SCHEMA - - def parse(self, content: str, filename: str | None = None) -> ParsedDocument: - """Parse an entity document.""" - log = logger.bind(filename=filename) - log.debug("parser.entity.start", content_length=len(content)) - - fm = frontmatter.loads(content) - body = fm.content - log.debug("parser.entity.frontmatter", keys=list(fm.metadata.keys())) - - title = self._extract_title(body) - log.debug("parser.entity.title", title=title) - - sections = self._parse_sections(body) - log.debug("parser.entity.sections", count=len(sections), names=[s.name for s in sections]) - - tables = self._extract_all_tables(body, sections) - log.debug("parser.entity.tables", count=len(tables)) - - # Associate tables with sections - for table in tables: - for section in sections: - if section.name == table.section_name: - section.tables.append(table) - - code_blocks = self._extract_code_blocks(body, sections) - log.debug("parser.entity.code_blocks", count=len(code_blocks)) - - # Associate code blocks with sections - for block in code_blocks: - for section in sections: - if section.name == block.section_name: - section.code_blocks.append(block) - - cross_refs = self._extract_cross_references(body) - log.debug("parser.entity.cross_refs", count=len(cross_refs)) - - parsed = ParsedDocument( - kind=KDDDocumentKind.ENTITY, - frontmatter=dict(fm.metadata), - title=title, - sections=sections, - tables=tables, - code_blocks=code_blocks, - cross_references=cross_refs, - validation_errors=[], - raw_content=content, - ) - - parsed.validation_errors = self._validate(parsed) - if parsed.validation_errors: - log.warning("parser.entity.validation_errors", errors=parsed.validation_errors) - - log.debug("parser.entity.complete", title=title) - return parsed - - def extract_entity_info(self, parsed: ParsedDocument) -> ExtractedEntityInfo: - """Extract structured entity information from parsed document.""" - log = logger.bind(entity_name=parsed.entity_name) - log.debug("parser.extract_info.start") - - description = "" - for section in parsed.sections: - if section.name.lower() in ["descripción", "description"]: - description = section.content - break - - attributes = self._extract_attributes(parsed) - log.debug("parser.extract_info.attributes", count=len(attributes)) - - relations = self._extract_relations(parsed) - log.debug("parser.extract_info.relations", count=len(relations)) - - states = self._extract_states(parsed) - log.debug("parser.extract_info.states", count=len(states)) - - invariants = self._extract_invariants(parsed) - log.debug("parser.extract_info.invariants", count=len(invariants)) - - events_emitted, events_consumed = self._extract_events(parsed) - log.debug("parser.extract_info.events", emitted=len(events_emitted), consumed=len(events_consumed)) - - return ExtractedEntityInfo( - name=parsed.entity_name, - aliases=parsed.aliases, - code_class=parsed.code_class, - code_table=parsed.code_table, - description=description, - attributes=attributes, - relations=relations, - states=states, - invariants=invariants, - events_emitted=events_emitted, - events_consumed=events_consumed, - cross_references=parsed.cross_references, - ) - - def _extract_title(self, body: str) -> str: - """Extract title from first H1 heading.""" - match = re.search(r"^#\s+(.+?)(?:\s*)?$", body, re.MULTILINE) - return match.group(1).strip() if match else "Untitled" - - def _parse_sections(self, body: str) -> list[ParsedSection]: - """Parse all sections from document body.""" - sections: list[ParsedSection] = [] - current_section: ParsedSection | None = None - current_content: list[str] = [] - offset = 0 - - lines = body.split("\n") - - for line in lines: - line_length = len(line) + 1 - - # Check for heading - heading_match = re.match(r"^(#{2,6})\s+(.+?)(?:\s*)?$", line) - - if heading_match: - # Save previous section - if current_section: - current_section.content = "\n".join(current_content).strip() - current_section.end_offset = offset - sections.append(current_section) - - level = len(heading_match.group(1)) - name = heading_match.group(2).strip() - - current_section = ParsedSection( - name=name, - level=level, - content="", - start_offset=offset, - ) - current_content = [] - elif current_section: - current_content.append(line) - - offset += line_length - - # Don't forget the last section - if current_section: - current_section.content = "\n".join(current_content).strip() - current_section.end_offset = offset - sections.append(current_section) - - return sections - - def _extract_all_tables(self, body: str, sections: list[ParsedSection]) -> list[ParsedTable]: - """Extract all markdown tables from body.""" - tables: list[ParsedTable] = [] - table_pattern = re.compile( - r"^\|(.+)\|\s*\n\|[-:\s|]+\|\s*\n((?:\|.+\|\s*\n?)+)", - re.MULTILINE, - ) - - for match in table_pattern.finditer(body): - header_line = match.group(1) - rows_text = match.group(2) - - headers = [h.strip().strip("`") for h in header_line.split("|") if h.strip()] - rows = [] - - for row_line in rows_text.strip().split("\n"): - cells = [c.strip() for c in row_line.split("|") if c.strip() or row_line.count("|") > len(headers)] - # Filter empty leading/trailing cells from split - cells = [c.strip() for c in row_line.strip().strip("|").split("|")] - if cells: - rows.append(cells) - - # Find which section this table belongs to - table_pos = match.start() - section_name = "Unknown" - for section in sections: - if section.start_offset <= table_pos < section.end_offset: - section_name = section.name - break - - tables.append(ParsedTable( - headers=headers, - rows=rows, - section_name=section_name, - raw_content=match.group(0), - )) - - return tables - - def _extract_code_blocks(self, body: str, sections: list[ParsedSection]) -> list[ParsedCodeBlock]: - """Extract code blocks from body.""" - blocks: list[ParsedCodeBlock] = [] - pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL) - - for match in pattern.finditer(body): - language = match.group(1) or "text" - content = match.group(2) - - # Find section - block_pos = match.start() - section_name = "Unknown" - for section in sections: - if section.start_offset <= block_pos < section.end_offset: - section_name = section.name - break - - blocks.append(ParsedCodeBlock( - language=language, - content=content, - section_name=section_name, - )) - - return blocks - - def _extract_cross_references(self, body: str) -> list[str]: - """Extract [[Reference]] links from body.""" - pattern = re.compile(r"\[\[([^\]]+)\]\]") - refs = pattern.findall(body) - return list(set(refs)) - - def _validate(self, parsed: ParsedDocument) -> list[str]: - """Validate parsed document against schema.""" - errors = [] - - # Check required sections - section_names = {s.name.lower() for s in parsed.sections} - for req in self.schema.required_sections: - names_to_check = {req.name.lower()} | {a.lower() for a in req.aliases} - if not names_to_check & section_names: - errors.append(f"Missing required section: {req.name}") - - return errors - - def _extract_attributes(self, parsed: ParsedDocument) -> list[ExtractedAttribute]: - """Extract attributes from Atributos table.""" - attributes = [] - - for table in parsed.tables: - if table.section_name.lower() not in ["atributos", "attributes"]: - continue - - headers_lower = [h.lower() for h in table.headers] - - for row in table.rows: - if len(row) < 2: - continue - - # Find column indices - name_idx = 0 - code_idx = self._find_column(headers_lower, ["code"]) - type_idx = self._find_column(headers_lower, ["tipo", "type"]) - desc_idx = self._find_column(headers_lower, ["descripción", "description"]) - - name = row[name_idx].strip("`") if name_idx < len(row) else "" - code = row[code_idx].strip("`") if code_idx is not None and code_idx < len(row) else None - attr_type = row[type_idx] if type_idx is not None and type_idx < len(row) else "unknown" - description = row[desc_idx] if desc_idx is not None and desc_idx < len(row) else "" - - # Check if reference type (contains [[Entity]]) - is_reference = "[[" in attr_type - reference_entity = None - if is_reference: - ref_match = re.search(r"\[\[(\w+)\]\]", attr_type) - if ref_match: - reference_entity = ref_match.group(1) - - attributes.append(ExtractedAttribute( - name=name, - code=code, - type=attr_type, - description=description, - is_reference=is_reference, - reference_entity=reference_entity, - )) - - return attributes - - def _extract_relations(self, parsed: ParsedDocument) -> list[ExtractedRelation]: - """Extract relations from Relaciones table.""" - relations = [] - - for table in parsed.tables: - if table.section_name.lower() not in ["relaciones", "relations", "relationships"]: - continue - - headers_lower = [h.lower() for h in table.headers] - - for row in table.rows: - if len(row) < 3: - continue - - name_idx = 0 - code_idx = self._find_column(headers_lower, ["code"]) - card_idx = self._find_column(headers_lower, ["cardinalidad", "cardinality"]) - entity_idx = self._find_column(headers_lower, ["entidad", "entity"]) - desc_idx = self._find_column(headers_lower, ["descripción", "description"]) - - name = row[name_idx].strip("`") if name_idx < len(row) else "" - code = row[code_idx].strip("`") if code_idx is not None and code_idx < len(row) else None - cardinality = row[card_idx] if card_idx is not None and card_idx < len(row) else "" - target_raw = row[entity_idx] if entity_idx is not None and entity_idx < len(row) else "" - description = row[desc_idx] if desc_idx is not None and desc_idx < len(row) else "" - - # Extract entity name from [[Entity]] - target_match = re.search(r"\[\[(\w+)\]\]", target_raw) - target_entity = target_match.group(1) if target_match else target_raw - - relations.append(ExtractedRelation( - name=name, - code=code, - cardinality=cardinality, - target_entity=target_entity, - description=description, - )) - - return relations - - def _extract_states(self, parsed: ParsedDocument) -> list[ExtractedState]: - """Extract states from Estados table.""" - states = [] - - for table in parsed.tables: - if table.section_name.lower() not in ["estados", "states"]: - continue - - headers_lower = [h.lower() for h in table.headers] - - for row in table.rows: - if len(row) < 2: - continue - - name_idx = 0 - desc_idx = self._find_column(headers_lower, ["descripción", "description"]) - cond_idx = self._find_column(headers_lower, ["condiciones de entrada", "entry conditions"]) - - name = row[name_idx].strip("*").strip() - description = row[desc_idx] if desc_idx is not None and desc_idx < len(row) else "" - entry_conditions = row[cond_idx] if cond_idx is not None and cond_idx < len(row) else "" - - states.append(ExtractedState( - name=name, - description=description, - entry_conditions=entry_conditions, - )) - - return states - - def _extract_invariants(self, parsed: ParsedDocument) -> list[str]: - """Extract invariants from Invariantes section.""" - invariants = [] - - for section in parsed.sections: - if section.name.lower() not in ["invariantes", "invariants", "constraints"]: - continue - - # Extract list items - for line in section.content.split("\n"): - line = line.strip() - if line.startswith("- ") or line.startswith("* "): - invariants.append(line[2:].strip()) - - return invariants - - def _extract_events(self, parsed: ParsedDocument) -> tuple[list[str], list[str]]: - """Extract events emitted and consumed.""" - emitted = [] - consumed = [] - - for section in parsed.sections: - if section.name.lower() not in ["eventos", "events"]: - continue - - for line in section.content.split("\n"): - line_lower = line.lower() - - if "emite" in line_lower or "emit" in line_lower or "produce" in line_lower: - refs = re.findall(r"\[\[([^\]]+)\]\]", line) - emitted.extend(refs) - elif "consume" in line_lower or "escucha" in line_lower or "listen" in line_lower: - refs = re.findall(r"\[\[([^\]]+)\]\]", line) - consumed.extend(refs) - - return list(set(emitted)), list(set(consumed)) - - def _find_column(self, headers: list[str], names: list[str]) -> int | None: - """Find column index by possible names.""" - for name in names: - if name in headers: - return headers.index(name) - return None diff --git a/src/kb_engine/smart/pipelines/__init__.py b/src/kb_engine/smart/pipelines/__init__.py deleted file mode 100644 index a9d4129..0000000 --- a/src/kb_engine/smart/pipelines/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Smart ingestion pipelines.""" - -from kb_engine.smart.pipelines.entity import EntityIngestionPipeline - -__all__ = [ - "EntityIngestionPipeline", -] diff --git a/src/kb_engine/smart/pipelines/entity.py b/src/kb_engine/smart/pipelines/entity.py deleted file mode 100644 index 564e721..0000000 --- a/src/kb_engine/smart/pipelines/entity.py +++ /dev/null @@ -1,273 +0,0 @@ -"""Entity document ingestion pipeline.""" - -import time -from pathlib import Path - -import structlog - -from kb_engine.smart.chunking import HierarchicalChunker, LLMSummaryService, MockSummaryService -from kb_engine.smart.extraction import EntityGraphExtractor -from kb_engine.smart.parsers import DocumentKindDetector, EntityParser -from kb_engine.smart.schemas import ENTITY_SCHEMA -from kb_engine.smart.stores import FalkorDBGraphStore -from kb_engine.smart.types import IngestionResult, KDDDocumentKind - -logger = structlog.get_logger(__name__) - - -class EntityIngestionPipeline: - """Complete pipeline for ingesting entity KDD documents. - - This pipeline: - 1. Detects document type (must be entity) - 2. Parses using EntityParser - 3. Generates hierarchical chunks with summaries - 4. Extracts entities and stores in FalkorDB graph - 5. Returns ingestion result with statistics - - Example: - ```python - from kb_engine.smart.pipelines import EntityIngestionPipeline - - pipeline = EntityIngestionPipeline(graph_path="./kb-graph.db") - - with open("domain/entities/User.md") as f: - content = f.read() - - result = await pipeline.ingest(content, filename="User.md") - print(f"Created {result.chunks_created} chunks") - print(f"Extracted {result.entities_extracted} entities") - ``` - """ - - def __init__( - self, - graph_path: str | Path = ".kb/graph.db", - use_mock_summarizer: bool = False, - max_chunk_size: int = 1024, - chunk_overlap: int = 50, - ) -> None: - """Initialize the entity ingestion pipeline. - - Args: - graph_path: Path to FalkorDB graph database file. - use_mock_summarizer: Use mock summarizer (no LLM calls) for testing. - max_chunk_size: Maximum chunk size in characters. - chunk_overlap: Overlap between text chunks. - """ - self._graph_path = Path(graph_path) - - # Initialize components - self._detector = DocumentKindDetector() - self._parser = EntityParser(schema=ENTITY_SCHEMA) - - # Summary service - if use_mock_summarizer: - self._summarizer = MockSummaryService() - else: - self._summarizer = LLMSummaryService() - - self._chunker = HierarchicalChunker( - summary_service=self._summarizer, - max_chunk_size=max_chunk_size, - chunk_overlap=chunk_overlap, - ) - - # Graph store (lazy init) - self._graph_store: FalkorDBGraphStore | None = None - self._extractor: EntityGraphExtractor | None = None - - def _init_graph(self) -> None: - """Lazy initialization of graph store.""" - if self._graph_store is None: - self._graph_store = FalkorDBGraphStore(self._graph_path) - self._graph_store.initialize() - self._extractor = EntityGraphExtractor(self._graph_store) - - async def ingest( - self, - content: str, - filename: str | None = None, - skip_graph: bool = False, - ) -> IngestionResult: - """Ingest an entity document. - - Args: - content: Raw markdown content of the entity document. - filename: Optional filename for context. - skip_graph: If True, skip storing to graph (for testing). - - Returns: - IngestionResult with counts and any errors. - """ - start_time = time.time() - result = IngestionResult() - log = logger.bind(filename=filename, skip_graph=skip_graph) - - log.debug("pipeline.start", content_length=len(content)) - - try: - # 1. Detect document type - log.debug("pipeline.step.detect.start") - detection = self._detector.detect(content, filename) - result.document_kind = detection.kind - result.detection_confidence = detection.confidence - log.debug( - "pipeline.step.detect.complete", - kind=detection.kind.value, - confidence=detection.confidence, - ) - - if detection.kind != KDDDocumentKind.ENTITY: - log.warning( - "pipeline.step.detect.rejected", - expected="entity", - got=detection.kind.value, - ) - result.validation_errors.append( - f"Expected entity document, got {detection.kind.value}" - ) - return result - - # 2. Parse document - log.debug("pipeline.step.parse.start") - parsed = self._parser.parse(content, filename) - result.validation_errors.extend(parsed.validation_errors) - log.debug( - "pipeline.step.parse.complete", - title=parsed.title, - sections=len(parsed.sections), - validation_errors=len(parsed.validation_errors), - ) - - if parsed.validation_errors: - result.warnings.append("Document has validation errors but will be processed") - - # 3. Extract entity info - log.debug("pipeline.step.extract_info.start") - entity_info = self._parser.extract_entity_info(parsed) - log.debug( - "pipeline.step.extract_info.complete", - attributes=len(entity_info.attributes), - relations=len(entity_info.relations), - states=len(entity_info.states), - ) - - # Document ID and path propagation for provenance - doc_id = parsed.frontmatter.get("id", entity_info.name) - parsed.frontmatter.setdefault("path", filename or "") - result.document_id = doc_id - log = log.bind(doc_id=doc_id) - - # 4. Generate hierarchical chunks - log.debug("pipeline.step.chunk.start") - chunks = await self._chunker.chunk(parsed, ENTITY_SCHEMA) - result.chunks_created = len(chunks) - chunk_types = {} - for c in chunks: - chunk_types[c.chunk_type] = chunk_types.get(c.chunk_type, 0) + 1 - log.debug( - "pipeline.step.chunk.complete", - total_chunks=len(chunks), - chunk_types=chunk_types, - ) - - # 5. Store in graph - if not skip_graph: - log.debug("pipeline.step.graph.start") - self._init_graph() - nodes, edges = self._extractor.extract_and_store(parsed, entity_info) - result.entities_extracted = nodes - result.relations_created = edges - log.debug( - "pipeline.step.graph.complete", - nodes=nodes, - edges=edges, - ) - else: - # Count what would be created - ref_attr_count = sum(1 for a in entity_info.attributes if a.is_reference) - result.entities_extracted = ( - 1 + # Document node - 1 + # main entity - len(entity_info.attributes) + - len(entity_info.states) + - len(entity_info.relations) + - ref_attr_count + # stub entities from reference attributes - len(entity_info.events_emitted) + - len(entity_info.events_consumed) - ) - result.relations_created = ( - 1 + # EXTRACTED_FROM for main entity - len(entity_info.attributes) + # EXTRACTED_FROM for attributes - len(entity_info.attributes) + # CONTAINS for attributes - len(entity_info.states) + # EXTRACTED_FROM for states - len(entity_info.states) + # CONTAINS for states - len(entity_info.relations) + # EXTRACTED_FROM for related entities - len(entity_info.relations) + # REFERENCES - ref_attr_count + # EXTRACTED_FROM for ref attr stubs - ref_attr_count + # REFERENCES from attrs - len(entity_info.events_emitted) + # EXTRACTED_FROM for events emitted - len(entity_info.events_emitted) + # PRODUCES - len(entity_info.events_consumed) + # EXTRACTED_FROM for events consumed - len(entity_info.events_consumed) # CONSUMES - ) - log.debug("pipeline.step.graph.skipped") - - result.success = True - log.info( - "pipeline.complete", - chunks=result.chunks_created, - entities=result.entities_extracted, - relations=result.relations_created, - ) - - except Exception as e: - log.exception("pipeline.error", error=str(e)) - result.validation_errors.append(f"Pipeline error: {str(e)}") - result.success = False - - result.processing_time_ms = (time.time() - start_time) * 1000 - log.debug("pipeline.timing", duration_ms=result.processing_time_ms) - return result - - async def ingest_file( - self, - file_path: str | Path, - skip_graph: bool = False, - ) -> IngestionResult: - """Ingest an entity document from file path. - - Args: - file_path: Path to the markdown file. - skip_graph: If True, skip storing to graph. - - Returns: - IngestionResult. - """ - path = Path(file_path) - content = path.read_text(encoding="utf-8") - return await self.ingest(content, filename=path.name, skip_graph=skip_graph) - - def close(self) -> None: - """Close graph store connection.""" - if self._graph_store: - self._graph_store.close() - - def get_graph_stats(self) -> dict: - """Get graph database statistics.""" - self._init_graph() - return self._graph_store.get_stats() - - def query_graph(self, cypher: str, params: dict | None = None) -> list[dict]: - """Execute a Cypher query on the graph. - - Args: - cypher: Cypher query string. - params: Query parameters. - - Returns: - List of result dictionaries. - """ - self._init_graph() - return self._graph_store.execute_cypher(cypher, params) diff --git a/src/kb_engine/smart/schemas/__init__.py b/src/kb_engine/smart/schemas/__init__.py deleted file mode 100644 index 4089113..0000000 --- a/src/kb_engine/smart/schemas/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""KDD document schemas.""" - -from kb_engine.smart.schemas.entity import ENTITY_SCHEMA - -__all__ = [ - "ENTITY_SCHEMA", -] diff --git a/src/kb_engine/smart/schemas/entity.py b/src/kb_engine/smart/schemas/entity.py deleted file mode 100644 index eef4844..0000000 --- a/src/kb_engine/smart/schemas/entity.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Entity document schema definition.""" - -from kb_engine.smart.types import ( - ChunkingStrategy, - ContentExpectation, - FieldDefinition, - KDDDocumentKind, - SectionDefinition, - TemplateSchema, -) - -# Column names for table parsing (ES/EN) -ATTRIBUTES_TABLE_COLUMNS = ["Atributo", "Code", "Tipo", "Descripción"] -ATTRIBUTES_TABLE_COLUMNS_EN = ["Attribute", "Code", "Type", "Description"] - -RELATIONS_TABLE_COLUMNS = ["Relación", "Code", "Cardinalidad", "Entidad", "Descripción"] -RELATIONS_TABLE_COLUMNS_EN = ["Relation", "Code", "Cardinality", "Entity", "Description"] - -STATES_TABLE_COLUMNS = ["Estado", "Descripción", "Condiciones de entrada"] -STATES_TABLE_COLUMNS_EN = ["State", "Description", "Entry Conditions"] - - -ENTITY_SCHEMA = TemplateSchema( - kind=KDDDocumentKind.ENTITY, - title_is_name=True, - frontmatter_fields=[ - FieldDefinition(name="kind", required=True, field_type="string"), - FieldDefinition(name="aliases", required=False, field_type="array"), - FieldDefinition(name="code", required=False, field_type="object"), - FieldDefinition(name="tags", required=True, field_type="array"), - ], - required_sections=[ - SectionDefinition( - name="Descripción", - aliases=["Description"], - required=True, - content_expectation=ContentExpectation.TEXT, - chunking_strategy=ChunkingStrategy.SPLIT_BY_PARAGRAPHS, - description="Entity description", - ), - SectionDefinition( - name="Atributos", - aliases=["Attributes"], - required=True, - content_expectation=ContentExpectation.TABLE, - chunking_strategy=ChunkingStrategy.TABLE_ROWS, - description="Entity attributes table", - ), - ], - optional_sections=[ - SectionDefinition( - name="Relaciones", - aliases=["Relations", "Relationships"], - content_expectation=ContentExpectation.TABLE, - chunking_strategy=ChunkingStrategy.TABLE_ROWS, - description="Entity relationships table", - ), - SectionDefinition( - name="Ciclo de Vida", - aliases=["Lifecycle", "Life Cycle"], - content_expectation=ContentExpectation.MERMAID_STATE, - chunking_strategy=ChunkingStrategy.KEEP_INTACT, - description="State diagram", - ), - SectionDefinition( - name="Estados", - aliases=["States"], - content_expectation=ContentExpectation.TABLE, - chunking_strategy=ChunkingStrategy.TABLE_ROWS, - description="States table", - ), - SectionDefinition( - name="Invariantes", - aliases=["Invariants", "Constraints"], - content_expectation=ContentExpectation.TEXT, - chunking_strategy=ChunkingStrategy.SPLIT_BY_ITEMS, - description="Business rules", - ), - SectionDefinition( - name="Eventos", - aliases=["Events"], - content_expectation=ContentExpectation.TEXT, - chunking_strategy=ChunkingStrategy.KEEP_INTACT, - description="Events emitted/consumed", - ), - ], -) diff --git a/src/kb_engine/smart/stores/__init__.py b/src/kb_engine/smart/stores/__init__.py deleted file mode 100644 index b7d1556..0000000 --- a/src/kb_engine/smart/stores/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Storage backends for the smart pipeline.""" - -from kb_engine.smart.stores.falkordb_graph import FalkorDBGraphStore - -__all__ = [ - "FalkorDBGraphStore", -] diff --git a/src/kb_engine/smart/stores/falkordb_graph.py b/src/kb_engine/smart/stores/falkordb_graph.py deleted file mode 100644 index a98cf08..0000000 --- a/src/kb_engine/smart/stores/falkordb_graph.py +++ /dev/null @@ -1,773 +0,0 @@ -"""FalkorDB graph store for knowledge graph storage.""" - -from pathlib import Path -from typing import Any - -import structlog -from redislite.falkordb_client import FalkorDB - -logger = structlog.get_logger(__name__) - - -class FalkorDBGraphStore: - """Graph store backed by FalkorDB (FalkorDBLite) embedded database. - - Provides storage for: - - Document nodes (provenance tracking) - - Entity nodes (domain entities) - - Concept nodes (attributes, states) - - Event nodes (domain events) - - EXTRACTED_FROM relationships (node-to-document provenance) - - Domain relationships (CONTAINS, REFERENCES, PRODUCES, CONSUMES) - - FalkorDB is schema-less and supports full MERGE...ON CREATE SET...ON MATCH SET syntax, - making upserts straightforward. - - Usage: - store = FalkorDBGraphStore("./kb-graph.db") - store.initialize() - - # Add document provenance - store.upsert_document("doc-1", "User Entity", "entities/User.md", "entity") - - # Add nodes - store.upsert_entity("entity:User", "User", "Domain user") - store.add_extracted_from("entity:User", "Entity", "doc-1", "primary", 1.0) - - # Query provenance - impact = store.get_document_impact("doc-1") - """ - - def __init__(self, db_path: str | Path) -> None: - """Initialize FalkorDB graph store. - - Args: - db_path: Path to the FalkorDB database file. - """ - self.db_path = Path(db_path) - self._db: FalkorDB | None = None - self._graph: Any = None # FalkorDB Graph object - self._initialized = False - - def initialize(self, reset: bool = False) -> None: - """Initialize the database. - - Args: - reset: If True, delete existing database and start fresh. - """ - log = logger.bind(db_path=str(self.db_path)) - - if reset and self.db_path.exists(): - log.info("falkordb.reset", action="deleting existing database") - self.db_path.unlink() - - log.debug("falkordb.initialize.start") - - # Ensure parent directory exists - self.db_path.parent.mkdir(parents=True, exist_ok=True) - - # Initialize FalkorDB with file path - self._db = FalkorDB(str(self.db_path)) - self._graph = self._db.select_graph("knowledge") - - if not self._initialized or reset: - self._create_indexes() - self._initialized = True - - log.info("falkordb.initialize.complete") - - def _create_indexes(self) -> None: - """Create indexes for better query performance. - - FalkorDB is schema-less, so we only create indexes, not schema. - """ - log = logger.bind(db_path=str(self.db_path)) - log.debug("falkordb.indexes.create") - - for label in ["Entity", "Concept", "Event", "Document"]: - try: - self._graph.query(f"CREATE INDEX FOR (n:{label}) ON (n.id)") - except Exception: - pass # Index may already exist - - log.debug("falkordb.indexes.created") - - @property - def graph(self) -> Any: - """Get graph instance, initializing if needed.""" - if self._graph is None: - self.initialize() - return self._graph - - def close(self) -> None: - """Close database connection.""" - # FalkorDBLite doesn't have an explicit close method - # Just release the references - self._graph = None - self._db = None - - # === Document Node Operations === - - def upsert_document( - self, - doc_id: str, - title: str, - path: str = "", - kind: str = "", - ) -> None: - """Insert or update a Document node for provenance tracking. - - Args: - doc_id: Unique document identifier. - title: Document title. - path: File path or URL of the document. - kind: Document kind (entity, use-case, etc.). - """ - log = logger.bind(doc_id=doc_id, title=title) - params = { - "id": doc_id, - "title": title, - "path": path, - "kind": kind, - } - - try: - self.graph.query( - """ - MERGE (d:Document {id: $id}) - ON CREATE SET d.title = $title, d.path = $path, d.kind = $kind - ON MATCH SET d.title = $title, d.path = $path, d.kind = $kind - """, - params=params, - ) - log.debug("falkordb.document.upserted") - except Exception as e: - log.warning("falkordb.document.upsert_failed", error=str(e)) - raise - - def add_extracted_from( - self, - node_id: str, - node_label: str, - doc_id: str, - role: str = "primary", - confidence: float = 1.0, - ) -> None: - """Create EXTRACTED_FROM edge from a domain node to a Document. - - Args: - node_id: ID of the source node (Entity, Concept, or Event). - node_label: Label of the source node ("Entity", "Concept", or "Event"). - doc_id: ID of the target Document node. - role: Role of the extraction ("primary" or "referenced"). - confidence: Confidence score of the extraction. - """ - params = { - "nid": node_id, - "did": doc_id, - "role": role, - "conf": confidence, - } - try: - self.graph.query( - f""" - MATCH (n:{node_label} {{id: $nid}}), (d:Document {{id: $did}}) - MERGE (n)-[r:EXTRACTED_FROM]->(d) - ON CREATE SET r.role = $role, r.confidence = $conf - ON MATCH SET r.role = $role, r.confidence = $conf - """, - params=params, - ) - except Exception as e: - logger.warning( - "falkordb.extracted_from.failed", - node_id=node_id, - doc_id=doc_id, - error=str(e), - ) - - # === Node Operations === - - def upsert_entity( - self, - entity_id: str, - name: str, - description: str = "", - code_class: str | None = None, - code_table: str | None = None, - confidence: float = 1.0, - ) -> None: - """Insert or update an Entity node. - - Uses a confidence guard: on update, only overwrites if the new - confidence is >= the existing confidence. This prevents a stub - reference (0.7) from overwriting a fully-defined entity (1.0). - """ - log = logger.bind(entity_id=entity_id, name=name) - params = { - "id": entity_id, - "name": name, - "descr": description[:500] if description else "", - "code_class": code_class or "", - "code_table": code_table or "", - "confidence": confidence, - } - - try: - # Step 1: Create if not exists - self.graph.query( - """ - MERGE (e:Entity {id: $id}) - ON CREATE SET e.name = $name, e.description = $descr, e.code_class = $code_class, - e.code_table = $code_table, e.confidence = $confidence - """, - params=params, - ) - # Step 2: Update only if new confidence >= existing - self.graph.query( - """ - MATCH (e:Entity {id: $id}) WHERE e.confidence <= $confidence - SET e.name = $name, e.description = $descr, e.code_class = $code_class, - e.code_table = $code_table, e.confidence = $confidence - """, - params=params, - ) - log.debug("falkordb.entity.upserted") - except Exception as e: - log.warning("falkordb.entity.upsert_failed", error=str(e)) - raise - - def upsert_concept( - self, - concept_id: str, - name: str, - concept_type: str, - description: str = "", - parent_entity: str | None = None, - properties: dict[str, Any] | None = None, - confidence: float = 1.0, - ) -> None: - """Insert or update a Concept node. - - Uses a confidence guard: on update, only overwrites if the new - confidence is >= the existing confidence. - """ - import json - - log = logger.bind(concept_id=concept_id, concept_type=concept_type) - params = { - "id": concept_id, - "name": name, - "ctype": concept_type, - "descr": description[:500] if description else "", - "parent": parent_entity or "", - "props": json.dumps(properties) if properties else "{}", - "confidence": confidence, - } - - try: - # Step 1: Create if not exists - self.graph.query( - """ - MERGE (c:Concept {id: $id}) - ON CREATE SET c.name = $name, c.concept_type = $ctype, c.description = $descr, - c.parent_entity = $parent, c.properties = $props, c.confidence = $confidence - """, - params=params, - ) - # Step 2: Update only if new confidence >= existing - self.graph.query( - """ - MATCH (c:Concept {id: $id}) WHERE c.confidence <= $confidence - SET c.name = $name, c.concept_type = $ctype, c.description = $descr, - c.parent_entity = $parent, c.properties = $props, c.confidence = $confidence - """, - params=params, - ) - log.debug("falkordb.concept.upserted") - except Exception as e: - log.warning("falkordb.concept.upsert_failed", error=str(e)) - raise - - def upsert_event( - self, - event_id: str, - name: str, - description: str = "", - confidence: float = 1.0, - ) -> None: - """Insert or update an Event node. - - Uses a confidence guard: on update, only overwrites if the new - confidence is >= the existing confidence. - """ - log = logger.bind(event_id=event_id, name=name) - params = { - "id": event_id, - "name": name, - "descr": description[:500] if description else "", - "confidence": confidence, - } - - try: - # Step 1: Create if not exists - self.graph.query( - """ - MERGE (e:Event {id: $id}) - ON CREATE SET e.name = $name, e.description = $descr, e.confidence = $confidence - """, - params=params, - ) - # Step 2: Update only if new confidence >= existing - self.graph.query( - """ - MATCH (e:Event {id: $id}) WHERE e.confidence <= $confidence - SET e.name = $name, e.description = $descr, e.confidence = $confidence - """, - params=params, - ) - log.debug("falkordb.event.upserted") - except Exception as e: - log.warning("falkordb.event.upsert_failed", error=str(e)) - raise - - # === Relationship Operations === - - def add_contains( - self, - entity_id: str, - concept_id: str, - confidence: float = 1.0, - source_doc_id: str | None = None, - ) -> None: - """Add CONTAINS relationship from Entity to Concept.""" - params = { - "eid": entity_id, - "cid": concept_id, - "conf": confidence, - "source": source_doc_id or "", - } - try: - self.graph.query( - """ - MATCH (e:Entity {id: $eid}), (c:Concept {id: $cid}) - MERGE (e)-[r:CONTAINS]->(c) - ON CREATE SET r.confidence = $conf, r.source_doc_id = $source - """, - params=params, - ) - except Exception as e: - logger.warning( - "falkordb.contains.failed", entity=entity_id, concept=concept_id, error=str(e) - ) - - def add_references( - self, - from_entity_id: str, - to_entity_id: str, - via_attribute: str | None = None, - cardinality: str | None = None, - description: str = "", - confidence: float = 1.0, - source_doc_id: str | None = None, - ) -> None: - """Add REFERENCES relationship between Entities.""" - params = { - "eid1": from_entity_id, - "eid2": to_entity_id, - "via": via_attribute or "", - "card": cardinality or "", - "descr": description, - "conf": confidence, - "source": source_doc_id or "", - } - try: - self.graph.query( - """ - MATCH (e1:Entity {id: $eid1}), (e2:Entity {id: $eid2}) - MERGE (e1)-[r:REFERENCES]->(e2) - ON CREATE SET r.via_attribute = $via, r.cardinality = $card, - r.description = $descr, r.confidence = $conf, r.source_doc_id = $source - """, - params=params, - ) - except Exception as e: - logger.warning( - "falkordb.references.failed", - from_id=from_entity_id, - to_id=to_entity_id, - error=str(e), - ) - - def add_produces( - self, - entity_id: str, - event_id: str, - confidence: float = 1.0, - source_doc_id: str | None = None, - ) -> None: - """Add PRODUCES relationship from Entity to Event.""" - params = { - "eid": entity_id, - "evid": event_id, - "conf": confidence, - "source": source_doc_id or "", - } - try: - self.graph.query( - """ - MATCH (e:Entity {id: $eid}), (ev:Event {id: $evid}) - MERGE (e)-[r:PRODUCES]->(ev) - ON CREATE SET r.confidence = $conf, r.source_doc_id = $source - """, - params=params, - ) - except Exception as e: - logger.warning( - "falkordb.produces.failed", entity=entity_id, event=event_id, error=str(e) - ) - - def add_consumes( - self, - entity_id: str, - event_id: str, - confidence: float = 1.0, - source_doc_id: str | None = None, - ) -> None: - """Add CONSUMES relationship from Entity to Event.""" - params = { - "eid": entity_id, - "evid": event_id, - "conf": confidence, - "source": source_doc_id or "", - } - try: - self.graph.query( - """ - MATCH (e:Entity {id: $eid}), (ev:Event {id: $evid}) - MERGE (e)-[r:CONSUMES]->(ev) - ON CREATE SET r.confidence = $conf, r.source_doc_id = $source - """, - params=params, - ) - except Exception as e: - logger.warning( - "falkordb.consumes.failed", entity=entity_id, event=event_id, error=str(e) - ) - - # === Query Operations === - - def execute_cypher(self, query: str, params: dict[str, Any] | None = None) -> list[dict]: - """Execute a Cypher query and return results as list of dicts.""" - result = self.graph.query(query, params=params or {}) - if not result.result_set: - return [] - # Extract column names from header - headers = [col[1] if isinstance(col, (list, tuple)) else str(col) for col in result.header] - return [dict(zip(headers, row)) for row in result.result_set] - - def get_entity(self, entity_id: str) -> dict | None: - """Get an entity by ID.""" - results = self.execute_cypher( - "MATCH (e:Entity {id: $id}) RETURN e", {"id": entity_id} - ) - return results[0] if results else None - - def get_node_graph(self, node_id: str, depth: int = 2) -> dict: - """Get a node and all related nodes up to depth. - - Works with any node type (Entity, Concept, Event). - Filters out EXTRACTED_FROM edges to only return domain relationships. - """ - # Domain relationship types only (exclude EXTRACTED_FROM) - domain_rels = "CONTAINS|REFERENCES|PRODUCES|CONSUMES" - nodes = self.execute_cypher( - f""" - MATCH (e {{id: $id}})-[:{domain_rels}*1..{depth}]-(n) - RETURN DISTINCT labels(n)[0] as node_type, n.id as id, n.name as name - """, - {"id": node_id}, - ) - - edges = self.execute_cypher( - f""" - MATCH (e {{id: $id}})-[r:{domain_rels}]-() - RETURN DISTINCT type(r) as rel_type - """, - {"id": node_id}, - ) - edge_types = [e["rel_type"] for e in edges] - - return { - "center": node_id, - "nodes": nodes, - "edge_types": edge_types, - } - - def get_entity_graph(self, entity_id: str, depth: int = 2) -> dict: - """Get an entity and all related nodes up to depth. - - Deprecated: use get_node_graph() instead. - """ - return self.get_node_graph(entity_id, depth) - - def get_all_entities(self) -> list[dict]: - """Get all entities.""" - return self.execute_cypher( - "MATCH (e:Entity) RETURN e.id as id, e.name as name, e.code_class as code_class" - ) - - def get_all_nodes(self, node_type: str | None = None) -> list[dict]: - """Get all domain nodes (Entity, Concept, Event). - - Args: - node_type: Optional filter - "entity", "concept", or "event". - If None, returns all domain node types. - - Returns: - List of dicts with label, id, name. - """ - if node_type: - label = node_type.capitalize() - return self.execute_cypher( - f"MATCH (n:{label}) RETURN '{label}' as label, n.id as id, n.name as name ORDER BY n.name" - ) - # Return all domain node types - results = [] - for label in ["Entity", "Concept", "Event"]: - rows = self.execute_cypher( - f"MATCH (n:{label}) RETURN '{label}' as label, n.id as id, n.name as name ORDER BY n.name" - ) - results.extend(rows) - return results - - def find_path( - self, - from_id: str, - to_id: str, - max_depth: int = 5, - ) -> list[dict]: - """Find path between two nodes.""" - return self.execute_cypher( - f""" - MATCH (a {{id: $from}})-[*1..{max_depth}]-(b {{id: $to}}) - RETURN a.name as start_name, b.name as end_name - """, - {"from": from_id, "to": to_id}, - ) - - # === Provenance Queries === - - def get_document_impact(self, doc_id: str) -> list[dict]: - """Get all nodes extracted from a given document. - - Looks up by document id first, then falls back to matching - by path suffix (ENDS WITH) so both 'specs/foo.md' and - '/specs/foo.md' resolve correctly. - - Args: - doc_id: Document identifier or file path. - - Returns: - List of dicts with node_type, id, name, role, confidence. - """ - results = self.execute_cypher( - """ - MATCH (n)-[r:EXTRACTED_FROM]->(d:Document {id: $did}) - RETURN labels(n)[0] as node_type, n.id as id, n.name as name, - r.role as role, r.confidence as confidence - """, - {"did": doc_id}, - ) - if results: - return results - # Fallback: search by path suffix - return self.execute_cypher( - """ - MATCH (n)-[r:EXTRACTED_FROM]->(d:Document) - WHERE d.id ENDS WITH $did OR d.path ENDS WITH $did - RETURN labels(n)[0] as node_type, n.id as id, n.name as name, - r.role as role, r.confidence as confidence - """, - {"did": doc_id}, - ) - - def get_node_provenance(self, node_id: str) -> list[dict]: - """Get all documents that contributed to a given node. - - Args: - node_id: Node identifier. - - Returns: - List of dicts with doc_id, title, path, role, confidence. - """ - return self.execute_cypher( - """ - MATCH (n {id: $nid})-[r:EXTRACTED_FROM]->(d:Document) - RETURN d.id as doc_id, d.title as title, d.path as path, - r.role as role, r.confidence as confidence - """, - {"nid": node_id}, - ) - - def get_orphan_entities(self) -> list[dict]: - """Get entities that are only referenced but have no primary document. - - These are "stub" entities created when processing references to entities - whose documents haven't been indexed yet. They have: - - confidence < 1.0 (typically 0.7) - - Only EXTRACTED_FROM edges with role="referenced", no role="primary" - - Returns: - List of dicts with id, name, confidence, referenced_by (list of doc titles). - """ - return self.execute_cypher( - """ - MATCH (e:Entity)-[r:EXTRACTED_FROM]->(d:Document) - WITH e, collect({role: r.role, doc: d.title}) as provenance - WHERE ALL(p IN provenance WHERE p.role = 'referenced') - RETURN e.id as id, e.name as name, e.confidence as confidence, - [p IN provenance | p.doc] as referenced_by - ORDER BY e.name - """ - ) - - def get_entity_completeness(self) -> list[dict]: - """Get completeness status for all entities. - - Returns entities with their provenance status: - - "complete": Has a primary document - - "stub": Only referenced, no primary document - - "orphan": No EXTRACTED_FROM edges at all - - Returns: - List of dicts with id, name, confidence, status, primary_doc, referenced_by. - """ - return self.execute_cypher( - """ - MATCH (e:Entity) - OPTIONAL MATCH (e)-[r:EXTRACTED_FROM]->(d:Document) - WITH e, - collect(CASE WHEN r.role = 'primary' THEN d.title END) as primary_docs, - collect(CASE WHEN r.role = 'referenced' THEN d.title END) as ref_docs - RETURN e.id as id, - e.name as name, - e.confidence as confidence, - CASE - WHEN size([p IN primary_docs WHERE p IS NOT NULL]) > 0 THEN 'complete' - WHEN size([r IN ref_docs WHERE r IS NOT NULL]) > 0 THEN 'stub' - ELSE 'orphan' - END as status, - [p IN primary_docs WHERE p IS NOT NULL] as primary_docs, - [r IN ref_docs WHERE r IS NOT NULL] as referenced_by - ORDER BY status, e.name - """ - ) - - # === Utility === - - def delete_node(self, node_id: str) -> bool: - """Delete a single node and all its relationships. - - Args: - node_id: The node ID to delete. - - Returns: - True if the node was found and deleted, False otherwise. - """ - result = self.graph.query( - "MATCH (n {id: $id}) DETACH DELETE n", - params={"id": node_id}, - ) - return result.nodes_deleted > 0 - - def delete_by_source_doc(self, source_doc_id: str) -> None: - """Delete a document and its exclusive nodes using cascade. - - Steps: - 1. Delete EXTRACTED_FROM edges pointing to the Document - 2. Delete domain relationships with source_doc_id = X - 3. Delete orphan nodes (no remaining EXTRACTED_FROM to any Document) - 4. Delete the Document node - """ - log = logger.bind(source_doc_id=source_doc_id) - log.debug("falkordb.delete_by_source.start") - - # Step 1: Delete EXTRACTED_FROM edges to this document - try: - self.graph.query( - """ - MATCH (n)-[r:EXTRACTED_FROM]->(d:Document {id: $doc_id}) - DELETE r - """, - params={"doc_id": source_doc_id}, - ) - except Exception: - pass - - # Step 2: Delete domain relationships with source_doc_id - for rel_type in ["CONTAINS", "REFERENCES", "PRODUCES", "CONSUMES", "RELATED_TO"]: - try: - self.graph.query( - f""" - MATCH ()-[r:{rel_type}]->() - WHERE r.source_doc_id = $doc_id - DELETE r - """, - params={"doc_id": source_doc_id}, - ) - except Exception: - pass - - # Step 3: Delete orphan nodes (no EXTRACTED_FROM to any Document) - # FalkorDB doesn't support OPTIONAL MATCH + WITH WHERE IS NULL + DELETE, - # so we use a two-step approach: find orphan IDs, then delete each. - for node_type in ["Entity", "Concept", "Event"]: - try: - # Find IDs that have EXTRACTED_FROM edges - linked = self.execute_cypher( - f"MATCH (n:{node_type})-[:EXTRACTED_FROM]->(:Document) RETURN DISTINCT n.id as id" - ) - linked_ids = {r["id"] for r in linked} - - # Find all IDs of this type - all_nodes = self.execute_cypher( - f"MATCH (n:{node_type}) RETURN n.id as id" - ) - - # Delete orphans (those without EXTRACTED_FROM) - for node in all_nodes: - if node["id"] not in linked_ids: - self.graph.query( - f"MATCH (n:{node_type} {{id: $id}}) DETACH DELETE n", - params={"id": node["id"]}, - ) - except Exception: - pass - - # Step 4: Delete the Document node - try: - self.graph.query( - """ - MATCH (d:Document {id: $doc_id}) - DETACH DELETE d - """, - params={"doc_id": source_doc_id}, - ) - except Exception: - pass - - log.info("falkordb.delete_by_source.complete") - - def get_stats(self) -> dict: - """Get database statistics.""" - stats = {} - - for node_type in ["Entity", "Concept", "Event", "Document"]: - try: - result = self.execute_cypher(f"MATCH (n:{node_type}) RETURN count(n) as cnt") - stats[f"{node_type.lower()}_count"] = result[0]["cnt"] if result else 0 - except Exception: - stats[f"{node_type.lower()}_count"] = 0 - - return stats diff --git a/src/kb_engine/smart/types.py b/src/kb_engine/smart/types.py deleted file mode 100644 index 3faf8b1..0000000 --- a/src/kb_engine/smart/types.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Core types for the smart ingestion pipeline.""" - -from dataclasses import dataclass, field -from enum import Enum -from typing import Any - - -class KDDDocumentKind(str, Enum): - """KDD document types.""" - - ENTITY = "entity" - USE_CASE = "use-case" - RULE = "rule" - PROCESS = "process" - EVENT = "event" - COMMAND = "command" - QUERY = "query" - ADR = "adr" - PRD = "prd" - NFR = "nfr" - STORY = "story" - UI_VIEW = "ui-view" - UI_FLOW = "ui-flow" - UI_COMPONENT = "ui-component" - IDEA = "idea" - REQUIREMENT = "requirement" - IMPLEMENTATION_CHARTER = "implementation-charter" - UNKNOWN = "unknown" - - -class ChunkingStrategy(str, Enum): - """Chunking strategies for different content types.""" - - DEFAULT = "default" - KEEP_INTACT = "keep_intact" - TABLE_ROWS = "table_rows" - SPLIT_BY_ITEMS = "split_by_items" - SPLIT_BY_PARAGRAPHS = "split_by_paragraphs" - - -class ContentExpectation(str, Enum): - """Expected content types for sections.""" - - TEXT = "text" - TABLE = "table" - MERMAID_STATE = "mermaid:stateDiagram-v2" - MERMAID_FLOW = "mermaid:flowchart" - YAML = "yaml" - JSON = "json" - CODE = "code" - - -@dataclass -class DetectionResult: - """Result of document kind detection.""" - - kind: KDDDocumentKind - confidence: float - detected_from: str # "frontmatter", "filename", "content" - - -@dataclass -class FieldDefinition: - """Definition of a frontmatter field.""" - - name: str - required: bool = False - field_type: str = "string" - description: str = "" - - -@dataclass -class SectionDefinition: - """Definition of a document section.""" - - name: str - required: bool = False - aliases: list[str] = field(default_factory=list) - content_expectation: ContentExpectation = ContentExpectation.TEXT - chunking_strategy: ChunkingStrategy = ChunkingStrategy.DEFAULT - description: str = "" - - -@dataclass -class TemplateSchema: - """Schema for a KDD document template.""" - - kind: KDDDocumentKind - title_is_name: bool = False - frontmatter_fields: list[FieldDefinition] = field(default_factory=list) - required_sections: list[SectionDefinition] = field(default_factory=list) - optional_sections: list[SectionDefinition] = field(default_factory=list) - - -@dataclass -class ParsedTable: - """A parsed markdown table.""" - - headers: list[str] - rows: list[list[str]] - section_name: str - raw_content: str - - -@dataclass -class ParsedCodeBlock: - """A parsed code block.""" - - language: str - content: str - section_name: str - - -@dataclass -class ParsedSection: - """A parsed document section.""" - - name: str - level: int - content: str - content_type: ContentExpectation = ContentExpectation.TEXT - tables: list[ParsedTable] = field(default_factory=list) - code_blocks: list[ParsedCodeBlock] = field(default_factory=list) - start_offset: int = 0 - end_offset: int = 0 - - -@dataclass -class ParsedDocument: - """A fully parsed KDD document.""" - - kind: KDDDocumentKind - frontmatter: dict[str, Any] - title: str - sections: list[ParsedSection] - tables: list[ParsedTable] - code_blocks: list[ParsedCodeBlock] - cross_references: list[str] - validation_errors: list[str] - raw_content: str - - @property - def entity_name(self) -> str: - """Get entity name (title for entity documents).""" - return self.title - - @property - def aliases(self) -> list[str]: - """Get aliases from frontmatter.""" - return self.frontmatter.get("aliases", []) - - @property - def code_class(self) -> str | None: - """Get code class name.""" - code = self.frontmatter.get("code", {}) - return code.get("class") if isinstance(code, dict) else None - - @property - def code_table(self) -> str | None: - """Get code table name.""" - code = self.frontmatter.get("code", {}) - return code.get("table") if isinstance(code, dict) else None - - -@dataclass -class HierarchicalContext: - """Context for hierarchical chunking.""" - - document_summary: str - section_summaries: list[str] - heading_path: list[str] - - def as_prefix(self) -> str: - """Generate context prefix for chunk.""" - parts = [] - if self.document_summary: - parts.append(f"[Doc: {self.document_summary}]") - if self.section_summaries: - parts.append(f"[Sec: {self.section_summaries[-1]}]") - return " > ".join(parts) - - -@dataclass -class ContextualizedChunk: - """A chunk with hierarchical context.""" - - id: str - content: str - contextualized_content: str - chunk_type: str - context: HierarchicalContext - document_id: str - document_kind: KDDDocumentKind - section_name: str - sequence: int - table_headers: list[str] | None = None - row_index: int | None = None - row_data: dict[str, str] | None = None - start_offset: int | None = None - end_offset: int | None = None - - -@dataclass -class ExtractedAttribute: - """An extracted entity attribute.""" - - name: str - code: str | None - type: str - description: str - is_reference: bool = False - reference_entity: str | None = None - - -@dataclass -class ExtractedRelation: - """An extracted entity relation.""" - - name: str - code: str | None - cardinality: str - target_entity: str - description: str - - -@dataclass -class ExtractedState: - """An extracted entity state.""" - - name: str - description: str - is_initial: bool = False - is_final: bool = False - entry_conditions: str = "" - - -@dataclass -class ExtractedEntityInfo: - """All extracted info from an entity document.""" - - name: str - aliases: list[str] - code_class: str | None - code_table: str | None - description: str - attributes: list[ExtractedAttribute] - relations: list[ExtractedRelation] - states: list[ExtractedState] - invariants: list[str] - events_emitted: list[str] - events_consumed: list[str] - cross_references: list[str] - - -@dataclass -class IngestionResult: - """Result of document ingestion.""" - - success: bool = False - document_id: str = "" - document_kind: KDDDocumentKind = KDDDocumentKind.UNKNOWN - detection_confidence: float = 0.0 - chunks_created: int = 0 - entities_extracted: int = 0 - relations_created: int = 0 - validation_errors: list[str] = field(default_factory=list) - warnings: list[str] = field(default_factory=list) - processing_time_ms: float = 0.0 diff --git a/src/kb_engine/utils/__init__.py b/src/kb_engine/utils/__init__.py deleted file mode 100644 index 6f8a578..0000000 --- a/src/kb_engine/utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Utility functions for KB-Engine.""" - -from kb_engine.utils.hashing import compute_content_hash -from kb_engine.utils.markdown import extract_frontmatter, parse_markdown_sections -from kb_engine.utils.tokenization import count_tokens, truncate_to_tokens - -__all__ = [ - "compute_content_hash", - "count_tokens", - "truncate_to_tokens", - "extract_frontmatter", - "parse_markdown_sections", -] diff --git a/src/kb_engine/utils/hashing.py b/src/kb_engine/utils/hashing.py deleted file mode 100644 index 228f0f5..0000000 --- a/src/kb_engine/utils/hashing.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Hashing utilities.""" - -import hashlib - - -def compute_content_hash(content: str) -> str: - """Compute a SHA-256 hash of content. - - Used for detecting document changes and deduplication. - """ - return hashlib.sha256(content.encode("utf-8")).hexdigest() diff --git a/src/kb_engine/utils/markdown.py b/src/kb_engine/utils/markdown.py deleted file mode 100644 index 99c06f6..0000000 --- a/src/kb_engine/utils/markdown.py +++ /dev/null @@ -1,137 +0,0 @@ -"""Markdown parsing utilities.""" - -import re -import unicodedata -from typing import Any - -import frontmatter - - -def extract_frontmatter(content: str) -> tuple[dict[str, Any], str]: - """Extract YAML frontmatter from markdown content. - - Returns a tuple of (metadata dict, content without frontmatter). - """ - try: - post = frontmatter.loads(content) - return dict(post.metadata), post.content - except Exception: - return {}, content - - -def parse_markdown_sections( - content: str, -) -> list[tuple[list[str], str]]: - """Parse markdown content into sections with heading paths. - - Returns a list of (heading_path, section_content) tuples. - """ - sections: list[tuple[list[str], str]] = [] - current_path: list[str] = [] - current_content: list[str] = [] - current_levels: list[int] = [] - - lines = content.split("\n") - - for line in lines: - if line.startswith("#"): - # Save previous section - section_text = "\n".join(current_content).strip() - if section_text: - sections.append((list(current_path), section_text)) - current_content = [] - - # Parse heading - level = len(line) - len(line.lstrip("#")) - heading_text = line.lstrip("#").strip() - - # Update path - while current_levels and current_levels[-1] >= level: - current_levels.pop() - if current_path: - current_path.pop() - - current_path.append(heading_text) - current_levels.append(level) - else: - current_content.append(line) - - # Don't forget last section - section_text = "\n".join(current_content).strip() - if section_text: - sections.append((list(current_path), section_text)) - - return sections - - -def heading_to_anchor(heading: str) -> str: - """Convert a heading to a GitHub-compatible anchor. - - Algorithm matches GitHub's anchor generation: - 1. Convert to lowercase - 2. Remove anything that is not a letter, number, space, or hyphen - 3. Replace spaces with hyphens - 4. Strip leading/trailing hyphens - - Examples: - "Atributos" -> "atributos" - "Ciclo de Vida" -> "ciclo-de-vida" - "Entity: User" -> "entity-user" - "## Estados (v2)" -> "estados-v2" - """ - # Normalize unicode characters - text = unicodedata.normalize("NFKD", heading) - # Lowercase - text = text.lower() - # Remove anything not alphanumeric, space, or hyphen - text = re.sub(r"[^\w\s-]", "", text) - # Replace whitespace with hyphens - text = re.sub(r"[\s]+", "-", text) - # Strip leading/trailing hyphens - text = text.strip("-") - return text - - -def heading_path_to_anchor(heading_path: list[str]) -> str | None: - """Convert a heading path to an anchor using the most specific heading. - - Uses the last element of the heading_path (most specific section). - Returns None if the heading_path is empty. - """ - if not heading_path: - return None - return heading_to_anchor(heading_path[-1]) - - -def extract_snippet(content: str, max_length: int = 200) -> str: - """Extract a snippet for preview from content. - - Truncates at sentence or word boundary, adds ellipsis if truncated. - """ - # Strip markdown formatting for cleaner snippets - text = content.strip() - # Remove heading markers - text = re.sub(r"^#+\s+", "", text, flags=re.MULTILINE) - # Remove bold/italic markers - text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) - text = re.sub(r"\*([^*]+)\*", r"\1", text) - # Remove link syntax, keep text - text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) - # Collapse whitespace - text = re.sub(r"\s+", " ", text).strip() - - if len(text) <= max_length: - return text - - # Try to break at sentence boundary - truncated = text[:max_length] - last_period = truncated.rfind(". ") - if last_period > max_length // 2: - return truncated[: last_period + 1] - - # Break at word boundary - last_space = truncated.rfind(" ") - if last_space > max_length // 2: - return truncated[:last_space] + "..." - - return truncated + "..." diff --git a/src/kb_engine/utils/tokenization.py b/src/kb_engine/utils/tokenization.py deleted file mode 100644 index e432ffb..0000000 --- a/src/kb_engine/utils/tokenization.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Tokenization utilities. - -Uses a simple word-based estimation when tiktoken is not available. -""" - - -def count_tokens(text: str, model: str = "gpt-4") -> int: - """Count the approximate number of tokens in text. - - Uses tiktoken if available, otherwise falls back to a simple - word-based estimation (1 token ~ 0.75 words for English). - """ - try: - import tiktoken - - encoding = tiktoken.encoding_for_model(model) - return len(encoding.encode(text)) - except (ImportError, KeyError): - # Rough estimation: ~4 chars per token on average - return max(1, len(text) // 4) - - -def truncate_to_tokens(text: str, max_tokens: int, model: str = "gpt-4") -> str: - """Truncate text to a maximum number of tokens.""" - try: - import tiktoken - - encoding = tiktoken.encoding_for_model(model) - tokens = encoding.encode(text) - if len(tokens) <= max_tokens: - return text - return encoding.decode(tokens[:max_tokens]) - except (ImportError, KeyError): - # Rough estimation: ~4 chars per token - max_chars = max_tokens * 4 - if len(text) <= max_chars: - return text - return text[:max_chars] diff --git a/src/kdd/__init__.py b/src/kdd/__init__.py deleted file mode 100644 index 5d88bc4..0000000 --- a/src/kdd/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""KDD — Knowledge-Driven Development retrieval engine for AI agents.""" - -__version__ = "0.1.0" diff --git a/src/kdd/api/__init__.py b/src/kdd/api/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kdd/api/cli.py b/src/kdd/api/cli.py deleted file mode 100644 index 1bbac83..0000000 --- a/src/kdd/api/cli.py +++ /dev/null @@ -1,438 +0,0 @@ -"""KDD CLI — Click-based command-line interface. - -Entry point: ``kdd`` command group. - -Commands: - kdd index Index all specs (CMD-001/CMD-002) - kdd search Hybrid search (QRY-003) - kdd graph Graph traversal (QRY-001) - kdd impact Impact analysis (QRY-004) - kdd coverage Coverage analysis (QRY-005) - kdd violations Layer violations (QRY-006) - kdd merge Merge indices (CMD-004) - kdd status Show index status -""" - -from __future__ import annotations - -import json -import sys -from pathlib import Path - -import click - -from kdd.container import create_container - - -@click.group() -@click.version_option(version="1.0.0", prog_name="kdd") -def cli(): - """KDD — Knowledge-Driven Development retrieval engine.""" - pass - - -# --------------------------------------------------------------------------- -# kdd index -# --------------------------------------------------------------------------- - - -@cli.command() -@click.argument("specs_path", type=click.Path(exists=True)) -@click.option("--index-path", type=click.Path(), default=None, help="Output .kdd-index/ path") -@click.option("--incremental/--full", default=True, help="Incremental (default) or full reindex") -@click.option("--domain", default=None, help="Domain name for multi-domain support") -def index(specs_path: str, index_path: str | None, incremental: bool, domain: str | None): - """Index KDD specs into .kdd-index/ artifacts.""" - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - if incremental: - from kdd.application.commands.index_incremental import index_incremental - - result = index_incremental( - specs_root, - registry=container.registry, - artifact_store=container.artifact_store, - event_bus=container.event_bus, - embedding_model=container.embedding_model, - index_level=container.index_level, - domain=domain, - ) - click.echo(f"Indexed: {result.indexed} Deleted: {result.deleted} " - f"Skipped: {result.skipped} Errors: {result.errors}") - if result.is_full_reindex: - click.echo("(full reindex — no prior manifest found)") - else: - from kdd.application.commands.index_incremental import index_incremental - - result = index_incremental( - specs_root, - registry=container.registry, - artifact_store=container.artifact_store, - event_bus=container.event_bus, - embedding_model=container.embedding_model, - index_level=container.index_level, - domain=domain, - ) - click.echo(f"Full index: {result.indexed} documents " - f"Skipped: {result.skipped} Errors: {result.errors}") - - click.echo(f"Index level: {container.index_level.value}") - click.echo(f"Index path: {container.index_path}") - - -# --------------------------------------------------------------------------- -# kdd search -# --------------------------------------------------------------------------- - - -@cli.command() -@click.argument("query") -@click.option("--limit", "-n", default=10, help="Max results") -@click.option("--min-score", default=0.5, help="Minimum score threshold") -@click.option("--depth", default=2, help="Graph expansion depth") -@click.option("--no-graph", is_flag=True, help="Disable graph expansion") -@click.option("--kind", multiple=True, help="Filter by kind (repeatable)") -@click.option("--index-path", type=click.Path(), default=None) -@click.option("--specs-path", type=click.Path(exists=True), default=".") -@click.option("--json-output", is_flag=True, help="Output as JSON") -def search(query: str, limit: int, min_score: float, depth: int, no_graph: bool, - kind: tuple, index_path: str | None, specs_path: str, json_output: bool): - """Search the KDD index (hybrid: semantic + graph + lexical).""" - from kdd.application.queries.retrieve_hybrid import HybridQueryInput, retrieve_hybrid - from kdd.domain.enums import KDDKind - - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - if not container.ensure_loaded(): - click.echo("Error: No index found. Run 'kdd index' first.", err=True) - sys.exit(1) - - include_kinds = [KDDKind(k) for k in kind] if kind else None - - result = retrieve_hybrid( - HybridQueryInput( - query_text=query, - expand_graph=not no_graph, - depth=depth, - include_kinds=include_kinds, - min_score=min_score, - limit=limit, - ), - container.graph_store, - container.vector_store, - container.embedding_model, - ) - - if json_output: - data = { - "total_results": result.total_results, - "total_tokens": result.total_tokens, - "warnings": result.warnings, - "results": [ - {"node_id": r.node_id, "score": round(r.score, 4), - "match_source": r.match_source, "snippet": r.snippet} - for r in result.results - ], - } - click.echo(json.dumps(data, indent=2)) - else: - if result.warnings: - for w in result.warnings: - click.echo(f" Warning: {w}", err=True) - click.echo(f"Found {result.total_results} results:\n") - for r in result.results: - score_bar = "█" * int(r.score * 10) - click.echo(f" {r.score:.3f} {score_bar} {r.node_id}") - if r.snippet: - click.echo(f" {r.snippet}") - click.echo(f" source: {r.match_source}") - click.echo() - - -# --------------------------------------------------------------------------- -# kdd graph -# --------------------------------------------------------------------------- - - -@cli.command() -@click.argument("node_id") -@click.option("--depth", "-d", default=2, help="Traversal depth") -@click.option("--edge-type", multiple=True, help="Filter edge types") -@click.option("--index-path", type=click.Path(), default=None) -@click.option("--specs-path", type=click.Path(exists=True), default=".") -def graph(node_id: str, depth: int, edge_type: tuple, index_path: str | None, specs_path: str): - """Traverse the knowledge graph from a root node (QRY-001).""" - from kdd.application.queries.retrieve_graph import GraphQueryInput, retrieve_by_graph - - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - if not container.ensure_loaded(): - click.echo("Error: No index found. Run 'kdd index' first.", err=True) - sys.exit(1) - - edge_types = list(edge_type) if edge_type else None - try: - result = retrieve_by_graph( - GraphQueryInput(root_node=node_id, depth=depth, edge_types=edge_types), - container.graph_store, - ) - except ValueError as e: - click.echo(f"Error: {e}", err=True) - sys.exit(1) - - click.echo(f"Center: {result.center_node.id if result.center_node else '?'}") - click.echo(f"Related nodes: {result.total_nodes} Edges: {result.total_edges}\n") - for r in result.related_nodes: - click.echo(f" {r.score:.3f} {r.node_id} ({r.snippet})") - - -# --------------------------------------------------------------------------- -# kdd impact -# --------------------------------------------------------------------------- - - -@cli.command() -@click.argument("node_id") -@click.option("--depth", "-d", default=3, help="Analysis depth") -@click.option("--index-path", type=click.Path(), default=None) -@click.option("--specs-path", type=click.Path(exists=True), default=".") -def impact(node_id: str, depth: int, index_path: str | None, specs_path: str): - """Analyze the impact of changing a node (QRY-004).""" - from kdd.application.queries.retrieve_impact import ImpactQueryInput, retrieve_impact - - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - if not container.ensure_loaded(): - click.echo("Error: No index found.", err=True) - sys.exit(1) - - try: - result = retrieve_impact( - ImpactQueryInput(node_id=node_id, depth=depth), - container.graph_store, - ) - except ValueError as e: - click.echo(f"Error: {e}", err=True) - sys.exit(1) - - click.echo(f"Impact analysis for: {node_id}\n") - click.echo(f"Directly affected: {result.total_directly}") - for a in result.directly_affected: - click.echo(f" {a.node_id} [{a.edge_type}] — {a.impact_description}") - - if result.transitively_affected: - click.echo(f"\nTransitively affected: {result.total_transitively}") - for t in result.transitively_affected: - path_str = " → ".join(t.path) - click.echo(f" {t.node_id} via {path_str}") - - if result.scenarios_to_rerun: - click.echo(f"\nBDD scenarios to re-run: {len(result.scenarios_to_rerun)}") - for s in result.scenarios_to_rerun: - click.echo(f" {s.scenario_name} — {s.reason}") - - -# --------------------------------------------------------------------------- -# kdd coverage -# --------------------------------------------------------------------------- - - -@cli.command() -@click.argument("node_id") -@click.option("--index-path", type=click.Path(), default=None) -@click.option("--specs-path", type=click.Path(exists=True), default=".") -def coverage(node_id: str, index_path: str | None, specs_path: str): - """Check governance coverage for a node (QRY-005).""" - from kdd.application.queries.retrieve_coverage import CoverageQueryInput, retrieve_coverage - - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - if not container.ensure_loaded(): - click.echo("Error: No index found.", err=True) - sys.exit(1) - - try: - result = retrieve_coverage( - CoverageQueryInput(node_id=node_id), - container.graph_store, - ) - except ValueError as e: - click.echo(f"Error: {e}", err=True) - sys.exit(1) - - click.echo(f"Coverage for {node_id}: {result.coverage_percent:.0f}%\n") - for cat in result.categories: - icon = "✓" if cat.status == "covered" else "✗" - click.echo(f" {icon} {cat.name}: {cat.status}") - if cat.found: - for fid in cat.found: - click.echo(f" → {fid}") - - -# --------------------------------------------------------------------------- -# kdd violations -# --------------------------------------------------------------------------- - - -@cli.command() -@click.option("--index-path", type=click.Path(), default=None) -@click.option("--specs-path", type=click.Path(exists=True), default=".") -def violations(index_path: str | None, specs_path: str): - """List all layer dependency violations (QRY-006).""" - from kdd.application.queries.retrieve_violations import ( - ViolationsQueryInput, - retrieve_violations, - ) - - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - if not container.ensure_loaded(): - click.echo("Error: No index found.", err=True) - sys.exit(1) - - result = retrieve_violations(ViolationsQueryInput(), container.graph_store) - - click.echo(f"Total edges: {result.total_edges_analyzed}") - click.echo(f"Violations: {result.total_violations} ({result.violation_rate:.1f}%)\n") - - for v in result.violations: - click.echo(f" {v.from_node} ({v.from_layer.value}) → " - f"{v.to_node} ({v.to_layer.value}) [{v.edge_type}]") - - -# --------------------------------------------------------------------------- -# kdd merge -# --------------------------------------------------------------------------- - - -@cli.command() -@click.argument("sources", nargs=-1, required=True) -@click.option("-o", "--output", required=True, type=click.Path(), help="Output .kdd-index/ path") -@click.option("--strategy", default="last_write_wins", - type=click.Choice(["last_write_wins", "fail_on_conflict"])) -def merge(sources: tuple, output: str, strategy: str): - """Merge multiple .kdd-index/ directories (CMD-004).""" - from kdd.application.commands.merge_index import merge_index - - source_paths = [Path(s) for s in sources] - result = merge_index(source_paths, Path(output), conflict_strategy=strategy) - - if result.success: - click.echo(f"Merge successful: {result.total_nodes} nodes, " - f"{result.total_edges} edges, " - f"{result.conflicts_resolved} conflicts resolved") - click.echo(f"Output: {output}") - else: - click.echo(f"Merge failed: {result.error}", err=True) - sys.exit(1) - - -# --------------------------------------------------------------------------- -# kdd enrich -# --------------------------------------------------------------------------- - - -@cli.command() -@click.argument("node_id") -@click.option("--timeout", default=120, help="Claude CLI timeout in seconds") -@click.option("--model", default=None, help="Claude model override (e.g. sonnet)") -@click.option("--index-path", type=click.Path(), default=None) -@click.option("--specs-path", type=click.Path(exists=True), default=".") -@click.option("--json-output", is_flag=True, help="Output as JSON") -def enrich(node_id: str, timeout: int, model: str | None, - index_path: str | None, specs_path: str, json_output: bool): - """Enrich a node with AI agent analysis (CMD-003 / UC-003).""" - from kdd.application.commands.enrich_with_agent import enrich_with_agent - - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - if not container.ensure_loaded(): - click.echo("Error: No index found. Run 'kdd index' first.", err=True) - sys.exit(1) - - if container.agent_client is None: - click.echo( - "Error: Claude CLI not found. Install it from " - "https://docs.anthropic.com/en/docs/claude-code", - err=True, - ) - sys.exit(1) - - # Apply overrides - if hasattr(container.agent_client, "timeout") and timeout != 120: - container.agent_client.timeout = timeout - if hasattr(container.agent_client, "model") and model: - container.agent_client.model = model - - result = enrich_with_agent( - node_id, - artifact_store=container.artifact_store, - agent_client=container.agent_client, - specs_root=specs_root, - ) - - if not result.success: - click.echo(f"Error: {result.error}", err=True) - sys.exit(1) - - if json_output: - click.echo(json.dumps(result.enrichment, indent=2, default=str)) - else: - enrichment = result.enrichment or {} - click.echo(f"Enrichment for: {node_id}\n") - if enrichment.get("summary"): - click.echo(f"Summary: {enrichment['summary']}\n") - click.echo(f"Implicit edges added: {result.implicit_edges}") - impact = enrichment.get("impact_analysis", {}) - if impact: - click.echo(f"Change risk: {impact.get('change_risk', 'unknown')}") - click.echo(f"Reason: {impact.get('reason', '')}") - - -# --------------------------------------------------------------------------- -# kdd status -# --------------------------------------------------------------------------- - - -@cli.command() -@click.option("--index-path", type=click.Path(), default=None) -@click.option("--specs-path", type=click.Path(exists=True), default=".") -def status(index_path: str | None, specs_path: str): - """Show index status and statistics.""" - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - - manifest = container.artifact_store.read_manifest() - if manifest is None: - click.echo("No index found. Run 'kdd index ' to create one.") - return - - click.echo(f"Index path: {container.index_path}") - click.echo(f"Version: {manifest.version}") - click.echo(f"Index level: {manifest.index_level.value}") - click.echo(f"Indexed at: {manifest.indexed_at}") - click.echo(f"Indexed by: {manifest.indexed_by}") - click.echo(f"Git commit: {manifest.git_commit or 'N/A'}") - click.echo(f"Structure: {manifest.structure}") - click.echo(f"Nodes: {manifest.stats.nodes}") - click.echo(f"Edges: {manifest.stats.edges}") - click.echo(f"Embeddings: {manifest.stats.embeddings}") - if manifest.embedding_model: - click.echo(f"Embed model: {manifest.embedding_model}") - if manifest.domains: - click.echo(f"Domains: {', '.join(manifest.domains)}") diff --git a/src/kdd/api/server.py b/src/kdd/api/server.py deleted file mode 100644 index 6e271cc..0000000 --- a/src/kdd/api/server.py +++ /dev/null @@ -1,409 +0,0 @@ -"""KDD REST API — FastAPI server. - -Provides the ``/v1/retrieve/*`` endpoints for AI agent integration. - -Endpoints: - POST /v1/retrieve/search (QRY-002: semantic search) - POST /v1/retrieve/context (QRY-003: hybrid search — primary endpoint) - GET /v1/retrieve/graph (QRY-001: graph traversal) - GET /v1/retrieve/impact (QRY-004: impact analysis) - GET /v1/retrieve/coverage (QRY-005: governance coverage) - GET /v1/retrieve/layer-violations (QRY-006: layer violations) -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Annotated - -from fastapi import Depends, FastAPI, HTTPException, Query -from pydantic import BaseModel, Field - -from kdd.container import Container, create_container - -# --------------------------------------------------------------------------- -# Application factory -# --------------------------------------------------------------------------- - -app = FastAPI( - title="KDD Retrieval API", - version="1.0.0", - description="Knowledge-Driven Development retrieval engine.", -) - - -def _get_container() -> Container: - """Dependency injection: resolve the global container. - - Override ``app.dependency_overrides[_get_container]`` in tests. - """ - if not hasattr(app.state, "container"): - raise HTTPException(503, "Index not loaded. Start server with --specs-path.") - return app.state.container - - -# --------------------------------------------------------------------------- -# Request / Response schemas -# --------------------------------------------------------------------------- - - -class SearchRequest(BaseModel): - query_text: str = Field(..., min_length=3) - include_kinds: list[str] | None = None - include_layers: list[str] | None = None - min_score: float = 0.7 - limit: int = 10 - - -class ContextRequest(BaseModel): - query_text: str = Field(..., min_length=3) - expand_graph: bool = True - depth: int = 2 - include_kinds: list[str] | None = None - include_layers: list[str] | None = None - respect_layers: bool = True - min_score: float = 0.5 - limit: int = 10 - max_tokens: int = 8000 - - -class ScoredNodeResponse(BaseModel): - node_id: str - score: float - snippet: str | None = None - match_source: str - - -class SearchResponse(BaseModel): - results: list[ScoredNodeResponse] - total_results: int - embedding_model: str | None = None - - -class ContextResponse(BaseModel): - results: list[ScoredNodeResponse] - total_results: int - total_tokens: int - warnings: list[str] - - -class GraphNodeResponse(BaseModel): - node_id: str - score: float - snippet: str | None = None - match_source: str = "graph" - - -class GraphResponse(BaseModel): - center_node: str | None - related_nodes: list[GraphNodeResponse] - total_nodes: int - total_edges: int - - -class AffectedNodeResponse(BaseModel): - node_id: str - kind: str - edge_type: str - impact_description: str - - -class TransitiveResponse(BaseModel): - node_id: str - kind: str - path: list[str] - - -class ScenarioResponse(BaseModel): - node_id: str - scenario_name: str - reason: str - - -class ImpactResponse(BaseModel): - analyzed_node: str | None - directly_affected: list[AffectedNodeResponse] - transitively_affected: list[TransitiveResponse] - scenarios_to_rerun: list[ScenarioResponse] - total_directly: int - total_transitively: int - - -class CoverageCategoryResponse(BaseModel): - name: str - status: str - found: list[str] - - -class CoverageResponse(BaseModel): - node_id: str - coverage_percent: float - categories: list[CoverageCategoryResponse] - - -class ViolationResponse(BaseModel): - from_node: str - to_node: str - from_layer: str - to_layer: str - edge_type: str - - -class ViolationsResponse(BaseModel): - violations: list[ViolationResponse] - total_violations: int - total_edges_analyzed: int - violation_rate: float - - -# --------------------------------------------------------------------------- -# Endpoints -# --------------------------------------------------------------------------- - - -@app.post("/v1/retrieve/search", response_model=SearchResponse) -def retrieve_search( - body: SearchRequest, - container: Container = Depends(_get_container), -): - """QRY-002: Semantic search over embeddings.""" - if container.embedding_model is None or container.vector_store is None: - raise HTTPException(400, "Semantic search requires L2 index (embeddings).") - - if not container.ensure_loaded(): - raise HTTPException(503, "Index not loaded.") - - from kdd.application.queries.retrieve_semantic import SemanticQueryInput, retrieve_semantic - from kdd.domain.enums import KDDKind, KDDLayer - - include_kinds = [KDDKind(k) for k in body.include_kinds] if body.include_kinds else None - include_layers = [KDDLayer(l) for l in body.include_layers] if body.include_layers else None - - try: - result = retrieve_semantic( - SemanticQueryInput( - query_text=body.query_text, - include_kinds=include_kinds, - include_layers=include_layers, - min_score=body.min_score, - limit=body.limit, - ), - container.embedding_model, - container.vector_store, - container.graph_store, - ) - except ValueError as e: - raise HTTPException(400, str(e)) - - return SearchResponse( - results=[ - ScoredNodeResponse( - node_id=r.node_id, score=round(r.score, 4), - snippet=r.snippet, match_source=r.match_source, - ) - for r in result.results - ], - total_results=result.total_results, - embedding_model=result.embedding_model, - ) - - -@app.post("/v1/retrieve/context", response_model=ContextResponse) -def retrieve_context( - body: ContextRequest, - container: Container = Depends(_get_container), -): - """QRY-003: Hybrid search (primary endpoint for AI agents).""" - if not container.ensure_loaded(): - raise HTTPException(503, "Index not loaded.") - - from kdd.application.queries.retrieve_hybrid import HybridQueryInput, retrieve_hybrid - from kdd.domain.enums import KDDKind, KDDLayer - - include_kinds = [KDDKind(k) for k in body.include_kinds] if body.include_kinds else None - include_layers = [KDDLayer(l) for l in body.include_layers] if body.include_layers else None - - try: - result = retrieve_hybrid( - HybridQueryInput( - query_text=body.query_text, - expand_graph=body.expand_graph, - depth=body.depth, - include_kinds=include_kinds, - include_layers=include_layers, - respect_layers=body.respect_layers, - min_score=body.min_score, - limit=body.limit, - max_tokens=body.max_tokens, - ), - container.graph_store, - container.vector_store, - container.embedding_model, - ) - except ValueError as e: - raise HTTPException(400, str(e)) - - return ContextResponse( - results=[ - ScoredNodeResponse( - node_id=r.node_id, score=round(r.score, 4), - snippet=r.snippet, match_source=r.match_source, - ) - for r in result.results - ], - total_results=result.total_results, - total_tokens=result.total_tokens, - warnings=result.warnings, - ) - - -@app.get("/v1/retrieve/graph", response_model=GraphResponse) -def retrieve_graph( - node_id: str, - depth: int = 2, - edge_type: Annotated[list[str] | None, Query()] = None, - container: Container = Depends(_get_container), -): - """QRY-001: Graph traversal from a root node.""" - if not container.ensure_loaded(): - raise HTTPException(503, "Index not loaded.") - - from kdd.application.queries.retrieve_graph import GraphQueryInput, retrieve_by_graph - - try: - result = retrieve_by_graph( - GraphQueryInput(root_node=node_id, depth=depth, edge_types=edge_type), - container.graph_store, - ) - except ValueError as e: - raise HTTPException(404, str(e)) - - return GraphResponse( - center_node=result.center_node.id if result.center_node else None, - related_nodes=[ - GraphNodeResponse( - node_id=r.node_id, score=round(r.score, 4), snippet=r.snippet, - ) - for r in result.related_nodes - ], - total_nodes=result.total_nodes, - total_edges=result.total_edges, - ) - - -@app.get("/v1/retrieve/impact", response_model=ImpactResponse) -def retrieve_impact( - node_id: str, - depth: int = 3, - container: Container = Depends(_get_container), -): - """QRY-004: Impact analysis for a node.""" - if not container.ensure_loaded(): - raise HTTPException(503, "Index not loaded.") - - from kdd.application.queries.retrieve_impact import ImpactQueryInput, retrieve_impact - - try: - result = retrieve_impact( - ImpactQueryInput(node_id=node_id, depth=depth), - container.graph_store, - ) - except ValueError as e: - raise HTTPException(404, str(e)) - - return ImpactResponse( - analyzed_node=result.analyzed_node.id if result.analyzed_node else None, - directly_affected=[ - AffectedNodeResponse( - node_id=a.node_id, kind=a.kind, - edge_type=a.edge_type, impact_description=a.impact_description, - ) - for a in result.directly_affected - ], - transitively_affected=[ - TransitiveResponse(node_id=t.node_id, kind=t.kind, path=t.path) - for t in result.transitively_affected - ], - scenarios_to_rerun=[ - ScenarioResponse( - node_id=s.node_id, scenario_name=s.scenario_name, reason=s.reason, - ) - for s in result.scenarios_to_rerun - ], - total_directly=result.total_directly, - total_transitively=result.total_transitively, - ) - - -@app.get("/v1/retrieve/coverage", response_model=CoverageResponse) -def retrieve_coverage( - node_id: str, - container: Container = Depends(_get_container), -): - """QRY-005: Governance coverage analysis.""" - if not container.ensure_loaded(): - raise HTTPException(503, "Index not loaded.") - - from kdd.application.queries.retrieve_coverage import CoverageQueryInput, retrieve_coverage - - try: - result = retrieve_coverage( - CoverageQueryInput(node_id=node_id), - container.graph_store, - ) - except ValueError as e: - raise HTTPException(404, str(e)) - - return CoverageResponse( - node_id=node_id, - coverage_percent=result.coverage_percent, - categories=[ - CoverageCategoryResponse(name=c.name, status=c.status, found=c.found) - for c in result.categories - ], - ) - - -@app.get("/v1/retrieve/layer-violations", response_model=ViolationsResponse) -def retrieve_violations( - container: Container = Depends(_get_container), -): - """QRY-006: List all layer dependency violations.""" - if not container.ensure_loaded(): - raise HTTPException(503, "Index not loaded.") - - from kdd.application.queries.retrieve_violations import ( - ViolationsQueryInput, - retrieve_violations, - ) - - result = retrieve_violations(ViolationsQueryInput(), container.graph_store) - - return ViolationsResponse( - violations=[ - ViolationResponse( - from_node=v.from_node, to_node=v.to_node, - from_layer=v.from_layer.value, to_layer=v.to_layer.value, - edge_type=v.edge_type, - ) - for v in result.violations - ], - total_violations=result.total_violations, - total_edges_analyzed=result.total_edges_analyzed, - violation_rate=result.violation_rate, - ) - - -# --------------------------------------------------------------------------- -# Server entry point -# --------------------------------------------------------------------------- - - -def create_app(specs_path: str = ".", index_path: str | None = None) -> FastAPI: - """Create and configure the FastAPI app with a loaded container.""" - specs_root = Path(specs_path).resolve() - idx_path = Path(index_path) if index_path else None - container = create_container(specs_root, idx_path) - container.ensure_loaded() - app.state.container = container - return app diff --git a/src/kdd/application/__init__.py b/src/kdd/application/__init__.py deleted file mode 100644 index 73e4537..0000000 --- a/src/kdd/application/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""KDD application layer — extractors, commands, and queries.""" diff --git a/src/kdd/application/chunking.py b/src/kdd/application/chunking.py deleted file mode 100644 index a8b722a..0000000 --- a/src/kdd/application/chunking.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Hierarchical chunking for embedding generation (BR-EMBEDDING-001). - -Selects embeddable sections per kind, splits them into paragraph-level -chunks, and enriches each chunk with context (document identity + ancestry). -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -from kdd.domain.entities import KDDDocument, Section -from kdd.domain.enums import KDDKind -from kdd.domain.rules import embeddable_sections - - -@dataclass(frozen=True) -class Chunk: - """A text chunk ready for embedding.""" - - chunk_id: str - document_id: str - section_heading: str - content: str - context_text: str - char_offset: int = 0 - - -def chunk_document( - document: KDDDocument, - *, - max_chunk_chars: int = 1500, - overlap_chars: int = 200, -) -> list[Chunk]: - """Chunk a document's embeddable sections into embedding-ready pieces. - - Steps: - 1. Identify embeddable sections via BR-EMBEDDING-001. - 2. For each section, split into paragraph chunks. - 3. Enrich each chunk with context (document identity + section heading). - - Returns an empty list if the kind has no embeddable sections (e.g. event). - """ - allowed = embeddable_sections(document.kind) - if not allowed: - return [] - - # Build document identity context - identity = _build_identity(document) - - chunks: list[Chunk] = [] - chunk_idx = 0 - - for section in document.sections: - if section.heading.lower() not in allowed: - continue - if not section.content.strip(): - continue - - paragraphs = _split_paragraphs(section.content, max_chunk_chars, overlap_chars) - - for offset, text in paragraphs: - context = f"{identity}\nSection: {section.heading}\n\n{text}" - chunks.append(Chunk( - chunk_id=f"{document.id}:chunk-{chunk_idx}", - document_id=document.id, - section_heading=section.heading, - content=text, - context_text=context, - char_offset=offset, - )) - chunk_idx += 1 - - return chunks - - -def _build_identity(document: KDDDocument) -> str: - """Build a concise identity string for context enrichment.""" - parts = [ - f"Document: {document.id}", - f"Kind: {document.kind.value}", - f"Layer: {document.layer.value}", - ] - title = document.front_matter.get("title") - if title: - parts.append(f"Title: {title}") - return "\n".join(parts) - - -def _split_paragraphs( - content: str, - max_chars: int, - overlap: int, -) -> list[tuple[int, str]]: - """Split content into paragraph-boundary chunks. - - Returns list of (char_offset, text) tuples. - - Strategy: - - Split on double newlines (paragraph boundaries). - - Accumulate paragraphs until max_chars is reached. - - When a single paragraph exceeds max_chars, split at sentence - boundaries within it. - """ - paragraphs = content.split("\n\n") - results: list[tuple[int, str]] = [] - - current_parts: list[str] = [] - current_len = 0 - current_offset = 0 - char_pos = 0 - - for para in paragraphs: - para = para.strip() - if not para: - char_pos += 2 # account for \n\n - continue - - para_len = len(para) - - if current_len + para_len + 2 > max_chars and current_parts: - # Flush current accumulation - results.append((current_offset, "\n\n".join(current_parts))) - # Overlap: keep last part if it fits - if overlap > 0 and current_parts: - last = current_parts[-1] - if len(last) <= overlap: - current_parts = [last] - current_len = len(last) - current_offset = char_pos - len(last) - 2 - else: - current_parts = [] - current_len = 0 - current_offset = char_pos - else: - current_parts = [] - current_len = 0 - current_offset = char_pos - - if para_len > max_chars and not current_parts: - # Single paragraph too large — split at sentence boundaries - sentences = _split_sentences(para) - sent_buf: list[str] = [] - sent_len = 0 - sent_offset = char_pos - - for sent in sentences: - if sent_len + len(sent) + 1 > max_chars and sent_buf: - results.append((sent_offset, " ".join(sent_buf))) - sent_buf = [] - sent_len = 0 - sent_offset = char_pos - sent_buf.append(sent) - sent_len += len(sent) + 1 - - if sent_buf: - current_parts = sent_buf - current_len = sent_len - current_offset = sent_offset - else: - if not current_parts: - current_offset = char_pos - current_parts.append(para) - current_len += para_len + 2 - - char_pos += para_len + 2 - - if current_parts: - results.append((current_offset, "\n\n".join(current_parts))) - - return results - - -def _split_sentences(text: str) -> list[str]: - """Naive sentence splitter: split on `. ` or `.\n`.""" - import re - parts = re.split(r"(?<=\.)\s+", text) - return [p.strip() for p in parts if p.strip()] diff --git a/src/kdd/application/commands/__init__.py b/src/kdd/application/commands/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kdd/application/commands/enrich_with_agent.py b/src/kdd/application/commands/enrich_with_agent.py deleted file mode 100644 index 2fb5a4e..0000000 --- a/src/kdd/application/commands/enrich_with_agent.py +++ /dev/null @@ -1,122 +0,0 @@ -"""CMD-003 — EnrichWithAgent command. - -Enriches an existing GraphNode using the developer's AI agent (L3). -Completely optional — requires API key and L2+ index. - -Spec: specs/02-behavior/commands/CMD-003-EnrichWithAgent.md -""" - -from __future__ import annotations - -import json -import logging -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path - -from kdd.domain.entities import GraphEdge, GraphNode -from kdd.domain.ports import AgentClient, ArtifactStore - -logger = logging.getLogger(__name__) - - -@dataclass -class EnrichResult: - success: bool - enrichment: dict | None = None - implicit_edges: int = 0 - error: str | None = None - - -def enrich_with_agent( - node_id: str, - *, - artifact_store: ArtifactStore, - agent_client: AgentClient, - specs_root: Path, -) -> EnrichResult: - """Enrich a graph node using an AI agent (CMD-003 / UC-003). - - Reads the source document, builds a prompt with existing graph context, - and asks the agent to produce an improved summary + implicit relations. - """ - # 1. Find the node - node = artifact_store.read_node(node_id) - if node is None: - return EnrichResult(success=False, error=f"NODE_NOT_FOUND: {node_id}") - - # 2. Read source document - source_path = specs_root / node.source_file - if not source_path.exists(): - return EnrichResult(success=False, error=f"DOCUMENT_NOT_FOUND: {node.source_file}") - - content = source_path.read_text(encoding="utf-8") - - # 3. Build context (document + existing edges) - edges = artifact_store.read_edges() - related_edges = [ - e for e in edges if e.from_node == node_id or e.to_node == node_id - ] - context = _build_context(node, content, related_edges) - - # 4. Call agent - try: - enrichment = agent_client.enrich(node, context) - except Exception as e: - return EnrichResult(success=False, error=f"AGENT_ERROR: {e}") - - # 5. Store enrichment - enrichments_dir = Path(artifact_store.root) / "enrichments" if hasattr(artifact_store, "root") else None - if enrichments_dir: - enrichments_dir.mkdir(parents=True, exist_ok=True) - doc_id = node_id.split(":", 1)[-1] if ":" in node_id else node_id - out_path = enrichments_dir / f"{doc_id}.json" - out_path.write_text( - json.dumps(enrichment, indent=2, default=str), - encoding="utf-8", - ) - - # 6. Extract implicit relations from enrichment - implicit_edges: list[GraphEdge] = [] - for rel in enrichment.get("implicit_relations", []): - implicit_edges.append(GraphEdge( - from_node=node_id, - to_node=rel.get("target", ""), - edge_type=rel.get("type", "WIKI_LINK"), - source_file=node.source_file, - extraction_method="implicit", - metadata={"agent": "enrichment"}, - )) - - if implicit_edges: - artifact_store.append_edges(implicit_edges) - - return EnrichResult( - success=True, - enrichment=enrichment, - implicit_edges=len(implicit_edges), - ) - - -def _build_context( - node: GraphNode, - document_content: str, - related_edges: list[GraphEdge], -) -> str: - """Build a context string for the agent prompt.""" - parts = [ - f"# Node: {node.id}", - f"Kind: {node.kind.value}", - f"Layer: {node.layer.value}", - "", - "## Document Content", - document_content[:5000], # Truncate for token budget - "", - "## Existing Relations", - ] - for edge in related_edges[:20]: - direction = "->" if edge.from_node == node.id else "<-" - other = edge.to_node if edge.from_node == node.id else edge.from_node - parts.append(f" {direction} {other} [{edge.edge_type}]") - - return "\n".join(parts) diff --git a/src/kdd/application/commands/index_document.py b/src/kdd/application/commands/index_document.py deleted file mode 100644 index 07fe29c..0000000 --- a/src/kdd/application/commands/index_document.py +++ /dev/null @@ -1,196 +0,0 @@ -"""CMD-001 — IndexDocument command. - -Processes a single KDD spec file through the full indexing pipeline: -1. Read file and extract front-matter -2. Route document via BR-DOCUMENT-001 -3. Extract node + edges via kind-specific extractor -4. Validate layer dependencies (BR-LAYER-001) -5. Optional L2: chunk + embed (BR-EMBEDDING-001) -6. Write artifacts to ArtifactStore -7. Emit domain events -""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path -from typing import Any - -from kdd.application.chunking import chunk_document -from kdd.application.extractors.registry import ExtractorRegistry -from kdd.domain.entities import Embedding, GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer -from kdd.domain.events import DocumentDetected, DocumentIndexed, DocumentParsed -from kdd.domain.ports import ArtifactStore, EmbeddingModel, EventBus -from kdd.domain.rules import detect_layer, route_document -from kdd.infrastructure.parsing.hashing import compute_content_hash -from kdd.infrastructure.parsing.markdown import ( - extract_frontmatter, - parse_markdown_sections, -) -from kdd.infrastructure.parsing.wiki_links import extract_wiki_link_targets - -logger = logging.getLogger(__name__) - - -@dataclass -class IndexResult: - """Result of indexing a single document.""" - - success: bool - node_id: str | None = None - edge_count: int = 0 - embedding_count: int = 0 - skipped_reason: str | None = None - warning: str | None = None - - -def index_document( - file_path: Path, - *, - specs_root: Path, - registry: ExtractorRegistry, - artifact_store: ArtifactStore, - event_bus: EventBus | None = None, - embedding_model: EmbeddingModel | None = None, - index_level: IndexLevel = IndexLevel.L1, - domain: str | None = None, -) -> IndexResult: - """Index a single KDD spec file. - - Args: - file_path: Absolute path to the spec file. - specs_root: Root directory of specs (for relative path computation). - registry: Extractor registry with all kind extractors. - artifact_store: Store to write artifacts to. - event_bus: Optional event bus for domain events. - embedding_model: Optional embedding model for L2+ indexing. - index_level: Target index level (L1, L2, L3). - domain: Optional domain override. - - Returns: - IndexResult with success status and metadata. - """ - start = datetime.now() - - # 1. Read file - try: - content = file_path.read_text(encoding="utf-8") - except (FileNotFoundError, PermissionError) as e: - return IndexResult(success=False, skipped_reason=f"File error: {e}") - - # 2. Extract front-matter and route - front_matter, body = extract_frontmatter(content) - relative_path = str(file_path.relative_to(specs_root)) - route = route_document(front_matter, relative_path) - - if route.kind is None: - return IndexResult(success=False, skipped_reason="No valid kind in front-matter") - - # 3. Find extractor - extractor = registry.get(route.kind) - if extractor is None: - return IndexResult( - success=False, - skipped_reason=f"No extractor registered for kind '{route.kind.value}'", - ) - - # 4. Build KDDDocument - sections = parse_markdown_sections(body) - wiki_links = extract_wiki_link_targets(body) - layer = detect_layer(relative_path) or KDDLayer.DOMAIN - doc_id = front_matter.get("id", file_path.stem) - source_hash = compute_content_hash(content) - - # Emit DocumentDetected - if event_bus: - event_bus.publish(DocumentDetected( - source_path=relative_path, - source_hash=source_hash, - kind=route.kind, - layer=layer, - detected_at=start, - )) - - document = KDDDocument( - id=doc_id, - kind=route.kind, - source_path=relative_path, - source_hash=source_hash, - layer=layer, - front_matter=front_matter, - sections=sections, - wiki_links=wiki_links, - domain=domain, - ) - - # Emit DocumentParsed - if event_bus: - event_bus.publish(DocumentParsed( - source_path=relative_path, - kind=route.kind, - document_id=doc_id, - front_matter=front_matter, - section_count=len(sections), - wiki_link_count=len(wiki_links), - parsed_at=datetime.now(), - )) - - # 5. Extract node + edges - node = extractor.extract_node(document) - edges = extractor.extract_edges(document) - - # 6. Write node + edges to artifact store - artifact_store.write_node(node) - if edges: - artifact_store.append_edges(edges) - - # 7. Optional L2: chunk + embed - embeddings: list[Embedding] = [] - if index_level in (IndexLevel.L2, IndexLevel.L3) and embedding_model is not None: - chunks = chunk_document(document) - if chunks: - texts = [c.context_text for c in chunks] - vectors = embedding_model.encode(texts) - now = datetime.now() - for i, (chunk, vector) in enumerate(zip(chunks, vectors)): - embeddings.append(Embedding( - id=chunk.chunk_id, - document_id=doc_id, - document_kind=route.kind, - section_path=chunk.section_heading, - chunk_index=i, - raw_text=chunk.content, - context_text=chunk.context_text, - vector=vector, - model=embedding_model.model_name, - dimensions=embedding_model.dimensions, - text_hash=compute_content_hash(chunk.content), - generated_at=now, - )) - artifact_store.write_embeddings(embeddings) - - # 8. Emit DocumentIndexed - duration_ms = int((datetime.now() - start).total_seconds() * 1000) - if event_bus: - event_bus.publish(DocumentIndexed( - source_path=relative_path, - kind=route.kind, - document_id=doc_id, - node_id=node.id, - edge_count=len(edges), - embedding_count=len(embeddings), - index_level=index_level, - duration_ms=duration_ms, - indexed_at=datetime.now(), - )) - - return IndexResult( - success=True, - node_id=node.id, - edge_count=len(edges), - embedding_count=len(embeddings), - warning=route.warning, - ) diff --git a/src/kdd/application/commands/index_incremental.py b/src/kdd/application/commands/index_incremental.py deleted file mode 100644 index 2b9428e..0000000 --- a/src/kdd/application/commands/index_incremental.py +++ /dev/null @@ -1,186 +0,0 @@ -"""CMD-002 — IndexIncremental command. - -Uses git diff to identify changed files since the last indexed commit, -then processes only new/modified/deleted files: -- New files → index via CMD-001 -- Modified → delete old artifacts + re-index via CMD-001 -- Deleted → cascade delete artifacts -""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path - -from kdd.application.commands.index_document import IndexResult, index_document -from kdd.application.extractors.registry import ExtractorRegistry -from kdd.domain.entities import IndexManifest, IndexStats -from kdd.domain.enums import IndexLevel -from kdd.domain.ports import ArtifactStore, EmbeddingModel, EventBus -from kdd.infrastructure.git.diff import get_current_commit, get_diff, scan_files - -logger = logging.getLogger(__name__) - - -@dataclass -class IncrementalResult: - """Result of an incremental indexing run.""" - - indexed: int = 0 - deleted: int = 0 - skipped: int = 0 - errors: int = 0 - results: list[IndexResult] = field(default_factory=list) - is_full_reindex: bool = False - - -def _index_file( - rel_path: str, - *, - repo_root: Path, - specs_root: Path, - registry: ExtractorRegistry, - artifact_store: ArtifactStore, - event_bus: EventBus | None, - embedding_model: EmbeddingModel | None, - index_level: IndexLevel, - domain: str | None, -) -> IndexResult: - """Index a single file given its repo-relative path.""" - file_path = repo_root / rel_path - return index_document( - file_path, - specs_root=specs_root, - registry=registry, - artifact_store=artifact_store, - event_bus=event_bus, - embedding_model=embedding_model, - index_level=index_level, - domain=domain, - ) - - -def index_incremental( - specs_root: Path, - *, - repo_root: Path | None = None, - registry: ExtractorRegistry, - artifact_store: ArtifactStore, - event_bus: EventBus | None = None, - embedding_model: EmbeddingModel | None = None, - index_level: IndexLevel = IndexLevel.L1, - include_patterns: list[str] | None = None, - domain: str | None = None, -) -> IncrementalResult: - """Run incremental indexing based on git diff. - - If no previous manifest exists, performs a full index of all matching files. - - Args: - specs_root: Root directory of specs (used to compute relative paths). - repo_root: Git repository root. Defaults to ``specs_root`` if not given. - registry: Extractor registry. - artifact_store: Store for reading/writing artifacts. - event_bus: Optional event bus. - embedding_model: Optional embedding model for L2+. - index_level: Target index level. - include_patterns: Glob patterns for files to include (default: ``["**/*.md"]``). - domain: Optional domain override. - """ - if include_patterns is None: - include_patterns = ["**/*.md"] - if repo_root is None: - repo_root = specs_root - - result = IncrementalResult() - - # Read existing manifest - manifest = artifact_store.read_manifest() - current_commit = get_current_commit(repo_root) - - common_kwargs = dict( - repo_root=repo_root, - specs_root=specs_root, - registry=registry, - artifact_store=artifact_store, - event_bus=event_bus, - embedding_model=embedding_model, - index_level=index_level, - domain=domain, - ) - - if manifest is None or manifest.git_commit is None: - # No previous index — full reindex - result.is_full_reindex = True - # scan_files returns paths relative to cwd (repo_root) - all_files = scan_files(repo_root, include_patterns=include_patterns) - for rel_path in all_files: - r = _index_file(rel_path, **common_kwargs) - result.results.append(r) - if r.success: - result.indexed += 1 - elif r.skipped_reason: - result.skipped += 1 - else: - result.errors += 1 - else: - # Incremental: only changed files - # get_diff returns paths relative to git root - diff = get_diff( - repo_root, - manifest.git_commit, - include_patterns=include_patterns, - ) - - # Process new files - for rel_path in diff.added: - r = _index_file(rel_path, **common_kwargs) - result.results.append(r) - if r.success: - result.indexed += 1 - elif r.skipped_reason: - result.skipped += 1 - else: - result.errors += 1 - - # Process modified files: delete old + re-index - for rel_path in diff.modified: - artifact_store.delete_document_artifacts(rel_path) - r = _index_file(rel_path, **common_kwargs) - result.results.append(r) - if r.success: - result.indexed += 1 - elif r.skipped_reason: - result.skipped += 1 - else: - result.errors += 1 - - # Process deleted files: cascade delete - for rel_path in diff.deleted: - artifact_store.delete_document_artifacts(rel_path) - result.deleted += 1 - - # Update manifest - total_nodes = sum(1 for r in result.results if r.success) - total_edges = sum(r.edge_count for r in result.results if r.success) - total_embeddings = sum(r.embedding_count for r in result.results if r.success) - - new_manifest = IndexManifest( - version="1.0.0", - kdd_version="1.0.0", - indexed_by="kdd-cli", - index_level=index_level, - git_commit=current_commit, - indexed_at=datetime.now(), - stats=IndexStats( - nodes=total_nodes, - edges=total_edges, - embeddings=total_embeddings, - ), - domains=[domain] if domain else [], - ) - artifact_store.write_manifest(new_manifest) - - return result diff --git a/src/kdd/application/commands/merge_index.py b/src/kdd/application/commands/merge_index.py deleted file mode 100644 index be43812..0000000 --- a/src/kdd/application/commands/merge_index.py +++ /dev/null @@ -1,223 +0,0 @@ -"""CMD-004 — MergeIndex command. - -Merges indices from multiple developers into a unified index. -Validates manifest compatibility, resolves node conflicts via -BR-MERGE-001 (last-write-wins / delete-wins), and produces a -new merged IndexManifest. - -Spec: specs/02-behavior/commands/CMD-004-MergeIndex.md -""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path - -from kdd.domain.entities import ( - Embedding, - GraphEdge, - GraphNode, - IndexManifest, - IndexStats, -) -from kdd.domain.enums import IndexLevel -from kdd.domain.rules import resolve_deletion, resolve_node_conflict -from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore - -logger = logging.getLogger(__name__) - - -@dataclass -class MergeResult: - """Result of a merge operation.""" - - success: bool - total_nodes: int = 0 - total_edges: int = 0 - total_embeddings: int = 0 - conflicts_resolved: int = 0 - error: str | None = None - - -def merge_index( - source_paths: list[Path], - output_path: Path, - *, - conflict_strategy: str = "last_write_wins", -) -> MergeResult: - """Merge multiple .kdd-index/ directories into one. - - Args: - source_paths: Paths to source .kdd-index/ directories (min 2). - output_path: Path for the merged output .kdd-index/. - conflict_strategy: "last_write_wins" or "fail_on_conflict". - - Returns: - MergeResult with success status and statistics. - """ - if len(source_paths) < 2: - return MergeResult(success=False, error="INSUFFICIENT_SOURCES: need at least 2 indices") - - # 1. Load all manifests and validate compatibility - sources: list[tuple[FilesystemArtifactStore, IndexManifest]] = [] - for path in source_paths: - store = FilesystemArtifactStore(path) - manifest = store.read_manifest() - if manifest is None: - return MergeResult(success=False, error=f"MANIFEST_NOT_FOUND: {path}") - sources.append((store, manifest)) - - err = _validate_compatibility([m for _, m in sources]) - if err: - return MergeResult(success=False, error=err) - - # 2. Merge nodes - all_nodes_by_id: dict[str, list[tuple[int, GraphNode]]] = {} - for idx, (store, _) in enumerate(sources): - for node in store.read_all_nodes(): - all_nodes_by_id.setdefault(node.id, []).append((idx, node)) - - merged_nodes: list[GraphNode] = [] - conflicts = 0 - - for node_id, candidates in all_nodes_by_id.items(): - if len(candidates) == 1: - merged_nodes.append(candidates[0][1]) - continue - - # Multiple copies — check for conflict - hashes = {n.source_hash for _, n in candidates} - if len(hashes) == 1: - # Identical - merged_nodes.append(candidates[0][1]) - continue - - # Real conflict - if conflict_strategy == "fail_on_conflict": - return MergeResult( - success=False, - error=f"CONFLICT_REJECTED: conflict on node {node_id}", - ) - - # Last-write-wins - conflict_dicts = [ - { - "source_hash": n.source_hash, - "indexed_at": n.indexed_at or datetime.min, - } - for _, n in candidates - ] - result = resolve_node_conflict(conflict_dicts) - merged_nodes.append(candidates[result.winner_index][1]) - conflicts += 1 - - merged_node_ids = {n.id for n in merged_nodes} - - # 3. Merge edges (union, deduplicate, cascade delete for removed nodes) - seen_edges: set[tuple[str, str, str]] = set() - merged_edges: list[GraphEdge] = [] - for store, _ in sources: - for edge in store.read_edges(): - # Cascade: skip edges referencing nodes not in merged set - if edge.from_node not in merged_node_ids or edge.to_node not in merged_node_ids: - continue - key = (edge.from_node, edge.to_node, edge.edge_type) - if key not in seen_edges: - seen_edges.add(key) - merged_edges.append(edge) - - # 4. Merge embeddings (use winner's embeddings for conflicted nodes) - winner_source: dict[str, int] = {} - for node_id, candidates in all_nodes_by_id.items(): - if len(candidates) == 1: - winner_source[node_id] = candidates[0][0] - else: - conflict_dicts = [ - { - "source_hash": n.source_hash, - "indexed_at": n.indexed_at or datetime.min, - } - for _, n in candidates - ] - result = resolve_node_conflict(conflict_dicts) - winner_source[node_id] = candidates[result.winner_index][0] - - merged_embeddings: list[Embedding] = [] - for node in merged_nodes: - doc_id = node.id.split(":", 1)[-1] if ":" in node.id else node.id - src_idx = winner_source.get(node.id, 0) - src_store = sources[src_idx][0] - embs = src_store.read_embeddings(doc_id) - merged_embeddings.extend(embs) - - # 5. Write merged output - out_store = FilesystemArtifactStore(output_path) - for node in merged_nodes: - out_store.write_node(node) - if merged_edges: - out_store.append_edges(merged_edges) - if merged_embeddings: - out_store.write_embeddings(merged_embeddings) - - # Determine merged index level (minimum of all sources) - levels = [m.index_level for _, m in sources] - merged_level = IndexLevel.L1 - if all(l in (IndexLevel.L2, IndexLevel.L3) for l in levels): - merged_level = IndexLevel.L2 - if all(l == IndexLevel.L3 for l in levels): - merged_level = IndexLevel.L3 - - # Determine embedding model (must be same across all L2+ sources) - emb_model = None - emb_dims = None - for _, m in sources: - if m.embedding_model: - emb_model = m.embedding_model - emb_dims = m.embedding_dimensions - break - - manifest = IndexManifest( - version="1.0.0", - kdd_version="1.0.0", - embedding_model=emb_model, - embedding_dimensions=emb_dims, - indexed_at=datetime.now(), - indexed_by="kdd-merge", - index_level=merged_level, - stats=IndexStats( - nodes=len(merged_nodes), - edges=len(merged_edges), - embeddings=len(merged_embeddings), - ), - ) - out_store.write_manifest(manifest) - - return MergeResult( - success=True, - total_nodes=len(merged_nodes), - total_edges=len(merged_edges), - total_embeddings=len(merged_embeddings), - conflicts_resolved=conflicts, - ) - - -def _validate_compatibility(manifests: list[IndexManifest]) -> str | None: - """Validate that all manifests are merge-compatible. Returns error or None.""" - # Same major version - majors = {m.version.split(".")[0] for m in manifests} - if len(majors) > 1: - return f"INCOMPATIBLE_VERSION: major versions differ: {majors}" - - # Same embedding model (for L2+ indices) - models = {m.embedding_model for m in manifests if m.embedding_model} - if len(models) > 1: - return f"INCOMPATIBLE_EMBEDDING_MODEL: models differ: {models}" - - # Same structure - structures = {m.structure for m in manifests} - if len(structures) > 1: - return f"INCOMPATIBLE_STRUCTURE: structures differ: {structures}" - - return None diff --git a/src/kdd/application/commands/sync_index.py b/src/kdd/application/commands/sync_index.py deleted file mode 100644 index 12800c8..0000000 --- a/src/kdd/application/commands/sync_index.py +++ /dev/null @@ -1,66 +0,0 @@ -"""CMD-005 — SyncIndex command. - -Synchronizes index artifacts between local machine and shared server. -Supports push (upload local) and pull (download merged). - -Spec: specs/02-behavior/commands/CMD-005-SyncIndex.md -""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from pathlib import Path - -from kdd.domain.ports import ArtifactStore, Transport - -logger = logging.getLogger(__name__) - - -@dataclass -class SyncResult: - success: bool - direction: str # "push" or "pull" - error: str | None = None - - -def sync_push( - artifact_store: ArtifactStore, - transport: Transport, - *, - index_path: str = ".kdd-index", - remote: str = "origin", -) -> SyncResult: - """Push local index artifacts to remote server (CMD-005 push). - - Privacy guarantee (REQ-003): Only transmits derived artifacts - (.kdd-index/), never original spec content. - """ - manifest = artifact_store.read_manifest() - if manifest is None: - return SyncResult(success=False, direction="push", error="NO_LOCAL_INDEX") - - try: - transport.push(index_path, remote) - except Exception as e: - return SyncResult(success=False, direction="push", error=f"TRANSPORT_ERROR: {e}") - - return SyncResult(success=True, direction="push") - - -def sync_pull( - transport: Transport, - *, - remote: str = "origin", - target_path: str = ".kdd-index", -) -> SyncResult: - """Pull merged index artifacts from remote server (CMD-005 pull). - - Replaces local .kdd-index/ with the merged index from the server. - """ - try: - transport.pull(remote, target_path) - except Exception as e: - return SyncResult(success=False, direction="pull", error=f"TRANSPORT_ERROR: {e}") - - return SyncResult(success=True, direction="pull") diff --git a/src/kdd/application/extractors/__init__.py b/src/kdd/application/extractors/__init__.py deleted file mode 100644 index 476e804..0000000 --- a/src/kdd/application/extractors/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Extractor framework for KDD spec files.""" diff --git a/src/kdd/application/extractors/base.py b/src/kdd/application/extractors/base.py deleted file mode 100644 index cf7b906..0000000 --- a/src/kdd/application/extractors/base.py +++ /dev/null @@ -1,217 +0,0 @@ -"""Base extractor protocol and helpers. - -Every kind-specific extractor implements this protocol so the indexing -pipeline can process any KDD spec uniformly. -""" - -from __future__ import annotations - -import re -from typing import Any, Protocol - -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument, Section -from kdd.domain.enums import KDDKind, KDDLayer -from kdd.domain.rules import detect_layer, is_layer_violation -from kdd.infrastructure.parsing.wiki_links import WikiLink, extract_wiki_links - - -class Extractor(Protocol): - """Protocol that every kind extractor must satisfy.""" - - kind: KDDKind - - def extract_node(self, document: KDDDocument) -> GraphNode: ... - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: ... - - -# --------------------------------------------------------------------------- -# Shared helpers available to all extractors -# --------------------------------------------------------------------------- - -# Mapping from kind to the ID prefix used in GraphNode.id -KIND_PREFIX: dict[KDDKind, str] = { - KDDKind.ENTITY: "Entity", - KDDKind.EVENT: "Event", - KDDKind.BUSINESS_RULE: "BR", - KDDKind.BUSINESS_POLICY: "BP", - KDDKind.CROSS_POLICY: "XP", - KDDKind.COMMAND: "CMD", - KDDKind.QUERY: "QRY", - KDDKind.PROCESS: "PROC", - KDDKind.USE_CASE: "UC", - KDDKind.UI_VIEW: "UIView", - KDDKind.UI_COMPONENT: "UIComp", - KDDKind.REQUIREMENT: "REQ", - KDDKind.OBJECTIVE: "OBJ", - KDDKind.PRD: "PRD", - KDDKind.ADR: "ADR", -} - - -def make_node_id(kind: KDDKind, document_id: str) -> str: - """Build a composite ``{Prefix}:{DocumentId}`` node ID.""" - prefix = KIND_PREFIX.get(kind, kind.value.upper()) - return f"{prefix}:{document_id}" - - -def find_section(sections: list[Section], *names: str) -> Section | None: - """Find the first section whose heading matches any of *names* (case-insensitive).""" - targets = {n.lower() for n in names} - for s in sections: - if s.heading.lower() in targets: - return s - return None - - -def find_sections(sections: list[Section], *names: str) -> list[Section]: - """Return all sections whose heading matches any of *names*.""" - targets = {n.lower() for n in names} - return [s for s in sections if s.heading.lower() in targets] - - -def find_section_with_children( - sections: list[Section], *names: str -) -> str | None: - """Find a section by heading and concatenate its content with all - immediate child sub-sections (deeper heading level). - - Returns the combined text, or ``None`` if the parent heading is not found. - This is useful for sections like ``## Flujos Alternativos`` that have - sub-headings (``### FA-1``, ``### FA-2``) carrying the actual content. - """ - targets = {n.lower() for n in names} - parent_idx: int | None = None - parent_level: int = 0 - - for i, s in enumerate(sections): - if s.heading.lower() in targets: - parent_idx = i - parent_level = s.level - break - - if parent_idx is None: - return None - - parts: list[str] = [] - parent = sections[parent_idx] - if parent.content.strip(): - parts.append(parent.content) - - # Collect children — sections at a deeper level until we hit same or shallower - for s in sections[parent_idx + 1:]: - if s.level <= parent_level: - break - parts.append(f"### {s.heading}\n\n{s.content}") - - return "\n\n".join(parts) if parts else None - - -def resolve_wiki_link_to_node_id(link: WikiLink) -> str | None: - """Best-effort resolution of a wiki-link target to a node ID. - - Heuristics: - - ``EVT-*`` → ``Event:{target}`` - - ``BR-*`` → ``BR:{target}`` - - ``BP-*`` → ``BP:{target}`` - - ``XP-*`` → ``XP:{target}`` - - ``CMD-*`` → ``CMD:{target}`` - - ``QRY-*`` → ``QRY:{target}`` - - ``UC-*`` → ``UC:{target}`` - - ``PROC-*`` → ``PROC:{target}`` - - ``REQ-*`` → ``REQ:{target}`` - - ``OBJ-*`` → ``OBJ:{target}`` - - ``ADR-*`` → ``ADR:{target}`` - - ``PRD-*`` → ``PRD:{target}`` - - ``UI-*`` → ``UIView:{target}`` (ambiguous, default to view) - - Otherwise → ``Entity:{target}`` (PascalCase names are typically entities) - """ - t = link.target - prefix_map = [ - ("EVT-", "Event"), - ("BR-", "BR"), - ("BP-", "BP"), - ("XP-", "XP"), - ("CMD-", "CMD"), - ("QRY-", "QRY"), - ("UC-", "UC"), - ("PROC-", "PROC"), - ("REQ-", "REQ"), - ("OBJ-", "OBJ"), - ("ADR-", "ADR"), - ("PRD-", "PRD"), - ("UI-", "UIView"), - ] - for prefix, node_prefix in prefix_map: - if t.startswith(prefix): - return f"{node_prefix}:{t}" - return f"Entity:{t}" - - -def build_wiki_link_edges( - document: KDDDocument, - from_node_id: str, - from_layer: KDDLayer, -) -> list[GraphEdge]: - """Extract WIKI_LINK edges from all wiki-links in the document body.""" - edges: list[GraphEdge] = [] - seen: set[tuple[str, str]] = set() - - full_content = "\n".join(s.content for s in document.sections) - links = extract_wiki_links(full_content) - - for link in links: - to_node_id = resolve_wiki_link_to_node_id(link) - if to_node_id is None: - continue - key = (from_node_id, to_node_id) - if key in seen: - continue - seen.add(key) - - # Determine destination layer heuristically - dest_layer = _guess_layer_from_node_id(to_node_id) - violation = False - if dest_layer is not None: - violation = is_layer_violation(from_layer, dest_layer) - - metadata: dict[str, Any] = {} - if link.domain: - metadata["domain"] = link.domain - if link.alias: - metadata["display_alias"] = link.alias - - edges.append(GraphEdge( - from_node=from_node_id, - to_node=to_node_id, - edge_type="WIKI_LINK", - source_file=document.source_path, - extraction_method="wiki_link", - metadata=metadata, - layer_violation=violation, - bidirectional=True, - )) - - return edges - - -def _guess_layer_from_node_id(node_id: str) -> KDDLayer | None: - """Guess the KDD layer from a node ID prefix.""" - prefix = node_id.split(":")[0] if ":" in node_id else "" - layer_map: dict[str, KDDLayer] = { - "Entity": KDDLayer.DOMAIN, - "Event": KDDLayer.DOMAIN, - "BR": KDDLayer.DOMAIN, - "BP": KDDLayer.BEHAVIOR, - "XP": KDDLayer.BEHAVIOR, - "CMD": KDDLayer.BEHAVIOR, - "QRY": KDDLayer.BEHAVIOR, - "PROC": KDDLayer.BEHAVIOR, - "UC": KDDLayer.BEHAVIOR, - "UIView": KDDLayer.EXPERIENCE, - "UIComp": KDDLayer.EXPERIENCE, - "REQ": KDDLayer.VERIFICATION, - "OBJ": KDDLayer.REQUIREMENTS, - "PRD": KDDLayer.REQUIREMENTS, - "ADR": KDDLayer.REQUIREMENTS, - } - return layer_map.get(prefix) diff --git a/src/kdd/application/extractors/kinds/__init__.py b/src/kdd/application/extractors/kinds/__init__.py deleted file mode 100644 index d9d769a..0000000 --- a/src/kdd/application/extractors/kinds/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Kind-specific extractors — one per KDDKind.""" diff --git a/src/kdd/application/extractors/kinds/adr.py b/src/kdd/application/extractors/kinds/adr.py deleted file mode 100644 index 8aad75a..0000000 --- a/src/kdd/application/extractors/kinds/adr.py +++ /dev/null @@ -1,63 +0,0 @@ -"""ADR extractor — parses ``kind: adr`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → adr row. -Indexed fields: context, decision, consequences. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class ADRExtractor: - """Extractor for ``kind: adr`` KDD documents.""" - - kind = KDDKind.ADR - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.ADR, document.id) - fields: dict[str, Any] = {} - - context = find_section(document.sections, "Contexto", "Context") - if context: - fields["context"] = context.content - - decision = find_section(document.sections, "Decisión", "Decision") - if decision: - fields["decision"] = decision.content - - consequences = find_section( - document.sections, "Consecuencias", "Consequences", - ) - if consequences: - fields["consequences"] = consequences.content - - return GraphNode( - id=node_id, - kind=KDDKind.ADR, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.ADR, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/business_policy.py b/src/kdd/application/extractors/kinds/business_policy.py deleted file mode 100644 index 9b61ec0..0000000 --- a/src/kdd/application/extractors/kinds/business_policy.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Business-policy extractor — parses ``kind: business-policy`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → business-policy row. -Indexed fields: declaration, when_applies, parameters, violation. -Edges: ENTITY_RULE (entities in declaration), WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, - resolve_wiki_link_to_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind -from kdd.infrastructure.parsing.wiki_links import extract_wiki_links - - -class PolicyExtractor: - """Extractor for ``kind: business-policy`` KDD documents.""" - - kind = KDDKind.BUSINESS_POLICY - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.BUSINESS_POLICY, document.id) - fields: dict[str, Any] = {} - - decl = find_section(document.sections, "Declaración", "Declaration") - if decl: - fields["declaration"] = decl.content - - when = find_section(document.sections, "Cuándo Aplica", "When Applies") - if when: - fields["when_applies"] = when.content - - params = find_section(document.sections, "Parámetros", "Parameters") - if params: - fields["parameters"] = params.content - - violation = find_section( - document.sections, - "Qué pasa si se incumple", "Violation", "What Happens on Violation", - ) - if violation: - fields["violation"] = violation.content - - return GraphNode( - id=node_id, - kind=KDDKind.BUSINESS_POLICY, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.BUSINESS_POLICY, document.id) - edges: list[GraphEdge] = [] - - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - - # ENTITY_RULE from declaration section - decl = find_section(document.sections, "Declaración", "Declaration") - if decl: - for link in extract_wiki_links(decl.content): - t = link.target - if not t.startswith(("EVT-", "BR-", "BP-", "XP-", "CMD-", "QRY-", - "UC-", "PROC-", "REQ-", "OBJ-", "ADR-", "PRD-", "UI-")): - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=node_id, - to_node=to_node, - edge_type="ENTITY_RULE", - source_file=document.source_path, - extraction_method="wiki_link", - )) - - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/business_rule.py b/src/kdd/application/extractors/kinds/business_rule.py deleted file mode 100644 index b1250eb..0000000 --- a/src/kdd/application/extractors/kinds/business_rule.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Business-rule extractor — parses ``kind: business-rule`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → business-rule row. -Indexed fields: declaration, when_applies, violation, examples, formalization. -Edges: ENTITY_RULE, WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, - resolve_wiki_link_to_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind -from kdd.infrastructure.parsing.wiki_links import extract_wiki_links - - -class RuleExtractor: - """Extractor for ``kind: business-rule`` KDD documents.""" - - kind = KDDKind.BUSINESS_RULE - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.BUSINESS_RULE, document.id) - fields: dict[str, Any] = {} - - decl = find_section(document.sections, "Declaración", "Declaration") - if decl: - fields["declaration"] = decl.content - - when = find_section(document.sections, "Cuándo aplica", "When Applies") - if when: - fields["when_applies"] = when.content - - why = find_section(document.sections, "Por qué existe", "Why it exists") - if why: - fields["why_exists"] = why.content - - violation = find_section( - document.sections, "Qué pasa si se incumple", "Violation", "What happens if violated" - ) - if violation: - fields["violation"] = violation.content - - examples = find_section(document.sections, "Ejemplos", "Examples") - if examples: - fields["examples"] = examples.content - - return GraphNode( - id=node_id, - kind=KDDKind.BUSINESS_RULE, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.BUSINESS_RULE, document.id) - edges: list[GraphEdge] = [] - - # WIKI_LINK edges - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - - # ENTITY_RULE: wiki-links to entities in ## Declaración - decl = find_section(document.sections, "Declaración", "Declaration") - if decl: - for link in extract_wiki_links(decl.content): - # Only link to entities (PascalCase names, not prefixed specs) - t = link.target - if not any(t.startswith(p) for p in ( - "EVT-", "BR-", "BP-", "XP-", "CMD-", "QRY-", - "UC-", "PROC-", "REQ-", "OBJ-", "ADR-", "PRD-", - )): - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=node_id, - to_node=to_node, - edge_type="ENTITY_RULE", - source_file=document.source_path, - extraction_method="wiki_link", - )) - - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/command.py b/src/kdd/application/extractors/kinds/command.py deleted file mode 100644 index f68ac1d..0000000 --- a/src/kdd/application/extractors/kinds/command.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Command extractor — parses ``kind: command`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → command row. -Indexed fields: purpose, input_params, preconditions, postconditions, errors. -Edges: EMITS (postcondition events), WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, - resolve_wiki_link_to_node_id, -) -from kdd.application.extractors.kinds.entity import _parse_table_rows, _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind -from kdd.infrastructure.parsing.wiki_links import extract_wiki_links - - -class CommandExtractor: - """Extractor for ``kind: command`` KDD documents.""" - - kind = KDDKind.COMMAND - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.COMMAND, document.id) - fields: dict[str, Any] = {} - - purpose = find_section(document.sections, "Purpose", "Propósito") - if purpose: - fields["purpose"] = purpose.content - - input_sec = find_section(document.sections, "Input", "Entrada") - if input_sec: - fields["input_params"] = _parse_table_rows(input_sec.content) - - pre = find_section(document.sections, "Preconditions", "Precondiciones") - if pre: - fields["preconditions"] = pre.content - - post = find_section(document.sections, "Postconditions", "Postcondiciones") - if post: - fields["postconditions"] = post.content - - errors = find_section(document.sections, "Possible Errors", "Errores Posibles") - if errors: - fields["errors"] = _parse_table_rows(errors.content) - - return GraphNode( - id=node_id, - kind=KDDKind.COMMAND, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.COMMAND, document.id) - edges: list[GraphEdge] = [] - - # WIKI_LINK edges - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - - # EMITS edges from Postconditions (EVT-* wiki-links) - post = find_section(document.sections, "Postconditions", "Postcondiciones") - if post: - for link in extract_wiki_links(post.content): - if link.target.startswith("EVT-"): - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=node_id, - to_node=to_node, - edge_type="EMITS", - source_file=document.source_path, - extraction_method="wiki_link", - )) - - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/cross_policy.py b/src/kdd/application/extractors/kinds/cross_policy.py deleted file mode 100644 index e80ab6f..0000000 --- a/src/kdd/application/extractors/kinds/cross_policy.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Cross-policy extractor — parses ``kind: cross-policy`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → cross-policy row. -Indexed fields: purpose, declaration, formalization_ears, standard_behavior. -Edges: ENTITY_RULE (entities in declaration), WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, - resolve_wiki_link_to_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind -from kdd.infrastructure.parsing.wiki_links import extract_wiki_links - - -class CrossPolicyExtractor: - """Extractor for ``kind: cross-policy`` KDD documents.""" - - kind = KDDKind.CROSS_POLICY - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.CROSS_POLICY, document.id) - fields: dict[str, Any] = {} - - purpose = find_section(document.sections, "Propósito", "Purpose") - if purpose: - fields["purpose"] = purpose.content - - decl = find_section(document.sections, "Declaración", "Declaration") - if decl: - fields["declaration"] = decl.content - - formal = find_section( - document.sections, - "Formalización EARS", "EARS Formalization", - ) - if formal: - fields["formalization_ears"] = formal.content - - behavior = find_section( - document.sections, - "Comportamiento Estándar", "Standard Behavior", - ) - if behavior: - fields["standard_behavior"] = behavior.content - - return GraphNode( - id=node_id, - kind=KDDKind.CROSS_POLICY, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.CROSS_POLICY, document.id) - edges: list[GraphEdge] = [] - - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - - # ENTITY_RULE from declaration section - decl = find_section(document.sections, "Declaración", "Declaration") - if decl: - for link in extract_wiki_links(decl.content): - t = link.target - if not t.startswith(("EVT-", "BR-", "BP-", "XP-", "CMD-", "QRY-", - "UC-", "PROC-", "REQ-", "OBJ-", "ADR-", "PRD-", "UI-")): - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=node_id, - to_node=to_node, - edge_type="ENTITY_RULE", - source_file=document.source_path, - extraction_method="wiki_link", - )) - - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/entity.py b/src/kdd/application/extractors/kinds/entity.py deleted file mode 100644 index 5cfbe77..0000000 --- a/src/kdd/application/extractors/kinds/entity.py +++ /dev/null @@ -1,196 +0,0 @@ -"""Entity extractor — parses ``kind: entity`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → entity row. -Indexed fields: description, attributes, relations, invariants, state_machine. -Edges: DOMAIN_RELATION, EMITS, CONSUMES, WIKI_LINK, business relations. -""" - -from __future__ import annotations - -import re -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument, Section -from kdd.domain.enums import KDDKind -from kdd.infrastructure.parsing.wiki_links import extract_wiki_links - - -class EntityExtractor: - """Extractor for ``kind: entity`` KDD documents.""" - - kind = KDDKind.ENTITY - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.ENTITY, document.id) - fields: dict[str, Any] = {} - - # description - desc = find_section(document.sections, "Descripción", "Description") - if desc: - fields["description"] = desc.content - - # attributes — parse table rows - attr_sec = find_section(document.sections, "Atributos", "Attributes") - if attr_sec: - fields["attributes"] = _parse_table_rows(attr_sec.content) - - # relations — parse table rows - rel_sec = find_section(document.sections, "Relaciones", "Relations", "Relationships") - if rel_sec: - fields["relations"] = _parse_table_rows(rel_sec.content) - - # invariants — list items - inv_sec = find_section(document.sections, "Invariantes", "Invariants", "Constraints") - if inv_sec: - fields["invariants"] = _parse_list_items(inv_sec.content) - - # state_machine — from Ciclo de Vida section - sm_sec = find_section(document.sections, "Ciclo de Vida", "Lifecycle", "State Machine") - if sm_sec: - fields["state_machine"] = sm_sec.content - - return GraphNode( - id=node_id, - kind=KDDKind.ENTITY, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.ENTITY, document.id) - edges: list[GraphEdge] = [] - - # 1. WIKI_LINK edges from all content - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - - # 2. DOMAIN_RELATION from ## Relaciones table - rel_sec = find_section(document.sections, "Relaciones", "Relations", "Relationships") - if rel_sec: - edges.extend( - _extract_relation_edges(rel_sec, node_id, document.source_path) - ) - - # 3. EMITS / CONSUMES from lifecycle events table or sections - for section in document.sections: - heading_lower = section.heading.lower() - if heading_lower in ("eventos del ciclo de vida", "lifecycle events"): - edges.extend( - _extract_event_edges(section, node_id, document.source_path) - ) - - return _deduplicate_edges(edges) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -_TABLE_ROW_RE = re.compile(r"^\|(.+)\|$", re.MULTILINE) - - -def _parse_table_rows(content: str) -> list[dict[str, str]]: - """Parse a Markdown table into a list of dicts.""" - lines = [ - line.strip() - for line in content.strip().splitlines() - if line.strip().startswith("|") - ] - if len(lines) < 2: - return [] - - headers = [h.strip().strip("`") for h in lines[0].strip("|").split("|")] - rows: list[dict[str, str]] = [] - for line in lines[2:]: # skip separator line - cells = [c.strip() for c in line.strip("|").split("|")] - if len(cells) >= len(headers): - rows.append(dict(zip(headers, cells))) - return rows - - -def _parse_list_items(content: str) -> list[str]: - """Extract ``- item`` list items from Markdown content.""" - items: list[str] = [] - for line in content.splitlines(): - line = line.strip() - if line.startswith("- ") or line.startswith("* "): - items.append(line[2:].strip()) - return items - - -def _extract_relation_edges( - section: Section, from_node: str, source_file: str, -) -> list[GraphEdge]: - """Extract DOMAIN_RELATION and business edges from a relations table.""" - edges: list[GraphEdge] = [] - rows = _parse_table_rows(section.content) - for row in rows: - # Find target entity in any column containing [[...]] - target = None - for val in row.values(): - links = extract_wiki_links(val) - if links: - from kdd.application.extractors.base import resolve_wiki_link_to_node_id - target = resolve_wiki_link_to_node_id(links[0]) - break - if not target: - continue - - # Relation name (first column usually) - rel_name = next(iter(row.values()), "") - cardinality = row.get("Cardinalidad", row.get("Cardinality", "")) - - edges.append(GraphEdge( - from_node=from_node, - to_node=target, - edge_type="DOMAIN_RELATION", - source_file=source_file, - extraction_method="section_content", - metadata={"relation": rel_name, "cardinality": cardinality}, - )) - - return edges - - -def _extract_event_edges( - section: Section, from_node: str, source_file: str, -) -> list[GraphEdge]: - """Extract EMITS edges from lifecycle event tables/lists.""" - edges: list[GraphEdge] = [] - links = extract_wiki_links(section.content) - for link in links: - if link.target.startswith("EVT-"): - from kdd.application.extractors.base import resolve_wiki_link_to_node_id - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=from_node, - to_node=to_node, - edge_type="EMITS", - source_file=source_file, - extraction_method="wiki_link", - )) - return edges - - -def _deduplicate_edges(edges: list[GraphEdge]) -> list[GraphEdge]: - """Remove duplicate edges (same from/to/type).""" - seen: set[tuple[str, str, str]] = set() - result: list[GraphEdge] = [] - for e in edges: - key = (e.from_node, e.to_node, e.edge_type) - if key not in seen: - seen.add(key) - result.append(e) - return result diff --git a/src/kdd/application/extractors/kinds/event.py b/src/kdd/application/extractors/kinds/event.py deleted file mode 100644 index cecb6ef..0000000 --- a/src/kdd/application/extractors/kinds/event.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Event extractor — parses ``kind: event`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → event row. -Indexed fields: description, payload, producer, consumers. -Edges: WIKI_LINK only. -Note: Events produce NO embeddings per BR-EMBEDDING-001. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.application.extractors.kinds.entity import ( - _deduplicate_edges, - _parse_table_rows, -) -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class EventExtractor: - """Extractor for ``kind: event`` KDD documents.""" - - kind = KDDKind.EVENT - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.EVENT, document.id) - fields: dict[str, Any] = {} - - desc = find_section(document.sections, "Descripción", "Description") - if desc: - fields["description"] = desc.content - - payload = find_section(document.sections, "Payload") - if payload: - fields["payload"] = _parse_table_rows(payload.content) - - producer = find_section(document.sections, "Productor", "Producer") - if producer: - fields["producer"] = producer.content - - consumers = find_section(document.sections, "Consumidores", "Consumers") - if consumers: - fields["consumers"] = consumers.content - - return GraphNode( - id=node_id, - kind=KDDKind.EVENT, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.EVENT, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/objective.py b/src/kdd/application/extractors/kinds/objective.py deleted file mode 100644 index d9cf3c0..0000000 --- a/src/kdd/application/extractors/kinds/objective.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Objective extractor — parses ``kind: objective`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → objective row. -Indexed fields: actor, objective, success_criteria. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class ObjectiveExtractor: - """Extractor for ``kind: objective`` KDD documents.""" - - kind = KDDKind.OBJECTIVE - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.OBJECTIVE, document.id) - fields: dict[str, Any] = {} - - actor = find_section(document.sections, "Actor", "Actors") - if actor: - fields["actor"] = actor.content - - objective = find_section(document.sections, "Objetivo", "Objective") - if objective: - fields["objective"] = objective.content - - criteria = find_section( - document.sections, - "Criterios de éxito", "Success Criteria", - ) - if criteria: - fields["success_criteria"] = criteria.content - - return GraphNode( - id=node_id, - kind=KDDKind.OBJECTIVE, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.OBJECTIVE, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/prd.py b/src/kdd/application/extractors/kinds/prd.py deleted file mode 100644 index 735a6df..0000000 --- a/src/kdd/application/extractors/kinds/prd.py +++ /dev/null @@ -1,82 +0,0 @@ -"""PRD extractor — parses ``kind: prd`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → prd row. -Indexed fields: problem, scope, users, metrics, dependencies. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - find_section_with_children, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class PRDExtractor: - """Extractor for ``kind: prd`` KDD documents.""" - - kind = KDDKind.PRD - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.PRD, document.id) - fields: dict[str, Any] = {} - - problem = find_section( - document.sections, - "Problema / Oportunidad", "Problem / Opportunity", - "Problema", "Problem", - ) - if problem: - fields["problem"] = problem.content - - scope = find_section_with_children( - document.sections, "Alcance", "Scope", - ) - if scope: - fields["scope"] = scope - - users = find_section_with_children( - document.sections, - "Usuarios y Jobs-to-be-done", "Users and Jobs-to-be-done", - ) - if users: - fields["users"] = users - - metrics = find_section( - document.sections, - "Métricas de éxito y telemetría", "Success Metrics", - ) - if metrics: - fields["metrics"] = metrics.content - - deps = find_section(document.sections, "Dependencias", "Dependencies") - if deps: - fields["dependencies"] = deps.content - - return GraphNode( - id=node_id, - kind=KDDKind.PRD, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.PRD, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/process.py b/src/kdd/application/extractors/kinds/process.py deleted file mode 100644 index cd2df31..0000000 --- a/src/kdd/application/extractors/kinds/process.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Process extractor — parses ``kind: process`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → process row. -Indexed fields: participants, steps, mermaid_flow. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - find_section_with_children, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class ProcessExtractor: - """Extractor for ``kind: process`` KDD documents.""" - - kind = KDDKind.PROCESS - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.PROCESS, document.id) - fields: dict[str, Any] = {} - - participants = find_section( - document.sections, "Participantes", "Participants", - ) - if participants: - fields["participants"] = participants.content - - steps = find_section_with_children( - document.sections, "Pasos", "Steps", - ) - if steps: - fields["steps"] = steps - - diagram = find_section(document.sections, "Diagrama", "Diagram") - if diagram: - fields["mermaid_flow"] = diagram.content - - return GraphNode( - id=node_id, - kind=KDDKind.PROCESS, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.PROCESS, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/query.py b/src/kdd/application/extractors/kinds/query.py deleted file mode 100644 index 325b09c..0000000 --- a/src/kdd/application/extractors/kinds/query.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Query extractor — parses ``kind: query`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → query row. -Indexed fields: purpose, input_params, output_structure, errors. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _parse_table_rows, _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class QueryExtractor: - """Extractor for ``kind: query`` KDD documents.""" - - kind = KDDKind.QUERY - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.QUERY, document.id) - fields: dict[str, Any] = {} - - purpose = find_section(document.sections, "Purpose", "Propósito") - if purpose: - fields["purpose"] = purpose.content - - input_sec = find_section(document.sections, "Input", "Entrada") - if input_sec: - fields["input_params"] = _parse_table_rows(input_sec.content) - - output_sec = find_section(document.sections, "Output", "Salida") - if output_sec: - fields["output_structure"] = output_sec.content - - errors = find_section(document.sections, "Possible Errors", "Errores Posibles") - if errors: - fields["errors"] = _parse_table_rows(errors.content) - - return GraphNode( - id=node_id, - kind=KDDKind.QUERY, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.QUERY, document.id) - edges = build_wiki_link_edges(document, node_id, document.layer) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/requirement.py b/src/kdd/application/extractors/kinds/requirement.py deleted file mode 100644 index 8ef8830..0000000 --- a/src/kdd/application/extractors/kinds/requirement.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Requirement extractor — parses ``kind: requirement`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → requirement row. -Indexed fields: description, acceptance_criteria, traceability. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class RequirementExtractor: - """Extractor for ``kind: requirement`` KDD documents.""" - - kind = KDDKind.REQUIREMENT - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.REQUIREMENT, document.id) - fields: dict[str, Any] = {} - - desc = find_section(document.sections, "Descripción", "Description") - if desc: - fields["description"] = desc.content - - criteria = find_section( - document.sections, - "Criterios de Aceptación", "Acceptance Criteria", - ) - if criteria: - fields["acceptance_criteria"] = criteria.content - - trace = find_section(document.sections, "Trazabilidad", "Traceability") - if trace: - fields["traceability"] = trace.content - - return GraphNode( - id=node_id, - kind=KDDKind.REQUIREMENT, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.REQUIREMENT, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/ui_component.py b/src/kdd/application/extractors/kinds/ui_component.py deleted file mode 100644 index a428c03..0000000 --- a/src/kdd/application/extractors/kinds/ui_component.py +++ /dev/null @@ -1,61 +0,0 @@ -"""UI-component extractor — parses ``kind: ui-component`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → ui-component row. -Indexed fields: description, entities, use_cases. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class UIComponentExtractor: - """Extractor for ``kind: ui-component`` KDD documents.""" - - kind = KDDKind.UI_COMPONENT - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.UI_COMPONENT, document.id) - fields: dict[str, Any] = {} - - desc = find_section(document.sections, "Descripción", "Description") - if desc: - fields["description"] = desc.content - - entities = find_section(document.sections, "Entidades", "Entities") - if entities: - fields["entities"] = entities.content - - use_cases = find_section(document.sections, "Casos de Uso", "Use Cases") - if use_cases: - fields["use_cases"] = use_cases.content - - return GraphNode( - id=node_id, - kind=KDDKind.UI_COMPONENT, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.UI_COMPONENT, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/ui_view.py b/src/kdd/application/extractors/kinds/ui_view.py deleted file mode 100644 index e372960..0000000 --- a/src/kdd/application/extractors/kinds/ui_view.py +++ /dev/null @@ -1,69 +0,0 @@ -"""UI-view extractor — parses ``kind: ui-view`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → ui-view row. -Indexed fields: description, layout, components, states, behavior. -Edges: WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - make_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind - - -class UIViewExtractor: - """Extractor for ``kind: ui-view`` KDD documents.""" - - kind = KDDKind.UI_VIEW - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.UI_VIEW, document.id) - fields: dict[str, Any] = {} - - desc = find_section(document.sections, "Descripción", "Description") - if desc: - fields["description"] = desc.content - - layout = find_section(document.sections, "Layout", "Diseño") - if layout: - fields["layout"] = layout.content - - components = find_section(document.sections, "Componentes", "Components") - if components: - fields["components"] = components.content - - states = find_section(document.sections, "Estados", "States") - if states: - fields["states"] = states.content - - behavior = find_section(document.sections, "Comportamiento", "Behavior") - if behavior: - fields["behavior"] = behavior.content - - return GraphNode( - id=node_id, - kind=KDDKind.UI_VIEW, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.UI_VIEW, document.id) - edges: list[GraphEdge] = [] - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/kinds/use_case.py b/src/kdd/application/extractors/kinds/use_case.py deleted file mode 100644 index b4c47f8..0000000 --- a/src/kdd/application/extractors/kinds/use_case.py +++ /dev/null @@ -1,137 +0,0 @@ -"""Use-case extractor — parses ``kind: use-case`` specs. - -Spec reference: PRD-KBEngine "Nodos del grafo" → use-case row. -Indexed fields: description, actors, preconditions, main_flow, alternatives, - exceptions, postconditions. -Edges: UC_APPLIES_RULE, UC_EXECUTES_CMD, UC_STORY, WIKI_LINK. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any - -from kdd.application.extractors.base import ( - build_wiki_link_edges, - find_section, - find_section_with_children, - make_node_id, - resolve_wiki_link_to_node_id, -) -from kdd.application.extractors.kinds.entity import _deduplicate_edges -from kdd.domain.entities import GraphEdge, GraphNode, KDDDocument -from kdd.domain.enums import KDDKind -from kdd.infrastructure.parsing.wiki_links import extract_wiki_links - - -class UseCaseExtractor: - """Extractor for ``kind: use-case`` KDD documents.""" - - kind = KDDKind.USE_CASE - - def extract_node(self, document: KDDDocument) -> GraphNode: - node_id = make_node_id(KDDKind.USE_CASE, document.id) - fields: dict[str, Any] = {} - - desc = find_section(document.sections, "Descripción", "Description") - if desc: - fields["description"] = desc.content - - actors = find_section(document.sections, "Actores", "Actors") - if actors: - fields["actors"] = actors.content - - pre = find_section(document.sections, "Precondiciones", "Preconditions") - if pre: - fields["preconditions"] = pre.content - - flow = find_section(document.sections, "Flujo Principal", "Main Flow") - if flow: - fields["main_flow"] = flow.content - - alt = find_section_with_children( - document.sections, "Flujos Alternativos", "Alternative Flows" - ) - if alt: - fields["alternatives"] = alt - - exc = find_section_with_children( - document.sections, "Excepciones", "Exceptions" - ) - if exc: - fields["exceptions"] = exc - - post = find_section(document.sections, "Postcondiciones", "Postconditions") - if post: - fields["postconditions"] = post.content - - return GraphNode( - id=node_id, - kind=KDDKind.USE_CASE, - source_file=document.source_path, - source_hash=document.source_hash, - layer=document.layer, - status=document.front_matter.get("status", "draft"), - aliases=document.front_matter.get("aliases", []), - domain=document.domain, - indexed_fields=fields, - indexed_at=datetime.now(), - ) - - def extract_edges(self, document: KDDDocument) -> list[GraphEdge]: - node_id = make_node_id(KDDKind.USE_CASE, document.id) - edges: list[GraphEdge] = [] - - # WIKI_LINK edges - edges.extend(build_wiki_link_edges(document, node_id, document.layer)) - - # UC_APPLIES_RULE from ## Reglas Aplicadas - rules_sec = find_section( - document.sections, "Reglas Aplicadas", "Applied Rules", "Rules Applied" - ) - if rules_sec: - for link in extract_wiki_links(rules_sec.content): - t = link.target - if t.startswith("BR-") or t.startswith("BP-") or t.startswith("XP-"): - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=node_id, - to_node=to_node, - edge_type="UC_APPLIES_RULE", - source_file=document.source_path, - extraction_method="wiki_link", - )) - - # UC_EXECUTES_CMD from ## Comandos Ejecutados - cmds_sec = find_section( - document.sections, "Comandos Ejecutados", "Commands Executed" - ) - if cmds_sec: - for link in extract_wiki_links(cmds_sec.content): - if link.target.startswith("CMD-"): - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=node_id, - to_node=to_node, - edge_type="UC_EXECUTES_CMD", - source_file=document.source_path, - extraction_method="wiki_link", - )) - - # UC_STORY from OBJ-* wiki-links anywhere in the document - full_content = "\n".join(s.content for s in document.sections) - for link in extract_wiki_links(full_content): - if link.target.startswith("OBJ-"): - to_node = resolve_wiki_link_to_node_id(link) - if to_node: - edges.append(GraphEdge( - from_node=node_id, - to_node=to_node, - edge_type="UC_STORY", - source_file=document.source_path, - extraction_method="wiki_link", - )) - - return _deduplicate_edges(edges) diff --git a/src/kdd/application/extractors/registry.py b/src/kdd/application/extractors/registry.py deleted file mode 100644 index 4a0673b..0000000 --- a/src/kdd/application/extractors/registry.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Extractor registry — maps KDDKind to extractor instances. - -Auto-registers all extractors when the kinds sub-package is imported. -""" - -from __future__ import annotations - -from kdd.application.extractors.base import Extractor -from kdd.domain.enums import KDDKind - - -class ExtractorRegistry: - """Registry that maps :class:`KDDKind` to :class:`Extractor` instances.""" - - def __init__(self) -> None: - self._extractors: dict[KDDKind, Extractor] = {} - - def register(self, extractor: Extractor) -> None: - """Register an extractor for its ``kind``.""" - self._extractors[extractor.kind] = extractor - - def get(self, kind: KDDKind) -> Extractor | None: - """Return the extractor for *kind*, or ``None``.""" - return self._extractors.get(kind) - - @property - def registered_kinds(self) -> set[KDDKind]: - return set(self._extractors.keys()) - - def __len__(self) -> int: - return len(self._extractors) - - -def create_default_registry() -> ExtractorRegistry: - """Create a registry pre-loaded with all 15 extractors.""" - from kdd.application.extractors.kinds.adr import ADRExtractor - from kdd.application.extractors.kinds.business_policy import PolicyExtractor - from kdd.application.extractors.kinds.business_rule import RuleExtractor - from kdd.application.extractors.kinds.command import CommandExtractor - from kdd.application.extractors.kinds.cross_policy import CrossPolicyExtractor - from kdd.application.extractors.kinds.entity import EntityExtractor - from kdd.application.extractors.kinds.event import EventExtractor - from kdd.application.extractors.kinds.objective import ObjectiveExtractor - from kdd.application.extractors.kinds.prd import PRDExtractor - from kdd.application.extractors.kinds.process import ProcessExtractor - from kdd.application.extractors.kinds.query import QueryExtractor - from kdd.application.extractors.kinds.requirement import RequirementExtractor - from kdd.application.extractors.kinds.ui_component import UIComponentExtractor - from kdd.application.extractors.kinds.ui_view import UIViewExtractor - from kdd.application.extractors.kinds.use_case import UseCaseExtractor - - registry = ExtractorRegistry() - registry.register(EntityExtractor()) - registry.register(EventExtractor()) - registry.register(RuleExtractor()) - registry.register(PolicyExtractor()) - registry.register(CrossPolicyExtractor()) - registry.register(CommandExtractor()) - registry.register(QueryExtractor()) - registry.register(ProcessExtractor()) - registry.register(UseCaseExtractor()) - registry.register(UIViewExtractor()) - registry.register(UIComponentExtractor()) - registry.register(RequirementExtractor()) - registry.register(ObjectiveExtractor()) - registry.register(PRDExtractor()) - registry.register(ADRExtractor()) - return registry diff --git a/src/kdd/application/queries/__init__.py b/src/kdd/application/queries/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kdd/application/queries/index_loader.py b/src/kdd/application/queries/index_loader.py deleted file mode 100644 index 1c17cbc..0000000 --- a/src/kdd/application/queries/index_loader.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Index loader — reads .kdd-index/ artifacts into memory stores. - -Bridges the write side (ArtifactStore on disk) with the read side -(GraphStore + VectorStore in memory). Caches the loaded state and -reloads when the manifest changes. -""" - -from __future__ import annotations - -from kdd.domain.ports import ArtifactStore, GraphStore, VectorStore - - -class IndexLoader: - """Loads index artifacts into in-memory stores for querying. - - Usage:: - - loader = IndexLoader(artifact_store, graph_store, vector_store) - loader.load() # populates graph + vector stores - # … run queries against graph_store / vector_store … - """ - - def __init__( - self, - artifact_store: ArtifactStore, - graph_store: GraphStore, - vector_store: VectorStore | None = None, - ) -> None: - self._artifacts = artifact_store - self._graph = graph_store - self._vector = vector_store - self._loaded_manifest_hash: str | None = None - - @property - def is_loaded(self) -> bool: - return self._loaded_manifest_hash is not None - - def load(self, *, force: bool = False) -> bool: - """Load artifacts into memory stores. - - Returns True if stores were (re)loaded, False if the cache was still - valid and *force* was not requested. - """ - manifest = self._artifacts.read_manifest() - if manifest is None: - return False - - # Cache check: skip reload if manifest hasn't changed - manifest_hash = f"{manifest.indexed_at}:{manifest.stats.nodes}:{manifest.stats.edges}" - if not force and self._loaded_manifest_hash == manifest_hash: - return False - - # Load graph - nodes = self._artifacts.read_all_nodes() - edges = self._artifacts.read_edges() - self._graph.load(nodes, edges) - - # Load vectors (optional — L2+ only) - if self._vector is not None: - embeddings = self._artifacts.read_all_embeddings() - if embeddings: - self._vector.load(embeddings) - - self._loaded_manifest_hash = manifest_hash - return True - - def reload(self) -> bool: - """Force reload of artifacts.""" - return self.load(force=True) diff --git a/src/kdd/application/queries/retrieve_coverage.py b/src/kdd/application/queries/retrieve_coverage.py deleted file mode 100644 index 886bc7a..0000000 --- a/src/kdd/application/queries/retrieve_coverage.py +++ /dev/null @@ -1,132 +0,0 @@ -"""QRY-005 — RetrieveCoverage. - -Validates governance coverage for a node. Determines what related -artifacts **should exist** for a given kind and which are missing. -Corresponds to ``GET /v1/retrieve/coverage``. - -Spec: specs/02-behavior/queries/QRY-005-RetrieveCoverage.md -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -from kdd.domain.entities import GraphNode -from kdd.domain.enums import EdgeType, KDDKind -from kdd.domain.ports import GraphStore - - -@dataclass -class CoverageCategory: - """A required category of related artifacts.""" - - name: str - description: str - edge_type: str - status: str # "covered", "missing", "partial" - found: list[str] # node IDs - - -@dataclass -class CoverageQueryInput: - node_id: str - - -@dataclass -class CoverageQueryResult: - analyzed_node: GraphNode | None - categories: list[CoverageCategory] - present: int - missing: int - coverage_percent: float - - -# Coverage rules per kind: which related artifact types should exist -_COVERAGE_RULES: dict[KDDKind, list[tuple[str, str, str]]] = { - # (category_name, description, edge_type_to_check) - KDDKind.ENTITY: [ - ("events", "Domain events emitted by this entity", EdgeType.EMITS.value), - ("business_rules", "Business rules for this entity", EdgeType.ENTITY_RULE.value), - ("use_cases", "Use cases involving this entity", EdgeType.WIKI_LINK.value), - ], - KDDKind.COMMAND: [ - ("events", "Events emitted by this command", EdgeType.EMITS.value), - ("use_cases", "Use cases that execute this command", EdgeType.UC_EXECUTES_CMD.value), - ], - KDDKind.USE_CASE: [ - ("commands", "Commands executed by this use case", EdgeType.UC_EXECUTES_CMD.value), - ("rules", "Business rules applied", EdgeType.UC_APPLIES_RULE.value), - ("requirements", "Requirements tracing to this UC", EdgeType.REQ_TRACES_TO.value), - ], - KDDKind.BUSINESS_RULE: [ - ("entity", "Entity this rule validates", EdgeType.ENTITY_RULE.value), - ("use_cases", "Use cases that apply this rule", EdgeType.UC_APPLIES_RULE.value), - ], - KDDKind.REQUIREMENT: [ - ("traces", "Artifacts this requirement traces to", EdgeType.REQ_TRACES_TO.value), - ], -} - - -def retrieve_coverage( - query: CoverageQueryInput, - graph_store: GraphStore, -) -> CoverageQueryResult: - """Execute a coverage analysis query (QRY-005). - - Raises ValueError if node not found or kind has no coverage rules. - """ - if not graph_store.has_node(query.node_id): - raise ValueError(f"NODE_NOT_FOUND: {query.node_id}") - - node = graph_store.get_node(query.node_id) - if node is None: - raise ValueError(f"NODE_NOT_FOUND: {query.node_id}") - - rules = _COVERAGE_RULES.get(node.kind) - if rules is None: - raise ValueError(f"UNKNOWN_KIND: no coverage rules for kind '{node.kind.value}'") - - # Collect all edges involving this node - incoming = graph_store.incoming_edges(query.node_id) - outgoing = graph_store.outgoing_edges(query.node_id) - all_edges = incoming + outgoing - - categories: list[CoverageCategory] = [] - present = 0 - missing = 0 - - for cat_name, cat_desc, edge_type in rules: - # Find edges of this type connecting to/from our node - found_ids: list[str] = [] - for edge in all_edges: - if edge.edge_type == edge_type: - other = edge.to_node if edge.from_node == query.node_id else edge.from_node - if other not in found_ids: - found_ids.append(other) - - if found_ids: - status = "covered" - present += 1 - else: - status = "missing" - missing += 1 - - categories.append(CoverageCategory( - name=cat_name, - description=cat_desc, - edge_type=edge_type, - status=status, - found=found_ids, - )) - - total = present + missing - coverage_pct = (present / total * 100) if total > 0 else 0.0 - - return CoverageQueryResult( - analyzed_node=node, - categories=categories, - present=present, - missing=missing, - coverage_percent=round(coverage_pct, 1), - ) diff --git a/src/kdd/application/queries/retrieve_graph.py b/src/kdd/application/queries/retrieve_graph.py deleted file mode 100644 index c12f911..0000000 --- a/src/kdd/application/queries/retrieve_graph.py +++ /dev/null @@ -1,119 +0,0 @@ -"""QRY-001 — RetrieveByGraph. - -Graph traversal starting from a root node, following edges by type and -depth. Corresponds to ``GET /v1/retrieve/graph``. - -Spec: specs/02-behavior/queries/QRY-001-RetrieveByGraph.md -""" - -from __future__ import annotations - -from dataclasses import dataclass - -from kdd.domain.entities import GraphEdge, GraphNode, ScoredNode -from kdd.domain.enums import KDDKind -from kdd.domain.ports import GraphStore - - -@dataclass -class GraphQueryInput: - root_node: str - depth: int = 2 - edge_types: list[str] | None = None - include_kinds: list[KDDKind] | None = None - respect_layers: bool = True - - -@dataclass -class GraphQueryResult: - center_node: GraphNode | None - related_nodes: list[ScoredNode] - edges: list[GraphEdge] - total_nodes: int - total_edges: int - - -def retrieve_by_graph( - query: GraphQueryInput, - graph_store: GraphStore, -) -> GraphQueryResult: - """Execute a graph traversal query (QRY-001). - - Raises ValueError if root_node is not found. - """ - if not graph_store.has_node(query.root_node): - raise ValueError(f"NODE_NOT_FOUND: {query.root_node}") - - nodes, edges = graph_store.traverse( - root=query.root_node, - depth=query.depth, - edge_types=query.edge_types, - respect_layers=query.respect_layers, - ) - - # Filter by kind if requested - if query.include_kinds: - kind_set = set(query.include_kinds) - nodes = [n for n in nodes if n.kind in kind_set] - - center = graph_store.get_node(query.root_node) - - # Score by distance from root (center=1.0, further=lower) - # We use a simple heuristic: nodes found via BFS get decreasing scores - scored: list[ScoredNode] = [] - for node in nodes: - if node.id == query.root_node: - continue - # Approximate distance: count hops via shortest edge path - dist = _estimate_distance(node.id, query.root_node, edges) - score = 1.0 / (1.0 + dist) - scored.append(ScoredNode( - node_id=node.id, - score=score, - snippet=_build_snippet(node), - match_source="graph", - )) - - scored.sort(key=lambda s: s.score, reverse=True) - - return GraphQueryResult( - center_node=center, - related_nodes=scored, - edges=edges, - total_nodes=len(scored) + (1 if center else 0), - total_edges=len(edges), - ) - - -def _estimate_distance( - node_id: str, - root_id: str, - edges: list[GraphEdge], -) -> int: - """Rough hop count between root and node via BFS on edge list.""" - from collections import deque - - adj: dict[str, set[str]] = {} - for e in edges: - adj.setdefault(e.from_node, set()).add(e.to_node) - adj.setdefault(e.to_node, set()).add(e.from_node) - - visited = {root_id} - queue: deque[tuple[str, int]] = deque([(root_id, 0)]) - while queue: - current, dist = queue.popleft() - if current == node_id: - return dist - for neighbor in adj.get(current, set()): - if neighbor not in visited: - visited.add(neighbor) - queue.append((neighbor, dist + 1)) - return 999 # unreachable - - -def _build_snippet(node: GraphNode) -> str: - """Build a short snippet from node indexed_fields.""" - title = node.indexed_fields.get("title", "") - if title: - return f"[{node.kind.value}] {title}" - return f"[{node.kind.value}] {node.id}" diff --git a/src/kdd/application/queries/retrieve_hybrid.py b/src/kdd/application/queries/retrieve_hybrid.py deleted file mode 100644 index e7ed89b..0000000 --- a/src/kdd/application/queries/retrieve_hybrid.py +++ /dev/null @@ -1,238 +0,0 @@ -"""QRY-003 — RetrieveHybrid. - -The primary query for AI agents. Combines semantic (QRY-002), graph -(QRY-001), and lexical search. Results fused with combined scoring. -Corresponds to ``POST /v1/retrieve/context``. - -Spec: specs/02-behavior/queries/QRY-003-RetrieveHybrid.md - -Fusion scoring priority: -- semantic + graph = highest -- semantic only = medium-high -- graph only = medium -- lexical only = low -""" - -from __future__ import annotations - -from dataclasses import dataclass - -from kdd.domain.entities import GraphEdge, ScoredNode -from kdd.domain.enums import KDDKind, KDDLayer -from kdd.domain.ports import EmbeddingModel, GraphStore, VectorStore -from kdd.infrastructure.parsing.tokenization import count_tokens - - -@dataclass -class HybridQueryInput: - query_text: str - expand_graph: bool = True - depth: int = 2 - include_kinds: list[KDDKind] | None = None - include_layers: list[KDDLayer] | None = None - respect_layers: bool = True - min_score: float = 0.5 - limit: int = 10 - max_tokens: int = 8000 - - -@dataclass -class HybridQueryResult: - results: list[ScoredNode] - graph_expansion: list[GraphEdge] - total_results: int - total_tokens: int - warnings: list[str] - - -# Fusion score weights -_WEIGHT_SEMANTIC = 0.6 -_WEIGHT_GRAPH = 0.3 -_WEIGHT_LEXICAL = 0.1 - - -def retrieve_hybrid( - query: HybridQueryInput, - graph_store: GraphStore, - vector_store: VectorStore | None = None, - embedding_model: EmbeddingModel | None = None, -) -> HybridQueryResult: - """Execute a hybrid search combining semantic, graph, and lexical (QRY-003). - - Gracefully degrades: - - No vector_store/embedding_model → graph + lexical only (warning) - """ - if len(query.query_text.strip()) < 3: - raise ValueError("QUERY_TOO_SHORT: query_text must be at least 3 characters") - - warnings: list[str] = [] - # Accumulators: node_id → {source: score} - scores: dict[str, dict[str, float]] = {} - - # ── Phase 1: Semantic search ────────────────────────────────────── - if vector_store is not None and embedding_model is not None: - vectors = embedding_model.encode([query.query_text]) - matches = vector_store.search( - vector=vectors[0], - limit=query.limit * 3, - min_score=query.min_score * 0.8, # slightly lower threshold for fusion - ) - for emb_id, score in matches: - node_id = _emb_id_to_node_id(emb_id, graph_store) - if node_id is None: - continue - scores.setdefault(node_id, {})["semantic"] = max( - scores.get(node_id, {}).get("semantic", 0), score - ) - else: - warnings.append("NO_EMBEDDINGS: index is L1, semantic search skipped") - - # ── Phase 2: Lexical search ─────────────────────────────────────── - lexical_nodes = graph_store.text_search(query.query_text) - for node in lexical_nodes: - if _kind_layer_filter(node, query.include_kinds, query.include_layers): - scores.setdefault(node.id, {})["lexical"] = 0.5 - - # ── Phase 3: Graph expansion ────────────────────────────────────── - all_graph_edges: list[GraphEdge] = [] - if query.expand_graph: - # Expand from all nodes found so far - seed_ids = list(scores.keys()) - for seed_id in seed_ids: - if not graph_store.has_node(seed_id): - continue - nodes, edges = graph_store.traverse( - root=seed_id, - depth=query.depth, - respect_layers=query.respect_layers, - ) - all_graph_edges.extend(edges) - for n in nodes: - if n.id == seed_id: - continue - if _kind_layer_filter(n, query.include_kinds, query.include_layers): - scores.setdefault(n.id, {})["graph"] = 0.5 - - # ── Phase 4: Fusion scoring ─────────────────────────────────────── - fused: list[ScoredNode] = [] - for node_id, sources in scores.items(): - node = graph_store.get_node(node_id) - if node is None: - continue - if not _kind_layer_filter(node, query.include_kinds, query.include_layers): - continue - - score = _compute_fusion_score(sources) - if score < query.min_score: - continue - - match_source = _determine_match_source(sources) - snippet = _build_snippet(node) - fused.append(ScoredNode( - node_id=node_id, - score=score, - snippet=snippet, - match_source=match_source, - )) - - # Sort by score, apply limit - fused.sort(key=lambda s: s.score, reverse=True) - - # Token truncation - final_results: list[ScoredNode] = [] - total_tokens = 0 - for scored in fused: - snippet_tokens = count_tokens(scored.snippet or "") - if total_tokens + snippet_tokens > query.max_tokens and final_results: - break - final_results.append(scored) - total_tokens += snippet_tokens - if len(final_results) >= query.limit: - break - - # Deduplicate graph edges - seen: set[tuple[str, str, str]] = set() - unique_edges: list[GraphEdge] = [] - for e in all_graph_edges: - key = (e.from_node, e.to_node, e.edge_type) - if key not in seen: - seen.add(key) - unique_edges.append(e) - - return HybridQueryResult( - results=final_results, - graph_expansion=unique_edges, - total_results=len(final_results), - total_tokens=total_tokens, - warnings=warnings, - ) - - -# --------------------------------------------------------------------------- -# Internal helpers -# --------------------------------------------------------------------------- - - -def _emb_id_to_node_id(emb_id: str, graph_store: GraphStore) -> str | None: - """Resolve an embedding ID to the owning node ID.""" - from kdd.application.extractors.base import KIND_PREFIX - - doc_id = emb_id.rsplit(":chunk-", 1)[0] if ":chunk-" in emb_id else emb_id.split(":")[0] - - for prefix in KIND_PREFIX.values(): - candidate = f"{prefix}:{doc_id}" - if graph_store.has_node(candidate): - return candidate - if graph_store.has_node(doc_id): - return doc_id - return None - - -def _kind_layer_filter(node, include_kinds, include_layers) -> bool: - if include_kinds and node.kind not in include_kinds: - return False - if include_layers and node.layer not in include_layers: - return False - return True - - -def _compute_fusion_score(sources: dict[str, float]) -> float: - """Weighted fusion of multiple retrieval sources.""" - semantic = sources.get("semantic", 0) - graph = sources.get("graph", 0) - lexical = sources.get("lexical", 0) - - # Bonus for multi-source matches - source_count = sum(1 for v in sources.values() if v > 0) - bonus = 0.1 * (source_count - 1) if source_count > 1 else 0 - - weighted = ( - semantic * _WEIGHT_SEMANTIC - + graph * _WEIGHT_GRAPH - + lexical * _WEIGHT_LEXICAL - + bonus - ) - - # Normalize to [0, 1] - return min(weighted / (_WEIGHT_SEMANTIC + _WEIGHT_GRAPH + _WEIGHT_LEXICAL + 0.2), 1.0) - - -def _determine_match_source(sources: dict[str, float]) -> str: - has_semantic = sources.get("semantic", 0) > 0 - has_graph = sources.get("graph", 0) > 0 - has_lexical = sources.get("lexical", 0) > 0 - - if has_semantic and has_graph: - return "fusion" - if has_semantic: - return "semantic" - if has_graph: - return "graph" - return "lexical" - - -def _build_snippet(node) -> str: - title = node.indexed_fields.get("title", "") - if title: - return f"[{node.kind.value}] {title}" - return f"[{node.kind.value}] {node.id}" diff --git a/src/kdd/application/queries/retrieve_impact.py b/src/kdd/application/queries/retrieve_impact.py deleted file mode 100644 index 56e72e8..0000000 --- a/src/kdd/application/queries/retrieve_impact.py +++ /dev/null @@ -1,154 +0,0 @@ -"""QRY-004 — RetrieveImpact. - -Impact analysis from a node. Returns all directly and transitively -affected nodes, dependency chains, and BDD scenarios to re-run. -Corresponds to ``GET /v1/retrieve/impact``. - -Spec: specs/02-behavior/queries/QRY-004-RetrieveImpact.md -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -from kdd.domain.entities import GraphEdge, GraphNode, ScoredNode -from kdd.domain.enums import EdgeType -from kdd.domain.ports import GraphStore - - -@dataclass -class AffectedNode: - """A node affected by a change, with the dependency path.""" - - node_id: str - kind: str - edge_type: str - impact_description: str - - -@dataclass -class TransitivelyAffected: - """A node transitively affected, with the full dependency chain.""" - - node_id: str - kind: str - path: list[str] # node IDs from root to this node - edge_types: list[str] # edge types along the path - - -@dataclass -class ScenarioToRerun: - """A BDD scenario that should be re-run after a change.""" - - node_id: str - scenario_name: str - reason: str - - -@dataclass -class ImpactQueryInput: - node_id: str - change_type: str = "modify_attribute" - depth: int = 3 - - -@dataclass -class ImpactQueryResult: - analyzed_node: GraphNode | None - directly_affected: list[AffectedNode] - transitively_affected: list[TransitivelyAffected] - scenarios_to_rerun: list[ScenarioToRerun] - total_directly: int - total_transitively: int - - -def retrieve_impact( - query: ImpactQueryInput, - graph_store: GraphStore, -) -> ImpactQueryResult: - """Execute an impact analysis query (QRY-004). - - Follows *incoming* edges to find nodes that depend on the queried node. - """ - if not graph_store.has_node(query.node_id): - raise ValueError(f"NODE_NOT_FOUND: {query.node_id}") - - analyzed = graph_store.get_node(query.node_id) - - # Phase 1: Direct dependents (nodes with edges pointing TO query.node_id) - direct_edges = graph_store.incoming_edges(query.node_id) - directly_affected: list[AffectedNode] = [] - direct_ids: set[str] = set() - - for edge in direct_edges: - pred_node = graph_store.get_node(edge.from_node) - if pred_node is None: - continue - direct_ids.add(pred_node.id) - directly_affected.append(AffectedNode( - node_id=pred_node.id, - kind=pred_node.kind.value, - edge_type=edge.edge_type, - impact_description=_describe_impact(edge, query.change_type), - )) - - # Phase 2: Transitive dependents (BFS on incoming edges beyond depth 1) - transitively_affected: list[TransitivelyAffected] = [] - if query.depth > 1: - reverse_results = graph_store.reverse_traverse(query.node_id, query.depth) - for node, path_edges in reverse_results: - if node.id in direct_ids or node.id == query.node_id: - continue - path_ids = [query.node_id] - edge_types = [] - for e in path_edges: - path_ids.append(e.from_node) - edge_types.append(e.edge_type) - transitively_affected.append(TransitivelyAffected( - node_id=node.id, - kind=node.kind.value, - path=path_ids, - edge_types=edge_types, - )) - - # Phase 3: Find BDD scenarios (nodes with VALIDATES edges) - scenarios: list[ScenarioToRerun] = [] - all_affected_ids = direct_ids | {t.node_id for t in transitively_affected} - all_affected_ids.add(query.node_id) - - for edge in graph_store.all_edges(): - if edge.edge_type == EdgeType.VALIDATES.value: - if edge.to_node in all_affected_ids: - feature_node = graph_store.get_node(edge.from_node) - if feature_node: - scenarios.append(ScenarioToRerun( - node_id=feature_node.id, - scenario_name=feature_node.indexed_fields.get("title", feature_node.id), - reason=f"Validates {edge.to_node} which is affected", - )) - - return ImpactQueryResult( - analyzed_node=analyzed, - directly_affected=directly_affected, - transitively_affected=transitively_affected, - scenarios_to_rerun=scenarios, - total_directly=len(directly_affected), - total_transitively=len(transitively_affected), - ) - - -def _describe_impact(edge: GraphEdge, change_type: str) -> str: - """Generate a human-readable impact description.""" - type_desc = { - "ENTITY_RULE": "Business rule validates this entity", - "UC_APPLIES_RULE": "Use case applies this rule", - "UC_EXECUTES_CMD": "Use case executes this command", - "EMITS": "Emits this event", - "CONSUMES": "Consumes this event", - "WIKI_LINK": "References this artifact", - "DOMAIN_RELATION": "Has a domain relationship", - "REQ_TRACES_TO": "Requirement traces to this artifact", - "VALIDATES": "Validates this artifact via BDD scenarios", - } - desc = type_desc.get(edge.edge_type, f"Connected via {edge.edge_type}") - return f"{desc} — change type: {change_type}" diff --git a/src/kdd/application/queries/retrieve_semantic.py b/src/kdd/application/queries/retrieve_semantic.py deleted file mode 100644 index e91c283..0000000 --- a/src/kdd/application/queries/retrieve_semantic.py +++ /dev/null @@ -1,119 +0,0 @@ -"""QRY-002 — RetrieveSemantic. - -Semantic search over embeddings, finding document fragments most similar -to query text. Corresponds to ``POST /v1/retrieve/search``. - -Spec: specs/02-behavior/queries/QRY-002-RetrieveSemantic.md -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -from kdd.domain.entities import ScoredNode -from kdd.domain.enums import KDDKind, KDDLayer -from kdd.domain.ports import EmbeddingModel, GraphStore, VectorStore - - -@dataclass -class SemanticQueryInput: - query_text: str - include_kinds: list[KDDKind] | None = None - include_layers: list[KDDLayer] | None = None - min_score: float = 0.7 - limit: int = 10 - - -@dataclass -class SemanticQueryResult: - results: list[ScoredNode] - total_results: int - embedding_model: str - - -def retrieve_semantic( - query: SemanticQueryInput, - embedding_model: EmbeddingModel, - vector_store: VectorStore, - graph_store: GraphStore, -) -> SemanticQueryResult: - """Execute a semantic search query (QRY-002). - - Raises ValueError if query_text is too short. - """ - if len(query.query_text.strip()) < 3: - raise ValueError("QUERY_TOO_SHORT: query_text must be at least 3 characters") - - # Encode query - vectors = embedding_model.encode([query.query_text]) - query_vector = vectors[0] - - # Search vector store - matches = vector_store.search( - vector=query_vector, - limit=query.limit * 3, # over-fetch to account for post-filtering - min_score=query.min_score, - ) - - # Resolve embedding IDs to nodes and filter - seen_nodes: set[str] = set() - results: list[ScoredNode] = [] - - for emb_id, score in matches: - # Embedding ID format: "{document_id}:chunk-{n}" - doc_id = emb_id.rsplit(":chunk-", 1)[0] if ":chunk-" in emb_id else emb_id.split(":")[0] - - # Find the node for this document - node = _find_node_for_doc(doc_id, graph_store) - if node is None: - continue - - # Deduplicate by node - if node.id in seen_nodes: - continue - seen_nodes.add(node.id) - - # Filter by kind - if query.include_kinds and node.kind not in query.include_kinds: - continue - - # Filter by layer - if query.include_layers and node.layer not in query.include_layers: - continue - - results.append(ScoredNode( - node_id=node.id, - score=score, - snippet=_build_snippet(node, emb_id), - match_source="semantic", - )) - - if len(results) >= query.limit: - break - - return SemanticQueryResult( - results=results, - total_results=len(results), - embedding_model=embedding_model.model_name, - ) - - -def _find_node_for_doc(doc_id: str, graph_store: GraphStore): - """Try to find a GraphNode whose document_id matches.""" - # Try direct lookup with common prefixes - from kdd.application.extractors.base import KIND_PREFIX - - for prefix in KIND_PREFIX.values(): - node = graph_store.get_node(f"{prefix}:{doc_id}") - if node is not None: - return node - - # Fallback: search by ID substring - return graph_store.get_node(doc_id) - - -def _build_snippet(node, emb_id: str) -> str: - title = node.indexed_fields.get("title", "") - if title: - return f"[{node.kind.value}] {title}" - return f"[{node.kind.value}] {node.id}" diff --git a/src/kdd/application/queries/retrieve_violations.py b/src/kdd/application/queries/retrieve_violations.py deleted file mode 100644 index 4140884..0000000 --- a/src/kdd/application/queries/retrieve_violations.py +++ /dev/null @@ -1,84 +0,0 @@ -"""QRY-006 — RetrieveLayerViolations. - -Detects and reports all edges violating KDD layer dependencies -(BR-LAYER-001). Corresponds to ``GET /v1/retrieve/layer-violations``. - -Spec: specs/02-behavior/queries/QRY-006-RetrieveLayerViolations.md -""" - -from __future__ import annotations - -from dataclasses import dataclass - -from kdd.domain.entities import GraphEdge, LayerViolation -from kdd.domain.enums import KDDKind, KDDLayer -from kdd.domain.ports import GraphStore - - -@dataclass -class ViolationsQueryInput: - include_kinds: list[KDDKind] | None = None - include_layers: list[KDDLayer] | None = None - - -@dataclass -class ViolationsQueryResult: - violations: list[LayerViolation] - total_violations: int - total_edges_analyzed: int - violation_rate: float # percentage - - -def retrieve_violations( - query: ViolationsQueryInput, - graph_store: GraphStore, -) -> ViolationsQueryResult: - """Execute a layer-violations query (QRY-006).""" - all_edges = graph_store.all_edges() - violation_edges = graph_store.find_violations() - - # Apply optional filters — include if EITHER endpoint matches - if query.include_kinds or query.include_layers: - filtered: list[GraphEdge] = [] - for edge in violation_edges: - from_node = graph_store.get_node(edge.from_node) - to_node = graph_store.get_node(edge.to_node) - - if query.include_kinds: - from_match = from_node and from_node.kind in query.include_kinds - to_match = to_node and to_node.kind in query.include_kinds - if not (from_match or to_match): - continue - - if query.include_layers: - from_match = from_node and from_node.layer in query.include_layers - to_match = to_node and to_node.layer in query.include_layers - if not (from_match or to_match): - continue - - filtered.append(edge) - violation_edges = filtered - - # Build violation details - violations: list[LayerViolation] = [] - for edge in violation_edges: - from_node = graph_store.get_node(edge.from_node) - to_node = graph_store.get_node(edge.to_node) - - violations.append(LayerViolation( - from_node=edge.from_node, - to_node=edge.to_node, - from_layer=from_node.layer if from_node else KDDLayer.DOMAIN, - to_layer=to_node.layer if to_node else KDDLayer.DOMAIN, - edge_type=edge.edge_type, - )) - - total = len(all_edges) - rate = (len(violations) / total * 100) if total > 0 else 0.0 - - return ViolationsQueryResult( - violations=violations, - total_violations=len(violations), - total_edges_analyzed=total, - violation_rate=round(rate, 2), - ) diff --git a/src/kdd/container.py b/src/kdd/container.py deleted file mode 100644 index 1cff88a..0000000 --- a/src/kdd/container.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Dependency injection container. - -Wires all infrastructure adapters and application components based on -available resources (embedding model, API keys). Used by CLI and API -entry points. -""" - -from __future__ import annotations - -import logging -import shutil -from dataclasses import dataclass, field -from pathlib import Path - -from kdd.application.extractors.registry import ExtractorRegistry, create_default_registry -from kdd.application.queries.index_loader import IndexLoader -from kdd.domain.enums import IndexLevel -from kdd.domain.ports import AgentClient, EmbeddingModel, EventBus, GraphStore, VectorStore -from kdd.domain.rules import detect_index_level -from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore -from kdd.infrastructure.events.bus import InMemoryEventBus -from kdd.infrastructure.graph.networkx_store import NetworkXGraphStore - -logger = logging.getLogger(__name__) - - -@dataclass -class Container: - """Holds all wired dependencies for a KDD session.""" - - specs_root: Path - index_path: Path - index_level: IndexLevel - artifact_store: FilesystemArtifactStore - graph_store: NetworkXGraphStore - vector_store: VectorStore | None - embedding_model: EmbeddingModel | None - event_bus: EventBus - registry: ExtractorRegistry - loader: IndexLoader - agent_client: AgentClient | None = None - - def ensure_loaded(self) -> bool: - """Load index into memory if not already loaded.""" - return self.loader.load() - - -def create_container( - specs_root: Path, - index_path: Path | None = None, - *, - embedding_model_name: str | None = None, -) -> Container: - """Create a fully wired Container. - - Auto-detects index level based on available resources: - - L1: always (no embedding model needed) - - L2: if sentence-transformers + hnswlib are available - - L3: not auto-detected (requires explicit agent config) - """ - if index_path is None: - index_path = specs_root.parent / ".kdd-index" - - artifact_store = FilesystemArtifactStore(index_path) - graph_store = NetworkXGraphStore() - event_bus = InMemoryEventBus() - registry = create_default_registry() - - # Attempt to load embedding model and vector store - embedding_model: EmbeddingModel | None = None - vector_store: VectorStore | None = None - - if embedding_model_name is not False: - try: - from kdd.infrastructure.embedding.sentence_transformer import ( - SentenceTransformerModel, - ) - from kdd.infrastructure.vector.hnswlib_store import HNSWLibVectorStore - - model_name = embedding_model_name or "all-mpnet-base-v2" - embedding_model = SentenceTransformerModel(model_name) - vector_store = HNSWLibVectorStore() - logger.info("L2 available: embedding model '%s' loaded", model_name) - except ImportError: - logger.info("L2 not available: sentence-transformers or hnswlib not installed") - except Exception as e: - logger.warning("L2 not available: %s", e) - - # Attempt to detect Claude CLI for L3 enrichment - agent_client: AgentClient | None = None - if shutil.which("claude"): - try: - from kdd.infrastructure.agent.claude_cli import ClaudeCliAgentClient - - agent_client = ClaudeCliAgentClient() - logger.info("L3 available: Claude CLI detected") - except Exception as e: - logger.warning("L3 not available: %s", e) - - index_level = detect_index_level( - embedding_model_available=embedding_model is not None, - agent_api_available=agent_client is not None, - ) - - loader = IndexLoader(artifact_store, graph_store, vector_store) - - return Container( - specs_root=specs_root, - index_path=index_path, - index_level=index_level, - artifact_store=artifact_store, - graph_store=graph_store, - vector_store=vector_store, - embedding_model=embedding_model, - event_bus=event_bus, - registry=registry, - loader=loader, - agent_client=agent_client, - ) diff --git a/src/kdd/domain/__init__.py b/src/kdd/domain/__init__.py deleted file mode 100644 index b2005c4..0000000 --- a/src/kdd/domain/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""KDD domain layer — entities, enums, events, rules, and ports.""" diff --git a/src/kdd/domain/entities.py b/src/kdd/domain/entities.py deleted file mode 100644 index 833f6b5..0000000 --- a/src/kdd/domain/entities.py +++ /dev/null @@ -1,201 +0,0 @@ -"""Domain entities for KDD. - -All entities are Pydantic BaseModels aligned with their spec definitions -under ``specs/01-domain/entities/``. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any -from uuid import UUID - -from pydantic import BaseModel, Field - -from kdd.domain.enums import ( - DocumentStatus, - IndexLevel, - KDDKind, - KDDLayer, - QueryStatus, - RetrievalStrategy, -) - - -# --------------------------------------------------------------------------- -# Value objects -# --------------------------------------------------------------------------- - - -class Section(BaseModel): - """A Markdown section extracted from a KDD document.""" - - heading: str - level: int - content: str - path: str = "" # hierarchical path e.g. "descripcion.atributos" - - -class IndexStats(BaseModel): - """Aggregate counts stored in an IndexManifest.""" - - nodes: int = 0 - edges: int = 0 - embeddings: int = 0 - enrichments: int = 0 - - -class ScoredNode(BaseModel): - """A graph node scored by the retrieval engine.""" - - node_id: str - score: float - snippet: str | None = None - match_source: str # "semantic", "graph", "lexical", "fusion" - - -class LayerViolation(BaseModel): - """A detected layer-dependency violation between two nodes.""" - - from_node: str - to_node: str - from_layer: KDDLayer - to_layer: KDDLayer - edge_type: str - - -# --------------------------------------------------------------------------- -# Core entities -# --------------------------------------------------------------------------- - - -class KDDDocument(BaseModel): - """A parsed KDD spec file — the atomic input to the indexing pipeline. - - Spec: specs/01-domain/entities/KDDDocument.md - """ - - id: str - kind: KDDKind - source_path: str - source_hash: str - layer: KDDLayer - front_matter: dict[str, Any] - sections: list[Section] - wiki_links: list[str] = Field(default_factory=list) - status: DocumentStatus = DocumentStatus.DETECTED - indexed_at: datetime | None = None - domain: str | None = None - - -class GraphNode(BaseModel): - """A node in the knowledge graph, produced by indexing a KDDDocument. - - Spec: specs/01-domain/entities/GraphNode.md - """ - - id: str # "{Kind}:{DocumentId}" e.g. "Entity:Pedido" - kind: KDDKind - source_file: str - source_hash: str - layer: KDDLayer - status: str = "draft" # artifact status (draft/review/approved/deprecated) - aliases: list[str] = Field(default_factory=list) - domain: str | None = None - indexed_fields: dict[str, Any] = Field(default_factory=dict) - indexed_at: datetime | None = None - - -class GraphEdge(BaseModel): - """A typed, directed relationship between two GraphNodes. - - Spec: specs/01-domain/entities/GraphEdge.md - """ - - from_node: str - to_node: str - edge_type: str # SCREAMING_SNAKE = structural, snake_case = business - source_file: str - extraction_method: str # "wiki_link", "section_content", "implicit" - metadata: dict[str, Any] = Field(default_factory=dict) - layer_violation: bool = False - bidirectional: bool = False - - -class Embedding(BaseModel): - """A semantic vector generated from a paragraph of a KDDDocument. - - Spec: specs/01-domain/entities/Embedding.md - """ - - id: str # "{document_id}:{section_path}:{chunk_index}" - document_id: str - document_kind: KDDKind - section_path: str - chunk_index: int - raw_text: str - context_text: str - vector: list[float] - model: str - dimensions: int - text_hash: str - generated_at: datetime - - -class IndexManifest(BaseModel): - """Metadata for a generated index stored in ``.kdd-index/manifest.json``. - - Spec: specs/01-domain/entities/IndexManifest.md - """ - - version: str # semver - kdd_version: str - embedding_model: str | None = None - embedding_dimensions: int | None = None - indexed_at: datetime - indexed_by: str - structure: str = "single-domain" # "single-domain" | "multi-domain" - index_level: IndexLevel - stats: IndexStats = Field(default_factory=IndexStats) - domains: list[str] = Field(default_factory=list) - git_commit: str | None = None - - -class RetrievalQuery(BaseModel): - """A query from an AI agent or developer to the retrieval engine. - - Spec: specs/01-domain/entities/RetrievalQuery.md - """ - - id: UUID - strategy: RetrievalStrategy - query_text: str | None = None - root_node: str | None = None - depth: int = 2 - edge_types: list[str] = Field(default_factory=list) - include_kinds: list[KDDKind] = Field(default_factory=list) - include_layers: list[KDDLayer] = Field(default_factory=list) - respect_layers: bool = True - min_score: float = 0.7 - limit: int = 10 - max_tokens: int = 8000 - status: QueryStatus = QueryStatus.RECEIVED - received_at: datetime - completed_at: datetime | None = None - duration_ms: int | None = None - caller: str | None = None - - -class RetrievalResult(BaseModel): - """The response returned by the retrieval engine for a RetrievalQuery. - - Spec: specs/01-domain/entities/RetrievalResult.md - """ - - query_id: UUID - strategy: RetrievalStrategy - results: list[ScoredNode] - graph_expansion: list[GraphEdge] = Field(default_factory=list) - total_nodes: int - total_tokens: int | None = None - layer_violations: list[LayerViolation] = Field(default_factory=list) diff --git a/src/kdd/domain/enums.py b/src/kdd/domain/enums.py deleted file mode 100644 index 4ee06d4..0000000 --- a/src/kdd/domain/enums.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Domain enumerations for KDD. - -Spec references: -- KDDKind: PRD-KBEngine (Nodos del grafo), BR-DOCUMENT-001 -- KDDLayer: BR-LAYER-001 -- EdgeType: GraphEdge entity spec (structural edge types) -- IndexLevel: BR-INDEX-001 -- RetrievalStrategy: RetrievalQuery entity spec -""" - -from __future__ import annotations - -from enum import Enum - - -class KDDKind(str, Enum): - """The 15 KDD artifact types recognized by the engine. - - Each value corresponds to the ``kind`` field in a spec's front-matter - and maps to a dedicated extractor (BR-DOCUMENT-001). - """ - - ENTITY = "entity" - EVENT = "event" - BUSINESS_RULE = "business-rule" - BUSINESS_POLICY = "business-policy" - CROSS_POLICY = "cross-policy" - COMMAND = "command" - QUERY = "query" - PROCESS = "process" - USE_CASE = "use-case" - UI_VIEW = "ui-view" - UI_COMPONENT = "ui-component" - REQUIREMENT = "requirement" - OBJECTIVE = "objective" - PRD = "prd" - ADR = "adr" - - -class KDDLayer(str, Enum): - """KDD layers ordered from bottom (requirements) to top (verification). - - The numeric prefix determines the dependency direction: - higher layers may reference lower layers, not the reverse (BR-LAYER-001). - ``00-requirements`` is exempt from this rule. - """ - - REQUIREMENTS = "00-requirements" - DOMAIN = "01-domain" - BEHAVIOR = "02-behavior" - EXPERIENCE = "03-experience" - VERIFICATION = "04-verification" - - @property - def numeric(self) -> int: - """Return the numeric prefix (0-4) for layer comparison.""" - return int(self.value[:2]) - - -class DocumentStatus(str, Enum): - """Lifecycle states of a KDDDocument in the indexing pipeline.""" - - DETECTED = "detected" - PARSING = "parsing" - INDEXED = "indexed" - STALE = "stale" - DELETED = "deleted" - - -class QueryStatus(str, Enum): - """Lifecycle states of a RetrievalQuery.""" - - RECEIVED = "received" - RESOLVING = "resolving" - COMPLETED = "completed" - FAILED = "failed" - - -class EdgeType(str, Enum): - """Structural (SCREAMING_SNAKE_CASE) edge types extracted by the engine. - - Business-domain edges (snake_case) are free-form strings defined by spec - authors and are NOT enumerated here. - """ - - WIKI_LINK = "WIKI_LINK" - DOMAIN_RELATION = "DOMAIN_RELATION" - ENTITY_RULE = "ENTITY_RULE" - ENTITY_POLICY = "ENTITY_POLICY" - EMITS = "EMITS" - CONSUMES = "CONSUMES" - UC_APPLIES_RULE = "UC_APPLIES_RULE" - UC_EXECUTES_CMD = "UC_EXECUTES_CMD" - UC_STORY = "UC_STORY" - VIEW_TRIGGERS_UC = "VIEW_TRIGGERS_UC" - VIEW_USES_COMPONENT = "VIEW_USES_COMPONENT" - COMPONENT_USES_ENTITY = "COMPONENT_USES_ENTITY" - REQ_TRACES_TO = "REQ_TRACES_TO" - VALIDATES = "VALIDATES" - DECIDES_FOR = "DECIDES_FOR" - CROSS_DOMAIN_REF = "CROSS_DOMAIN_REF" - - -class IndexLevel(str, Enum): - """Progressive indexing levels (BR-INDEX-001). - - L1 is always available. L2 requires a local embedding model. - L3 requires an AI agent API key. - """ - - L1 = "L1" - L2 = "L2" - L3 = "L3" - - -class RetrievalStrategy(str, Enum): - """Available retrieval strategies (RetrievalQuery spec).""" - - GRAPH = "graph" - SEMANTIC = "semantic" - HYBRID = "hybrid" - IMPACT = "impact" diff --git a/src/kdd/domain/events.py b/src/kdd/domain/events.py deleted file mode 100644 index 6832cf1..0000000 --- a/src/kdd/domain/events.py +++ /dev/null @@ -1,189 +0,0 @@ -"""Domain events for KDD. - -All events are frozen dataclasses aligned with their spec definitions -under ``specs/01-domain/events/``. - -Each event is immutable (frozen=True) and carries the payload described -in the corresponding EVT-* spec. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from datetime import datetime -from typing import Any -from uuid import UUID - -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer, RetrievalStrategy - - -# --------------------------------------------------------------------------- -# Document lifecycle events -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class DocumentDetected: - """A spec file with valid front-matter was found in /specs. - - Spec: EVT-KDDDocument-Detected - """ - - source_path: str - source_hash: str - kind: KDDKind - layer: KDDLayer - detected_at: datetime - - -@dataclass(frozen=True) -class DocumentParsed: - """A KDDDocument was successfully parsed by its kind extractor. - - Spec: EVT-KDDDocument-Parsed - """ - - document_id: str - source_path: str - kind: KDDKind - front_matter: dict[str, Any] - section_count: int - wiki_link_count: int - parsed_at: datetime - - -@dataclass(frozen=True) -class DocumentIndexed: - """A KDDDocument completed the full indexing pipeline. - - Spec: EVT-KDDDocument-Indexed - """ - - document_id: str - source_path: str - kind: KDDKind - node_id: str - edge_count: int - embedding_count: int - index_level: IndexLevel - duration_ms: int - indexed_at: datetime - - -@dataclass(frozen=True) -class DocumentStale: - """A previously-indexed KDDDocument was modified on disk. - - Spec: EVT-KDDDocument-Stale - """ - - document_id: str - source_path: str - previous_hash: str - current_hash: str - detected_at: datetime - - -@dataclass(frozen=True) -class DocumentDeleted: - """A previously-indexed KDDDocument was removed from the filesystem. - - Spec: EVT-KDDDocument-Deleted - """ - - document_id: str - source_path: str - node_id: str - edge_count: int - embedding_count: int - deleted_at: datetime - - -# --------------------------------------------------------------------------- -# Merge events -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class MergeRequested: - """A merge of multiple developer indexes was requested. - - Spec: EVT-Index-MergeRequested - """ - - merge_id: UUID - source_manifests: list[str] = field(default_factory=list) - developer_ids: list[str] = field(default_factory=list) - target_version: str = "" - requested_at: datetime = field(default_factory=datetime.now) - requested_by: str = "" - - -@dataclass(frozen=True) -class MergeCompleted: - """A merge of indexes completed successfully. - - Spec: EVT-Index-MergeCompleted - """ - - merge_id: UUID - merged_manifest_id: str - source_count: int - total_nodes: int - total_edges: int - total_embeddings: int - conflicts_resolved: int - duration_ms: int - completed_at: datetime - - -# --------------------------------------------------------------------------- -# Query lifecycle events -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class QueryReceived: - """A retrieval query was received from an agent or developer. - - Spec: EVT-RetrievalQuery-Received - """ - - query_id: UUID - strategy: RetrievalStrategy - query_text: str | None = None - root_node: str | None = None - caller: str | None = None - received_at: datetime = field(default_factory=datetime.now) - - -@dataclass(frozen=True) -class QueryCompleted: - """A retrieval query was resolved successfully. - - Spec: EVT-RetrievalQuery-Completed - """ - - query_id: UUID - strategy: RetrievalStrategy - total_results: int - top_score: float - total_tokens: int - duration_ms: int - completed_at: datetime - - -@dataclass(frozen=True) -class QueryFailed: - """A retrieval query failed during validation or resolution. - - Spec: EVT-RetrievalQuery-Failed - """ - - query_id: UUID - strategy: RetrievalStrategy - error_code: str - error_message: str - phase: str # "validation" or "resolution" - duration_ms: int - failed_at: datetime diff --git a/src/kdd/domain/ports.py b/src/kdd/domain/ports.py deleted file mode 100644 index b8049c1..0000000 --- a/src/kdd/domain/ports.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Port definitions (hexagonal architecture). - -Each Protocol defines a boundary that infrastructure adapters must satisfy. -The domain and application layers depend only on these Protocols, never on -concrete implementations. -""" - -from __future__ import annotations - -from typing import Any, Protocol, runtime_checkable - -from kdd.domain.entities import ( - Embedding, - GraphEdge, - GraphNode, - IndexManifest, -) - - -# --------------------------------------------------------------------------- -# Storage ports -# --------------------------------------------------------------------------- - - -@runtime_checkable -class ArtifactStore(Protocol): - """Read/write .kdd-index/ artifacts on disk.""" - - def write_manifest(self, manifest: IndexManifest) -> None: ... - def read_manifest(self) -> IndexManifest | None: ... - def write_node(self, node: GraphNode) -> None: ... - def read_node(self, node_id: str) -> GraphNode | None: ... - def append_edges(self, edges: list[GraphEdge]) -> None: ... - def read_edges(self) -> list[GraphEdge]: ... - def write_embeddings(self, embeddings: list[Embedding]) -> None: ... - def read_embeddings(self, document_id: str) -> list[Embedding]: ... - def read_all_nodes(self) -> list[GraphNode]: ... - def read_all_embeddings(self) -> list[Embedding]: ... - def delete_document_artifacts(self, document_id: str) -> None: ... - - -@runtime_checkable -class GraphStore(Protocol): - """In-memory graph loaded from artifacts for querying.""" - - def load(self, nodes: list[GraphNode], edges: list[GraphEdge]) -> None: ... - def get_node(self, node_id: str) -> GraphNode | None: ... - def has_node(self, node_id: str) -> bool: ... - def traverse( - self, - root: str, - depth: int, - edge_types: list[str] | None = None, - respect_layers: bool = True, - ) -> tuple[list[GraphNode], list[GraphEdge]]: ... - def text_search( - self, - query: str, - fields: list[str] | None = None, - ) -> list[GraphNode]: ... - def neighbors(self, node_id: str) -> list[GraphNode]: ... - def incoming_edges(self, node_id: str) -> list[GraphEdge]: ... - def outgoing_edges(self, node_id: str) -> list[GraphEdge]: ... - def reverse_traverse( - self, - root: str, - depth: int, - ) -> list[tuple[GraphNode, list[GraphEdge]]]: ... - def all_edges(self) -> list[GraphEdge]: ... - def find_violations(self) -> list[GraphEdge]: ... - - -@runtime_checkable -class VectorStore(Protocol): - """In-memory vector index loaded from artifacts for semantic search.""" - - def load(self, embeddings: list[Embedding]) -> None: ... - def search( - self, - vector: list[float], - limit: int = 10, - min_score: float = 0.0, - ) -> list[tuple[str, float]]: ... # (embedding_id, score) - - -# --------------------------------------------------------------------------- -# Embedding port -# --------------------------------------------------------------------------- - - -@runtime_checkable -class EmbeddingModel(Protocol): - """Generates embedding vectors from text.""" - - @property - def model_name(self) -> str: ... - - @property - def dimensions(self) -> int: ... - - def encode(self, texts: list[str]) -> list[list[float]]: ... - - -# --------------------------------------------------------------------------- -# Event bus port -# --------------------------------------------------------------------------- - - -@runtime_checkable -class EventBus(Protocol): - """Publish/subscribe in-memory event bus.""" - - def publish(self, event: Any) -> None: ... - def subscribe(self, event_type: type, handler: Any) -> None: ... - - -# --------------------------------------------------------------------------- -# External integration ports -# --------------------------------------------------------------------------- - - -@runtime_checkable -class AgentClient(Protocol): - """Communicates with an AI agent for L3 enrichment (CMD-003).""" - - def enrich(self, node: GraphNode, context: str) -> dict[str, Any]: ... - - -@runtime_checkable -class Transport(Protocol): - """Push/pull .kdd-index/ artifacts to a remote (CMD-005).""" - - def push(self, index_path: str, remote: str) -> None: ... - def pull(self, remote: str, target_path: str) -> None: ... diff --git a/src/kdd/domain/rules.py b/src/kdd/domain/rules.py deleted file mode 100644 index eeb6c83..0000000 --- a/src/kdd/domain/rules.py +++ /dev/null @@ -1,229 +0,0 @@ -"""Business rules as pure functions. - -Each function implements a spec from ``specs/01-domain/rules/`` and is -fully deterministic — no I/O, no side-effects, easy to unit-test. -""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any - -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer - - -# --------------------------------------------------------------------------- -# BR-DOCUMENT-001 — Kind Router -# --------------------------------------------------------------------------- - -# Front-matter ``kind`` string → KDDKind enum -KIND_LOOKUP: dict[str, KDDKind] = {k.value: k for k in KDDKind} - -# kind → expected folder prefix (for location warnings) -KIND_EXPECTED_PATH: dict[KDDKind, str] = { - KDDKind.ENTITY: "01-domain/entities/", - KDDKind.EVENT: "01-domain/events/", - KDDKind.BUSINESS_RULE: "01-domain/rules/", - KDDKind.BUSINESS_POLICY: "02-behavior/policies/", - KDDKind.CROSS_POLICY: "02-behavior/policies/", - KDDKind.COMMAND: "02-behavior/commands/", - KDDKind.QUERY: "02-behavior/queries/", - KDDKind.PROCESS: "02-behavior/processes/", - KDDKind.USE_CASE: "02-behavior/use-cases/", - KDDKind.UI_VIEW: "03-experience/views/", - KDDKind.UI_COMPONENT: "03-experience/views/", - KDDKind.REQUIREMENT: "04-verification/criteria/", - KDDKind.OBJECTIVE: "00-requirements/objectives/", - KDDKind.PRD: "00-requirements/", - KDDKind.ADR: "00-requirements/decisions/", -} - - -@dataclass -class RouteResult: - """Result of routing a document to its kind.""" - - kind: KDDKind | None - warning: str | None = None - - -def route_document( - front_matter: dict[str, Any] | None, - source_path: str, -) -> RouteResult: - """Determine the KDDKind and validate location (BR-DOCUMENT-001). - - Returns ``RouteResult(kind=None)`` when the document should be ignored - (no front-matter or unrecognised kind). - """ - if not front_matter: - return RouteResult(kind=None) - - kind_str = str(front_matter.get("kind", "")).lower().strip() - if not kind_str or kind_str not in KIND_LOOKUP: - return RouteResult(kind=None) - - kind = KIND_LOOKUP[kind_str] - - # Check expected location - expected = KIND_EXPECTED_PATH.get(kind, "") - warning = None - if expected and expected not in source_path: - warning = ( - f"{kind.value} '{source_path}' found outside " - f"expected path '{expected}'" - ) - - return RouteResult(kind=kind, warning=warning) - - -# --------------------------------------------------------------------------- -# BR-EMBEDDING-001 — Embedding Strategy -# --------------------------------------------------------------------------- - -# kind → set of embeddable section headings (normalised to lowercase). -# An empty set means the kind produces no embeddings at all. -EMBEDDABLE_SECTIONS: dict[KDDKind, set[str]] = { - KDDKind.ENTITY: {"descripción", "description"}, - KDDKind.EVENT: set(), # no embeddings - KDDKind.BUSINESS_RULE: {"declaración", "declaration", "cuándo aplica", "when applies"}, - KDDKind.BUSINESS_POLICY: {"declaración", "declaration"}, - KDDKind.CROSS_POLICY: {"propósito", "purpose", "declaración", "declaration"}, - KDDKind.COMMAND: {"purpose", "propósito"}, - KDDKind.QUERY: {"purpose", "propósito"}, - KDDKind.PROCESS: {"participantes", "participants", "pasos", "steps"}, - KDDKind.USE_CASE: {"descripción", "description", "flujo principal", "main flow"}, - KDDKind.UI_VIEW: {"descripción", "description", "comportamiento", "behavior"}, - KDDKind.UI_COMPONENT: {"descripción", "description"}, - KDDKind.REQUIREMENT: {"descripción", "description"}, - KDDKind.OBJECTIVE: {"objetivo", "objective"}, - KDDKind.PRD: {"problema / oportunidad", "problem / opportunity"}, - KDDKind.ADR: {"contexto", "context", "decisión", "decision"}, -} - - -def embeddable_sections(kind: KDDKind) -> set[str]: - """Return the set of embeddable section headings for a kind (BR-EMBEDDING-001).""" - return EMBEDDABLE_SECTIONS.get(kind, set()) - - -# --------------------------------------------------------------------------- -# BR-INDEX-001 — Index Level detection -# --------------------------------------------------------------------------- - - -def detect_index_level( - *, - embedding_model_available: bool, - agent_api_available: bool, -) -> IndexLevel: - """Determine the highest indexing level available (BR-INDEX-001).""" - if agent_api_available and embedding_model_available: - return IndexLevel.L3 - if embedding_model_available: - return IndexLevel.L2 - return IndexLevel.L1 - - -# --------------------------------------------------------------------------- -# BR-LAYER-001 — Layer Validation -# --------------------------------------------------------------------------- - -# Layer prefix → KDDLayer -LAYER_BY_PREFIX: dict[str, KDDLayer] = { - "00-requirements": KDDLayer.REQUIREMENTS, - "01-domain": KDDLayer.DOMAIN, - "02-behavior": KDDLayer.BEHAVIOR, - "03-experience": KDDLayer.EXPERIENCE, - "04-verification": KDDLayer.VERIFICATION, -} - - -def detect_layer(source_path: str) -> KDDLayer | None: - """Infer the KDD layer from a file's path prefix.""" - for prefix, layer in LAYER_BY_PREFIX.items(): - if prefix in source_path: - return layer - return None - - -def is_layer_violation( - origin_layer: KDDLayer, - destination_layer: KDDLayer, -) -> bool: - """Return True if an edge from origin to destination violates layer deps. - - Rules (BR-LAYER-001): - - ``00-requirements`` is exempt — never a violation when it's the origin. - - A violation occurs when the origin's numeric layer is > 0 and is - strictly lower than the destination's numeric layer - (i.e. a lower layer references a higher layer). - """ - if origin_layer == KDDLayer.REQUIREMENTS: - return False - return origin_layer.numeric < destination_layer.numeric - - -# --------------------------------------------------------------------------- -# BR-MERGE-001 — Merge Conflict Resolution -# --------------------------------------------------------------------------- - -@dataclass -class ConflictResult: - """Outcome of a node conflict resolution.""" - - winner_index: int # index into the list of candidates - reason: str - - -def resolve_node_conflict( - candidates: list[dict[str, Any]], -) -> ConflictResult: - """Resolve a node conflict using last-write-wins (BR-MERGE-001). - - Each candidate dict must contain ``indexed_at`` (ISO datetime string - or datetime object) and ``source_hash``. - - If all hashes are identical the first candidate wins (equivalent copies). - """ - if len(candidates) == 1: - return ConflictResult(winner_index=0, reason="single") - - # Identical hashes → take first - hashes = {c["source_hash"] for c in candidates} - if len(hashes) == 1: - return ConflictResult(winner_index=0, reason="identical") - - # Last-write-wins by indexed_at - best_idx = 0 - best_ts = candidates[0]["indexed_at"] - for i, c in enumerate(candidates[1:], start=1): - if c["indexed_at"] > best_ts: - best_ts = c["indexed_at"] - best_idx = i - - return ConflictResult(winner_index=best_idx, reason="last-write-wins") - - -def resolve_deletion( - present_in: list[bool], - modified_after_deletion: bool = False, -) -> tuple[bool, str | None]: - """Decide whether a node should be deleted during merge (BR-MERGE-001). - - ``present_in`` is a list of booleans, one per source index, indicating - whether the node is present in that index. - - Returns ``(should_delete, warning)``. - - Delete-wins: if the node is absent from *any* index, it's deleted. - - If it was modified in another index after the deletion, a warning is - returned. - """ - if all(present_in): - return False, None - - warning = None - if modified_after_deletion: - warning = "Node was modified in another index after deletion" - - return True, warning diff --git a/src/kdd/infrastructure/__init__.py b/src/kdd/infrastructure/__init__.py deleted file mode 100644 index ed7476b..0000000 --- a/src/kdd/infrastructure/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""KDD infrastructure layer — adapters for storage, parsing, events, and git.""" diff --git a/src/kdd/infrastructure/agent/__init__.py b/src/kdd/infrastructure/agent/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kdd/infrastructure/agent/claude_cli.py b/src/kdd/infrastructure/agent/claude_cli.py deleted file mode 100644 index 75540d5..0000000 --- a/src/kdd/infrastructure/agent/claude_cli.py +++ /dev/null @@ -1,127 +0,0 @@ -"""AgentClient adapter that invokes the Claude CLI (``claude -p``). - -Requires the ``claude`` binary available in PATH (Claude Code subscription). -No API key needed — uses the developer's existing Claude CLI auth. -""" - -from __future__ import annotations - -import json -import logging -import os -import subprocess -from typing import Any - -from kdd.domain.entities import GraphNode - -logger = logging.getLogger(__name__) - - -class ClaudeCliAgentClient: - """Adapter: AgentClient → ``claude -p`` subprocess.""" - - def __init__( - self, - timeout: int = 120, - claude_path: str | None = None, - model: str | None = None, - ) -> None: - self.timeout = timeout - self.claude_path = claude_path or "claude" - self.model = model - - def enrich(self, node: GraphNode, context: str) -> dict[str, Any]: - """Call Claude CLI to enrich a graph node (CMD-003).""" - prompt = _build_enrichment_prompt(node, context) - - cmd = [self.claude_path, "-p", prompt, "--output-format", "json"] - if self.model: - cmd.extend(["--model", self.model]) - - # Filter CLAUDECODE* env vars so nested claude invocation works - env = {k: v for k, v in os.environ.items() if not k.startswith("CLAUDECODE")} - - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=self.timeout, - env=env, - ) - except FileNotFoundError: - raise RuntimeError( - f"Claude CLI not found at '{self.claude_path}'. " - "Install it from https://docs.anthropic.com/en/docs/claude-code" - ) - except subprocess.TimeoutExpired: - raise RuntimeError( - f"Claude CLI timed out after {self.timeout}s enriching {node.id}" - ) - - if result.returncode != 0: - stderr = result.stderr.strip()[:500] if result.stderr else "(no stderr)" - raise RuntimeError( - f"Claude CLI exited with code {result.returncode}: {stderr}" - ) - - # Parse the CLI envelope: {"type":"result","subtype":"success","result":"..."} - try: - envelope = json.loads(result.stdout) - except json.JSONDecodeError as e: - raise RuntimeError(f"Claude CLI returned invalid JSON envelope: {e}") - - model_text = envelope.get("result", "") - return _parse_enrichment_response(model_text) - - -def _build_enrichment_prompt(node: GraphNode, context: str) -> str: - """Build the enrichment prompt for Claude.""" - return f"""\ -You are a KDD (Knowledge-Driven Development) analyst. Given the following \ -specification node and its context, produce a JSON object with exactly these keys: - -- "summary": A concise 2-3 sentence summary of the specification's purpose and scope. -- "implicit_relations": An array of objects, each with "target" (node ID like "Entity:Pedido") \ -and "type" (edge type like "DEPENDS_ON", "TRIGGERS", "VALIDATES"). -- "impact_analysis": An object with "change_risk" ("low"|"medium"|"high") and \ -"reason" (one sentence explaining why). - -Respond ONLY with the JSON object, no markdown fences, no explanation. - ---- -{context} -""" - - -def _parse_enrichment_response(text: str) -> dict[str, Any]: - """Parse the model's response text into a structured dict. - - Handles markdown fences defensively and fills missing keys with defaults. - """ - cleaned = text.strip() - - # Strip markdown fences if present - if cleaned.startswith("```"): - lines = cleaned.split("\n") - # Remove first line (```json or ```) - lines = lines[1:] - # Remove last line if it's closing fence - if lines and lines[-1].strip() == "```": - lines = lines[:-1] - cleaned = "\n".join(lines).strip() - - try: - data = json.loads(cleaned) - except json.JSONDecodeError as e: - raise RuntimeError(f"Model returned invalid JSON: {e}\nRaw text: {text[:300]}") - - if not isinstance(data, dict): - raise RuntimeError(f"Expected JSON object, got {type(data).__name__}") - - # Fill missing keys with defaults - data.setdefault("summary", "") - data.setdefault("implicit_relations", []) - data.setdefault("impact_analysis", {"change_risk": "medium", "reason": "unknown"}) - - return data diff --git a/src/kdd/infrastructure/artifact/__init__.py b/src/kdd/infrastructure/artifact/__init__.py deleted file mode 100644 index a1e0995..0000000 --- a/src/kdd/infrastructure/artifact/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Artifact store: filesystem-based .kdd-index/ read/write.""" diff --git a/src/kdd/infrastructure/artifact/filesystem.py b/src/kdd/infrastructure/artifact/filesystem.py deleted file mode 100644 index 6895f98..0000000 --- a/src/kdd/infrastructure/artifact/filesystem.py +++ /dev/null @@ -1,236 +0,0 @@ -"""Filesystem-based ArtifactStore implementation. - -Manages the ``.kdd-index/`` directory layout described in PRD Appendix A:: - - .kdd-index/ - ├── manifest.json - ├── nodes/{kind}/{id}.json - ├── edges/edges.jsonl - └── embeddings/{kind}/{doc_id}.json - -Implements the ``ArtifactStore`` port from ``kdd.domain.ports``. -""" - -from __future__ import annotations - -import json -from pathlib import Path - -from kdd.domain.entities import Embedding, GraphEdge, GraphNode, IndexManifest -from kdd.domain.enums import KDDKind - - -class FilesystemArtifactStore: - """Read/write ``.kdd-index/`` artifacts on the local filesystem.""" - - def __init__(self, index_path: str | Path) -> None: - self._root = Path(index_path) - - @property - def root(self) -> Path: - return self._root - - # ------------------------------------------------------------------ - # Manifest - # ------------------------------------------------------------------ - - def write_manifest(self, manifest: IndexManifest) -> None: - self._root.mkdir(parents=True, exist_ok=True) - path = self._root / "manifest.json" - path.write_text( - manifest.model_dump_json(indent=2), - encoding="utf-8", - ) - - def read_manifest(self) -> IndexManifest | None: - path = self._root / "manifest.json" - if not path.exists(): - return None - data = json.loads(path.read_text(encoding="utf-8")) - return IndexManifest.model_validate(data) - - # ------------------------------------------------------------------ - # Nodes - # ------------------------------------------------------------------ - - def _node_dir(self, kind: KDDKind) -> Path: - return self._root / "nodes" / kind.value - - def _node_path(self, node: GraphNode) -> Path: - # ID is "{Kind}:{DocId}" — use DocId as filename - doc_id = node.id.split(":", 1)[-1] if ":" in node.id else node.id - return self._node_dir(node.kind) / f"{doc_id}.json" - - def write_node(self, node: GraphNode) -> None: - path = self._node_path(node) - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(node.model_dump_json(indent=2), encoding="utf-8") - - def read_node(self, node_id: str) -> GraphNode | None: - # Search all kind subdirectories for the node file - nodes_dir = self._root / "nodes" - if not nodes_dir.exists(): - return None - for kind_dir in nodes_dir.iterdir(): - if not kind_dir.is_dir(): - continue - doc_id = node_id.split(":", 1)[-1] if ":" in node_id else node_id - path = kind_dir / f"{doc_id}.json" - if path.exists(): - data = json.loads(path.read_text(encoding="utf-8")) - return GraphNode.model_validate(data) - return None - - def read_all_nodes(self) -> list[GraphNode]: - """Read every node from the store.""" - nodes: list[GraphNode] = [] - nodes_dir = self._root / "nodes" - if not nodes_dir.exists(): - return nodes - for kind_dir in nodes_dir.iterdir(): - if not kind_dir.is_dir(): - continue - for path in sorted(kind_dir.glob("*.json")): - data = json.loads(path.read_text(encoding="utf-8")) - nodes.append(GraphNode.model_validate(data)) - return nodes - - # ------------------------------------------------------------------ - # Edges - # ------------------------------------------------------------------ - - def _edges_path(self) -> Path: - return self._root / "edges" / "edges.jsonl" - - def append_edges(self, edges: list[GraphEdge]) -> None: - path = self._edges_path() - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("a", encoding="utf-8") as f: - for edge in edges: - f.write(edge.model_dump_json() + "\n") - - def read_edges(self) -> list[GraphEdge]: - path = self._edges_path() - if not path.exists(): - return [] - edges: list[GraphEdge] = [] - for line in path.read_text(encoding="utf-8").splitlines(): - line = line.strip() - if line: - data = json.loads(line) - edges.append(GraphEdge.model_validate(data)) - return edges - - # ------------------------------------------------------------------ - # Embeddings - # ------------------------------------------------------------------ - - def _embedding_path(self, kind: KDDKind, document_id: str) -> Path: - return self._root / "embeddings" / kind.value / f"{document_id}.json" - - def write_embeddings(self, embeddings: list[Embedding]) -> None: - if not embeddings: - return - # Group by (kind, document_id) and write one file per document - by_doc: dict[tuple[str, str], list[Embedding]] = {} - for emb in embeddings: - key = (emb.document_kind.value, emb.document_id) - by_doc.setdefault(key, []).append(emb) - - for (kind_val, doc_id), doc_embeddings in by_doc.items(): - kind = KDDKind(kind_val) - path = self._embedding_path(kind, doc_id) - path.parent.mkdir(parents=True, exist_ok=True) - data = [e.model_dump(mode="json") for e in doc_embeddings] - path.write_text( - json.dumps(data, indent=2, default=str), - encoding="utf-8", - ) - - def read_embeddings(self, document_id: str) -> list[Embedding]: - emb_dir = self._root / "embeddings" - if not emb_dir.exists(): - return [] - results: list[Embedding] = [] - for kind_dir in emb_dir.iterdir(): - if not kind_dir.is_dir(): - continue - path = kind_dir / f"{document_id}.json" - if path.exists(): - data = json.loads(path.read_text(encoding="utf-8")) - for item in data: - results.append(Embedding.model_validate(item)) - return results - - def read_all_embeddings(self) -> list[Embedding]: - """Read every embedding from the store.""" - emb_dir = self._root / "embeddings" - if not emb_dir.exists(): - return [] - results: list[Embedding] = [] - for kind_dir in emb_dir.iterdir(): - if not kind_dir.is_dir(): - continue - for path in sorted(kind_dir.glob("*.json")): - data = json.loads(path.read_text(encoding="utf-8")) - for item in data: - results.append(Embedding.model_validate(item)) - return results - - # ------------------------------------------------------------------ - # Cascade delete - # ------------------------------------------------------------------ - - def delete_document_artifacts(self, document_id: str) -> None: - """Remove a document's node, edges, and embeddings. - - - Deletes the node JSON file. - - Rewrites ``edges.jsonl`` excluding edges involving this document's node. - - Deletes the embedding JSON file. - """ - # 1. Find and delete node file - nodes_dir = self._root / "nodes" - if nodes_dir.exists(): - for kind_dir in nodes_dir.iterdir(): - if not kind_dir.is_dir(): - continue - path = kind_dir / f"{document_id}.json" - if path.exists(): - # Read node to get its ID for edge filtering - data = json.loads(path.read_text(encoding="utf-8")) - node_id = data.get("id", "") - path.unlink() - # Remove empty kind directory - if not any(kind_dir.iterdir()): - kind_dir.rmdir() - # 2. Filter edges - self._remove_edges_for_node(node_id) - break - - # 3. Delete embeddings - emb_dir = self._root / "embeddings" - if emb_dir.exists(): - for kind_dir in emb_dir.iterdir(): - if not kind_dir.is_dir(): - continue - path = kind_dir / f"{document_id}.json" - if path.exists(): - path.unlink() - if not any(kind_dir.iterdir()): - kind_dir.rmdir() - - def _remove_edges_for_node(self, node_id: str) -> None: - """Rewrite edges.jsonl excluding edges that reference *node_id*.""" - path = self._edges_path() - if not path.exists(): - return - kept: list[str] = [] - for line in path.read_text(encoding="utf-8").splitlines(): - line = line.strip() - if not line: - continue - data = json.loads(line) - if data.get("from_node") == node_id or data.get("to_node") == node_id: - continue - kept.append(line) - path.write_text("\n".join(kept) + ("\n" if kept else ""), encoding="utf-8") diff --git a/src/kdd/infrastructure/embedding/__init__.py b/src/kdd/infrastructure/embedding/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kdd/infrastructure/embedding/sentence_transformer.py b/src/kdd/infrastructure/embedding/sentence_transformer.py deleted file mode 100644 index 5c56b57..0000000 --- a/src/kdd/infrastructure/embedding/sentence_transformer.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Sentence-transformers based EmbeddingModel implementation. - -Wraps a HuggingFace sentence-transformers model for local L2 embedding -generation. Implements the ``EmbeddingModel`` port. -""" - -from __future__ import annotations - - -class SentenceTransformerModel: - """Local embedding model using sentence-transformers.""" - - def __init__(self, model_name: str = "all-mpnet-base-v2") -> None: - from sentence_transformers import SentenceTransformer - - self._model = SentenceTransformer(model_name) - self._model_name = model_name - self._dimensions = self._model.get_sentence_embedding_dimension() - - @property - def model_name(self) -> str: - return self._model_name - - @property - def dimensions(self) -> int: - return self._dimensions - - def encode(self, texts: list[str]) -> list[list[float]]: - embeddings = self._model.encode(texts, show_progress_bar=False) - return [e.tolist() for e in embeddings] diff --git a/src/kdd/infrastructure/events/__init__.py b/src/kdd/infrastructure/events/__init__.py deleted file mode 100644 index 23cd61e..0000000 --- a/src/kdd/infrastructure/events/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""In-memory event bus.""" diff --git a/src/kdd/infrastructure/events/bus.py b/src/kdd/infrastructure/events/bus.py deleted file mode 100644 index 3634f81..0000000 --- a/src/kdd/infrastructure/events/bus.py +++ /dev/null @@ -1,26 +0,0 @@ -"""In-memory event bus. - -Simple publish/subscribe for domain events. Handlers are called -synchronously in registration order. Implements ``EventBus`` port. -""" - -from __future__ import annotations - -from collections import defaultdict -from typing import Any, Callable - - -class InMemoryEventBus: - """Synchronous in-memory event bus.""" - - def __init__(self) -> None: - self._handlers: dict[type, list[Callable[..., Any]]] = defaultdict(list) - - def subscribe(self, event_type: type, handler: Callable[..., Any]) -> None: - """Register *handler* to be called when *event_type* is published.""" - self._handlers[event_type].append(handler) - - def publish(self, event: Any) -> None: - """Dispatch *event* to all registered handlers for its type.""" - for handler in self._handlers.get(type(event), []): - handler(event) diff --git a/src/kdd/infrastructure/git/__init__.py b/src/kdd/infrastructure/git/__init__.py deleted file mode 100644 index 9a686a3..0000000 --- a/src/kdd/infrastructure/git/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Git integration: diff detection for incremental indexing.""" diff --git a/src/kdd/infrastructure/git/diff.py b/src/kdd/infrastructure/git/diff.py deleted file mode 100644 index ba67e8c..0000000 --- a/src/kdd/infrastructure/git/diff.py +++ /dev/null @@ -1,146 +0,0 @@ -"""Simplified git diff adapter for incremental indexing. - -Decoupled from ``RepositoryConfig`` — operates on plain ``Path`` + pattern -lists. Ported and simplified from ``kb_engine/git/scanner.py``. -""" - -from __future__ import annotations - -import subprocess -from dataclasses import dataclass, field -from pathlib import Path - - -@dataclass -class DiffResult: - """Files changed between two git states.""" - - added: list[str] = field(default_factory=list) - modified: list[str] = field(default_factory=list) - deleted: list[str] = field(default_factory=list) - - -def _run_git(repo: Path, *args: str) -> str: - """Run a git command in *repo* and return stdout.""" - result = subprocess.run( - ["git", *args], - cwd=repo, - capture_output=True, - text=True, - check=True, - ) - return result.stdout.strip() - - -def is_git_repo(repo: Path) -> bool: - """Return True if *repo* is inside a git working tree.""" - try: - _run_git(repo, "rev-parse", "--git-dir") - return True - except (subprocess.CalledProcessError, FileNotFoundError, NotADirectoryError): - return False - - -def get_current_commit(repo: Path) -> str | None: - """Return the current HEAD commit hash, or None.""" - try: - return _run_git(repo, "rev-parse", "HEAD") - except (subprocess.CalledProcessError, FileNotFoundError): - return None - - -def get_diff( - repo: Path, - since_commit: str, - *, - include_patterns: list[str] | None = None, -) -> DiffResult: - """Return files added/modified/deleted since *since_commit*. - - If *include_patterns* is given, only files matching at least one glob - pattern are included. - """ - try: - output = _run_git( - repo, "diff", "--name-status", since_commit, "HEAD", - ) - except subprocess.CalledProcessError: - return DiffResult() - - result = DiffResult() - for line in output.splitlines(): - if not line.strip(): - continue - parts = line.split("\t", 1) - if len(parts) != 2: - continue - status, filepath = parts[0].strip(), parts[1].strip() - - if include_patterns and not _matches_any(filepath, include_patterns): - continue - - if status.startswith("A"): - result.added.append(filepath) - elif status.startswith("M") or status.startswith("R"): - result.modified.append(filepath) - elif status.startswith("D"): - result.deleted.append(filepath) - - return result - - -def scan_files( - repo: Path, - *, - include_patterns: list[str] | None = None, -) -> list[str]: - """Return all tracked files in *repo*, optionally filtered by patterns.""" - try: - output = _run_git(repo, "ls-files") - except (subprocess.CalledProcessError, FileNotFoundError): - return [] - - files = output.splitlines() if output else [] - if include_patterns: - files = [f for f in files if _matches_any(f, include_patterns)] - return sorted(files) - - -def _matches_any(filepath: str, patterns: list[str]) -> bool: - """Return True if *filepath* matches any of the glob *patterns*. - - Handles ``**`` patterns manually since ``PurePath.match`` doesn't - support recursive globs reliably across Python versions. - """ - import fnmatch - - for pattern in patterns: - # Simple pattern without ** — use fnmatch directly - if "**" not in pattern: - if fnmatch.fnmatch(filepath, pattern): - return True - continue - - # Pattern like "specs/**/*.md" or "**/*.md" - # Split on **/ and check prefix + suffix - parts = pattern.split("**/", 1) - prefix = parts[0] # e.g. "specs/" or "" - suffix = parts[1] if len(parts) > 1 else "" # e.g. "*.md" - - # Check prefix match - if prefix and not filepath.startswith(prefix): - continue - - # Check suffix match on the remainder - remainder = filepath[len(prefix):] - # The suffix may apply to any nested level, so check the filename - # or any subpath. For "*.md" we check if the file itself matches. - if fnmatch.fnmatch(remainder, suffix): - return True - # Also check just the filename for deeply nested paths - filename = Path(filepath).name - if fnmatch.fnmatch(filename, suffix): - if not prefix or filepath.startswith(prefix): - return True - - return False diff --git a/src/kdd/infrastructure/graph/__init__.py b/src/kdd/infrastructure/graph/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kdd/infrastructure/graph/networkx_store.py b/src/kdd/infrastructure/graph/networkx_store.py deleted file mode 100644 index 4cc15be..0000000 --- a/src/kdd/infrastructure/graph/networkx_store.py +++ /dev/null @@ -1,261 +0,0 @@ -"""NetworkX-based GraphStore implementation. - -Loads nodes and edges from artifacts into a NetworkX DiGraph for -in-memory querying. Implements the ``GraphStore`` port. - -Used for: -- QRY-001 RetrieveByGraph (BFS traversal) -- QRY-003 RetrieveHybrid (graph expansion phase) -- QRY-004 RetrieveImpact (reverse edge traversal) -- QRY-005 RetrieveCoverage (neighbor analysis) -- QRY-006 RetrieveLayerViolations (violation detection) -""" - -from __future__ import annotations - -from collections import deque - -import networkx as nx - -from kdd.domain.entities import GraphEdge, GraphNode - - -class NetworkXGraphStore: - """In-memory graph backed by a NetworkX DiGraph.""" - - def __init__(self) -> None: - self._graph: nx.DiGraph = nx.DiGraph() - self._nodes: dict[str, GraphNode] = {} - - # ------------------------------------------------------------------ - # GraphStore port interface - # ------------------------------------------------------------------ - - def load(self, nodes: list[GraphNode], edges: list[GraphEdge]) -> None: - """Load nodes and edges into the graph, replacing any prior state.""" - self._graph.clear() - self._nodes.clear() - - for node in nodes: - self._graph.add_node(node.id, data=node) - self._nodes[node.id] = node - - for edge in edges: - self._graph.add_edge( - edge.from_node, - edge.to_node, - data=edge, - ) - - def traverse( - self, - root: str, - depth: int, - edge_types: list[str] | None = None, - respect_layers: bool = True, - ) -> tuple[list[GraphNode], list[GraphEdge]]: - """BFS traversal from *root* up to *depth* hops. - - Follows both outgoing and incoming edges (undirected traversal) - so that, e.g., starting from a command you can reach both the - entities it references and the use-cases that reference it. - - When *respect_layers* is True, edges marked as layer violations - are excluded from traversal. - """ - if root not in self._graph: - return [], [] - - visited_nodes: set[str] = {root} - collected_edges: list[GraphEdge] = [] - queue: deque[tuple[str, int]] = deque([(root, 0)]) - - while queue: - current, dist = queue.popleft() - if dist >= depth: - continue - - # Outgoing edges - for _, neighbor, edata in self._graph.out_edges(current, data=True): - edge: GraphEdge = edata["data"] - if not self._edge_matches(edge, edge_types, respect_layers): - continue - collected_edges.append(edge) - if neighbor not in visited_nodes: - visited_nodes.add(neighbor) - queue.append((neighbor, dist + 1)) - - # Incoming edges (reverse traversal) - for neighbor, _, edata in self._graph.in_edges(current, data=True): - edge = edata["data"] - if not self._edge_matches(edge, edge_types, respect_layers): - continue - collected_edges.append(edge) - if neighbor not in visited_nodes: - visited_nodes.add(neighbor) - queue.append((neighbor, dist + 1)) - - result_nodes = [ - self._nodes[nid] for nid in visited_nodes if nid in self._nodes - ] - # Deduplicate edges (same edge may be encountered from both ends) - seen_edges: set[tuple[str, str, str]] = set() - unique_edges: list[GraphEdge] = [] - for e in collected_edges: - key = (e.from_node, e.to_node, e.edge_type) - if key not in seen_edges: - seen_edges.add(key) - unique_edges.append(e) - - return result_nodes, unique_edges - - def text_search( - self, - query: str, - fields: list[str] | None = None, - ) -> list[GraphNode]: - """Case-insensitive lexical search over node indexed_fields. - - If *fields* is None, searches all indexed_fields values. - """ - query_lower = query.lower() - results: list[GraphNode] = [] - - for node in self._nodes.values(): - if self._node_matches_text(node, query_lower, fields): - results.append(node) - - return results - - def neighbors(self, node_id: str) -> list[GraphNode]: - """Return all directly connected nodes (successors + predecessors).""" - if node_id not in self._graph: - return [] - neighbor_ids: set[str] = set() - neighbor_ids.update(self._graph.successors(node_id)) - neighbor_ids.update(self._graph.predecessors(node_id)) - return [self._nodes[nid] for nid in neighbor_ids if nid in self._nodes] - - def all_edges(self) -> list[GraphEdge]: - """Return every edge in the graph.""" - return [data["data"] for _, _, data in self._graph.edges(data=True)] - - def find_violations(self) -> list[GraphEdge]: - """Return all edges marked as layer violations.""" - return [ - data["data"] - for _, _, data in self._graph.edges(data=True) - if data["data"].layer_violation - ] - - # ------------------------------------------------------------------ - # Additional query helpers - # ------------------------------------------------------------------ - - def get_node(self, node_id: str) -> GraphNode | None: - """Look up a single node by ID.""" - return self._nodes.get(node_id) - - def has_node(self, node_id: str) -> bool: - return node_id in self._nodes - - def node_count(self) -> int: - return len(self._nodes) - - def edge_count(self) -> int: - return self._graph.number_of_edges() - - def incoming_edges(self, node_id: str) -> list[GraphEdge]: - """Return all edges pointing *to* this node (dependents).""" - if node_id not in self._graph: - return [] - return [ - data["data"] - for _, _, data in self._graph.in_edges(node_id, data=True) - ] - - def outgoing_edges(self, node_id: str) -> list[GraphEdge]: - """Return all edges originating *from* this node.""" - if node_id not in self._graph: - return [] - return [ - data["data"] - for _, _, data in self._graph.out_edges(node_id, data=True) - ] - - def reverse_traverse( - self, - root: str, - depth: int, - ) -> list[tuple[GraphNode, list[GraphEdge]]]: - """BFS traversal following *incoming* edges only. - - Used by QRY-004 (impact analysis) to find nodes that depend on *root*. - Returns a list of (node, path_edges) pairs where path_edges is the - chain of edges from root to that node (in reverse dependency order). - """ - if root not in self._graph: - return [] - - results: list[tuple[GraphNode, list[GraphEdge]]] = [] - visited: set[str] = {root} - # (current_node_id, current_depth, path_edges_so_far) - queue: deque[tuple[str, int, list[GraphEdge]]] = deque([(root, 0, [])]) - - while queue: - current, dist, path = queue.popleft() - if dist >= depth: - continue - - for pred, _, edata in self._graph.in_edges(current, data=True): - if pred in visited: - continue - visited.add(pred) - edge: GraphEdge = edata["data"] - new_path = path + [edge] - if pred in self._nodes: - results.append((self._nodes[pred], new_path)) - queue.append((pred, dist + 1, new_path)) - - return results - - # ------------------------------------------------------------------ - # Internals - # ------------------------------------------------------------------ - - @staticmethod - def _edge_matches( - edge: GraphEdge, - edge_types: list[str] | None, - respect_layers: bool, - ) -> bool: - if respect_layers and edge.layer_violation: - return False - if edge_types is not None and edge.edge_type not in edge_types: - return False - return True - - @staticmethod - def _node_matches_text( - node: GraphNode, - query_lower: str, - fields: list[str] | None, - ) -> bool: - if fields is not None: - search_values = [ - str(v) - for k, v in node.indexed_fields.items() - if k in fields and v is not None - ] - else: - search_values = [ - str(v) - for v in node.indexed_fields.values() - if v is not None - ] - - # Also search node ID and aliases - search_values.append(node.id) - search_values.extend(node.aliases) - - return any(query_lower in val.lower() for val in search_values) diff --git a/src/kdd/infrastructure/parsing/__init__.py b/src/kdd/infrastructure/parsing/__init__.py deleted file mode 100644 index c23f143..0000000 --- a/src/kdd/infrastructure/parsing/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Parsing utilities: markdown, hashing, tokenization, wiki-links.""" diff --git a/src/kdd/infrastructure/parsing/hashing.py b/src/kdd/infrastructure/parsing/hashing.py deleted file mode 100644 index a3bf3e6..0000000 --- a/src/kdd/infrastructure/parsing/hashing.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Hashing utilities. - -Ported from ``kb_engine/utils/hashing.py``. -""" - -from __future__ import annotations - -import hashlib - - -def compute_content_hash(content: str) -> str: - """Compute a SHA-256 hex digest of *content*.""" - return hashlib.sha256(content.encode("utf-8")).hexdigest() diff --git a/src/kdd/infrastructure/parsing/markdown.py b/src/kdd/infrastructure/parsing/markdown.py deleted file mode 100644 index 3ca2672..0000000 --- a/src/kdd/infrastructure/parsing/markdown.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Markdown parsing utilities. - -Ported from ``kb_engine/utils/markdown.py`` with wiki-link awareness removed -(wiki-links have their own module) and Section dataclass integration. -""" - -from __future__ import annotations - -import re -import unicodedata -from typing import Any - -import frontmatter - -from kdd.domain.entities import Section - - -def extract_frontmatter(content: str) -> tuple[dict[str, Any], str]: - """Extract YAML front-matter from markdown content. - - Returns ``(metadata_dict, body_without_frontmatter)``. - If parsing fails, returns ``({}, original_content)``. - """ - try: - post = frontmatter.loads(content) - return dict(post.metadata), post.content - except Exception: - return {}, content - - -def parse_markdown_sections(content: str) -> list[Section]: - """Parse markdown body into a list of :class:`Section` objects. - - Each section captures its heading text, heading level (1-6), the raw - content below it, and a dot-separated hierarchical ``path`` built from - the heading ancestry (e.g. ``"descripción.atributos"``). - """ - sections: list[Section] = [] - current_headings: list[str] = [] - current_levels: list[int] = [] - current_lines: list[str] = [] - - def _flush() -> None: - text = "\n".join(current_lines).strip() - if current_headings: - path = ".".join( - heading_to_anchor(h) for h in current_headings - ) - sections.append(Section( - heading=current_headings[-1], - level=current_levels[-1] if current_levels else 1, - content=text, - path=path, - )) - - for line in content.split("\n"): - if line.startswith("#"): - _flush() - current_lines = [] - - level = len(line) - len(line.lstrip("#")) - heading_text = line.lstrip("#").strip() - - # Maintain hierarchy: pop deeper or equal headings - while current_levels and current_levels[-1] >= level: - current_levels.pop() - if current_headings: - current_headings.pop() - - current_headings.append(heading_text) - current_levels.append(level) - else: - current_lines.append(line) - - _flush() - return sections - - -def heading_to_anchor(heading: str) -> str: - """Convert a heading to a GitHub-compatible anchor slug. - - Algorithm: lowercase → strip non-alphanumeric (keep spaces/hyphens) - → spaces to hyphens → strip trailing hyphens. - """ - text = unicodedata.normalize("NFKD", heading) - text = text.lower() - text = re.sub(r"[^\w\s-]", "", text) - text = re.sub(r"[\s]+", "-", text) - text = text.strip("-") - return text - - -def extract_snippet(content: str, max_length: int = 200) -> str: - """Extract a plain-text snippet from markdown content. - - Strips formatting, truncates at sentence or word boundary. - """ - text = content.strip() - text = re.sub(r"^#+\s+", "", text, flags=re.MULTILINE) - text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) - text = re.sub(r"\*([^*]+)\*", r"\1", text) - text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) - text = re.sub(r"\s+", " ", text).strip() - - if len(text) <= max_length: - return text - - truncated = text[:max_length] - last_period = truncated.rfind(". ") - if last_period > max_length // 2: - return truncated[: last_period + 1] - - last_space = truncated.rfind(" ") - if last_space > max_length // 2: - return truncated[:last_space] + "..." - - return truncated + "..." diff --git a/src/kdd/infrastructure/parsing/tokenization.py b/src/kdd/infrastructure/parsing/tokenization.py deleted file mode 100644 index 402d17e..0000000 --- a/src/kdd/infrastructure/parsing/tokenization.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Token counting and truncation. - -Ported from ``kb_engine/utils/tokenization.py``. -Uses a simple character-based estimation (no tiktoken dependency). -""" - -from __future__ import annotations - -# Average ~4 characters per token for English text. -_CHARS_PER_TOKEN = 4 - - -def count_tokens(text: str) -> int: - """Return an approximate token count for *text*.""" - return max(1, len(text) // _CHARS_PER_TOKEN) - - -def truncate_to_tokens(text: str, max_tokens: int) -> str: - """Truncate *text* so it fits within *max_tokens* (approx).""" - max_chars = max_tokens * _CHARS_PER_TOKEN - if len(text) <= max_chars: - return text - return text[:max_chars] diff --git a/src/kdd/infrastructure/parsing/wiki_links.py b/src/kdd/infrastructure/parsing/wiki_links.py deleted file mode 100644 index a6bde12..0000000 --- a/src/kdd/infrastructure/parsing/wiki_links.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Wiki-link extraction from markdown content. - -Handles two syntaxes: -- ``[[Target]]`` — intra-domain link -- ``[[domain::Target]]`` — cross-domain link -- ``[[Target|Display]]`` — link with display alias - -Returns structured results so callers can distinguish link types. -""" - -from __future__ import annotations - -import re -from dataclasses import dataclass - -# Matches [[...]] allowing nested pipes and double-colons. -_WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") - - -@dataclass(frozen=True) -class WikiLink: - """A parsed wiki-link.""" - - raw: str # full text inside [[ ]] - target: str # resolved target name (without domain prefix or alias) - domain: str | None # non-None for cross-domain ``[[domain::Target]]`` - alias: str | None # non-None for ``[[Target|Alias]]`` - - -def extract_wiki_links(content: str) -> list[WikiLink]: - """Extract all ``[[...]]`` wiki-links from *content*.""" - results: list[WikiLink] = [] - for match in _WIKI_LINK_RE.finditer(content): - raw = match.group(1).strip() - if not raw: - continue - - domain: str | None = None - alias: str | None = None - target = raw - - # Cross-domain: [[domain::Target]] - if "::" in target: - parts = target.split("::", 1) - domain = parts[0].strip() - target = parts[1].strip() - - # Display alias: [[Target|Alias]] - if "|" in target: - parts = target.split("|", 1) - target = parts[0].strip() - alias = parts[1].strip() - - results.append(WikiLink( - raw=raw, - target=target, - domain=domain, - alias=alias, - )) - - return results - - -def extract_wiki_link_targets(content: str) -> list[str]: - """Return just the target names from all wiki-links in *content*. - - Convenience wrapper that returns a flat list of strings. - """ - return [link.target for link in extract_wiki_links(content)] diff --git a/src/kdd/infrastructure/vector/__init__.py b/src/kdd/infrastructure/vector/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kdd/infrastructure/vector/hnswlib_store.py b/src/kdd/infrastructure/vector/hnswlib_store.py deleted file mode 100644 index 870c2c9..0000000 --- a/src/kdd/infrastructure/vector/hnswlib_store.py +++ /dev/null @@ -1,103 +0,0 @@ -"""HNSWLib-based VectorStore implementation. - -Loads embedding vectors into an hnswlib HNSW index for fast approximate -nearest-neighbor search. Implements the ``VectorStore`` port. - -Used for: -- QRY-002 RetrieveSemantic -- QRY-003 RetrieveHybrid (semantic phase) -""" - -from __future__ import annotations - -import numpy as np - -from kdd.domain.entities import Embedding - - -class HNSWLibVectorStore: - """In-memory vector index backed by hnswlib.""" - - def __init__(self) -> None: - self._index = None # hnswlib.Index, created on load() - self._id_map: list[str] = [] # positional index → embedding ID - self._dimensions: int = 0 - - # ------------------------------------------------------------------ - # VectorStore port interface - # ------------------------------------------------------------------ - - def load(self, embeddings: list[Embedding]) -> None: - """Build an HNSW index from a list of Embedding entities.""" - import hnswlib - - if not embeddings: - self._index = None - self._id_map = [] - return - - self._dimensions = embeddings[0].dimensions - n = len(embeddings) - - # Build index - index = hnswlib.Index(space="cosine", dim=self._dimensions) - # ef_construction and M tuned for small-medium indices (< 10k vectors) - index.init_index(max_elements=max(n, 16), ef_construction=200, M=16) - - vectors = np.array( - [emb.vector for emb in embeddings], dtype=np.float32 - ) - ids = np.arange(n, dtype=np.int64) - index.add_items(vectors, ids) - index.set_ef(50) # query-time ef - - self._index = index - self._id_map = [emb.id for emb in embeddings] - - def search( - self, - vector: list[float], - limit: int = 10, - min_score: float = 0.0, - ) -> list[tuple[str, float]]: - """Find nearest embeddings. - - Returns list of ``(embedding_id, cosine_similarity_score)`` sorted - by score descending. - - hnswlib returns *distances* in cosine space where - ``distance = 1 - similarity``, so we convert to similarity. - """ - if self._index is None or not self._id_map: - return [] - - k = min(limit, len(self._id_map)) - query = np.array([vector], dtype=np.float32) - - labels, distances = self._index.knn_query(query, k=k) - - results: list[tuple[str, float]] = [] - for label, dist in zip(labels[0], distances[0]): - score = 1.0 - float(dist) - if score < min_score: - continue - emb_id = self._id_map[int(label)] - results.append((emb_id, score)) - - # Sort by score descending (hnswlib returns sorted by distance asc - # which is score desc, but filter may have changed order) - results.sort(key=lambda x: x[1], reverse=True) - return results - - # ------------------------------------------------------------------ - # Helpers - # ------------------------------------------------------------------ - - @property - def size(self) -> int: - """Number of vectors in the index.""" - return len(self._id_map) - - @property - def dimensions(self) -> int: - return self._dimensions diff --git a/src/mcp.ts b/src/mcp.ts new file mode 100644 index 0000000..7f57f1b --- /dev/null +++ b/src/mcp.ts @@ -0,0 +1,271 @@ +/** + * MCP server — exposes KDD search tools. + * + * Tools: kdd_search, kdd_find_spec, kdd_related, kdd_impact, + * kdd_read_section, kdd_list, kdd_stats + */ + +import { Server } from "@modelcontextprotocol/sdk/server/index.js"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; +import { + CallToolRequestSchema, + ListToolsRequestSchema, +} from "@modelcontextprotocol/sdk/types.js"; +import { resolve } from "node:path"; +import { createContainer, type Container } from "./container.ts"; +import { hybridSearch } from "./application/queries/hybrid-search.ts"; +import { graphQuery } from "./application/queries/graph-query.ts"; +import { impactQuery } from "./application/queries/impact-query.ts"; +import type { KDDKind } from "./domain/types.ts"; + +const INDEX_PATH = resolve(process.env.KDD_INDEX_PATH ?? ".kdd-index"); +const SPECS_PATH = resolve(process.env.KDD_SPECS_PATH ?? "specs"); + +let container: Container; + +async function getContainer(): Promise { + if (!container) { + container = await createContainer(INDEX_PATH); + } + return container; +} + +const server = new Server( + { name: "kdd", version: "1.0.0" }, + { capabilities: { tools: {} } }, +); + +server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [ + { + name: "kdd_search", + description: "Search KDD specifications using hybrid retrieval (semantic + graph + lexical). Returns scored results with snippets.", + inputSchema: { + type: "object" as const, + properties: { + query: { type: "string", description: "Search query text (min 3 chars)" }, + kind: { type: "string", description: "Filter by kind (comma-separated: entity, command, use-case, etc.)" }, + limit: { type: "number", description: "Max results (default: 10)" }, + min_score: { type: "number", description: "Minimum score threshold (default: 0.3)" }, + }, + required: ["query"], + }, + }, + { + name: "kdd_find_spec", + description: "Quick lookup of a specific KDD spec by name or ID. Convenience wrapper for search with limit=5.", + inputSchema: { + type: "object" as const, + properties: { + name: { type: "string", description: "Spec name or ID to find" }, + }, + required: ["name"], + }, + }, + { + name: "kdd_related", + description: "Find related specs via knowledge graph traversal (BFS from root node).", + inputSchema: { + type: "object" as const, + properties: { + node_id: { type: "string", description: "Root node ID (e.g. Entity:KDDDocument)" }, + depth: { type: "number", description: "Traversal depth (default: 2)" }, + kind: { type: "string", description: "Filter by kind (comma-separated)" }, + }, + required: ["node_id"], + }, + }, + { + name: "kdd_impact", + description: "Impact analysis: what breaks if this spec changes? Uses reverse BFS to find dependents.", + inputSchema: { + type: "object" as const, + properties: { + node_id: { type: "string", description: "Node ID to analyze" }, + depth: { type: "number", description: "Analysis depth (default: 3)" }, + }, + required: ["node_id"], + }, + }, + { + name: "kdd_read_section", + description: "Read the raw markdown content of a spec file, optionally a specific section by anchor.", + inputSchema: { + type: "object" as const, + properties: { + file: { type: "string", description: "Relative path within specs/ (e.g. 01-domain/entities/KDDDocument.md)" }, + anchor: { type: "string", description: "Section anchor to jump to (e.g. #descripción)" }, + }, + required: ["file"], + }, + }, + { + name: "kdd_list", + description: "List all indexed KDD nodes, optionally filtered by kind or domain.", + inputSchema: { + type: "object" as const, + properties: { + kind: { type: "string", description: "Filter by kind (comma-separated)" }, + domain: { type: "string", description: "Filter by domain" }, + }, + }, + }, + { + name: "kdd_stats", + description: "Get index statistics: node count, edge count, embedding count, etc.", + inputSchema: { type: "object" as const, properties: {} }, + }, + ], +})); + +server.setRequestHandler(CallToolRequestSchema, async (request) => { + const { name, arguments: args } = request.params; + const c = await getContainer(); + + try { + switch (name) { + case "kdd_search": { + const query = String(args?.query ?? ""); + const includeKinds = args?.kind + ? String(args.kind).split(",") as KDDKind[] + : undefined; + const result = await hybridSearch( + { + queryText: query, + limit: Number(args?.limit ?? 10), + minScore: Number(args?.min_score ?? 0.3), + includeKinds, + }, + c.graphStore, + c.vectorStore, + c.encodeFn, + ); + return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; + } + + case "kdd_find_spec": { + const query = String(args?.name ?? ""); + const result = await hybridSearch( + { queryText: query, limit: 5, minScore: 0.1 }, + c.graphStore, + c.vectorStore, + c.encodeFn, + ); + return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; + } + + case "kdd_related": { + const nodeId = String(args?.node_id ?? ""); + const includeKinds = args?.kind + ? String(args.kind).split(",") as KDDKind[] + : undefined; + const result = graphQuery( + { + rootNode: nodeId, + depth: Number(args?.depth ?? 2), + includeKinds, + }, + c.graphStore, + ); + return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; + } + + case "kdd_impact": { + const nodeId = String(args?.node_id ?? ""); + const result = impactQuery( + { nodeId, depth: Number(args?.depth ?? 3) }, + c.graphStore, + ); + return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; + } + + case "kdd_read_section": { + const filePath = resolve(SPECS_PATH, String(args?.file ?? "")); + const file = Bun.file(filePath); + if (!(await file.exists())) { + return { content: [{ type: "text", text: `File not found: ${args?.file}` }], isError: true }; + } + let text = await file.text(); + + // If anchor specified, extract that section + const anchor = args?.anchor ? String(args.anchor).replace(/^#/, "") : null; + if (anchor) { + const lines = text.split("\n"); + let start = -1; + let end = lines.length; + let headingLevel = 0; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]!; + if (line.startsWith("#")) { + const slug = line + .replace(/^#+\s*/, "") + .normalize("NFKD") + .toLowerCase() + .replace(/[^\w\s-]/g, "") + .replace(/\s+/g, "-") + .replace(/^-+|-+$/g, ""); + if (slug === anchor && start === -1) { + start = i; + headingLevel = line.length - line.replace(/^#+/, "").length; + } else if (start !== -1) { + const level = line.length - line.replace(/^#+/, "").length; + if (level <= headingLevel) { + end = i; + break; + } + } + } + } + + if (start >= 0) { + text = lines.slice(start, end).join("\n"); + } + } + + return { content: [{ type: "text", text }] }; + } + + case "kdd_list": { + const allNodes = c.graphStore.allNodes(); + let filtered = allNodes; + + if (args?.kind) { + const kinds = new Set(String(args.kind).split(",")); + filtered = filtered.filter((n) => kinds.has(n.kind)); + } + if (args?.domain) { + filtered = filtered.filter((n) => n.domain === String(args!.domain)); + } + + const items = filtered.map((n) => ({ + id: n.id, + kind: n.kind, + layer: n.layer, + source_file: n.source_file, + title: n.indexed_fields.title ?? n.id, + })); + return { content: [{ type: "text", text: JSON.stringify(items, null, 2) }] }; + } + + case "kdd_stats": { + const stats = { + manifest: c.manifest, + nodes: c.graphStore.nodeCount(), + edges: c.graphStore.edgeCount(), + embeddings: c.vectorStore?.size ?? 0, + }; + return { content: [{ type: "text", text: JSON.stringify(stats, null, 2) }] }; + } + + default: + return { content: [{ type: "text", text: `Unknown tool: ${name}` }], isError: true }; + } + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { content: [{ type: "text", text: `Error: ${message}` }], isError: true }; + } +}); + +const transport = new StdioServerTransport(); +await server.connect(transport); diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index 2d21fe3..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for KB-Engine.""" diff --git a/tests/api/__init__.py b/tests/api/__init__.py deleted file mode 100644 index c775f2c..0000000 --- a/tests/api/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""API tests for KB-Engine.""" diff --git a/tests/api/test_health.py b/tests/api/test_health.py deleted file mode 100644 index 22514a0..0000000 --- a/tests/api/test_health.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Tests for health check endpoints.""" - -import pytest -from fastapi.testclient import TestClient - -from kb_engine.api.main import app -from kb_engine.api.routers import health as health_router -from kb_engine.config.settings import Settings - - -@pytest.fixture -def client() -> TestClient: - """Create a test client.""" - return TestClient(app) - - -@pytest.mark.api -class TestHealthEndpoints: - """Tests for health endpoints.""" - - def test_health_check(self, client: TestClient) -> None: - """Test basic health check endpoint.""" - response = client.get("/health") - - assert response.status_code == 200 - assert response.json() == {"status": "ok"} - - def test_readiness_check(self, client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: - """Test readiness check endpoint.""" - class DummyTraceability: - async def list_documents(self, limit: int = 1): - return [] - - class DummyVector: - async def get_collection_info(self): - return {"count": 0} - - class DummyFactory: - def __init__(self, settings): - self._settings = settings - - async def get_traceability_repository(self): - return DummyTraceability() - - async def get_vector_repository(self): - return DummyVector() - - async def get_graph_repository(self): - return None - - async def close(self): - return None - - def fake_settings() -> Settings: - return Settings( - _env_file=None, - profile="local", - traceability_store="sqlite", - vector_store="chroma", - graph_store="none", - ) - - monkeypatch.setattr(health_router, "RepositoryFactory", DummyFactory) - monkeypatch.setattr(health_router, "get_settings", fake_settings) - - response = client.get("/health/ready") - - assert response.status_code == 200 - data = response.json() - assert data["status"] == "ok" - assert data["checks"] == {"traceability": "ok", "vector": "ok", "graph": "skipped"} - - def test_liveness_check(self, client: TestClient) -> None: - """Test liveness check endpoint.""" - response = client.get("/health/live") - - assert response.status_code == 200 - assert response.json() == {"status": "ok"} diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 2885cb0..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Pytest configuration and fixtures.""" - -import pytest -from uuid import uuid4 - -from kb_engine.core.models.document import Chunk, ChunkType, Document, DocumentStatus -from kb_engine.core.models.graph import Edge, EdgeType, Node, NodeType - - -@pytest.fixture -def sample_document() -> Document: - """Create a sample document for testing.""" - return Document( - id=uuid4(), - title="Test Document", - content="""# Test Document - -## Entity: User - -A user in the system. - -### Attributes - -- **id**: Unique identifier -- **name**: User's full name -- **email**: Email address - -## Use Case: Login - -### Actors -- User - -### Main Flow -1. User enters credentials -2. System validates -3. User is logged in -""", - source_path="/test/document.md", - domain="test", - tags=["test", "sample"], - status=DocumentStatus.PENDING, - repo_name="test-repo", - relative_path="test/document.md", - ) - - -@pytest.fixture -def sample_chunk(sample_document: Document) -> Chunk: - """Create a sample chunk for testing.""" - return Chunk( - id=uuid4(), - document_id=sample_document.id, - content="A user in the system with attributes like id, name, and email.", - chunk_type=ChunkType.ENTITY, - sequence=0, - heading_path=["Test Document", "Entity: User"], - section_anchor="entity-user", - ) - - -@pytest.fixture -def sample_node() -> Node: - """Create a sample node for testing.""" - return Node( - id=uuid4(), - name="User", - node_type=NodeType.ENTITY, - description="A user in the system", - properties={"domain": "test"}, - ) - - -@pytest.fixture -def sample_edge(sample_node: Node) -> Edge: - """Create a sample edge for testing.""" - target_node = Node( - id=uuid4(), - name="Login", - node_type=NodeType.USE_CASE, - ) - return Edge( - id=uuid4(), - source_id=sample_node.id, - target_id=target_node.id, - edge_type=EdgeType.PERFORMS, - ) diff --git a/tests/factories.py b/tests/factories.py deleted file mode 100644 index b0e2bcd..0000000 --- a/tests/factories.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Test factories using factory_boy.""" - -from uuid import uuid4 - -import factory - -from kb_engine.core.models.document import Chunk, ChunkType, Document, DocumentStatus -from kb_engine.core.models.embedding import Embedding -from kb_engine.core.models.graph import Edge, EdgeType, Node, NodeType - - -class DocumentFactory(factory.Factory): - """Factory for creating Document instances.""" - - class Meta: - model = Document - - id = factory.LazyFunction(uuid4) - title = factory.Sequence(lambda n: f"Document {n}") - content = factory.Faker("paragraph", nb_sentences=5) - source_path = factory.Sequence(lambda n: f"/docs/document_{n}.md") - domain = factory.Faker("word") - tags = factory.LazyFunction(lambda: ["test"]) - status = DocumentStatus.PENDING - - -class ChunkFactory(factory.Factory): - """Factory for creating Chunk instances.""" - - class Meta: - model = Chunk - - id = factory.LazyFunction(uuid4) - document_id = factory.LazyFunction(uuid4) - content = factory.Faker("paragraph", nb_sentences=2) - chunk_type = ChunkType.DEFAULT - sequence = factory.Sequence(lambda n: n) - heading_path = factory.LazyFunction(lambda: ["Section"]) - - -class EmbeddingFactory(factory.Factory): - """Factory for creating Embedding instances.""" - - class Meta: - model = Embedding - - id = factory.LazyFunction(uuid4) - chunk_id = factory.LazyFunction(uuid4) - document_id = factory.LazyFunction(uuid4) - vector = factory.LazyFunction(lambda: [0.1] * 768) - model = "all-mpnet-base-v2" - dimensions = 768 - - -class NodeFactory(factory.Factory): - """Factory for creating Node instances.""" - - class Meta: - model = Node - - id = factory.LazyFunction(uuid4) - name = factory.Faker("word") - node_type = NodeType.ENTITY - description = factory.Faker("sentence") - - -class EdgeFactory(factory.Factory): - """Factory for creating Edge instances.""" - - class Meta: - model = Edge - - id = factory.LazyFunction(uuid4) - source_id = factory.LazyFunction(uuid4) - target_id = factory.LazyFunction(uuid4) - edge_type = EdgeType.RELATED_TO diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/fixtures/entities/Usuario.md b/tests/fixtures/entities/Usuario.md deleted file mode 100644 index 5f756e4..0000000 --- a/tests/fixtures/entities/Usuario.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -kind: entity -aliases: - - User - - Cuenta de Usuario -code: - class: User - table: users -tags: - - entity - - core ---- - -# Usuario - -## Descripción - -Representa a una persona que interactúa con el sistema. El Usuario es la entidad central del dominio de identidad y autenticación. Gestiona la información personal, credenciales de acceso y permisos dentro de la plataforma. - -Características principales: -- Puede autenticarse mediante email/contraseña o proveedores OAuth -- Tiene un perfil editable con información personal -- Puede pertenecer a múltiples organizaciones con diferentes roles - -## Atributos - -| Atributo | Code | Tipo | Descripción | -|----------|------|------|-------------| -| `id` | `id` | uuid | Identificador único del usuario | -| `email` | `email` | string | Correo electrónico único, usado para login | -| `nombre` | `firstName` | string | Nombre de pila del usuario | -| `apellido` | `lastName` | string | Apellido del usuario | -| `estado` | `status` | enum | Estado del ciclo de vida | -| `avatar_url` | `avatarUrl` | string? | URL de la imagen de perfil | -| `organizacion_id` | `organizationId` | [[Organizacion]] | Organización principal del usuario | -| `created_at` | `createdAt` | timestamp | Fecha de registro | -| `last_login_at` | `lastLoginAt` | timestamp? | Último acceso al sistema | - -## Relaciones - -| Relación | Code | Cardinalidad | Entidad | Descripción | -|----------|------|--------------|---------|-------------| -| `pertenece a` | `organization` | N:1 | [[Organizacion]] | Organización principal | -| `tiene` | `roles` | 1:N | [[RolUsuario]] | Roles asignados en diferentes contextos | -| `creó` | `documents` | 1:N | [[Documento]] | Documentos creados por el usuario | -| `participa en` | `projects` | N:M | [[Proyecto]] | Proyectos donde colabora | - -## Ciclo de Vida - -```mermaid -stateDiagram-v2 - [*] --> Pendiente: registrar - - Pendiente --> Activo: verificar_email - Pendiente --> Pendiente: reenviar_verificacion - - Activo --> Activo: actualizar_perfil - Activo --> Suspendido: suspender [violacion_terminos] - Activo --> Inactivo: desactivar - - Suspendido --> Activo: reactivar [revision_aprobada] - Suspendido --> Eliminado: eliminar - - Inactivo --> Activo: reactivar - Inactivo --> Eliminado: eliminar [solicitud_gdpr] - - Eliminado --> [*] -``` - -### Estados - -| Estado | Descripción | Condiciones de entrada | -|--------|-------------|------------------------| -| **Pendiente** | Usuario registrado pero sin verificar email | Registro completado | -| **Activo** | Usuario con acceso completo al sistema | Email verificado o reactivación | -| **Suspendido** | Acceso temporalmente revocado | Violación de términos de servicio | -| **Inactivo** | Usuario que desactivó voluntariamente su cuenta | Solicitud del usuario | -| **Eliminado** | Datos anonimizados, cuenta no recuperable | Solicitud GDPR o eliminación admin | - -## Invariantes - -- El email debe ser único en todo el sistema -- Un usuario activo debe tener al menos un rol asignado -- El estado no puede cambiar de Eliminado a ningún otro estado -- last_login_at solo se actualiza cuando el usuario está Activo - -## Eventos - -- **Emite**: [[EVT-Usuario-Registrado]], [[EVT-Usuario-Verificado]], [[EVT-Usuario-Actualizado]], [[EVT-Usuario-Suspendido]], [[EVT-Usuario-Eliminado]] -- **Consume**: [[EVT-Organizacion-Eliminada]] (desactiva usuarios asociados) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py deleted file mode 100644 index 6c442ce..0000000 --- a/tests/integration/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for KB-Engine.""" diff --git a/tests/integration/pipelines/__init__.py b/tests/integration/pipelines/__init__.py deleted file mode 100644 index 75ce0a5..0000000 --- a/tests/integration/pipelines/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for pipelines.""" diff --git a/tests/integration/repositories/__init__.py b/tests/integration/repositories/__init__.py deleted file mode 100644 index 2b7685b..0000000 --- a/tests/integration/repositories/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for repositories.""" diff --git a/tests/integration/test_smart_pipeline.py b/tests/integration/test_smart_pipeline.py deleted file mode 100644 index 573cde2..0000000 --- a/tests/integration/test_smart_pipeline.py +++ /dev/null @@ -1,550 +0,0 @@ -"""Integration tests for the smart pipeline with FalkorDB.""" - -import asyncio -from pathlib import Path - -import pytest - -from kb_engine.smart import ( - DocumentKindDetector, - EntityIngestionPipeline, - EntityParser, - FalkorDBGraphStore, - KDDDocumentKind, -) - -# Test fixtures -FIXTURE_PATH = Path(__file__).parent.parent / "fixtures" / "entities" / "Usuario.md" -TEST_GRAPH_PATH = Path("/tmp/kb-engine-test-graph.db") -TEST_PROVENANCE_GRAPH_PATH = Path("/tmp/kb-engine-test-provenance-graph.db") - - -def _cleanup_db(path: Path) -> None: - """Remove a FalkorDB database file and its settings file.""" - if path.exists(): - path.unlink() - settings = Path(str(path) + ".settings") - if settings.exists(): - settings.unlink() - - -@pytest.fixture(autouse=True) -def cleanup_graph(): - """Clean up test graph before and after tests.""" - _cleanup_db(TEST_GRAPH_PATH) - _cleanup_db(TEST_PROVENANCE_GRAPH_PATH) - yield - _cleanup_db(TEST_GRAPH_PATH) - _cleanup_db(TEST_PROVENANCE_GRAPH_PATH) - - -class TestDocumentKindDetector: - """Tests for document kind detection.""" - - def test_detect_entity_from_frontmatter(self): - """Should detect entity kind from frontmatter.""" - content = FIXTURE_PATH.read_text() - detector = DocumentKindDetector() - - result = detector.detect(content, "Usuario.md") - - assert result.kind == KDDDocumentKind.ENTITY - assert result.confidence == 1.0 - assert result.detected_from == "frontmatter" - - def test_detect_unknown_without_frontmatter(self): - """Should return unknown when no frontmatter kind is present.""" - content = "# Some Entity\n\nDescription here." - detector = DocumentKindDetector() - - result = detector.detect(content, "Product.md") - - assert result.kind == KDDDocumentKind.UNKNOWN - - -class TestEntityParser: - """Tests for entity document parsing.""" - - def test_parse_entity_document(self): - """Should parse entity document structure.""" - content = FIXTURE_PATH.read_text() - parser = EntityParser() - - parsed = parser.parse(content, "Usuario.md") - - assert parsed.kind == KDDDocumentKind.ENTITY - assert parsed.title == "Usuario" - assert parsed.entity_name == "Usuario" - assert "User" in parsed.aliases - assert parsed.code_class == "User" - assert parsed.code_table == "users" - - def test_extract_entity_info(self): - """Should extract entity info from parsed document.""" - content = FIXTURE_PATH.read_text() - parser = EntityParser() - - parsed = parser.parse(content, "Usuario.md") - entity_info = parser.extract_entity_info(parsed) - - # Check attributes - assert len(entity_info.attributes) >= 8 - attr_names = [a.name for a in entity_info.attributes] - assert "id" in attr_names - assert "email" in attr_names - - # Check relations - assert len(entity_info.relations) >= 4 - - # Check states - assert len(entity_info.states) >= 5 - - # Check events - assert len(entity_info.events_emitted) >= 5 - - -class TestFalkorDBGraphStore: - """Tests for FalkorDB graph store.""" - - def test_initialize_and_upsert(self): - """Should initialize store and upsert entities.""" - store = FalkorDBGraphStore(TEST_GRAPH_PATH) - store.initialize() - - # Upsert entity - store.upsert_entity( - entity_id="entity:Test", - name="Test", - description="Test entity", - code_class="Test", - ) - - # Query - results = store.execute_cypher( - "MATCH (e:Entity {name: 'Test'}) RETURN e.name as name" - ) - - assert len(results) == 1 - assert results[0]["name"] == "Test" - - store.close() - - def test_relationships(self): - """Should create and query relationships.""" - store = FalkorDBGraphStore(TEST_GRAPH_PATH) - store.initialize() - - # Create entities - store.upsert_entity("entity:A", "EntityA", "First entity") - store.upsert_concept("concept:A.attr", "attr", "attribute", "An attribute", "EntityA") - - # Create relationship - store.add_contains("entity:A", "concept:A.attr") - - # Query relationship - results = store.execute_cypher(""" - MATCH (e:Entity)-[:CONTAINS]->(c:Concept) - RETURN e.name as entity, c.name as concept - """) - - assert len(results) == 1 - assert results[0]["entity"] == "EntityA" - assert results[0]["concept"] == "attr" - - store.close() - - -class TestEntityIngestionPipeline: - """Integration tests for the full pipeline.""" - - @pytest.mark.asyncio - async def test_ingest_entity_document_skip_graph(self): - """Should ingest entity document without storing to graph.""" - content = FIXTURE_PATH.read_text() - - pipeline = EntityIngestionPipeline( - graph_path=TEST_GRAPH_PATH, - use_mock_summarizer=True, - ) - - result = await pipeline.ingest(content, filename="Usuario.md", skip_graph=True) - - assert result.success - assert result.document_kind == KDDDocumentKind.ENTITY - assert result.document_id == "Usuario" - assert result.chunks_created > 0 - assert result.entities_extracted > 0 - assert result.relations_created > 0 - assert len(result.validation_errors) == 0 - - @pytest.mark.asyncio - async def test_ingest_entity_document_with_graph(self): - """Should ingest entity document and store to FalkorDB graph.""" - content = FIXTURE_PATH.read_text() - - pipeline = EntityIngestionPipeline( - graph_path=TEST_GRAPH_PATH, - use_mock_summarizer=True, - ) - - result = await pipeline.ingest(content, filename="Usuario.md", skip_graph=False) - - assert result.success - assert result.entities_extracted > 0 - assert result.relations_created > 0 - - # Verify data in graph - stats = pipeline.get_graph_stats() - assert stats["entity_count"] > 0 - - # Query the graph - entities = pipeline.query_graph( - "MATCH (e:Entity {name: 'Usuario'}) RETURN e.name as name" - ) - assert len(entities) == 1 - assert entities[0]["name"] == "Usuario" - - # Query relationships (CONTAINS) - relations = pipeline.query_graph(""" - MATCH (e:Entity {name: 'Usuario'})-[:CONTAINS]->(c:Concept) - RETURN c.name as concept, c.concept_type as ctype - """) - assert len(relations) > 0 - - pipeline.close() - - @pytest.mark.asyncio - async def test_reject_non_entity_document(self): - """Should reject non-entity documents.""" - content = """--- -kind: use-case ---- - -# Login de Usuario - -## Resumen - -El usuario inicia sesión. -""" - pipeline = EntityIngestionPipeline( - graph_path=TEST_GRAPH_PATH, - use_mock_summarizer=True, - ) - - result = await pipeline.ingest(content, filename="UC-Login.md", skip_graph=True) - - assert not result.success - assert "Expected entity document" in result.validation_errors[0] - - -class TestGraphProvenance: - """Tests for graph-native provenance with Document nodes and EXTRACTED_FROM edges.""" - - def test_provenance_extracted_from_edges(self): - """Indexing a document should create EXTRACTED_FROM edges to a Document node.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - # Create a document and entity with provenance - store.upsert_document("doc-1", "User", "entities/User.md", "entity") - store.upsert_entity("entity:User", "User", "Domain user", confidence=1.0) - store.add_extracted_from("entity:User", "Entity", "doc-1", "primary", 1.0) - - # Verify Document node - docs = store.execute_cypher("MATCH (d:Document) RETURN d.id as id, d.title as title") - assert len(docs) == 1 - assert docs[0]["id"] == "doc-1" - - # Verify EXTRACTED_FROM edge - edges = store.execute_cypher(""" - MATCH (n:Entity)-[r:EXTRACTED_FROM]->(d:Document) - RETURN n.name as name, r.role as role, d.id as doc_id - """) - assert len(edges) == 1 - assert edges[0]["name"] == "User" - assert edges[0]["role"] == "primary" - assert edges[0]["doc_id"] == "doc-1" - - store.close() - - def test_multi_document_provenance(self): - """Two documents sharing an entity should both have EXTRACTED_FROM edges.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - # Doc A defines Entity X, references Entity Y - store.upsert_document("doc-A", "Entity X", "x.md", "entity") - store.upsert_entity("entity:X", "X", "Main entity X", confidence=1.0) - store.add_extracted_from("entity:X", "Entity", "doc-A", "primary", 1.0) - store.upsert_entity("entity:Y", "Y", "Referenced by X", confidence=0.7) - store.add_extracted_from("entity:Y", "Entity", "doc-A", "referenced", 0.7) - - # Doc B defines Entity Y, references Entity X - store.upsert_document("doc-B", "Entity Y", "y.md", "entity") - store.upsert_entity("entity:Y", "Y", "Main entity Y", confidence=1.0) - store.add_extracted_from("entity:Y", "Entity", "doc-B", "primary", 1.0) - store.upsert_entity("entity:X", "X", "Referenced by Y", confidence=0.7) - store.add_extracted_from("entity:X", "Entity", "doc-B", "referenced", 0.7) - - # Entity X should have EXTRACTED_FROM edges to both documents - x_provenance = store.get_node_provenance("entity:X") - assert len(x_provenance) == 2 - x_doc_ids = {p["doc_id"] for p in x_provenance} - assert x_doc_ids == {"doc-A", "doc-B"} - - # Entity Y should have EXTRACTED_FROM edges to both documents - y_provenance = store.get_node_provenance("entity:Y") - assert len(y_provenance) == 2 - y_doc_ids = {p["doc_id"] for p in y_provenance} - assert y_doc_ids == {"doc-A", "doc-B"} - - store.close() - - def test_delete_preserves_shared_entities(self): - """Deleting one document should preserve entities shared with another.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - # Two docs both contribute to entity:Shared - store.upsert_document("doc-1", "Doc 1", "d1.md", "entity") - store.upsert_document("doc-2", "Doc 2", "d2.md", "entity") - - store.upsert_entity("entity:Shared", "Shared", "Shared entity", confidence=1.0) - store.add_extracted_from("entity:Shared", "Entity", "doc-1", "primary", 1.0) - store.add_extracted_from("entity:Shared", "Entity", "doc-2", "primary", 1.0) - - store.upsert_entity("entity:OnlyDoc1", "OnlyDoc1", "Only in doc 1", confidence=1.0) - store.add_extracted_from("entity:OnlyDoc1", "Entity", "doc-1", "primary", 1.0) - - # Add domain relationship with source_doc_id for doc-1 - store.upsert_concept("concept:attr1", "attr1", "attribute", "An attr", "Shared") - store.add_extracted_from("concept:attr1", "Concept", "doc-1", "primary", 0.95) - store.add_contains("entity:Shared", "concept:attr1", source_doc_id="doc-1") - - # Delete doc-1 - store.delete_by_source_doc("doc-1") - - # entity:Shared should survive (still has EXTRACTED_FROM to doc-2) - shared = store.get_entity("entity:Shared") - assert shared is not None - - # entity:OnlyDoc1 should be deleted (orphan) - only1 = store.get_entity("entity:OnlyDoc1") - assert only1 is None - - # concept:attr1 should be deleted (orphan, no EXTRACTED_FROM left) - concepts = store.execute_cypher( - "MATCH (c:Concept {id: 'concept:attr1'}) RETURN c" - ) - assert len(concepts) == 0 - - # doc-1 Document node should be gone - d1 = store.execute_cypher("MATCH (d:Document {id: 'doc-1'}) RETURN d") - assert len(d1) == 0 - - # doc-2 Document node should survive - d2 = store.execute_cypher("MATCH (d:Document {id: 'doc-2'}) RETURN d") - assert len(d2) == 1 - - store.close() - - def test_confidence_guard(self): - """Upserting with lower confidence should not overwrite higher-confidence data.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - # First upsert with high confidence - store.upsert_entity( - "entity:Guarded", "Guarded", "Full description", confidence=1.0 - ) - - # Second upsert with lower confidence - store.upsert_entity( - "entity:Guarded", "GuardedOverwritten", "Stub description", confidence=0.7 - ) - - # Should retain the higher-confidence values - result = store.execute_cypher( - "MATCH (e:Entity {id: 'entity:Guarded'}) RETURN e.name as name, e.description as descr, e.confidence as conf" - ) - assert len(result) == 1 - assert result[0]["name"] == "Guarded" - assert result[0]["descr"] == "Full description" - assert result[0]["conf"] == 1.0 - - store.close() - - def test_confidence_guard_allows_equal_or_higher(self): - """Upserting with equal or higher confidence should overwrite.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - store.upsert_entity("entity:G", "Original", "Original desc", confidence=0.7) - store.upsert_entity("entity:G", "Updated", "Updated desc", confidence=1.0) - - result = store.execute_cypher( - "MATCH (e:Entity {id: 'entity:G'}) RETURN e.name as name, e.confidence as conf" - ) - assert result[0]["name"] == "Updated" - assert result[0]["conf"] == 1.0 - - store.close() - - def test_get_document_impact(self): - """get_document_impact should return all nodes extracted from a document.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - store.upsert_document("doc-impact", "Impact Doc", "impact.md", "entity") - store.upsert_entity("entity:A", "A", "Entity A", confidence=1.0) - store.add_extracted_from("entity:A", "Entity", "doc-impact", "primary", 1.0) - store.upsert_concept("concept:A.x", "x", "attribute", "attr x", confidence=0.95) - store.add_extracted_from("concept:A.x", "Concept", "doc-impact", "primary", 0.95) - store.upsert_event("event:ACreated", "ACreated", "A created", confidence=0.9) - store.add_extracted_from("event:ACreated", "Event", "doc-impact", "primary", 0.9) - - impact = store.get_document_impact("doc-impact") - assert len(impact) == 3 - node_types = {i["node_type"] for i in impact} - assert node_types == {"Entity", "Concept", "Event"} - - store.close() - - def test_get_node_provenance(self): - """get_node_provenance should return all documents that contributed to a node.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - store.upsert_document("doc-p1", "Doc P1", "p1.md", "entity") - store.upsert_document("doc-p2", "Doc P2", "p2.md", "entity") - - store.upsert_entity("entity:Multi", "Multi", "Multi-source", confidence=1.0) - store.add_extracted_from("entity:Multi", "Entity", "doc-p1", "primary", 1.0) - store.add_extracted_from("entity:Multi", "Entity", "doc-p2", "referenced", 0.7) - - provenance = store.get_node_provenance("entity:Multi") - assert len(provenance) == 2 - roles = {p["role"] for p in provenance} - assert roles == {"primary", "referenced"} - doc_ids = {p["doc_id"] for p in provenance} - assert doc_ids == {"doc-p1", "doc-p2"} - - store.close() - - def test_get_stats_includes_document_count(self): - """get_stats should include document_count.""" - store = FalkorDBGraphStore(TEST_PROVENANCE_GRAPH_PATH) - store.initialize(reset=True) - - store.upsert_document("doc-s1", "Stats Doc", "s1.md", "entity") - store.upsert_entity("entity:S", "S", "Stats entity", confidence=1.0) - - stats = store.get_stats() - assert "document_count" in stats - assert stats["document_count"] == 1 - assert stats["entity_count"] == 1 - - store.close() - - @pytest.mark.asyncio - async def test_full_pipeline_creates_provenance(self): - """Full pipeline ingestion should create Document + EXTRACTED_FROM edges.""" - content = FIXTURE_PATH.read_text() - - pipeline = EntityIngestionPipeline( - graph_path=TEST_PROVENANCE_GRAPH_PATH, - use_mock_summarizer=True, - ) - - result = await pipeline.ingest(content, filename="Usuario.md", skip_graph=False) - assert result.success - - # Should have a Document node - stats = pipeline.get_graph_stats() - assert stats["document_count"] >= 1 - - # Should have EXTRACTED_FROM edges - edges = pipeline.query_graph(""" - MATCH (n)-[r:EXTRACTED_FROM]->(d:Document) - RETURN count(r) as cnt - """) - assert edges[0]["cnt"] > 0 - - # Main entity should have provenance to the document - provenance = pipeline.query_graph(""" - MATCH (e:Entity {name: 'Usuario'})-[r:EXTRACTED_FROM]->(d:Document) - RETURN d.id as doc_id, r.role as role - """) - assert len(provenance) == 1 - assert provenance[0]["role"] == "primary" - - pipeline.close() - - -# Quick manual test -async def main(): - """Run a quick test of the pipeline.""" - print("=" * 60) - print("Testing Smart Pipeline with FalkorDB") - print("=" * 60) - - # Clean up - if TEST_GRAPH_PATH.exists(): - TEST_GRAPH_PATH.unlink() - - content = FIXTURE_PATH.read_text() - print(f"\nLoaded: {FIXTURE_PATH.name} ({len(content)} chars)") - - # Create pipeline - pipeline = EntityIngestionPipeline( - graph_path=TEST_GRAPH_PATH, - use_mock_summarizer=True, - ) - - # Ingest with graph storage - print("\nIngesting document...") - result = await pipeline.ingest(content, filename="Usuario.md", skip_graph=False) - - print(f"\nResult:") - print(f" - Success: {result.success}") - print(f" - Document ID: {result.document_id}") - print(f" - Chunks created: {result.chunks_created}") - print(f" - Entities extracted: {result.entities_extracted}") - print(f" - Relations created: {result.relations_created}") - print(f" - Processing time: {result.processing_time_ms:.2f}ms") - - # Query graph - print("\n" + "-" * 60) - print("Querying FalkorDB Graph") - print("-" * 60) - - print("\nEntities:") - entities = pipeline.query_graph("MATCH (e:Entity) RETURN e.name as name, e.code_class as code") - for e in entities: - print(f" - {e['name']} ({e['code']})") - - print("\nRelationships from Usuario:") - # Query each relationship type - for rel_type in ["CONTAINS", "REFERENCES", "PRODUCES", "CONSUMES"]: - rels = pipeline.query_graph(f""" - MATCH (e:Entity {{name: 'Usuario'}})-[r:{rel_type}]->(n) - RETURN label(n) as target_type, n.name as target_name - LIMIT 5 - """) - for r in rels: - print(f" - {rel_type} -> {r['target_type']}:{r['target_name']}") - - print("\nGraph stats:") - stats = pipeline.get_graph_stats() - for key, value in stats.items(): - print(f" - {key}: {value}") - - pipeline.close() - - print("\n" + "=" * 60) - print("Test completed successfully!") - print("=" * 60) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py deleted file mode 100644 index 7d76d90..0000000 --- a/tests/unit/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit tests for KB-Engine.""" diff --git a/tests/unit/chunking/__init__.py b/tests/unit/chunking/__init__.py deleted file mode 100644 index f2c8b27..0000000 --- a/tests/unit/chunking/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit tests for chunking module.""" diff --git a/tests/unit/chunking/test_chunking.py b/tests/unit/chunking/test_chunking.py deleted file mode 100644 index 60d14a1..0000000 --- a/tests/unit/chunking/test_chunking.py +++ /dev/null @@ -1,230 +0,0 @@ -"""Tests for chunking functionality.""" - -import pytest - -from kb_engine.chunking import ChunkerFactory, ChunkingConfig -from kb_engine.chunking.strategies import ( - DefaultChunkingStrategy, - EntityChunkingStrategy, - RuleChunkingStrategy, - UseCaseChunkingStrategy, -) -from kb_engine.core.models.document import ChunkType, Document - - -@pytest.mark.unit -class TestChunkingConfig: - """Tests for ChunkingConfig.""" - - def test_default_config(self) -> None: - """Test default configuration values.""" - config = ChunkingConfig() - - assert config.min_chunk_size == 100 - assert config.target_chunk_size == 512 - assert config.max_chunk_size == 1024 - assert config.overlap_size == 50 - - def test_custom_config(self) -> None: - """Test custom configuration.""" - config = ChunkingConfig( - min_chunk_size=50, - target_chunk_size=256, - max_chunk_size=512, - ) - - assert config.min_chunk_size == 50 - assert config.target_chunk_size == 256 - assert config.max_chunk_size == 512 - - -@pytest.mark.unit -class TestEntityChunkingStrategy: - """Tests for EntityChunkingStrategy.""" - - def test_can_handle_entity_content(self) -> None: - """Test detection of entity content.""" - strategy = EntityChunkingStrategy() - doc = Document(title="Test", content="") - - entity_content = """ -## Entity: User - -A user represents an authenticated individual. - -### Attributes -- **id**: Unique identifier -- **name**: Full name -- **email**: Email address -""" - assert strategy.can_handle(doc, entity_content) is True - - def test_cannot_handle_non_entity_content(self) -> None: - """Test rejection of non-entity content.""" - strategy = EntityChunkingStrategy() - doc = Document(title="Test", content="") - - non_entity_content = "This is just regular text without entity patterns." - assert strategy.can_handle(doc, non_entity_content) is False - - def test_chunk_type(self) -> None: - """Test chunk type is ENTITY.""" - strategy = EntityChunkingStrategy() - assert strategy.chunk_type == ChunkType.ENTITY - - -@pytest.mark.unit -class TestUseCaseChunkingStrategy: - """Tests for UseCaseChunkingStrategy.""" - - def test_can_handle_use_case_content(self) -> None: - """Test detection of use case content.""" - strategy = UseCaseChunkingStrategy() - doc = Document(title="Test", content="") - - use_case_content = """ -## Use Case: Login - -### Actors -- User - -### Preconditions -- User has an account - -### Main Flow -1. User enters credentials -2. System validates -""" - assert strategy.can_handle(doc, use_case_content) is True - - def test_chunk_type(self) -> None: - """Test chunk type is USE_CASE.""" - strategy = UseCaseChunkingStrategy() - assert strategy.chunk_type == ChunkType.USE_CASE - - -@pytest.mark.unit -class TestRuleChunkingStrategy: - """Tests for RuleChunkingStrategy.""" - - def test_can_handle_rule_content(self) -> None: - """Test detection of rule content.""" - strategy = RuleChunkingStrategy() - doc = Document(title="Test", content="") - - rule_content = """ -## Business Rules - -### RN-001: Validation Rule -When a user submits a form, then all required fields must be filled. -""" - assert strategy.can_handle(doc, rule_content) is True - - def test_chunk_type(self) -> None: - """Test chunk type is RULE.""" - strategy = RuleChunkingStrategy() - assert strategy.chunk_type == ChunkType.RULE - - -@pytest.mark.unit -class TestDefaultChunkingStrategy: - """Tests for DefaultChunkingStrategy.""" - - def test_can_handle_any_content(self) -> None: - """Test that default strategy handles any content.""" - strategy = DefaultChunkingStrategy() - doc = Document(title="Test", content="") - - assert strategy.can_handle(doc, "Any content") is True - assert strategy.can_handle(doc, "") is True - - def test_chunk_type(self) -> None: - """Test chunk type is DEFAULT.""" - strategy = DefaultChunkingStrategy() - assert strategy.chunk_type == ChunkType.DEFAULT - - def test_chunk_small_content(self) -> None: - """Test chunking small content.""" - strategy = DefaultChunkingStrategy() - doc = Document(title="Test", content="Small content") - - chunks = strategy.chunk(doc, "Small content") - - assert len(chunks) == 1 - assert chunks[0].content == "Small content" - assert chunks[0].chunk_type == ChunkType.DEFAULT - - -@pytest.mark.unit -class TestChunkerFactory: - """Tests for ChunkerFactory.""" - - def test_factory_initialization(self) -> None: - """Test factory initializes with strategies.""" - factory = ChunkerFactory() - - types = factory.get_available_chunk_types() - assert ChunkType.ENTITY in types - assert ChunkType.USE_CASE in types - assert ChunkType.RULE in types - assert ChunkType.DEFAULT in types - - def test_strategy_selection_entity(self) -> None: - """Test strategy selection for entity content.""" - factory = ChunkerFactory() - doc = Document(title="Test", content="") - - entity_content = "## Entity: User\n\n- **id**: UUID" - strategy = factory.get_strategy_for_content(doc, entity_content) - - assert strategy.chunk_type == ChunkType.ENTITY - - def test_strategy_selection_default(self) -> None: - """Test fallback to default strategy.""" - factory = ChunkerFactory() - doc = Document(title="Test", content="") - - generic_content = "Some generic text without special patterns." - strategy = factory.get_strategy_for_content(doc, generic_content) - - assert strategy.chunk_type == ChunkType.DEFAULT - - def test_chunk_document(self, sample_document: Document) -> None: - """Test chunking a full document.""" - factory = ChunkerFactory() - - chunks = factory.chunk_document(sample_document) - - assert len(chunks) > 0 - assert all(c.document_id == sample_document.id for c in chunks) - # Verify sequences are unique - sequences = [c.sequence for c in chunks] - assert len(sequences) == len(set(sequences)) - - def test_chunk_document_with_json_parser(self) -> None: - """Test chunking a JSON document.""" - factory = ChunkerFactory() - doc = Document( - title="config", - content='{"database": {"host": "localhost", "port": 5432}}', - ) - - chunks = factory.chunk_document(doc, parser="json") - - assert len(chunks) > 0 - assert all(c.document_id == doc.id for c in chunks) - - def test_chunk_document_with_plaintext_parser(self) -> None: - """Test chunking a plain text document.""" - factory = ChunkerFactory() - doc = Document( - title="readme", - content="First paragraph.\n\nSecond paragraph.\n\nThird paragraph.", - ) - - chunks = factory.chunk_document(doc, parser="plaintext") - - assert len(chunks) == 3 - assert all(c.document_id == doc.id for c in chunks) - sequences = [c.sequence for c in chunks] - assert sequences == [0, 1, 2] diff --git a/tests/unit/chunking/test_hierarchical_chunker.py b/tests/unit/chunking/test_hierarchical_chunker.py deleted file mode 100644 index d29c173..0000000 --- a/tests/unit/chunking/test_hierarchical_chunker.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Tests for HierarchicalChunker paragraph splitting (BR-EMBEDDING-001).""" - -import pytest - -from kb_engine.smart.chunking import HierarchicalChunker, MockSummaryService -from kb_engine.smart.types import ( - ChunkingStrategy, - ContentExpectation, - KDDDocumentKind, - ParsedDocument, - ParsedSection, - SectionDefinition, - TemplateSchema, -) - -# Reusable paragraphs with >= 20 words each -P_PEDIDO = ( - "Representa un pedido de compra realizado por un Usuario registrado " - "en la plataforma de comercio electrónico del sistema principal de ventas." -) # 21 words -P_CICLO = ( - "El pedido tiene un ciclo de vida completo que va desde borrador hasta " - "entregado, pasando por confirmado, en preparación y finalmente enviado." -) # 22 words -P_LINEAS = ( - "Cada pedido contiene una o más líneas con productos seleccionados " - "del catálogo vigente, incluyendo cantidades y precios unitarios actualizados al momento." -) # 21 words - - -def _make_schema( - chunking_strategy: ChunkingStrategy = ChunkingStrategy.SPLIT_BY_PARAGRAPHS, - section_name: str = "Descripción", -) -> TemplateSchema: - return TemplateSchema( - kind=KDDDocumentKind.ENTITY, - required_sections=[ - SectionDefinition( - name=section_name, - required=True, - content_expectation=ContentExpectation.TEXT, - chunking_strategy=chunking_strategy, - ), - ], - ) - - -def _make_parsed( - content: str, - section_name: str = "Descripción", - doc_id: str = "TEST-001", -) -> ParsedDocument: - return ParsedDocument( - kind=KDDDocumentKind.ENTITY, - frontmatter={"id": doc_id}, - title="TestEntity", - sections=[ - ParsedSection( - name=section_name, - level=2, - content=content, - ), - ], - tables=[], - code_blocks=[], - cross_references=[], - validation_errors=[], - raw_content=content, - ) - - -def _make_chunker(max_chunk_size: int = 1024) -> HierarchicalChunker: - return HierarchicalChunker( - summary_service=MockSummaryService(), - max_chunk_size=max_chunk_size, - ) - - -@pytest.mark.unit -class TestSplitByParagraphs: - """Tests for SPLIT_BY_PARAGRAPHS strategy per BR-EMBEDDING-001.""" - - @pytest.mark.asyncio - async def test_two_paragraphs_produce_two_chunks(self): - """Each paragraph (>= 20 words) produces an independent chunk.""" - text = f"{P_PEDIDO}\n\n{P_CICLO}" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(text), _make_schema()) - - assert len(chunks) == 2 - assert chunks[0].chunk_type == "paragraph" - assert chunks[1].chunk_type == "paragraph" - assert "pedido de compra" in chunks[0].content - assert "ciclo de vida" in chunks[1].content - - @pytest.mark.asyncio - async def test_short_paragraph_merged_with_next(self): - """Paragraphs with < 20 words are merged with the next one.""" - short = "Párrafo corto introductorio." - text = f"{short}\n\n{P_PEDIDO}" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(text), _make_schema()) - - assert len(chunks) == 1 - assert "Párrafo corto" in chunks[0].content - assert "pedido de compra" in chunks[0].content - - @pytest.mark.asyncio - async def test_multiple_short_paragraphs_merged_until_threshold(self): - """Multiple short paragraphs accumulate until >= 20 words.""" - p1 = "Primera frase corta." - p2 = "Segunda frase corta." - p3 = "Tercera frase corta." - text = f"{p1}\n\n{p2}\n\n{p3}\n\n{P_PEDIDO}" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(text), _make_schema()) - - assert len(chunks) == 1 - assert "Primera frase" in chunks[0].content - assert "pedido de compra" in chunks[0].content - - @pytest.mark.asyncio - async def test_trailing_short_paragraph_appended_to_last(self): - """A trailing short paragraph is appended to the last chunk.""" - trailing = "Nota final breve." - text = f"{P_PEDIDO}\n\n{trailing}" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(text), _make_schema()) - - assert len(chunks) == 1 - assert "Nota final breve." in chunks[0].content - assert "pedido de compra" in chunks[0].content - - @pytest.mark.asyncio - async def test_single_paragraph_produces_one_chunk(self): - """A single paragraph produces exactly one chunk.""" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(P_PEDIDO), _make_schema()) - - assert len(chunks) == 1 - assert chunks[0].chunk_type == "paragraph" - - @pytest.mark.asyncio - async def test_empty_content_produces_no_chunks(self): - """Empty content produces zero chunks.""" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(""), _make_schema()) - - assert len(chunks) == 0 - - @pytest.mark.asyncio - async def test_large_paragraph_falls_back_to_size_split(self): - """A paragraph exceeding max_chunk_size is split by size.""" - large = " ".join(f"word{i}" for i in range(300)) - chunker = _make_chunker(max_chunk_size=200) - chunks = await chunker.chunk(_make_parsed(large), _make_schema()) - - assert len(chunks) > 1 - for chunk in chunks: - assert chunk.chunk_type == "text" - - @pytest.mark.asyncio - async def test_chunk_ids_are_sequential(self): - """Chunk IDs follow the doc_id#sequence pattern.""" - text = f"{P_PEDIDO}\n\n{P_CICLO}" - chunker = _make_chunker() - chunks = await chunker.chunk( - _make_parsed(text, doc_id="ENT-001"), _make_schema() - ) - - assert chunks[0].id == "ENT-001#0" - assert chunks[1].id == "ENT-001#1" - - @pytest.mark.asyncio - async def test_three_paragraphs_produce_three_chunks(self): - """Three substantial paragraphs produce three chunks (BR-EMBEDDING-001 example).""" - text = f"{P_PEDIDO}\n\n{P_CICLO}\n\n{P_LINEAS}" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(text), _make_schema()) - - assert len(chunks) == 3 - assert "pedido de compra" in chunks[0].content - assert "ciclo de vida" in chunks[1].content - assert "líneas con productos" in chunks[2].content - - @pytest.mark.asyncio - async def test_contextualized_content_includes_prefix(self): - """Each chunk's contextualized_content includes the hierarchical prefix.""" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(P_PEDIDO), _make_schema()) - - assert len(chunks) == 1 - assert chunks[0].contextualized_content != chunks[0].content - assert chunks[0].content in chunks[0].contextualized_content - - @pytest.mark.asyncio - async def test_whitespace_only_paragraphs_ignored(self): - """Blank lines between paragraphs don't produce empty chunks.""" - text = f"{P_PEDIDO}\n\n \n\n{P_CICLO}" - chunker = _make_chunker() - chunks = await chunker.chunk(_make_parsed(text), _make_schema()) - - assert len(chunks) == 2 - - -@pytest.mark.unit -class TestEntitySchemaUsesParagraphSplitting: - """Verify entity schema uses SPLIT_BY_PARAGRAPHS for Descripción.""" - - def test_descripcion_strategy(self): - from kb_engine.smart.schemas.entity import ENTITY_SCHEMA - - descripcion = next( - s for s in ENTITY_SCHEMA.required_sections if s.name == "Descripción" - ) - assert descripcion.chunking_strategy == ChunkingStrategy.SPLIT_BY_PARAGRAPHS diff --git a/tests/unit/chunking/test_parsers.py b/tests/unit/chunking/test_parsers.py deleted file mode 100644 index 7f3cf9a..0000000 --- a/tests/unit/chunking/test_parsers.py +++ /dev/null @@ -1,169 +0,0 @@ -"""Tests for content parsers.""" - -import pytest - -from kb_engine.chunking.parsers import get_parser, parse_json, parse_markdown, parse_plaintext, parse_rst, parse_yaml - - -@pytest.mark.unit -class TestParseMarkdown: - """Tests for the markdown parser.""" - - def test_headings_and_hierarchy(self) -> None: - content = "# Title\n\nIntro text.\n\n## Section A\n\nContent A.\n\n### Sub A1\n\nContent A1.\n\n## Section B\n\nContent B." - sections = parse_markdown(content) - - assert sections[0] == (["Title"], "Intro text.") - assert sections[1] == (["Title", "Section A"], "Content A.") - assert sections[2] == (["Title", "Section A", "Sub A1"], "Content A1.") - assert sections[3] == (["Title", "Section B"], "Content B.") - - def test_no_headings(self) -> None: - content = "Just plain text\nwith multiple lines." - sections = parse_markdown(content) - - assert len(sections) == 1 - assert sections[0] == ([], "Just plain text\nwith multiple lines.") - - def test_empty_content(self) -> None: - sections = parse_markdown("") - assert sections == [] - - -@pytest.mark.unit -class TestParseJson: - """Tests for the JSON parser.""" - - def test_object_key_paths(self) -> None: - content = '{"name": "Alice", "age": 30}' - sections = parse_json(content) - - paths = [s[0] for s in sections] - assert ["name"] in paths - assert ["age"] in paths - - def test_nested_object(self) -> None: - content = '{"user": {"name": "Alice", "email": "a@b.com"}}' - sections = parse_json(content) - - paths = [s[0] for s in sections] - assert ["user", "name"] in paths - assert ["user", "email"] in paths - - def test_array_of_objects(self) -> None: - content = '[{"name": "Alice"}, {"name": "Bob"}]' - sections = parse_json(content) - - # Each element gets its own section, labeled by name field - paths = [s[0] for s in sections] - assert any("Alice" in p for p in paths) - assert any("Bob" in p for p in paths) - - def test_invalid_json_fallback(self) -> None: - content = "not valid json {{" - sections = parse_json(content) - - assert len(sections) == 1 - assert sections[0] == ([], content) - - def test_empty_object(self) -> None: - content = "{}" - sections = parse_json(content) - - # Empty object produces no keys, fallback to raw content - assert len(sections) == 1 - assert sections[0] == ([], content) - - -@pytest.mark.unit -class TestParseYaml: - """Tests for the YAML parser.""" - - def test_nested_structure(self) -> None: - content = "database:\n host: localhost\n port: 5432\napp:\n name: myapp" - sections = parse_yaml(content) - - paths = [s[0] for s in sections] - assert ["database", "host"] in paths - assert ["database", "port"] in paths - assert ["app", "name"] in paths - - def test_invalid_yaml_fallback(self) -> None: - content = ":\n - :\n - :" - sections = parse_yaml(content) - # Should not crash; returns at least one section - assert len(sections) >= 1 - - def test_empty_yaml(self) -> None: - content = "" - sections = parse_yaml(content) - assert len(sections) == 1 - assert sections[0] == ([], "") - - -@pytest.mark.unit -class TestParseRst: - """Tests for the RST parser.""" - - def test_headings_with_underlines(self) -> None: - content = "Title\n=====\n\nIntro text.\n\nSection A\n---------\n\nContent A.\n\nSection B\n---------\n\nContent B." - sections = parse_rst(content) - - assert sections[0] == (["Title"], "Intro text.") - assert sections[1] == (["Title", "Section A"], "Content A.") - assert sections[2] == (["Title", "Section B"], "Content B.") - - def test_hierarchy_by_adornment_char(self) -> None: - content = "Top\n===\n\nText.\n\nSub\n---\n\nSub text.\n\nSubSub\n~~~~~~\n\nDeep text." - sections = parse_rst(content) - - assert sections[0] == (["Top"], "Text.") - assert sections[1] == (["Top", "Sub"], "Sub text.") - assert sections[2] == (["Top", "Sub", "SubSub"], "Deep text.") - - def test_no_headings(self) -> None: - content = "Just some plain text\nwithout any headings." - sections = parse_rst(content) - - assert len(sections) == 1 - assert sections[0] == ([], "Just some plain text\nwithout any headings.") - - -@pytest.mark.unit -class TestParsePlaintext: - """Tests for the plaintext parser.""" - - def test_paragraphs_by_blank_lines(self) -> None: - content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph." - sections = parse_plaintext(content) - - assert len(sections) == 3 - assert sections[0] == ([], "First paragraph.") - assert sections[1] == ([], "Second paragraph.") - assert sections[2] == ([], "Third paragraph.") - - def test_single_paragraph(self) -> None: - content = "Just one paragraph without blanks." - sections = parse_plaintext(content) - - assert len(sections) == 1 - assert sections[0] == ([], "Just one paragraph without blanks.") - - def test_empty_content(self) -> None: - sections = parse_plaintext("") - assert len(sections) == 1 - assert sections[0] == ([], "") - - -@pytest.mark.unit -class TestGetParser: - """Tests for the parser registry.""" - - def test_get_known_parsers(self) -> None: - for name in ("markdown", "json", "yaml", "rst", "plaintext"): - parser = get_parser(name) - assert callable(parser) - - def test_unknown_parser_raises(self) -> None: - with pytest.raises(ValueError, match="Unknown parser"): - get_parser("nonexistent") diff --git a/tests/unit/config/test_settings.py b/tests/unit/config/test_settings.py deleted file mode 100644 index 7d17516..0000000 --- a/tests/unit/config/test_settings.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Tests for Settings validation.""" - -import pytest - -from kb_engine.config.settings import Settings - - -def test_local_profile_requires_local_stores() -> None: - with pytest.raises(ValueError, match="profile=local requires traceability_store=sqlite"): - Settings( - _env_file=None, - profile="local", - traceability_store="postgres", - vector_store="chroma", - graph_store="none", - ) - - -def test_server_profile_requires_server_stores() -> None: - with pytest.raises(ValueError, match="profile=server requires traceability_store=postgres"): - Settings( - _env_file=None, - profile="server", - traceability_store="sqlite", - vector_store="qdrant", - graph_store="neo4j", - ) - - with pytest.raises(ValueError, match="profile=server requires vector_store=qdrant"): - Settings( - _env_file=None, - profile="server", - traceability_store="postgres", - vector_store="chroma", - graph_store="neo4j", - ) - - with pytest.raises(ValueError, match="profile=server requires graph_store=neo4j|none"): - Settings( - _env_file=None, - profile="server", - traceability_store="postgres", - vector_store="qdrant", - graph_store="sqlite", - ) - - -def test_openai_requires_api_key() -> None: - with pytest.raises(ValueError, match="openai_api_key is required"): - Settings( - _env_file=None, - profile="local", - traceability_store="sqlite", - vector_store="chroma", - graph_store="none", - embedding_provider="openai", - openai_api_key=None, - ) diff --git a/tests/unit/core/__init__.py b/tests/unit/core/__init__.py deleted file mode 100644 index 62f7145..0000000 --- a/tests/unit/core/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit tests for core module.""" diff --git a/tests/unit/core/test_models.py b/tests/unit/core/test_models.py deleted file mode 100644 index 9e60269..0000000 --- a/tests/unit/core/test_models.py +++ /dev/null @@ -1,258 +0,0 @@ -"""Tests for core domain models.""" - -import pytest -from uuid import uuid4 - -from kb_engine.core.models.document import Chunk, ChunkType, Document, DocumentStatus -from kb_engine.core.models.embedding import Embedding -from kb_engine.core.models.graph import Edge, EdgeType, Node, NodeType -from kb_engine.core.models.search import ( - DocumentReference, - RetrievalMode, - RetrievalResponse, - SearchFilters, -) -from kb_engine.core.models.repository import RepositoryConfig - - -@pytest.mark.unit -class TestDocument: - """Tests for Document model.""" - - def test_create_document(self) -> None: - doc = Document(title="Test", content="Test content") - assert doc.title == "Test" - assert doc.content == "Test content" - assert doc.status == DocumentStatus.PENDING - assert doc.id is not None - - def test_document_with_metadata(self) -> None: - doc = Document( - title="Test", - content="Content", - domain="test-domain", - tags=["tag1", "tag2"], - metadata={"key": "value"}, - ) - assert doc.domain == "test-domain" - assert doc.tags == ["tag1", "tag2"] - assert doc.metadata == {"key": "value"} - - def test_document_git_fields(self) -> None: - doc = Document( - title="Test", - content="Content", - repo_name="my-repo", - relative_path="docs/test.md", - git_commit="abc123", - git_remote_url="https://github.com/org/repo", - ) - assert doc.repo_name == "my-repo" - assert doc.relative_path == "docs/test.md" - assert doc.git_commit == "abc123" - assert doc.git_remote_url == "https://github.com/org/repo" - - -@pytest.mark.unit -class TestChunk: - """Tests for Chunk model.""" - - def test_create_chunk(self) -> None: - doc_id = uuid4() - chunk = Chunk( - document_id=doc_id, - content="Chunk content", - chunk_type=ChunkType.ENTITY, - ) - assert chunk.document_id == doc_id - assert chunk.content == "Chunk content" - assert chunk.chunk_type == ChunkType.ENTITY - - def test_chunk_with_heading_path(self) -> None: - chunk = Chunk( - document_id=uuid4(), - content="Content", - heading_path=["Section 1", "Subsection 1.1"], - ) - assert chunk.heading_path == ["Section 1", "Subsection 1.1"] - - def test_chunk_section_anchor(self) -> None: - chunk = Chunk( - document_id=uuid4(), - content="Content", - section_anchor="atributos", - ) - assert chunk.section_anchor == "atributos" - - -@pytest.mark.unit -class TestDocumentReference: - """Tests for DocumentReference model.""" - - def test_create_reference(self) -> None: - ref = DocumentReference( - url="file:///path/to/doc.md#atributos", - document_path="docs/entities/Usuario.md", - section_anchor="atributos", - title="Usuario", - section_title="Atributos", - score=0.92, - snippet="Representa a una persona que interactúa con el sistema.", - domain="core", - tags=["entity"], - chunk_type="entity", - retrieval_mode=RetrievalMode.VECTOR, - ) - assert ref.url == "file:///path/to/doc.md#atributos" - assert ref.section_anchor == "atributos" - assert ref.score == 0.92 - - def test_reference_defaults(self) -> None: - ref = DocumentReference( - url="file:///doc.md", - document_path="doc.md", - title="Doc", - ) - assert ref.section_anchor is None - assert ref.score == 0.0 - assert ref.snippet == "" - assert ref.tags == [] - assert ref.retrieval_mode == RetrievalMode.VECTOR - - -@pytest.mark.unit -class TestRetrievalResponse: - """Tests for RetrievalResponse model.""" - - def test_create_response(self) -> None: - ref = DocumentReference( - url="file:///doc.md", - document_path="doc.md", - title="Doc", - score=0.8, - ) - response = RetrievalResponse( - query="test query", - references=[ref], - total_count=1, - processing_time_ms=42.5, - ) - assert response.query == "test query" - assert len(response.references) == 1 - assert response.total_count == 1 - assert response.processing_time_ms == 42.5 - - -@pytest.mark.unit -class TestRepositoryConfig: - """Tests for RepositoryConfig model.""" - - def test_create_config(self) -> None: - config = RepositoryConfig( - name="my-repo", - local_path="/path/to/repo", - ) - assert config.name == "my-repo" - assert config.branch == "main" - assert config.include_patterns == ["**/*.md"] - assert config.exclude_patterns == [] - - def test_config_with_remote(self) -> None: - config = RepositoryConfig( - name="my-repo", - local_path="/path/to/repo", - remote_url="https://github.com/org/repo", - base_url_template="{remote}/blob/{branch}/{path}", - ) - assert config.remote_url == "https://github.com/org/repo" - - -@pytest.mark.unit -class TestEmbedding: - """Tests for Embedding model.""" - - def test_create_embedding(self) -> None: - chunk_id = uuid4() - doc_id = uuid4() - vector = [0.1] * 768 - - embedding = Embedding( - chunk_id=chunk_id, - document_id=doc_id, - vector=vector, - model="all-mpnet-base-v2", - dimensions=768, - ) - assert embedding.chunk_id == chunk_id - assert len(embedding.vector) == 768 - - def test_embedding_payload(self) -> None: - chunk_id = uuid4() - doc_id = uuid4() - - embedding = Embedding( - chunk_id=chunk_id, - document_id=doc_id, - vector=[0.1], - model="test-model", - dimensions=1, - metadata={"key": "value"}, - ) - - payload = embedding.payload - assert payload["chunk_id"] == str(chunk_id) - assert payload["document_id"] == str(doc_id) - assert payload["model"] == "test-model" - assert payload["key"] == "value" - - -@pytest.mark.unit -class TestNode: - """Tests for Node model.""" - - def test_create_node(self) -> None: - node = Node(name="TestEntity", node_type=NodeType.ENTITY, description="A test entity") - assert node.name == "TestEntity" - assert node.node_type == NodeType.ENTITY - - def test_node_types(self) -> None: - for node_type in NodeType: - node = Node(name="Test", node_type=node_type) - assert node.node_type == node_type - - -@pytest.mark.unit -class TestEdge: - """Tests for Edge model.""" - - def test_create_edge(self) -> None: - source_id = uuid4() - target_id = uuid4() - edge = Edge(source_id=source_id, target_id=target_id, edge_type=EdgeType.DEPENDS_ON) - assert edge.source_id == source_id - assert edge.edge_type == EdgeType.DEPENDS_ON - - def test_edge_types(self) -> None: - for edge_type in EdgeType: - edge = Edge(source_id=uuid4(), target_id=uuid4(), edge_type=edge_type) - assert edge.edge_type == edge_type - - -@pytest.mark.unit -class TestSearchFilters: - """Tests for SearchFilters model.""" - - def test_empty_filters(self) -> None: - filters = SearchFilters() - assert filters.document_ids is None - assert filters.domains is None - - def test_filters_with_values(self) -> None: - doc_id = uuid4() - filters = SearchFilters( - document_ids=[doc_id], - domains=["domain1"], - tags=["tag1"], - ) - assert filters.document_ids == [doc_id] - assert filters.domains == ["domain1"] diff --git a/tests/unit/extraction/__init__.py b/tests/unit/extraction/__init__.py deleted file mode 100644 index 4e100bf..0000000 --- a/tests/unit/extraction/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit tests for extraction module.""" diff --git a/tests/unit/extraction/test_extraction.py b/tests/unit/extraction/test_extraction.py deleted file mode 100644 index 157ea39..0000000 --- a/tests/unit/extraction/test_extraction.py +++ /dev/null @@ -1,184 +0,0 @@ -"""Tests for entity extraction functionality.""" - -import pytest - -from kb_engine.core.models.document import Chunk, ChunkType, Document -from kb_engine.core.models.graph import NodeType -from kb_engine.extraction import ExtractionConfig, ExtractionPipelineFactory -from kb_engine.extraction.extractors import FrontmatterExtractor, PatternExtractor -from kb_engine.extraction.models import ExtractedNode - - -@pytest.mark.unit -class TestExtractionConfig: - """Tests for ExtractionConfig.""" - - def test_default_config(self) -> None: - """Test default configuration values.""" - config = ExtractionConfig() - - assert config.use_llm is False - assert config.confidence_threshold == 0.7 - assert config.enable_frontmatter_extraction is True - assert config.enable_pattern_extraction is True - - def test_custom_config(self) -> None: - """Test custom configuration.""" - config = ExtractionConfig( - use_llm=False, - confidence_threshold=0.9, - ) - - assert config.use_llm is False - assert config.confidence_threshold == 0.9 - - -@pytest.mark.unit -class TestFrontmatterExtractor: - """Tests for FrontmatterExtractor.""" - - def test_extractor_properties(self) -> None: - """Test extractor properties.""" - extractor = FrontmatterExtractor() - - assert extractor.name == "frontmatter" - assert extractor.priority == 10 - - def test_can_extract_with_metadata(self) -> None: - """Test can_extract with document metadata.""" - extractor = FrontmatterExtractor() - doc = Document( - title="Test", - content="Content", - metadata={"key": "value"}, - ) - chunk = Chunk(document_id=doc.id, content="Content") - - assert extractor.can_extract(chunk, doc) is True - - def test_cannot_extract_without_metadata(self) -> None: - """Test can_extract without document metadata.""" - extractor = FrontmatterExtractor() - doc = Document(title="Test", content="Content", metadata={}) - chunk = Chunk(document_id=doc.id, content="Content") - - assert extractor.can_extract(chunk, doc) is False - - @pytest.mark.asyncio - async def test_extract_from_frontmatter(self) -> None: - """Test extraction from frontmatter.""" - extractor = FrontmatterExtractor() - doc = Document( - title="Test Document", - content="Content", - domain="test-domain", - tags=["tag1", "tag2"], - metadata={"tags": ["tag1", "tag2"]}, - ) - chunk = Chunk(document_id=doc.id, content="Content") - - result = await extractor.extract(chunk, doc) - - assert len(result.nodes) > 0 - # Should have document node and concept nodes for tags/domain - node_types = [n.node_type for n in result.nodes] - assert NodeType.DOCUMENT in node_types - assert NodeType.CONCEPT in node_types - - -@pytest.mark.unit -class TestPatternExtractor: - """Tests for PatternExtractor.""" - - def test_extractor_properties(self) -> None: - """Test extractor properties.""" - extractor = PatternExtractor() - - assert extractor.name == "pattern" - assert extractor.priority == 20 - - def test_can_extract_with_content(self) -> None: - """Test can_extract with sufficient content.""" - extractor = PatternExtractor() - doc = Document(title="Test", content="Content") - chunk = Chunk(document_id=doc.id, content="This is sufficient content for extraction.") - - assert extractor.can_extract(chunk, doc) is True - - def test_cannot_extract_short_content(self) -> None: - """Test can_extract with very short content.""" - extractor = PatternExtractor() - doc = Document(title="Test", content="") - chunk = Chunk(document_id=doc.id, content="Short") - - assert extractor.can_extract(chunk, doc) is False - - @pytest.mark.asyncio - async def test_extract_actor(self) -> None: - """Test extraction of actor entities.""" - extractor = PatternExtractor() - doc = Document(title="Test", content="Content") - chunk = Chunk( - document_id=doc.id, - content="The actor: Administrator manages the system.", - ) - - result = await extractor.extract(chunk, doc) - - actor_nodes = [n for n in result.nodes if n.node_type == NodeType.ACTOR] - assert len(actor_nodes) > 0 - - @pytest.mark.asyncio - async def test_extract_relationship(self) -> None: - """Test extraction of relationships.""" - extractor = PatternExtractor() - doc = Document(title="Test", content="Content") - chunk = Chunk( - document_id=doc.id, - content="The OrderService depends on PaymentService for processing.", - ) - - result = await extractor.extract(chunk, doc) - - assert len(result.edges) > 0 - - -@pytest.mark.unit -class TestExtractionPipelineFactory: - """Tests for ExtractionPipelineFactory.""" - - def test_create_pipeline_default(self) -> None: - """Test creating pipeline with default config.""" - factory = ExtractionPipelineFactory() - pipeline = factory.create_pipeline() - - assert pipeline is not None - - def test_create_pipeline_no_llm(self) -> None: - """Test creating pipeline without LLM.""" - config = ExtractionConfig(use_llm=False) - factory = ExtractionPipelineFactory(config) - pipeline = factory.create_pipeline() - - # Should still have frontmatter and pattern extractors - assert pipeline is not None - - -@pytest.mark.unit -class TestExtractedNode: - """Tests for ExtractedNode model.""" - - def test_create_extracted_node(self) -> None: - """Test creating an extracted node.""" - node = ExtractedNode( - name="TestEntity", - node_type=NodeType.ENTITY, - description="A test entity", - confidence=0.9, - extraction_method="test", - ) - - assert node.name == "TestEntity" - assert node.node_type == NodeType.ENTITY - assert node.confidence == 0.9 - assert node.extraction_method == "test" diff --git a/tests/unit/git/__init__.py b/tests/unit/git/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/unit/git/test_scanner.py b/tests/unit/git/test_scanner.py deleted file mode 100644 index 32bcd93..0000000 --- a/tests/unit/git/test_scanner.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Tests for Git repository scanner.""" - -import subprocess -from pathlib import Path - -import pytest - -from kb_engine.core.models.repository import RepositoryConfig -from kb_engine.git.scanner import GitRepoScanner - - -@pytest.fixture -def git_repo(tmp_path: Path) -> Path: - """Create a temporary Git repository with some files.""" - repo_path = tmp_path / "test-repo" - repo_path.mkdir() - - # Init repo - subprocess.run(["git", "init"], cwd=repo_path, capture_output=True, check=True) - subprocess.run( - ["git", "config", "user.email", "test@test.com"], - cwd=repo_path, capture_output=True, check=True, - ) - subprocess.run( - ["git", "config", "user.name", "Test"], - cwd=repo_path, capture_output=True, check=True, - ) - - # Create markdown files - (repo_path / "docs").mkdir() - (repo_path / "docs" / "entity.md").write_text("# Entity\n\nContent here.\n") - (repo_path / "docs" / "process.md").write_text("# Process\n\nSteps here.\n") - (repo_path / "README.md").write_text("# Test Repo\n\nA test repository.\n") - (repo_path / "src").mkdir() - (repo_path / "src" / "main.py").write_text("print('hello')\n") - - # Commit - subprocess.run(["git", "add", "."], cwd=repo_path, capture_output=True, check=True) - subprocess.run( - ["git", "commit", "-m", "Initial commit"], - cwd=repo_path, capture_output=True, check=True, - ) - - return repo_path - - -@pytest.mark.unit -class TestGitRepoScanner: - """Tests for GitRepoScanner.""" - - def test_is_git_repo(self, git_repo: Path) -> None: - config = RepositoryConfig(name="test", local_path=str(git_repo)) - scanner = GitRepoScanner(config) - assert scanner.is_git_repo() is True - - def test_is_not_git_repo(self, tmp_path: Path) -> None: - config = RepositoryConfig(name="test", local_path=str(tmp_path)) - scanner = GitRepoScanner(config) - assert scanner.is_git_repo() is False - - def test_get_current_commit(self, git_repo: Path) -> None: - config = RepositoryConfig(name="test", local_path=str(git_repo)) - scanner = GitRepoScanner(config) - commit = scanner.get_current_commit() - assert len(commit) == 40 # Full SHA - - def test_scan_files_default_pattern(self, git_repo: Path) -> None: - config = RepositoryConfig(name="test", local_path=str(git_repo)) - scanner = GitRepoScanner(config) - files = scanner.scan_files() - assert "README.md" in files - assert "docs/entity.md" in files - assert "docs/process.md" in files - assert "src/main.py" not in files # Not .md - - def test_scan_files_custom_pattern(self, git_repo: Path) -> None: - config = RepositoryConfig( - name="test", - local_path=str(git_repo), - include_patterns=["**/*.py"], - ) - scanner = GitRepoScanner(config) - files = scanner.scan_files() - assert "src/main.py" in files - assert "README.md" not in files - - def test_scan_files_with_exclude(self, git_repo: Path) -> None: - config = RepositoryConfig( - name="test", - local_path=str(git_repo), - include_patterns=["**/*.md"], - exclude_patterns=["README.md"], - ) - scanner = GitRepoScanner(config) - files = scanner.scan_files() - assert "README.md" not in files - assert "docs/entity.md" in files - - def test_get_changed_files(self, git_repo: Path) -> None: - config = RepositoryConfig(name="test", local_path=str(git_repo)) - scanner = GitRepoScanner(config) - - initial_commit = scanner.get_current_commit() - - # Modify a file and create a new commit - (git_repo / "docs" / "entity.md").write_text("# Entity\n\nUpdated content.\n") - subprocess.run(["git", "add", "."], cwd=git_repo, capture_output=True, check=True) - subprocess.run( - ["git", "commit", "-m", "Update entity"], - cwd=git_repo, capture_output=True, check=True, - ) - - changed = scanner.get_changed_files(initial_commit) - assert "docs/entity.md" in changed - assert "docs/process.md" not in changed - - def test_read_file(self, git_repo: Path) -> None: - config = RepositoryConfig(name="test", local_path=str(git_repo)) - scanner = GitRepoScanner(config) - content = scanner.read_file("docs/entity.md") - assert "# Entity" in content - - def test_get_current_branch(self, git_repo: Path) -> None: - config = RepositoryConfig(name="test", local_path=str(git_repo)) - scanner = GitRepoScanner(config) - branch = scanner.get_current_branch() - # Depending on git version, may be "main" or "master" - assert isinstance(branch, str) - assert len(branch) > 0 diff --git a/tests/unit/git/test_url_resolver.py b/tests/unit/git/test_url_resolver.py deleted file mode 100644 index 354df91..0000000 --- a/tests/unit/git/test_url_resolver.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Tests for URL resolver.""" - -import pytest - -from kb_engine.core.models.repository import RepositoryConfig -from kb_engine.git.url_resolver import URLResolver - - -@pytest.mark.unit -class TestURLResolver: - """Tests for URLResolver.""" - - def test_resolve_local(self) -> None: - config = RepositoryConfig( - name="test-repo", - local_path="/tmp/test-repo", - ) - resolver = URLResolver(config) - url = resolver.resolve("docs/entity.md", "atributos") - assert url.startswith("file://") - assert url.endswith("/test-repo/docs/entity.md#atributos") - - def test_resolve_local_no_anchor(self) -> None: - config = RepositoryConfig( - name="test-repo", - local_path="/tmp/test-repo", - ) - resolver = URLResolver(config) - url = resolver.resolve("docs/entity.md") - assert url.startswith("file://") - assert url.endswith("/test-repo/docs/entity.md") - assert "#" not in url - - def test_resolve_remote_https(self) -> None: - config = RepositoryConfig( - name="test-repo", - local_path="/tmp/test-repo", - remote_url="https://github.com/org/repo.git", - branch="main", - ) - resolver = URLResolver(config) - url = resolver.resolve("docs/entity.md", "atributos") - assert url == "https://github.com/org/repo/blob/main/docs/entity.md#atributos" - - def test_resolve_remote_ssh(self) -> None: - config = RepositoryConfig( - name="test-repo", - local_path="/tmp/test-repo", - remote_url="git@github.com:org/repo.git", - branch="develop", - ) - resolver = URLResolver(config) - url = resolver.resolve("README.md") - assert url == "https://github.com/org/repo/blob/develop/README.md" - - def test_resolve_with_template(self) -> None: - config = RepositoryConfig( - name="test-repo", - local_path="/tmp/test-repo", - remote_url="https://github.com/org/repo", - branch="main", - base_url_template="{remote}/blob/{branch}/{path}", - ) - resolver = URLResolver(config) - url = resolver.resolve("docs/entity.md", "sec") - assert url == "https://github.com/org/repo/blob/main/docs/entity.md#sec" - - def test_normalize_ssh_url(self) -> None: - result = URLResolver._normalize_remote_url("git@github.com:org/repo.git") - assert result == "https://github.com/org/repo" - - def test_normalize_https_url(self) -> None: - result = URLResolver._normalize_remote_url("https://github.com/org/repo.git") - assert result == "https://github.com/org/repo" - - def test_normalize_clean_url(self) -> None: - result = URLResolver._normalize_remote_url("https://github.com/org/repo") - assert result == "https://github.com/org/repo" diff --git a/tests/unit/repositories/__init__.py b/tests/unit/repositories/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/unit/repositories/test_sqlite_graph.py b/tests/unit/repositories/test_sqlite_graph.py deleted file mode 100644 index 4fb9bfd..0000000 --- a/tests/unit/repositories/test_sqlite_graph.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Tests for SQLite graph repository.""" - -import pytest -from uuid import uuid4 - -from kb_engine.core.models.graph import Edge, EdgeType, Node, NodeType -from kb_engine.repositories.graph.sqlite import SQLiteGraphRepository - - -@pytest.fixture -async def graph_repo(tmp_path) -> SQLiteGraphRepository: - """Create a SQLite graph repository for testing.""" - db_path = str(tmp_path / "test.db") - repo = SQLiteGraphRepository(db_path=db_path) - await repo.initialize() - yield repo - await repo.close() - - -@pytest.mark.unit -class TestSQLiteGraphRepository: - """Tests for SQLiteGraphRepository.""" - - @pytest.mark.asyncio - async def test_create_and_get_node(self, graph_repo: SQLiteGraphRepository) -> None: - node = Node( - name="User", - node_type=NodeType.ENTITY, - description="A user entity", - ) - created = await graph_repo.create_node(node) - assert created.name == "User" - - fetched = await graph_repo.get_node(node.id) - assert fetched is not None - assert fetched.name == "User" - assert fetched.node_type == NodeType.ENTITY - - @pytest.mark.asyncio - async def test_find_nodes_by_type(self, graph_repo: SQLiteGraphRepository) -> None: - await graph_repo.create_node( - Node(name="User", node_type=NodeType.ENTITY) - ) - await graph_repo.create_node( - Node(name="Login", node_type=NodeType.USE_CASE) - ) - await graph_repo.create_node( - Node(name="Admin", node_type=NodeType.ENTITY) - ) - - entities = await graph_repo.find_nodes(node_type="entity") - assert len(entities) == 2 - - use_cases = await graph_repo.find_nodes(node_type="use_case") - assert len(use_cases) == 1 - - @pytest.mark.asyncio - async def test_find_nodes_by_name(self, graph_repo: SQLiteGraphRepository) -> None: - await graph_repo.create_node( - Node(name="User", node_type=NodeType.ENTITY) - ) - await graph_repo.create_node( - Node(name="Admin", node_type=NodeType.ENTITY) - ) - - results = await graph_repo.find_nodes(name_pattern="Us") - assert len(results) == 1 - assert results[0].name == "User" - - @pytest.mark.asyncio - async def test_create_and_get_edges(self, graph_repo: SQLiteGraphRepository) -> None: - node1 = Node(name="User", node_type=NodeType.ENTITY) - node2 = Node(name="Login", node_type=NodeType.USE_CASE) - await graph_repo.create_node(node1) - await graph_repo.create_node(node2) - - edge = Edge( - source_id=node1.id, - target_id=node2.id, - edge_type=EdgeType.PERFORMS, - ) - await graph_repo.create_edge(edge) - - edges = await graph_repo.get_edges(node1.id, direction="out") - assert len(edges) == 1 - assert edges[0].edge_type == EdgeType.PERFORMS - - @pytest.mark.asyncio - async def test_traverse(self, graph_repo: SQLiteGraphRepository) -> None: - # Create a simple graph: A -> B -> C - a = Node(name="A", node_type=NodeType.ENTITY) - b = Node(name="B", node_type=NodeType.ENTITY) - c = Node(name="C", node_type=NodeType.ENTITY) - await graph_repo.create_node(a) - await graph_repo.create_node(b) - await graph_repo.create_node(c) - - await graph_repo.create_edge( - Edge(source_id=a.id, target_id=b.id, edge_type=EdgeType.RELATED_TO) - ) - await graph_repo.create_edge( - Edge(source_id=b.id, target_id=c.id, edge_type=EdgeType.RELATED_TO) - ) - - # Traverse with max_hops=1 should find B - results_1 = await graph_repo.traverse(a.id, max_hops=1) - target_names_1 = {t.name for _, _, t in results_1} - assert "B" in target_names_1 - - # Traverse with max_hops=2 should find B and C - results_2 = await graph_repo.traverse(a.id, max_hops=2) - target_names_2 = {t.name for _, _, t in results_2} - assert "B" in target_names_2 - assert "C" in target_names_2 - - @pytest.mark.asyncio - async def test_delete_by_document(self, graph_repo: SQLiteGraphRepository) -> None: - doc_id = uuid4() - node = Node( - name="User", - node_type=NodeType.ENTITY, - source_document_id=doc_id, - ) - await graph_repo.create_node(node) - - deleted = await graph_repo.delete_by_document(doc_id) - assert deleted >= 1 - - fetched = await graph_repo.get_node(node.id) - assert fetched is None diff --git a/tests/unit/repositories/test_sqlite_traceability.py b/tests/unit/repositories/test_sqlite_traceability.py deleted file mode 100644 index ea64525..0000000 --- a/tests/unit/repositories/test_sqlite_traceability.py +++ /dev/null @@ -1,163 +0,0 @@ -"""Tests for SQLite traceability repository.""" - -import pytest -from uuid import uuid4 - -from kb_engine.core.models.document import Chunk, ChunkType, Document, DocumentStatus -from kb_engine.repositories.traceability.sqlite import SQLiteRepository - - -@pytest.fixture -async def sqlite_repo(tmp_path) -> SQLiteRepository: - """Create a SQLite repository for testing.""" - db_path = str(tmp_path / "test.db") - repo = SQLiteRepository(db_path=db_path) - await repo.initialize() - yield repo - await repo.close() - - -@pytest.mark.unit -class TestSQLiteRepository: - """Tests for SQLiteRepository.""" - - @pytest.mark.asyncio - async def test_save_and_get_document(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document( - title="Test Doc", - content="# Test\n\nContent here.", - domain="test", - tags=["entity"], - ) - saved = await sqlite_repo.save_document(doc) - assert saved.id == doc.id - - fetched = await sqlite_repo.get_document(doc.id) - assert fetched is not None - assert fetched.title == "Test Doc" - assert fetched.domain == "test" - assert fetched.tags == ["entity"] - - @pytest.mark.asyncio - async def test_get_document_by_external_id(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document( - title="Test", - content="Content", - external_id="repo:path/to/file.md", - ) - await sqlite_repo.save_document(doc) - - fetched = await sqlite_repo.get_document_by_external_id("repo:path/to/file.md") - assert fetched is not None - assert fetched.id == doc.id - - @pytest.mark.asyncio - async def test_get_document_by_relative_path(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document( - title="Test", - content="Content", - repo_name="my-repo", - relative_path="docs/entity.md", - ) - await sqlite_repo.save_document(doc) - - fetched = await sqlite_repo.get_document_by_relative_path("my-repo", "docs/entity.md") - assert fetched is not None - assert fetched.id == doc.id - - @pytest.mark.asyncio - async def test_list_documents(self, sqlite_repo: SQLiteRepository) -> None: - for i in range(3): - await sqlite_repo.save_document( - Document(title=f"Doc {i}", content=f"Content {i}") - ) - - docs = await sqlite_repo.list_documents() - assert len(docs) == 3 - - @pytest.mark.asyncio - async def test_update_document(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document(title="Original", content="Content") - await sqlite_repo.save_document(doc) - - doc.title = "Updated" - doc.status = DocumentStatus.INDEXED - await sqlite_repo.update_document(doc) - - fetched = await sqlite_repo.get_document(doc.id) - assert fetched is not None - assert fetched.title == "Updated" - assert fetched.status == DocumentStatus.INDEXED - - @pytest.mark.asyncio - async def test_delete_document(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document(title="To Delete", content="Content") - await sqlite_repo.save_document(doc) - - deleted = await sqlite_repo.delete_document(doc.id) - assert deleted is True - - fetched = await sqlite_repo.get_document(doc.id) - assert fetched is None - - @pytest.mark.asyncio - async def test_save_and_get_chunks(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document(title="Doc", content="Content") - await sqlite_repo.save_document(doc) - - chunks = [ - Chunk( - document_id=doc.id, - content="Chunk 1", - chunk_type=ChunkType.ENTITY, - sequence=0, - heading_path=["Doc", "Section 1"], - section_anchor="section-1", - ), - Chunk( - document_id=doc.id, - content="Chunk 2", - chunk_type=ChunkType.DEFAULT, - sequence=1, - heading_path=["Doc", "Section 2"], - section_anchor="section-2", - ), - ] - - saved = await sqlite_repo.save_chunks(chunks) - assert len(saved) == 2 - - fetched = await sqlite_repo.get_chunks_by_document(doc.id) - assert len(fetched) == 2 - assert fetched[0].content == "Chunk 1" - assert fetched[0].section_anchor == "section-1" - assert fetched[1].content == "Chunk 2" - - @pytest.mark.asyncio - async def test_get_chunk(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document(title="Doc", content="Content") - await sqlite_repo.save_document(doc) - - chunk = Chunk(document_id=doc.id, content="Test chunk") - await sqlite_repo.save_chunks([chunk]) - - fetched = await sqlite_repo.get_chunk(chunk.id) - assert fetched is not None - assert fetched.content == "Test chunk" - - @pytest.mark.asyncio - async def test_delete_chunks_by_document(self, sqlite_repo: SQLiteRepository) -> None: - doc = Document(title="Doc", content="Content") - await sqlite_repo.save_document(doc) - - chunks = [ - Chunk(document_id=doc.id, content="Chunk 1", sequence=0), - Chunk(document_id=doc.id, content="Chunk 2", sequence=1), - ] - await sqlite_repo.save_chunks(chunks) - - deleted = await sqlite_repo.delete_chunks_by_document(doc.id) - assert deleted == 2 - - remaining = await sqlite_repo.get_chunks_by_document(doc.id) - assert len(remaining) == 0 diff --git a/tests/unit/test_cli_graph.py b/tests/unit/test_cli_graph.py deleted file mode 100644 index d25da7c..0000000 --- a/tests/unit/test_cli_graph.py +++ /dev/null @@ -1,410 +0,0 @@ -"""Tests for CLI graph commands.""" - -import json -from unittest.mock import MagicMock, patch - -from click.testing import CliRunner - -from kb_engine.cli import cli - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphStats: - def test_stats_human(self, mock_get_store): - store = MagicMock() - store.get_stats.return_value = { - "entity_count": 10, - "concept_count": 5, - "event_count": 3, - "document_count": 8, - } - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "stats"]) - assert result.exit_code == 0 - assert "Entities: 10" in result.output - assert "Concepts: 5" in result.output - assert "Events: 3" in result.output - assert "Documents: 8" in result.output - assert "Total domain nodes: 18" in result.output - - def test_stats_json(self, mock_get_store): - store = MagicMock() - store.get_stats.return_value = { - "entity_count": 10, - "concept_count": 5, - "event_count": 3, - "document_count": 8, - } - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "stats", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["entity_count"] == 10 - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphLs: - def test_ls_all(self, mock_get_store): - store = MagicMock() - store.get_all_nodes.return_value = [ - {"label": "Entity", "id": "entity:User", "name": "User"}, - {"label": "Concept", "id": "concept:email", "name": "Email"}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "ls"]) - assert result.exit_code == 0 - assert "Found 2 nodes" in result.output - assert "[Entity] entity:User" in result.output - assert "[Concept] concept:email" in result.output - store.get_all_nodes.assert_called_once_with(None) - - def test_ls_filtered(self, mock_get_store): - store = MagicMock() - store.get_all_nodes.return_value = [ - {"label": "Entity", "id": "entity:User", "name": "User"}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "ls", "--type", "entity"]) - assert result.exit_code == 0 - store.get_all_nodes.assert_called_once_with("entity") - - def test_ls_empty(self, mock_get_store): - store = MagicMock() - store.get_all_nodes.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "ls"]) - assert result.exit_code == 0 - assert "No nodes found" in result.output - - def test_ls_json(self, mock_get_store): - store = MagicMock() - store.get_all_nodes.return_value = [ - {"label": "Entity", "id": "entity:User", "name": "User"}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "ls", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["count"] == 1 - assert data["nodes"][0]["id"] == "entity:User" - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphInspect: - def test_inspect_human(self, mock_get_store): - store = MagicMock() - store.get_node_graph.return_value = { - "center": "entity:User", - "nodes": [{"node_type": "Concept", "id": "concept:email", "name": "Email"}], - "edge_types": ["CONTAINS"], - } - store.get_node_provenance.return_value = [ - {"doc_id": "doc-1", "title": "User Entity", "path": "entities/User.md", "role": "primary", "confidence": 1.0}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "inspect", "entity:User"]) - assert result.exit_code == 0 - assert "Node: entity:User" in result.output - assert "[Concept] concept:email" in result.output - assert "CONTAINS" in result.output - assert "[primary] doc-1" in result.output - - def test_inspect_json(self, mock_get_store): - store = MagicMock() - store.get_node_graph.return_value = { - "center": "entity:User", - "nodes": [], - "edge_types": [], - } - store.get_node_provenance.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "inspect", "entity:User", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["neighborhood"]["center"] == "entity:User" - assert data["provenance"] == [] - - def test_inspect_custom_depth(self, mock_get_store): - store = MagicMock() - store.get_node_graph.return_value = {"center": "entity:User", "nodes": [], "edge_types": []} - store.get_node_provenance.return_value = [] - mock_get_store.return_value = store - - CliRunner().invoke(cli, ["graph", "inspect", "entity:User", "-d", "3"]) - store.get_node_graph.assert_called_once_with("entity:User", depth=3) - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphPath: - def test_path_found(self, mock_get_store): - store = MagicMock() - store.find_path.return_value = [{"start_name": "User", "end_name": "Order"}] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "path", "entity:User", "entity:Order"]) - assert result.exit_code == 0 - assert "Path found: User -> Order" in result.output - - def test_path_not_found(self, mock_get_store): - store = MagicMock() - store.find_path.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "path", "entity:User", "entity:Order"]) - assert result.exit_code == 0 - assert "No path found" in result.output - - def test_path_json(self, mock_get_store): - store = MagicMock() - store.find_path.return_value = [{"start_name": "User", "end_name": "Order"}] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "path", "entity:User", "entity:Order", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["reachable"] is True - assert data["from"] == "entity:User" - - def test_path_custom_depth(self, mock_get_store): - store = MagicMock() - store.find_path.return_value = [] - mock_get_store.return_value = store - - CliRunner().invoke(cli, ["graph", "path", "entity:User", "entity:Order", "--max-depth", "3"]) - store.find_path.assert_called_once_with("entity:User", "entity:Order", max_depth=3) - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphImpact: - def test_impact_human(self, mock_get_store): - store = MagicMock() - store.get_document_impact.return_value = [ - {"node_type": "Entity", "id": "entity:User", "name": "User", "role": "primary", "confidence": 1.0}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "impact", "doc-1"]) - assert result.exit_code == 0 - assert "1 nodes" in result.output - assert "[Entity] entity:User" in result.output - - def test_impact_empty(self, mock_get_store): - store = MagicMock() - store.get_document_impact.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "impact", "doc-1"]) - assert result.exit_code == 0 - assert "No nodes found" in result.output - - def test_impact_json(self, mock_get_store): - store = MagicMock() - store.get_document_impact.return_value = [ - {"node_type": "Entity", "id": "entity:User", "name": "User", "role": "primary", "confidence": 1.0}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "impact", "doc-1", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["doc_id"] == "doc-1" - assert data["count"] == 1 - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphProvenance: - def test_provenance_human(self, mock_get_store): - store = MagicMock() - store.get_node_provenance.return_value = [ - {"doc_id": "doc-1", "title": "User Entity", "path": "entities/User.md", "role": "primary", "confidence": 1.0}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "provenance", "entity:User"]) - assert result.exit_code == 0 - assert "1 documents" in result.output - assert "[primary] doc-1" in result.output - assert "entities/User.md" in result.output - - def test_provenance_empty(self, mock_get_store): - store = MagicMock() - store.get_node_provenance.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "provenance", "entity:User"]) - assert result.exit_code == 0 - assert "No provenance records" in result.output - - def test_provenance_json(self, mock_get_store): - store = MagicMock() - store.get_node_provenance.return_value = [ - {"doc_id": "doc-1", "title": "User Entity", "path": "entities/User.md", "role": "primary", "confidence": 1.0}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "provenance", "entity:User", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["node_id"] == "entity:User" - assert data["count"] == 1 - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphCypher: - def test_cypher_results(self, mock_get_store): - store = MagicMock() - store.execute_cypher.return_value = [ - {"type": "Entity", "cnt": 10}, - {"type": "Concept", "cnt": 5}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "cypher", "MATCH (n) RETURN labels(n)[0] as type, count(n) as cnt"]) - assert result.exit_code == 0 - assert "type" in result.output - assert "Entity" in result.output - - def test_cypher_empty(self, mock_get_store): - store = MagicMock() - store.execute_cypher.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "cypher", "MATCH (n:Foo) RETURN n"]) - assert result.exit_code == 0 - assert "no results" in result.output - - def test_cypher_json(self, mock_get_store): - store = MagicMock() - store.execute_cypher.return_value = [{"type": "Entity", "cnt": 10}] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "cypher", "MATCH (n) RETURN n", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["count"] == 1 - - def test_cypher_error(self, mock_get_store): - store = MagicMock() - store.execute_cypher.side_effect = Exception("Invalid syntax") - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "cypher", "INVALID QUERY"]) - assert result.exit_code != 0 - - def test_cypher_error_json(self, mock_get_store): - store = MagicMock() - store.execute_cypher.side_effect = Exception("Invalid syntax") - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "cypher", "INVALID QUERY", "--json"]) - assert result.exit_code != 0 - data = json.loads(result.output) - assert "error" in data - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphDelete: - def test_delete_with_force(self, mock_get_store): - store = MagicMock() - store.delete_node.return_value = True - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "delete", "entity:User", "-f"]) - assert result.exit_code == 0 - assert "Deleted node: entity:User" in result.output - store.delete_node.assert_called_once_with("entity:User") - - def test_delete_not_found(self, mock_get_store): - store = MagicMock() - store.delete_node.return_value = False - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "delete", "entity:Ghost", "-f"]) - assert result.exit_code == 0 - assert "Node not found: entity:Ghost" in result.output - - def test_delete_confirmation_yes(self, mock_get_store): - store = MagicMock() - store.delete_node.return_value = True - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "delete", "entity:User"], input="y\n") - assert result.exit_code == 0 - assert "Deleted node: entity:User" in result.output - - def test_delete_confirmation_no(self, mock_get_store): - store = MagicMock() - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "delete", "entity:User"], input="n\n") - assert result.exit_code != 0 # click.confirm abort - store.delete_node.assert_not_called() - - def test_delete_json(self, mock_get_store): - store = MagicMock() - store.delete_node.return_value = True - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "delete", "entity:User", "-f", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["deleted"] is True - assert data["node_id"] == "entity:User" - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphOrphans: - def test_orphans_human(self, mock_get_store): - store = MagicMock() - store.get_orphan_entities.return_value = [ - {"name": "Order", "confidence": 0.7, "referenced_by": ["User Entity"]}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "orphans"]) - assert result.exit_code == 0 - assert "1 orphan" in result.output - assert "Order" in result.output - - def test_orphans_json(self, mock_get_store): - store = MagicMock() - store.get_orphan_entities.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "orphans", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["count"] == 0 - - -@patch("kb_engine.cli._get_graph_store") -class TestGraphCompleteness: - def test_completeness_human(self, mock_get_store): - store = MagicMock() - store.get_entity_completeness.return_value = [ - {"id": "entity:User", "name": "User", "confidence": 1.0, "status": "complete", "primary_docs": ["User Entity"], "referenced_by": []}, - ] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "completeness"]) - assert result.exit_code == 0 - assert "1 total" in result.output - assert "[OK] User" in result.output - - def test_completeness_json(self, mock_get_store): - store = MagicMock() - store.get_entity_completeness.return_value = [] - mock_get_store.return_value = store - - result = CliRunner().invoke(cli, ["graph", "completeness", "--json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert data["count"] == 0 diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py deleted file mode 100644 index e9a3a39..0000000 --- a/tests/unit/test_mcp_server.py +++ /dev/null @@ -1,369 +0,0 @@ -"""Tests for MCP server tools.""" - -import json -from datetime import datetime -from unittest.mock import AsyncMock, patch -from uuid import uuid4 - -import pytest - -from kb_engine.core.models.document import Chunk, ChunkType, Document, DocumentStatus -from kb_engine.core.models.graph import Edge, EdgeType, Node, NodeType -from kb_engine.core.models.search import DocumentReference, RetrievalResponse - - -# --- Fixtures --- - - -@pytest.fixture(autouse=True) -def _reset_services(): - """Reset global service state between tests.""" - import kb_engine.mcp_server as mod - - mod._retrieval_service = None - mod._graph_repo = None - mod._traceability_repo = None - mod._factory = None - yield - mod._retrieval_service = None - mod._graph_repo = None - mod._traceability_repo = None - mod._factory = None - - -@pytest.fixture -def mock_retrieval_service(): - return AsyncMock() - - -@pytest.fixture -def mock_graph_repo(): - return AsyncMock() - - -@pytest.fixture -def mock_traceability_repo(): - return AsyncMock() - - -def _patch_services(retrieval, graph, traceability): - """Patch _get_services to return our mocks.""" - import kb_engine.mcp_server as mod - - mod._retrieval_service = retrieval - mod._graph_repo = graph - mod._traceability_repo = traceability - - -def _make_doc(**kwargs): - defaults = { - "id": uuid4(), - "title": "Test Document", - "content": "Some content", - "source_path": "/repo/docs/test.md", - "relative_path": "docs/test.md", - "domain": "testing", - "status": DocumentStatus.INDEXED, - "metadata": {}, - "indexed_at": datetime(2025, 1, 15, 10, 30), - } - defaults.update(kwargs) - return Document(**defaults) - - -def _make_ref(**kwargs): - defaults = { - "url": "file:///repo/docs/test.md#section", - "document_path": "docs/test.md", - "title": "Test Document", - "section_title": "Introduction", - "score": 0.87654, - "snippet": "This is a test snippet with some content.", - "chunk_type": "paragraph", - "domain": "testing", - } - defaults.update(kwargs) - return DocumentReference(**defaults) - - -# --- kdd_search tests --- - - -@pytest.mark.unit -class TestKddSearch: - @pytest.mark.asyncio - async def test_basic_search(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - mock_retrieval_service.search.return_value = RetrievalResponse( - query="security model", - references=[_make_ref(), _make_ref(title="Second", score=0.75)], - total_count=2, - processing_time_ms=42.0, - ) - - from kb_engine.mcp_server import kdd_search - - result = await kdd_search(query="security model", limit=5) - data = json.loads(result) - - assert data["query"] == "security model" - assert data["total"] == 2 - assert len(data["results"]) == 2 - assert data["results"][0]["score"] == 0.8765 - assert data["results"][0]["title"] == "Test Document" - assert data["results"][0]["url"] == "file:///repo/docs/test.md#section" - - mock_retrieval_service.search.assert_awaited_once() - call_kwargs = mock_retrieval_service.search.call_args.kwargs - assert call_kwargs["query"] == "security model" - assert call_kwargs["limit"] == 5 - - @pytest.mark.asyncio - async def test_search_with_filters(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - mock_retrieval_service.search.return_value = RetrievalResponse( - query="test", references=[], total_count=0, - ) - - from kb_engine.mcp_server import kdd_search - - await kdd_search( - query="test", - chunk_types=["header"], - domains=["architecture"], - tags=["adr"], - score_threshold=0.5, - ) - - call_kwargs = mock_retrieval_service.search.call_args.kwargs - assert call_kwargs["filters"] is not None - assert call_kwargs["filters"].chunk_types == ["header"] - assert call_kwargs["filters"].domains == ["architecture"] - assert call_kwargs["filters"].tags == ["adr"] - assert call_kwargs["score_threshold"] == 0.5 - - @pytest.mark.asyncio - async def test_search_empty_results(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - mock_retrieval_service.search.return_value = RetrievalResponse( - query="nonexistent", references=[], total_count=0, - ) - - from kb_engine.mcp_server import kdd_search - - result = await kdd_search(query="nonexistent") - data = json.loads(result) - - assert data["total"] == 0 - assert data["results"] == [] - - @pytest.mark.asyncio - async def test_search_result_structure(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - mock_retrieval_service.search.return_value = RetrievalResponse( - query="q", - references=[_make_ref()], - total_count=1, - ) - - from kb_engine.mcp_server import kdd_search - - result = await kdd_search(query="q") - data = json.loads(result) - item = data["results"][0] - - expected_keys = {"url", "title", "section", "score", "snippet", "type", "domain", "retrieval_mode"} - assert set(item.keys()) == expected_keys - assert item["section"] == "Introduction" - assert item["type"] == "paragraph" - assert item["domain"] == "testing" - - -# --- kdd_related tests --- - - -@pytest.mark.unit -class TestKddRelated: - @pytest.mark.asyncio - async def test_related_entities(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - doc = _make_doc() - start_node = Node(name="SecurityModel", node_type=NodeType.CONCEPT) - target_node = Node( - name="AuthModule", - node_type=NodeType.ENTITY, - source_document_id=doc.id, - ) - edge = Edge( - source_id=start_node.id, - target_id=target_node.id, - edge_type=EdgeType.REFERENCES, - ) - - mock_graph_repo.find_nodes.return_value = [start_node] - mock_graph_repo.traverse.return_value = [(start_node, edge, target_node)] - mock_traceability_repo.get_document.return_value = doc - - from kb_engine.mcp_server import kdd_related - - result = await kdd_related(entity="SecurityModel", depth=1) - data = json.loads(result) - - assert data["entity"]["name"] == "SecurityModel" - assert len(data["related"]) == 1 - assert data["related"][0]["name"] == "AuthModule" - assert data["related"][0]["relationship"] == "REFERENCES" - assert data["related"][0]["document_url"] == f"file://{doc.source_path}" - - mock_graph_repo.find_nodes.assert_awaited_once_with(name_pattern="SecurityModel") - mock_graph_repo.traverse.assert_awaited_once() - - @pytest.mark.asyncio - async def test_related_no_graph(self, mock_retrieval_service, mock_traceability_repo): - _patch_services(mock_retrieval_service, None, mock_traceability_repo) - - from kb_engine.mcp_server import kdd_related - - result = await kdd_related(entity="Something") - data = json.loads(result) - - assert "error" in data - assert "not available" in data["error"] - - @pytest.mark.asyncio - async def test_related_entity_not_found(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - mock_graph_repo.find_nodes.return_value = [] - - from kb_engine.mcp_server import kdd_related - - result = await kdd_related(entity="NonExistent") - data = json.loads(result) - - assert data["related"] == [] - assert "No entity found" in data["message"] - - @pytest.mark.asyncio - async def test_related_deduplicates_targets(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - start = Node(name="A", node_type=NodeType.CONCEPT) - target = Node(name="B", node_type=NodeType.ENTITY) - edge1 = Edge(source_id=start.id, target_id=target.id, edge_type=EdgeType.REFERENCES) - edge2 = Edge(source_id=start.id, target_id=target.id, edge_type=EdgeType.CONTAINS) - - mock_graph_repo.find_nodes.return_value = [start] - mock_graph_repo.traverse.return_value = [(start, edge1, target), (start, edge2, target)] - mock_traceability_repo.get_document.return_value = None - - from kb_engine.mcp_server import kdd_related - - result = await kdd_related(entity="A") - data = json.loads(result) - - assert len(data["related"]) == 1 - - -# --- kdd_list tests --- - - -@pytest.mark.unit -class TestKddList: - @pytest.mark.asyncio - async def test_list_documents(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - doc = _make_doc() - chunks = [ - Chunk(document_id=doc.id, content="chunk 1"), - Chunk(document_id=doc.id, content="chunk 2"), - ] - - mock_traceability_repo.list_documents.return_value = [doc] - mock_traceability_repo.get_chunks_by_document.return_value = chunks - - from kb_engine.mcp_server import kdd_list - - result = await kdd_list() - data = json.loads(result) - - assert data["total"] == 1 - item = data["documents"][0] - assert item["path"] == "docs/test.md" - assert item["title"] == "Test Document" - assert item["status"] == "indexed" - assert item["chunks"] == 2 - assert item["indexed_at"] is not None - - @pytest.mark.asyncio - async def test_list_filter_by_kind(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - doc_adr = _make_doc(title="ADR-001", metadata={"kind": "adr"}) - doc_challenge = _make_doc(title="DC-001", metadata={"kind": "challenge"}) - - mock_traceability_repo.list_documents.return_value = [doc_adr, doc_challenge] - mock_traceability_repo.get_chunks_by_document.return_value = [] - - from kb_engine.mcp_server import kdd_list - - result = await kdd_list(kind="adr") - data = json.loads(result) - - assert data["total"] == 1 - assert data["documents"][0]["title"] == "ADR-001" - assert data["documents"][0]["kind"] == "adr" - - @pytest.mark.asyncio - async def test_list_filter_by_status(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - doc_indexed = _make_doc(status=DocumentStatus.INDEXED) - doc_pending = _make_doc(title="Pending", status=DocumentStatus.PENDING) - - mock_traceability_repo.list_documents.return_value = [doc_indexed, doc_pending] - mock_traceability_repo.get_chunks_by_document.return_value = [] - - from kb_engine.mcp_server import kdd_list - - result = await kdd_list(status="indexed") - data = json.loads(result) - - assert data["total"] == 1 - assert data["documents"][0]["status"] == "indexed" - - @pytest.mark.asyncio - async def test_list_filter_by_domain(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - mock_traceability_repo.list_documents.return_value = [] - - from kb_engine.mcp_server import kdd_list - - await kdd_list(domain="architecture") - - call_kwargs = mock_traceability_repo.list_documents.call_args.kwargs - assert call_kwargs["filters"].domains == ["architecture"] - - @pytest.mark.asyncio - async def test_list_result_structure(self, mock_retrieval_service, mock_graph_repo, mock_traceability_repo): - _patch_services(mock_retrieval_service, mock_graph_repo, mock_traceability_repo) - - doc = _make_doc(metadata={"kind": "adr"}) - mock_traceability_repo.list_documents.return_value = [doc] - mock_traceability_repo.get_chunks_by_document.return_value = [] - - from kb_engine.mcp_server import kdd_list - - result = await kdd_list() - data = json.loads(result) - item = data["documents"][0] - - expected_keys = {"path", "title", "kind", "domain", "status", "chunks", "indexed_at"} - assert set(item.keys()) == expected_keys diff --git a/tests/unit/utils/__init__.py b/tests/unit/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/unit/utils/test_markdown.py b/tests/unit/utils/test_markdown.py deleted file mode 100644 index 49d2602..0000000 --- a/tests/unit/utils/test_markdown.py +++ /dev/null @@ -1,146 +0,0 @@ -"""Tests for markdown utility functions.""" - -import pytest - -from kb_engine.utils.markdown import ( - extract_frontmatter, - extract_snippet, - heading_path_to_anchor, - heading_to_anchor, - parse_markdown_sections, -) - - -@pytest.mark.unit -class TestHeadingToAnchor: - """Tests for heading_to_anchor function.""" - - def test_simple_heading(self) -> None: - assert heading_to_anchor("Atributos") == "atributos" - - def test_heading_with_spaces(self) -> None: - assert heading_to_anchor("Ciclo de Vida") == "ciclo-de-vida" - - def test_heading_with_colon(self) -> None: - assert heading_to_anchor("Entity: User") == "entity-user" - - def test_heading_with_parentheses(self) -> None: - assert heading_to_anchor("Estados (v2)") == "estados-v2" - - def test_heading_with_accents(self) -> None: - assert heading_to_anchor("Descripción") == "descripcion" - - def test_heading_with_special_chars(self) -> None: - assert heading_to_anchor("¿Qué es?") == "que-es" - - def test_empty_heading(self) -> None: - assert heading_to_anchor("") == "" - - def test_heading_all_special(self) -> None: - assert heading_to_anchor("---") == "" - - -@pytest.mark.unit -class TestHeadingPathToAnchor: - """Tests for heading_path_to_anchor function.""" - - def test_with_path(self) -> None: - assert heading_path_to_anchor(["Usuario", "Atributos"]) == "atributos" - - def test_single_element(self) -> None: - assert heading_path_to_anchor(["Descripción"]) == "descripcion" - - def test_empty_path(self) -> None: - assert heading_path_to_anchor([]) is None - - -@pytest.mark.unit -class TestExtractSnippet: - """Tests for extract_snippet function.""" - - def test_short_content(self) -> None: - text = "This is short content." - assert extract_snippet(text) == "This is short content." - - def test_long_content_truncated(self) -> None: - text = "A" * 300 - snippet = extract_snippet(text, max_length=200) - assert len(snippet) <= 203 # 200 + "..." - - def test_strips_markdown(self) -> None: - text = "**Bold text** and *italic* with [link](http://example.com)" - snippet = extract_snippet(text) - assert "**" not in snippet - assert "*" not in snippet - assert "](http" not in snippet - assert "Bold text" in snippet - assert "link" in snippet - - def test_sentence_boundary_truncation(self) -> None: - text = "First sentence. Second sentence. " + "A" * 200 - snippet = extract_snippet(text, max_length=50) - # Should truncate at sentence boundary if possible - assert snippet.endswith(".") or snippet.endswith("...") - - -@pytest.mark.unit -class TestExtractFrontmatter: - """Tests for extract_frontmatter function.""" - - def test_with_frontmatter(self) -> None: - content = """--- -title: Test -tags: - - entity ---- - -# Content here -""" - metadata, body = extract_frontmatter(content) - assert metadata["title"] == "Test" - assert metadata["tags"] == ["entity"] - assert "# Content here" in body - - def test_without_frontmatter(self) -> None: - content = "# Just a heading\n\nSome content." - metadata, body = extract_frontmatter(content) - assert metadata == {} - assert "# Just a heading" in body - - -@pytest.mark.unit -class TestParseMarkdownSections: - """Tests for parse_markdown_sections function.""" - - def test_basic_sections(self) -> None: - content = """# Title - -Intro text. - -## Section A - -Section A content. - -## Section B - -Section B content. -""" - sections = parse_markdown_sections(content) - assert len(sections) >= 3 - assert sections[0][0] == ["Title"] - assert "Intro text" in sections[0][1] - - def test_nested_sections(self) -> None: - content = """# Doc - -## Parent - -### Child - -Child content. -""" - sections = parse_markdown_sections(content) - # The child section should have full path - child_sections = [s for s in sections if len(s[0]) == 3] - assert len(child_sections) == 1 - assert child_sections[0][0] == ["Doc", "Parent", "Child"] diff --git a/tests/v2/__init__.py b/tests/v2/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/api/__init__.py b/tests/v2/api/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/api/test_cli.py b/tests/v2/api/test_cli.py deleted file mode 100644 index f6fc736..0000000 --- a/tests/v2/api/test_cli.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Tests for the kdd CLI (Click commands).""" - -from __future__ import annotations - -from datetime import datetime -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -from click.testing import CliRunner - -from kdd.api.cli import cli -from kdd.domain.entities import IndexManifest, IndexStats -from kdd.domain.enums import IndexLevel - - -@pytest.fixture -def runner(): - return CliRunner() - - -class TestCliVersion: - def test_version_flag(self, runner): - result = runner.invoke(cli, ["--version"]) - assert result.exit_code == 0 - assert "kdd" in result.output - assert "1.0.0" in result.output - - -class TestCliStatus: - def test_status_no_index(self, runner, tmp_path): - result = runner.invoke(cli, ["status", "--specs-path", str(tmp_path)]) - assert result.exit_code == 0 - assert "No index found" in result.output - - def test_status_with_index(self, runner, tmp_path): - # Create a minimal .kdd-index/ with manifest - from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore - - idx_path = tmp_path / ".kdd-index" - store = FilesystemArtifactStore(idx_path) - store.write_manifest(IndexManifest( - version="1.0.0", - kdd_version="1.0.0", - indexed_at=datetime(2025, 1, 15), - indexed_by="test", - index_level=IndexLevel.L1, - stats=IndexStats(nodes=10, edges=20, embeddings=0), - )) - - # specs_path is a dir; index_path is parent/.kdd-index - specs = tmp_path / "specs" - specs.mkdir() - - result = runner.invoke(cli, [ - "status", "--specs-path", str(specs), "--index-path", str(idx_path), - ]) - assert result.exit_code == 0 - assert "Nodes:" in result.output - assert "10" in result.output - - -class TestCliIndex: - def test_index_nonexistent_path(self, runner): - result = runner.invoke(cli, ["index", "/nonexistent/path"]) - assert result.exit_code != 0 - - @patch("kdd.api.cli.create_container") - def test_index_invokes_incremental(self, mock_create, runner, tmp_path): - """Verify that 'kdd index ' calls index_incremental.""" - # Create specs directory - specs = tmp_path / "specs" - specs.mkdir() - - mock_container = MagicMock() - mock_container.index_level.value = "L1" - mock_container.index_path = str(tmp_path / ".kdd-index") - mock_create.return_value = mock_container - - with patch("kdd.application.commands.index_incremental.index_incremental") as mock_idx: - from dataclasses import dataclass - - @dataclass - class FakeResult: - indexed: int = 5 - deleted: int = 0 - skipped: int = 0 - errors: int = 0 - is_full_reindex: bool = False - - mock_idx.return_value = FakeResult() - - result = runner.invoke(cli, ["index", str(specs)]) - - assert result.exit_code == 0 - assert "Indexed: 5" in result.output - - -class TestCliSearch: - @patch("kdd.api.cli.create_container") - def test_search_no_index(self, mock_create, runner, tmp_path): - specs = tmp_path / "specs" - specs.mkdir() - - mock_container = MagicMock() - mock_container.ensure_loaded.return_value = False - mock_create.return_value = mock_container - - result = runner.invoke(cli, [ - "search", "test query", "--specs-path", str(specs), - ]) - assert result.exit_code != 0 - assert "No index found" in result.output - - -class TestCliMerge: - def test_merge_missing_sources(self, runner, tmp_path): - src1 = tmp_path / "idx1" - src2 = tmp_path / "idx2" - out = tmp_path / "out" - - src1.mkdir() - src2.mkdir() - - result = runner.invoke(cli, [ - "merge", str(src1), str(src2), "-o", str(out), - ]) - # Should fail because no manifests - assert result.exit_code != 0 or "failed" in result.output.lower() diff --git a/tests/v2/api/test_server.py b/tests/v2/api/test_server.py deleted file mode 100644 index d897417..0000000 --- a/tests/v2/api/test_server.py +++ /dev/null @@ -1,178 +0,0 @@ -"""Tests for the kdd REST API server.""" - -from __future__ import annotations - -from datetime import datetime - -import pytest -from fastapi.testclient import TestClient - -from kdd.api.server import app, _get_container -from kdd.domain.entities import GraphEdge, GraphNode, IndexManifest, IndexStats -from kdd.domain.enums import EdgeType, IndexLevel, KDDKind, KDDLayer -from kdd.infrastructure.graph.networkx_store import NetworkXGraphStore - - -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - - -def _node(id: str, kind: KDDKind, layer: KDDLayer, **fields) -> GraphNode: - return GraphNode( - id=id, kind=kind, source_file=f"{id}.md", source_hash="abc", - layer=layer, indexed_fields=fields, - ) - - -def _edge(from_node: str, to_node: str, edge_type: str, violation: bool = False) -> GraphEdge: - return GraphEdge( - from_node=from_node, to_node=to_node, edge_type=edge_type, - source_file="test.md", extraction_method="section_content", - layer_violation=violation, - ) - - -NODES = [ - _node("Entity:A", KDDKind.ENTITY, KDDLayer.DOMAIN, title="Entity A"), - _node("Entity:B", KDDKind.ENTITY, KDDLayer.DOMAIN, title="Entity B"), - _node("CMD:C1", KDDKind.COMMAND, KDDLayer.BEHAVIOR, title="Command C1"), - _node("REQ:R1", KDDKind.REQUIREMENT, KDDLayer.VERIFICATION, title="Requirement R1"), -] - -EDGES = [ - _edge("Entity:A", "Entity:B", EdgeType.DOMAIN_RELATION.value), - _edge("CMD:C1", "Entity:A", EdgeType.WIKI_LINK.value), - _edge("Entity:A", "REQ:R1", EdgeType.WIKI_LINK.value, violation=True), -] - - -class FakeContainer: - """Minimal container for API tests.""" - - def __init__(self): - self.graph_store = NetworkXGraphStore() - self.graph_store.load(NODES, EDGES) - self.vector_store = None - self.embedding_model = None - - def ensure_loaded(self): - return True - - -@pytest.fixture -def client(): - container = FakeContainer() - app.dependency_overrides[_get_container] = lambda: container - yield TestClient(app) - app.dependency_overrides.clear() - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - - -class TestContextEndpoint: - """POST /v1/retrieve/context — QRY-003 hybrid search.""" - - def test_context_search(self, client): - resp = client.post("/v1/retrieve/context", json={ - "query_text": "Entity A", - "min_score": 0.0, - }) - assert resp.status_code == 200 - data = resp.json() - assert "results" in data - assert "total_results" in data - # L1 index → no semantic, should get lexical + graph results - assert any("NO_EMBEDDINGS" in w for w in data["warnings"]) - - def test_context_short_query(self, client): - resp = client.post("/v1/retrieve/context", json={"query_text": "ab"}) - assert resp.status_code == 422 # pydantic validation: min_length=3 - - -class TestGraphEndpoint: - """GET /v1/retrieve/graph — QRY-001 traversal.""" - - def test_graph_traversal(self, client): - resp = client.get("/v1/retrieve/graph", params={"node_id": "Entity:A"}) - assert resp.status_code == 200 - data = resp.json() - assert data["center_node"] == "Entity:A" - assert data["total_nodes"] > 1 - - def test_graph_not_found(self, client): - resp = client.get("/v1/retrieve/graph", params={"node_id": "Entity:MISSING"}) - assert resp.status_code == 404 - - -class TestImpactEndpoint: - """GET /v1/retrieve/impact — QRY-004.""" - - def test_impact_analysis(self, client): - resp = client.get("/v1/retrieve/impact", params={"node_id": "Entity:A"}) - assert resp.status_code == 200 - data = resp.json() - assert data["analyzed_node"] == "Entity:A" - assert isinstance(data["directly_affected"], list) - - def test_impact_not_found(self, client): - resp = client.get("/v1/retrieve/impact", params={"node_id": "NOPE"}) - assert resp.status_code == 404 - - -class TestCoverageEndpoint: - """GET /v1/retrieve/coverage — QRY-005.""" - - def test_coverage(self, client): - resp = client.get("/v1/retrieve/coverage", params={"node_id": "Entity:A"}) - assert resp.status_code == 200 - data = resp.json() - assert data["node_id"] == "Entity:A" - assert "coverage_percent" in data - assert isinstance(data["categories"], list) - - def test_coverage_not_found(self, client): - resp = client.get("/v1/retrieve/coverage", params={"node_id": "NOPE"}) - assert resp.status_code == 404 - - -class TestViolationsEndpoint: - """GET /v1/retrieve/layer-violations — QRY-006.""" - - def test_violations_list(self, client): - resp = client.get("/v1/retrieve/layer-violations") - assert resp.status_code == 200 - data = resp.json() - assert data["total_violations"] >= 1 - assert data["total_edges_analyzed"] >= 1 - assert len(data["violations"]) >= 1 - # Verify the known violation - v = data["violations"][0] - assert "from_node" in v - assert "from_layer" in v - - -class TestSearchEndpoint: - """POST /v1/retrieve/search — QRY-002 semantic. Requires L2.""" - - def test_search_requires_l2(self, client): - resp = client.post("/v1/retrieve/search", json={"query_text": "test query"}) - assert resp.status_code == 400 - assert "L2" in resp.json()["detail"] - - -class TestNoContainerLoaded: - """When no container is set.""" - - def test_503_without_container(self): - app.dependency_overrides.clear() - # Reset app state - if hasattr(app.state, "container"): - del app.state.container - - client = TestClient(app, raise_server_exceptions=False) - resp = client.get("/v1/retrieve/layer-violations") - assert resp.status_code == 503 diff --git a/tests/v2/application/__init__.py b/tests/v2/application/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/application/commands/__init__.py b/tests/v2/application/commands/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/application/commands/test_index_document.py b/tests/v2/application/commands/test_index_document.py deleted file mode 100644 index 87787bb..0000000 --- a/tests/v2/application/commands/test_index_document.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Tests for CMD-001 IndexDocument command.""" - -from pathlib import Path - -import pytest - -from kdd.application.commands.index_document import IndexResult, index_document -from kdd.application.extractors.registry import create_default_registry -from kdd.domain.events import DocumentDetected, DocumentIndexed, DocumentParsed -from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore -from kdd.infrastructure.events.bus import InMemoryEventBus - -# Root of the specs/ directory -SPECS_ROOT = Path(__file__).resolve().parents[4] / "specs" - - -@pytest.fixture -def artifact_store(tmp_path): - return FilesystemArtifactStore(tmp_path / ".kdd-index") - - -@pytest.fixture -def registry(): - return create_default_registry() - - -@pytest.fixture -def event_bus(): - return InMemoryEventBus() - - -class TestIndexDocumentL1: - """Test L1 indexing (no embeddings).""" - - def test_index_entity(self, artifact_store, registry): - result = index_document( - SPECS_ROOT / "01-domain" / "entities" / "KDDDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is True - assert result.node_id == "Entity:KDDDocument" - assert result.edge_count > 0 - assert result.embedding_count == 0 - - def test_index_creates_node_file(self, artifact_store, registry): - index_document( - SPECS_ROOT / "01-domain" / "entities" / "KDDDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - node = artifact_store.read_node("Entity:KDDDocument") - assert node is not None - assert node.kind.value == "entity" - - def test_index_creates_edges(self, artifact_store, registry): - index_document( - SPECS_ROOT / "01-domain" / "entities" / "KDDDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - edges = artifact_store.read_edges() - assert len(edges) > 0 - - def test_index_event(self, artifact_store, registry): - result = index_document( - SPECS_ROOT / "01-domain" / "events" / "EVT-KDDDocument-Indexed.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is True - assert result.node_id == "Event:EVT-KDDDocument-Indexed" - - def test_index_command(self, artifact_store, registry): - result = index_document( - SPECS_ROOT / "02-behavior" / "commands" / "CMD-001-IndexDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is True - assert result.node_id == "CMD:CMD-001" - - def test_index_use_case(self, artifact_store, registry): - result = index_document( - SPECS_ROOT / "02-behavior" / "use-cases" / "UC-001-IndexDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is True - assert result.node_id == "UC:UC-001" - - def test_index_prd(self, artifact_store, registry): - result = index_document( - SPECS_ROOT / "00-requirements" / "PRD-KBEngine.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is True - assert result.node_id == "PRD:PRD-KBEngine" - - -class TestIndexDocumentSkips: - """Test documents that should be skipped.""" - - def test_nonexistent_file(self, artifact_store, registry): - result = index_document( - SPECS_ROOT / "nonexistent.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is False - assert "File error" in result.skipped_reason - - def test_no_frontmatter(self, artifact_store, registry, tmp_path): - md = tmp_path / "plain.md" - md.write_text("# Just a plain markdown file\n\nNo front-matter here.\n") - result = index_document( - md, - specs_root=tmp_path, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is False - assert "kind" in result.skipped_reason.lower() - - def test_unknown_kind(self, artifact_store, registry, tmp_path): - md = tmp_path / "unknown.md" - md.write_text("---\nid: X-001\nkind: spaceship\n---\n\n# Spaceship\n") - result = index_document( - md, - specs_root=tmp_path, - registry=registry, - artifact_store=artifact_store, - ) - assert result.success is False - assert "kind" in result.skipped_reason.lower() - - -class TestIndexDocumentEvents: - """Test domain event emission.""" - - def test_emits_document_detected(self, artifact_store, registry, event_bus): - events = [] - event_bus.subscribe(DocumentDetected, lambda e: events.append(e)) - - index_document( - SPECS_ROOT / "01-domain" / "entities" / "KDDDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - event_bus=event_bus, - ) - assert len(events) == 1 - assert events[0].kind.value == "entity" - - def test_emits_document_parsed(self, artifact_store, registry, event_bus): - events = [] - event_bus.subscribe(DocumentParsed, lambda e: events.append(e)) - - index_document( - SPECS_ROOT / "01-domain" / "entities" / "KDDDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - event_bus=event_bus, - ) - assert len(events) == 1 - assert events[0].section_count > 0 - - def test_emits_document_indexed(self, artifact_store, registry, event_bus): - events = [] - event_bus.subscribe(DocumentIndexed, lambda e: events.append(e)) - - index_document( - SPECS_ROOT / "01-domain" / "entities" / "KDDDocument.md", - specs_root=SPECS_ROOT, - registry=registry, - artifact_store=artifact_store, - event_bus=event_bus, - ) - assert len(events) == 1 - assert events[0].node_id == "Entity:KDDDocument" - assert events[0].edge_count > 0 diff --git a/tests/v2/application/commands/test_index_incremental.py b/tests/v2/application/commands/test_index_incremental.py deleted file mode 100644 index 28afe73..0000000 --- a/tests/v2/application/commands/test_index_incremental.py +++ /dev/null @@ -1,227 +0,0 @@ -"""Tests for CMD-002 IndexIncremental command.""" - -import subprocess -from pathlib import Path - -import pytest - -from kdd.application.commands.index_incremental import ( - IncrementalResult, - index_incremental, -) -from kdd.application.extractors.registry import create_default_registry -from kdd.domain.entities import IndexManifest, IndexStats -from kdd.domain.enums import IndexLevel -from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore - - -@pytest.fixture -def git_specs(tmp_path): - """Create a temporary git repo with a few spec files.""" - specs = tmp_path / "specs" - specs.mkdir() - - # Init git repo - subprocess.run(["git", "init"], cwd=tmp_path, capture_output=True) - subprocess.run( - ["git", "config", "user.email", "test@test.com"], - cwd=tmp_path, capture_output=True, - ) - subprocess.run( - ["git", "config", "user.name", "Test"], - cwd=tmp_path, capture_output=True, - ) - - # Create directory structure - domain = specs / "01-domain" / "entities" - domain.mkdir(parents=True) - - # Create a spec file - entity = domain / "Order.md" - entity.write_text( - "---\nid: Order\nkind: entity\nstatus: draft\n---\n\n" - "# Order\n\n## Descripción\n\nAn order entity.\n" - ) - - subprocess.run(["git", "add", "."], cwd=tmp_path, capture_output=True) - subprocess.run( - ["git", "commit", "-m", "init"], - cwd=tmp_path, capture_output=True, - ) - - return specs - - -@pytest.fixture -def artifact_store(tmp_path): - return FilesystemArtifactStore(tmp_path / ".kdd-index") - - -@pytest.fixture -def registry(): - return create_default_registry() - - -class TestFullReindex: - """No previous manifest → full reindex.""" - - def test_indexes_all_files(self, git_specs, artifact_store, registry): - result = index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - assert result.is_full_reindex is True - assert result.indexed >= 1 - - def test_creates_manifest(self, git_specs, artifact_store, registry): - index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - manifest = artifact_store.read_manifest() - assert manifest is not None - assert manifest.git_commit is not None - assert manifest.stats.nodes >= 1 - - def test_creates_node(self, git_specs, artifact_store, registry): - index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - node = artifact_store.read_node("Entity:Order") - assert node is not None - - -class TestIncrementalNew: - """New file added since last index.""" - - def test_indexes_new_file(self, git_specs, artifact_store, registry): - # First: full index - index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - - # Add a new file and commit - events_dir = git_specs / "01-domain" / "events" - events_dir.mkdir(parents=True, exist_ok=True) - evt = events_dir / "EVT-Order-Created.md" - evt.write_text( - "---\nid: EVT-Order-Created\nkind: event\nstatus: draft\n---\n\n" - "# EVT-Order-Created\n\n## Descripción\n\nOrder was created.\n" - ) - subprocess.run(["git", "add", "."], cwd=git_specs.parent, capture_output=True) - subprocess.run( - ["git", "commit", "-m", "add event"], - cwd=git_specs.parent, capture_output=True, - ) - - # Incremental - result = index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - assert result.is_full_reindex is False - assert result.indexed >= 1 - - # Verify new node exists - node = artifact_store.read_node("Event:EVT-Order-Created") - assert node is not None - - -class TestIncrementalModified: - """Modified file since last index.""" - - def test_reindexes_modified_file(self, git_specs, artifact_store, registry): - index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - - # Modify the entity file - entity = git_specs / "01-domain" / "entities" / "Order.md" - entity.write_text( - "---\nid: Order\nkind: entity\nstatus: review\n---\n\n" - "# Order\n\n## Descripción\n\nAn order entity (updated).\n" - ) - subprocess.run(["git", "add", "."], cwd=git_specs.parent, capture_output=True) - subprocess.run( - ["git", "commit", "-m", "update order"], - cwd=git_specs.parent, capture_output=True, - ) - - result = index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - assert result.indexed >= 1 - - # Verify updated node - node = artifact_store.read_node("Entity:Order") - assert node is not None - assert node.status == "review" - - -class TestIncrementalDeleted: - """Deleted file since last index.""" - - def test_cascade_deletes(self, git_specs, artifact_store, registry): - index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - - # Delete the entity file - entity = git_specs / "01-domain" / "entities" / "Order.md" - entity.unlink() - subprocess.run(["git", "add", "."], cwd=git_specs.parent, capture_output=True) - subprocess.run( - ["git", "commit", "-m", "delete order"], - cwd=git_specs.parent, capture_output=True, - ) - - result = index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - assert result.deleted >= 1 - - -class TestIncrementalNoChanges: - """No changes since last index.""" - - def test_noop(self, git_specs, artifact_store, registry): - index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - - result = index_incremental( - git_specs, - repo_root=git_specs.parent, - registry=registry, - artifact_store=artifact_store, - ) - assert result.is_full_reindex is False - assert result.indexed == 0 - assert result.deleted == 0 diff --git a/tests/v2/application/commands/test_merge_index.py b/tests/v2/application/commands/test_merge_index.py deleted file mode 100644 index 6fd12ea..0000000 --- a/tests/v2/application/commands/test_merge_index.py +++ /dev/null @@ -1,267 +0,0 @@ -"""Tests for CMD-004 MergeIndex.""" - -from datetime import datetime, timedelta -from pathlib import Path - -import pytest - -from kdd.application.commands.merge_index import merge_index -from kdd.domain.entities import ( - Embedding, - GraphEdge, - GraphNode, - IndexManifest, - IndexStats, -) -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer -from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore - - -def _make_manifest( - *, - version: str = "1.0.0", - embedding_model: str | None = "all-mpnet-base-v2", - index_level: IndexLevel = IndexLevel.L1, - nodes: int = 1, - edges: int = 1, -) -> IndexManifest: - return IndexManifest( - version=version, - kdd_version="1.0.0", - embedding_model=embedding_model, - indexed_at=datetime.now(), - indexed_by="test", - index_level=index_level, - stats=IndexStats(nodes=nodes, edges=edges), - ) - - -def _make_node(id: str, source_hash: str = "abc", indexed_at: datetime | None = None) -> GraphNode: - return GraphNode( - id=id, - kind=KDDKind.ENTITY, - source_file=f"{id}.md", - source_hash=source_hash, - layer=KDDLayer.DOMAIN, - indexed_at=indexed_at or datetime.now(), - ) - - -def _make_edge(from_node: str, to_node: str) -> GraphEdge: - return GraphEdge( - from_node=from_node, - to_node=to_node, - edge_type="WIKI_LINK", - source_file="test.md", - extraction_method="section_content", - ) - - -def _populate_store(store: FilesystemArtifactStore, manifest, nodes, edges) -> None: - store.write_manifest(manifest) - for node in nodes: - store.write_node(node) - if edges: - store.append_edges(edges) - - -class TestMergeIndexSuccess: - """Merge without conflicts.""" - - def test_merge_disjoint_nodes(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - out = tmp_path / "merged" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - node_a = _make_node("Entity:A") - node_b = _make_node("Entity:B") - edge_ab = _make_edge("Entity:A", "Entity:B") - - _populate_store(s1, _make_manifest(), [node_a], []) - _populate_store(s2, _make_manifest(), [node_b], []) - - result = merge_index([src1, src2], out) - - assert result.success - assert result.total_nodes == 2 - assert result.conflicts_resolved == 0 - - def test_merge_identical_nodes_no_conflict(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - out = tmp_path / "merged" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - node = _make_node("Entity:A", source_hash="same_hash") - - _populate_store(s1, _make_manifest(), [node], []) - _populate_store(s2, _make_manifest(), [node], []) - - result = merge_index([src1, src2], out) - - assert result.success - assert result.total_nodes == 1 - assert result.conflicts_resolved == 0 - - def test_merge_edges_union(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - out = tmp_path / "merged" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - node_a = _make_node("Entity:A") - node_b = _make_node("Entity:B") - node_c = _make_node("Entity:C") - - _populate_store(s1, _make_manifest(), [node_a, node_b], [_make_edge("Entity:A", "Entity:B")]) - _populate_store(s2, _make_manifest(), [node_b, node_c], [_make_edge("Entity:B", "Entity:C")]) - - result = merge_index([src1, src2], out) - - assert result.success - assert result.total_nodes == 3 - assert result.total_edges == 2 - - def test_writes_manifest(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - out = tmp_path / "merged" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - _populate_store(s1, _make_manifest(), [_make_node("Entity:A")], []) - _populate_store(s2, _make_manifest(), [_make_node("Entity:B")], []) - - merge_index([src1, src2], out) - - out_store = FilesystemArtifactStore(out) - manifest = out_store.read_manifest() - assert manifest is not None - assert manifest.stats.nodes == 2 - assert manifest.indexed_by == "kdd-merge" - - -class TestMergeConflictResolution: - """Merge with node conflicts — last-write-wins.""" - - def test_last_write_wins(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - out = tmp_path / "merged" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - old = _make_node("Entity:A", source_hash="old", indexed_at=datetime(2024, 1, 1)) - new = _make_node("Entity:A", source_hash="new", indexed_at=datetime(2025, 1, 1)) - - _populate_store(s1, _make_manifest(), [old], []) - _populate_store(s2, _make_manifest(), [new], []) - - result = merge_index([src1, src2], out) - - assert result.success - assert result.conflicts_resolved == 1 - assert result.total_nodes == 1 - - # Verify the winner - out_store = FilesystemArtifactStore(out) - merged_node = out_store.read_node("Entity:A") - assert merged_node is not None - assert merged_node.source_hash == "new" - - def test_fail_on_conflict_strategy(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - out = tmp_path / "merged" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - _populate_store(s1, _make_manifest(), [_make_node("Entity:A", source_hash="v1")], []) - _populate_store(s2, _make_manifest(), [_make_node("Entity:A", source_hash="v2")], []) - - result = merge_index([src1, src2], out, conflict_strategy="fail_on_conflict") - - assert not result.success - assert "CONFLICT_REJECTED" in result.error - - -class TestMergeValidation: - """Validation of manifest compatibility.""" - - def test_insufficient_sources(self, tmp_path): - result = merge_index([tmp_path / "only_one"], tmp_path / "out") - assert not result.success - assert "INSUFFICIENT_SOURCES" in result.error - - def test_missing_manifest(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - src1.mkdir() - src2.mkdir() - - s1 = FilesystemArtifactStore(src1) - s1.write_manifest(_make_manifest()) - - result = merge_index([src1, src2], tmp_path / "out") - assert not result.success - assert "MANIFEST_NOT_FOUND" in result.error - - def test_incompatible_versions(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - _populate_store(s1, _make_manifest(version="1.0.0"), [_make_node("Entity:A")], []) - _populate_store(s2, _make_manifest(version="2.0.0"), [_make_node("Entity:B")], []) - - result = merge_index([src1, src2], tmp_path / "out") - assert not result.success - assert "INCOMPATIBLE_VERSION" in result.error - - def test_incompatible_embedding_models(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - _populate_store( - s1, _make_manifest(embedding_model="model-a"), [_make_node("Entity:A")], [], - ) - _populate_store( - s2, _make_manifest(embedding_model="model-b"), [_make_node("Entity:B")], [], - ) - - result = merge_index([src1, src2], tmp_path / "out") - assert not result.success - assert "INCOMPATIBLE_EMBEDDING_MODEL" in result.error - - -class TestMergeEdgeCascade: - """Edges referencing removed nodes are dropped.""" - - def test_cascade_delete_orphan_edges(self, tmp_path): - src1, src2 = tmp_path / "idx1", tmp_path / "idx2" - out = tmp_path / "merged" - - s1 = FilesystemArtifactStore(src1) - s2 = FilesystemArtifactStore(src2) - - node_a = _make_node("Entity:A") - node_b = _make_node("Entity:B") - - # src1 has A→B edge, src2 has only node C (B is not in src2) - _populate_store(s1, _make_manifest(), [node_a, node_b], [_make_edge("Entity:A", "Entity:B")]) - # src2 doesn't have node_b but has an edge referencing a nonexistent node - _populate_store( - s2, _make_manifest(), [_make_node("Entity:C")], - [_make_edge("Entity:C", "Entity:GHOST")], - ) - - result = merge_index([src1, src2], out) - assert result.success - # Only the A→B edge survives (both endpoints exist), not C→GHOST - assert result.total_edges == 1 diff --git a/tests/v2/application/commands/test_sync_index.py b/tests/v2/application/commands/test_sync_index.py deleted file mode 100644 index 4d19dbd..0000000 --- a/tests/v2/application/commands/test_sync_index.py +++ /dev/null @@ -1,274 +0,0 @@ -"""Tests for CMD-005 SyncIndex and CMD-003 EnrichWithAgent.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any - -import pytest - -from kdd.application.commands.sync_index import SyncResult, sync_pull, sync_push -from kdd.domain.entities import GraphEdge, GraphNode, IndexManifest, IndexStats -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer -from kdd.domain.ports import ArtifactStore, Transport - - -# --------------------------------------------------------------------------- -# Fakes -# --------------------------------------------------------------------------- - - -class FakeTransport: - """Records push/pull calls.""" - - def __init__(self, *, fail: bool = False): - self.calls: list[tuple[str, ...]] = [] - self._fail = fail - - def push(self, index_path: str, remote: str) -> None: - if self._fail: - raise ConnectionError("network down") - self.calls.append(("push", index_path, remote)) - - def pull(self, remote: str, target_path: str) -> None: - if self._fail: - raise ConnectionError("network down") - self.calls.append(("pull", remote, target_path)) - - -class FakeArtifactStore: - """Minimal ArtifactStore for sync tests.""" - - def __init__(self, has_manifest: bool = True): - self._has_manifest = has_manifest - - def read_manifest(self): - if not self._has_manifest: - return None - from datetime import datetime - return IndexManifest( - version="1.0.0", - kdd_version="1.0.0", - indexed_at=datetime.now(), - indexed_by="test", - index_level=IndexLevel.L1, - stats=IndexStats(nodes=5, edges=3), - ) - - # Required by Protocol (not used in sync tests) - def write_manifest(self, m): ... - def write_node(self, n): ... - def read_node(self, nid): ... - def append_edges(self, e): ... - def read_edges(self): return [] - def write_embeddings(self, e): ... - def read_embeddings(self, did): return [] - def read_all_nodes(self): return [] - def read_all_embeddings(self): return [] - def delete_document_artifacts(self, did): ... - - -# --------------------------------------------------------------------------- -# sync_push tests -# --------------------------------------------------------------------------- - - -class TestSyncPush: - def test_push_success(self): - transport = FakeTransport() - store = FakeArtifactStore() - result = sync_push(store, transport) - - assert result.success - assert result.direction == "push" - assert len(transport.calls) == 1 - assert transport.calls[0][0] == "push" - - def test_push_no_manifest(self): - transport = FakeTransport() - store = FakeArtifactStore(has_manifest=False) - result = sync_push(store, transport) - - assert not result.success - assert "NO_LOCAL_INDEX" in result.error - - def test_push_transport_error(self): - transport = FakeTransport(fail=True) - store = FakeArtifactStore() - result = sync_push(store, transport) - - assert not result.success - assert "TRANSPORT_ERROR" in result.error - - -# --------------------------------------------------------------------------- -# sync_pull tests -# --------------------------------------------------------------------------- - - -class TestSyncPull: - def test_pull_success(self): - transport = FakeTransport() - result = sync_pull(transport) - - assert result.success - assert result.direction == "pull" - assert len(transport.calls) == 1 - assert transport.calls[0][0] == "pull" - - def test_pull_transport_error(self): - transport = FakeTransport(fail=True) - result = sync_pull(transport) - - assert not result.success - assert "TRANSPORT_ERROR" in result.error - - -# --------------------------------------------------------------------------- -# EnrichWithAgent tests -# --------------------------------------------------------------------------- - - -class FakeAgentClient: - """Returns a canned enrichment.""" - - def enrich(self, node, context: str) -> dict: - return { - "summary": "Enriched summary", - "implicit_relations": [ - {"target": "Entity:GraphNode", "type": "WIKI_LINK"}, - ], - } - - -class FailingAgentClient: - def enrich(self, node, context: str) -> dict: - raise RuntimeError("Agent unreachable") - - -class InMemoryArtifactStore: - """ArtifactStore that holds state in memory for test verification.""" - - def __init__(self): - self._nodes: dict[str, GraphNode] = {} - self._edges: list[GraphEdge] = [] - self.root = "/tmp/test-index" - - def write_manifest(self, m): ... - - def read_manifest(self): - return None - - def write_node(self, node: GraphNode): - self._nodes[node.id] = node - - def read_node(self, node_id: str): - return self._nodes.get(node_id) - - def append_edges(self, edges): - self._edges.extend(edges) - - def read_edges(self): - return list(self._edges) - - def write_embeddings(self, e): ... - def read_embeddings(self, did): return [] - def read_all_nodes(self): return list(self._nodes.values()) - def read_all_embeddings(self): return [] - def delete_document_artifacts(self, did): ... - - -class TestEnrichWithAgent: - def test_enrich_success(self, tmp_path): - from kdd.application.commands.enrich_with_agent import enrich_with_agent - - # Setup: create a spec file and a node - spec_file = tmp_path / "entity.md" - spec_file.write_text("# Entity\nSome content", encoding="utf-8") - - store = InMemoryArtifactStore() - node = GraphNode( - id="Entity:Test", - kind=KDDKind.ENTITY, - source_file="entity.md", - source_hash="abc", - layer=KDDLayer.DOMAIN, - ) - store.write_node(node) - - result = enrich_with_agent( - "Entity:Test", - artifact_store=store, - agent_client=FakeAgentClient(), - specs_root=tmp_path, - ) - - assert result.success - assert result.enrichment is not None - assert result.implicit_edges == 1 - assert len(store._edges) == 1 - assert store._edges[0].extraction_method == "implicit" - - def test_enrich_node_not_found(self, tmp_path): - from kdd.application.commands.enrich_with_agent import enrich_with_agent - - store = InMemoryArtifactStore() - - result = enrich_with_agent( - "Entity:Missing", - artifact_store=store, - agent_client=FakeAgentClient(), - specs_root=tmp_path, - ) - - assert not result.success - assert "NODE_NOT_FOUND" in result.error - - def test_enrich_agent_error(self, tmp_path): - from kdd.application.commands.enrich_with_agent import enrich_with_agent - - spec_file = tmp_path / "entity.md" - spec_file.write_text("# Entity\nContent", encoding="utf-8") - - store = InMemoryArtifactStore() - node = GraphNode( - id="Entity:Test", - kind=KDDKind.ENTITY, - source_file="entity.md", - source_hash="abc", - layer=KDDLayer.DOMAIN, - ) - store.write_node(node) - - result = enrich_with_agent( - "Entity:Test", - artifact_store=store, - agent_client=FailingAgentClient(), - specs_root=tmp_path, - ) - - assert not result.success - assert "AGENT_ERROR" in result.error - - def test_enrich_document_missing(self, tmp_path): - from kdd.application.commands.enrich_with_agent import enrich_with_agent - - store = InMemoryArtifactStore() - node = GraphNode( - id="Entity:Test", - kind=KDDKind.ENTITY, - source_file="does_not_exist.md", - source_hash="abc", - layer=KDDLayer.DOMAIN, - ) - store.write_node(node) - - result = enrich_with_agent( - "Entity:Test", - artifact_store=store, - agent_client=FakeAgentClient(), - specs_root=tmp_path, - ) - - assert not result.success - assert "DOCUMENT_NOT_FOUND" in result.error diff --git a/tests/v2/application/extractors/__init__.py b/tests/v2/application/extractors/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/application/extractors/conftest.py b/tests/v2/application/extractors/conftest.py deleted file mode 100644 index de6ee4f..0000000 --- a/tests/v2/application/extractors/conftest.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Shared fixtures for extractor tests. - -Provides helpers that build a KDDDocument from a real spec file on disk -or from synthetic markdown content, so extractors can be tested against -actual project specs or fabricated examples. -""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from kdd.domain.entities import KDDDocument -from kdd.domain.enums import KDDKind, KDDLayer -from kdd.domain.rules import detect_layer, route_document -from kdd.infrastructure.parsing.hashing import compute_content_hash -from kdd.infrastructure.parsing.markdown import extract_frontmatter, parse_markdown_sections -from kdd.infrastructure.parsing.wiki_links import extract_wiki_link_targets - -# Root of the specs/ directory in this project -SPECS_ROOT = Path(__file__).resolve().parents[4] / "specs" - - -def build_synthetic_document( - content: str, - *, - spec_path: str | None = None, -) -> KDDDocument: - """Build a KDDDocument from synthetic markdown content. - - Useful for kinds that don't have real spec files yet. - The content must include YAML front-matter with at least ``id`` and ``kind``. - """ - fm, body = extract_frontmatter(content) - assert "kind" in fm, "Synthetic content must have 'kind' in front-matter" - assert "id" in fm, "Synthetic content must have 'id' in front-matter" - - if spec_path is None: - spec_path = f"specs/synthetic/{fm['id']}.md" - - route = route_document(fm, spec_path) - assert route.kind is not None, f"Could not route kind '{fm['kind']}'" - - sections = parse_markdown_sections(body) - wiki_links = extract_wiki_link_targets(body) - layer = detect_layer(spec_path) or KDDLayer.DOMAIN - - return KDDDocument( - id=fm["id"], - kind=route.kind, - source_path=spec_path, - source_hash=compute_content_hash(content), - layer=layer, - front_matter=fm, - sections=sections, - wiki_links=wiki_links, - ) - - -def build_document(spec_path: str) -> KDDDocument: - """Build a KDDDocument from a real spec file path (relative to repo root). - - Example: ``build_document("specs/01-domain/entities/KDDDocument.md")`` - """ - full_path = Path(__file__).resolve().parents[4] / spec_path - content = full_path.read_text(encoding="utf-8") - fm, body = extract_frontmatter(content) - route = route_document(fm, spec_path) - assert route.kind is not None, f"Could not route {spec_path}" - - sections = parse_markdown_sections(body) - wiki_links = extract_wiki_link_targets(body) - layer = detect_layer(spec_path) - assert layer is not None, f"Could not detect layer for {spec_path}" - - doc_id = fm.get("id", full_path.stem) - - return KDDDocument( - id=doc_id, - kind=route.kind, - source_path=spec_path, - source_hash=compute_content_hash(content), - layer=layer, - front_matter=fm, - sections=sections, - wiki_links=wiki_links, - ) diff --git a/tests/v2/application/extractors/test_command.py b/tests/v2/application/extractors/test_command.py deleted file mode 100644 index 684c33a..0000000 --- a/tests/v2/application/extractors/test_command.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Tests for CommandExtractor against real spec files.""" - -from kdd.application.extractors.kinds.command import CommandExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_document - - -class TestCommandExtractor: - """Parse CMD-001-IndexDocument.md.""" - - def setup_method(self): - self.doc = build_document("specs/02-behavior/commands/CMD-001-IndexDocument.md") - self.extractor = CommandExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.COMMAND - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "CMD:CMD-001" - - def test_node_has_purpose(self): - node = self.extractor.extract_node(self.doc) - assert "purpose" in node.indexed_fields - assert len(node.indexed_fields["purpose"]) > 10 - - def test_node_has_input_params(self): - node = self.extractor.extract_node(self.doc) - params = node.indexed_fields.get("input_params", []) - assert len(params) >= 2 # source_path, index_path, force - - def test_node_has_preconditions(self): - node = self.extractor.extract_node(self.doc) - assert "preconditions" in node.indexed_fields - - def test_node_has_postconditions(self): - node = self.extractor.extract_node(self.doc) - assert "postconditions" in node.indexed_fields - - def test_node_has_errors(self): - node = self.extractor.extract_node(self.doc) - errors = node.indexed_fields.get("errors", []) - assert len(errors) >= 1 - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_include_emits(self): - edges = self.extractor.extract_edges(self.doc) - emits = [e for e in edges if e.edge_type == "EMITS"] - # CMD-001 postconditions reference EVT-KDDDocument-Detected, etc. - assert len(emits) >= 1 - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "CMD:CMD-001" diff --git a/tests/v2/application/extractors/test_entity.py b/tests/v2/application/extractors/test_entity.py deleted file mode 100644 index 2c693c2..0000000 --- a/tests/v2/application/extractors/test_entity.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Tests for EntityExtractor against real spec files. - -Validates BDD: index-entity.feature SCN-001 (node + edges for entity). -""" - -from kdd.application.extractors.kinds.entity import EntityExtractor -from kdd.domain.enums import KDDKind, KDDLayer - -from .conftest import build_document - - -class TestEntityExtractor: - """Parse real entity spec: KDDDocument.md.""" - - def setup_method(self): - self.doc = build_document("specs/01-domain/entities/KDDDocument.md") - self.extractor = EntityExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.ENTITY - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "Entity:KDDDocument" - - def test_node_kind_and_layer(self): - node = self.extractor.extract_node(self.doc) - assert node.kind == KDDKind.ENTITY - assert node.layer == KDDLayer.DOMAIN - - def test_node_has_description(self): - node = self.extractor.extract_node(self.doc) - assert "description" in node.indexed_fields - assert len(node.indexed_fields["description"]) > 10 - - def test_node_has_attributes(self): - node = self.extractor.extract_node(self.doc) - attrs = node.indexed_fields.get("attributes", []) - assert len(attrs) > 0 - # KDDDocument has id, kind, source_path, etc. - attr_names = {a.get("Atributo", a.get("Attribute", "")) for a in attrs} - assert "`id`" in attr_names or "id" in attr_names - - def test_node_has_relations(self): - node = self.extractor.extract_node(self.doc) - rels = node.indexed_fields.get("relations", []) - assert len(rels) > 0 - - def test_node_has_invariants(self): - node = self.extractor.extract_node(self.doc) - invs = node.indexed_fields.get("invariants", []) - assert len(invs) > 0 - - def test_node_has_lifecycle(self): - node = self.extractor.extract_node(self.doc) - assert "state_machine" in node.indexed_fields - - def test_node_status_from_frontmatter(self): - node = self.extractor.extract_node(self.doc) - assert node.status == "draft" - - def test_node_aliases_from_frontmatter(self): - node = self.extractor.extract_node(self.doc) - assert "Document" in node.aliases or "Spec" in node.aliases - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_include_domain_relations(self): - edges = self.extractor.extract_edges(self.doc) - domain_rels = [e for e in edges if e.edge_type == "DOMAIN_RELATION"] - # KDDDocument has relations to GraphNode, GraphEdge, Embedding, IndexManifest - assert len(domain_rels) >= 1 - - def test_edges_have_correct_source(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "Entity:KDDDocument" - assert edge.source_file == self.doc.source_path - - def test_edges_include_emits(self): - edges = self.extractor.extract_edges(self.doc) - emits = [e for e in edges if e.edge_type == "EMITS"] - # KDDDocument lifecycle has EVT-KDDDocument-Detected, etc. - assert len(emits) >= 1 - for e in emits: - assert e.to_node.startswith("Event:") - - def test_no_duplicate_edges(self): - edges = self.extractor.extract_edges(self.doc) - keys = [(e.from_node, e.to_node, e.edge_type) for e in edges] - assert len(keys) == len(set(keys)) - - -class TestEntityExtractorGraphNode: - """Parse GraphNode.md — simpler entity with fewer sections.""" - - def setup_method(self): - self.doc = build_document("specs/01-domain/entities/GraphNode.md") - self.extractor = EntityExtractor() - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "Entity:GraphNode" - - def test_has_description(self): - node = self.extractor.extract_node(self.doc) - assert "description" in node.indexed_fields - - def test_has_attributes(self): - node = self.extractor.extract_node(self.doc) - assert len(node.indexed_fields.get("attributes", [])) > 0 diff --git a/tests/v2/application/extractors/test_event.py b/tests/v2/application/extractors/test_event.py deleted file mode 100644 index 5e3ce1a..0000000 --- a/tests/v2/application/extractors/test_event.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Tests for EventExtractor against real spec files.""" - -from kdd.application.extractors.kinds.event import EventExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_document - - -class TestEventExtractor: - """Parse EVT-KDDDocument-Indexed.md.""" - - def setup_method(self): - self.doc = build_document("specs/01-domain/events/EVT-KDDDocument-Indexed.md") - self.extractor = EventExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.EVENT - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "Event:EVT-KDDDocument-Indexed" - - def test_node_has_description(self): - node = self.extractor.extract_node(self.doc) - assert "description" in node.indexed_fields - assert len(node.indexed_fields["description"]) > 10 - - def test_node_has_payload(self): - node = self.extractor.extract_node(self.doc) - assert "payload" in node.indexed_fields - payload = node.indexed_fields["payload"] - assert isinstance(payload, list) - assert len(payload) >= 1 - # Payload rows should have Campo column - field_names = [r.get("Campo", "") for r in payload] - assert any("document_id" in f for f in field_names) - - def test_node_has_producer(self): - node = self.extractor.extract_node(self.doc) - assert "producer" in node.indexed_fields - - def test_node_has_consumers(self): - node = self.extractor.extract_node(self.doc) - assert "consumers" in node.indexed_fields - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "Event:EVT-KDDDocument-Indexed" diff --git a/tests/v2/application/extractors/test_prd.py b/tests/v2/application/extractors/test_prd.py deleted file mode 100644 index 64e8fc1..0000000 --- a/tests/v2/application/extractors/test_prd.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Tests for PRDExtractor against real spec files.""" - -from kdd.application.extractors.kinds.prd import PRDExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_document - - -class TestPRDExtractor: - """Parse PRD-KBEngine.md.""" - - def setup_method(self): - self.doc = build_document("specs/00-requirements/PRD-KBEngine.md") - self.extractor = PRDExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.PRD - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "PRD:PRD-KBEngine" - - def test_node_has_problem(self): - node = self.extractor.extract_node(self.doc) - assert "problem" in node.indexed_fields - assert len(node.indexed_fields["problem"]) > 20 - - def test_node_has_scope(self): - node = self.extractor.extract_node(self.doc) - assert "scope" in node.indexed_fields - assert "alcance" in node.indexed_fields["scope"].lower() or \ - "v1" in node.indexed_fields["scope"].lower() - - def test_node_has_users(self): - node = self.extractor.extract_node(self.doc) - assert "users" in node.indexed_fields - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "PRD:PRD-KBEngine" diff --git a/tests/v2/application/extractors/test_query.py b/tests/v2/application/extractors/test_query.py deleted file mode 100644 index 7aa736f..0000000 --- a/tests/v2/application/extractors/test_query.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Tests for QueryExtractor against real spec files.""" - -from kdd.application.extractors.kinds.query import QueryExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_document - - -class TestQueryExtractor: - """Parse QRY-001-RetrieveByGraph.md.""" - - def setup_method(self): - self.doc = build_document("specs/02-behavior/queries/QRY-001-RetrieveByGraph.md") - self.extractor = QueryExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.QUERY - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "QRY:QRY-001" - - def test_node_has_purpose(self): - node = self.extractor.extract_node(self.doc) - assert "purpose" in node.indexed_fields - assert len(node.indexed_fields["purpose"]) > 10 - - def test_node_has_input_params(self): - node = self.extractor.extract_node(self.doc) - params = node.indexed_fields.get("input_params", []) - assert len(params) >= 1 # root_node, depth, etc. - - def test_node_has_output(self): - node = self.extractor.extract_node(self.doc) - assert "output_structure" in node.indexed_fields - - def test_node_has_errors(self): - node = self.extractor.extract_node(self.doc) - errors = node.indexed_fields.get("errors", []) - assert len(errors) >= 1 - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "QRY:QRY-001" diff --git a/tests/v2/application/extractors/test_registry.py b/tests/v2/application/extractors/test_registry.py deleted file mode 100644 index 52fdb19..0000000 --- a/tests/v2/application/extractors/test_registry.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Tests for kdd.application.extractors.registry.""" - -from kdd.application.extractors.registry import ExtractorRegistry, create_default_registry -from kdd.domain.enums import KDDKind - - -class TestExtractorRegistry: - def test_register_and_get(self): - from kdd.application.extractors.kinds.entity import EntityExtractor - - registry = ExtractorRegistry() - ext = EntityExtractor() - registry.register(ext) - assert registry.get(KDDKind.ENTITY) is ext - - def test_get_unknown_returns_none(self): - registry = ExtractorRegistry() - assert registry.get(KDDKind.EVENT) is None - - def test_len(self): - registry = ExtractorRegistry() - assert len(registry) == 0 - from kdd.application.extractors.kinds.entity import EntityExtractor - registry.register(EntityExtractor()) - assert len(registry) == 1 - - -class TestCreateDefaultRegistry: - def test_has_15_extractors(self): - registry = create_default_registry() - assert len(registry) == 15 - - def test_finds_correct_extractor_per_kind(self): - registry = create_default_registry() - assert registry.registered_kinds == set(KDDKind) - - def test_each_extractor_has_matching_kind(self): - registry = create_default_registry() - for kind in registry.registered_kinds: - ext = registry.get(kind) - assert ext is not None - assert ext.kind == kind diff --git a/tests/v2/application/extractors/test_requirement.py b/tests/v2/application/extractors/test_requirement.py deleted file mode 100644 index c87292e..0000000 --- a/tests/v2/application/extractors/test_requirement.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Tests for RequirementExtractor against real spec files.""" - -from kdd.application.extractors.kinds.requirement import RequirementExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_document - - -class TestRequirementExtractor: - """Parse REQ-001-Performance.md.""" - - def setup_method(self): - self.doc = build_document("specs/04-verification/criteria/REQ-001-Performance.md") - self.extractor = RequirementExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.REQUIREMENT - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "REQ:REQ-001" - - def test_node_has_description(self): - node = self.extractor.extract_node(self.doc) - assert "description" in node.indexed_fields - assert len(node.indexed_fields["description"]) > 10 - - def test_node_has_acceptance_criteria(self): - node = self.extractor.extract_node(self.doc) - assert "acceptance_criteria" in node.indexed_fields - assert "CA-1" in node.indexed_fields["acceptance_criteria"] - - def test_node_has_traceability(self): - node = self.extractor.extract_node(self.doc) - assert "traceability" in node.indexed_fields - assert "UC-001" in node.indexed_fields["traceability"] - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "REQ:REQ-001" diff --git a/tests/v2/application/extractors/test_rule.py b/tests/v2/application/extractors/test_rule.py deleted file mode 100644 index 3f89e80..0000000 --- a/tests/v2/application/extractors/test_rule.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Tests for RuleExtractor against real spec files.""" - -from kdd.application.extractors.kinds.business_rule import RuleExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_document - - -class TestRuleExtractor: - """Parse BR-DOCUMENT-001.md.""" - - def setup_method(self): - self.doc = build_document("specs/01-domain/rules/BR-DOCUMENT-001.md") - self.extractor = RuleExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.BUSINESS_RULE - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "BR:BR-DOCUMENT-001" - - def test_node_has_declaration(self): - node = self.extractor.extract_node(self.doc) - assert "declaration" in node.indexed_fields - assert len(node.indexed_fields["declaration"]) > 10 - - def test_node_has_when_applies(self): - node = self.extractor.extract_node(self.doc) - assert "when_applies" in node.indexed_fields - - def test_node_has_violation(self): - node = self.extractor.extract_node(self.doc) - assert "violation" in node.indexed_fields - - def test_node_has_examples(self): - node = self.extractor.extract_node(self.doc) - assert "examples" in node.indexed_fields - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_include_entity_rule(self): - edges = self.extractor.extract_edges(self.doc) - entity_rules = [e for e in edges if e.edge_type == "ENTITY_RULE"] - # BR-DOCUMENT-001 declaration references KDDDocument - assert len(entity_rules) >= 1 - targets = {e.to_node for e in entity_rules} - assert any("KDDDocument" in t for t in targets) - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "BR:BR-DOCUMENT-001" diff --git a/tests/v2/application/extractors/test_synthetic_kinds.py b/tests/v2/application/extractors/test_synthetic_kinds.py deleted file mode 100644 index 78947b3..0000000 --- a/tests/v2/application/extractors/test_synthetic_kinds.py +++ /dev/null @@ -1,497 +0,0 @@ -"""Tests for extractors that don't yet have real spec files. - -Uses synthetic markdown content to verify extractor behavior for: -business-policy, cross-policy, process, ui-view, ui-component, -objective, adr. -""" - -from kdd.application.extractors.kinds.adr import ADRExtractor -from kdd.application.extractors.kinds.business_policy import PolicyExtractor -from kdd.application.extractors.kinds.cross_policy import CrossPolicyExtractor -from kdd.application.extractors.kinds.objective import ObjectiveExtractor -from kdd.application.extractors.kinds.process import ProcessExtractor -from kdd.application.extractors.kinds.ui_component import UIComponentExtractor -from kdd.application.extractors.kinds.ui_view import UIViewExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_synthetic_document - - -# --------------------------------------------------------------------------- -# Business Policy -# --------------------------------------------------------------------------- - -_BP_CONTENT = """\ ---- -id: BP-NAMING-001 -kind: business-policy -status: draft ---- - -# BP-NAMING-001 — Naming Policy - -## Declaración - -Todos los nombres de [[KDDDocument]] deben seguir el patrón PascalCase. - -## Cuándo Aplica - -Cuando se crea un nuevo documento KDD. - -## Parámetros - -Ninguno configurable. - -## Qué pasa si se incumple - -El pipeline de indexación emite un warning y continúa. -""" - - -class TestPolicyExtractor: - def setup_method(self): - self.doc = build_synthetic_document( - _BP_CONTENT, - spec_path="specs/02-behavior/policies/BP-NAMING-001.md", - ) - self.extractor = PolicyExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.BUSINESS_POLICY - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "BP:BP-NAMING-001" - - def test_node_has_declaration(self): - node = self.extractor.extract_node(self.doc) - assert "declaration" in node.indexed_fields - assert "PascalCase" in node.indexed_fields["declaration"] - - def test_node_has_when_applies(self): - node = self.extractor.extract_node(self.doc) - assert "when_applies" in node.indexed_fields - - def test_node_has_violation(self): - node = self.extractor.extract_node(self.doc) - assert "violation" in node.indexed_fields - - def test_edges_include_entity_rule(self): - edges = self.extractor.extract_edges(self.doc) - entity_rules = [e for e in edges if e.edge_type == "ENTITY_RULE"] - assert len(entity_rules) >= 1 - assert any("KDDDocument" in e.to_node for e in entity_rules) - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "BP:BP-NAMING-001" - - -# --------------------------------------------------------------------------- -# Cross Policy -# --------------------------------------------------------------------------- - -_XP_CONTENT = """\ ---- -id: XP-AUDIT-001 -kind: cross-policy -status: draft ---- - -# XP-AUDIT-001 — Audit Trail - -## Propósito - -Garantizar trazabilidad de todas las operaciones. - -## Declaración - -Toda operación sobre [[IndexManifest]] debe registrar un evento de auditoría. - -## Formalización EARS - -When a document is indexed, the system shall emit an audit event. - -## Comportamiento Estándar - -El sistema registra: timestamp, operación, usuario, resultado. -""" - - -class TestCrossPolicyExtractor: - def setup_method(self): - self.doc = build_synthetic_document( - _XP_CONTENT, - spec_path="specs/02-behavior/policies/XP-AUDIT-001.md", - ) - self.extractor = CrossPolicyExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.CROSS_POLICY - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "XP:XP-AUDIT-001" - - def test_node_has_purpose(self): - node = self.extractor.extract_node(self.doc) - assert "purpose" in node.indexed_fields - - def test_node_has_declaration(self): - node = self.extractor.extract_node(self.doc) - assert "declaration" in node.indexed_fields - - def test_node_has_formalization(self): - node = self.extractor.extract_node(self.doc) - assert "formalization_ears" in node.indexed_fields - - def test_node_has_standard_behavior(self): - node = self.extractor.extract_node(self.doc) - assert "standard_behavior" in node.indexed_fields - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "XP:XP-AUDIT-001" - - -# --------------------------------------------------------------------------- -# Process -# --------------------------------------------------------------------------- - -_PROC_CONTENT = """\ ---- -id: PROC-001 -kind: process -status: draft ---- - -# PROC-001 — Index Pipeline - -## Participantes - -- [[KDDDocument]] producer -- IndexPipeline orchestrator - -## Pasos - -### Paso 1: Detección - -El sistema detecta cambios via git diff. - -### Paso 2: Extracción - -El extractor procesa el documento y genera [[GraphNode]]. - -## Diagrama - -```mermaid -graph LR - A[Detect] --> B[Extract] --> C[Store] -``` -""" - - -class TestProcessExtractor: - def setup_method(self): - self.doc = build_synthetic_document( - _PROC_CONTENT, - spec_path="specs/02-behavior/processes/PROC-001.md", - ) - self.extractor = ProcessExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.PROCESS - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "PROC:PROC-001" - - def test_node_has_participants(self): - node = self.extractor.extract_node(self.doc) - assert "participants" in node.indexed_fields - - def test_node_has_steps(self): - node = self.extractor.extract_node(self.doc) - assert "steps" in node.indexed_fields - # Steps should include sub-sections - assert "Detección" in node.indexed_fields["steps"] - - def test_node_has_mermaid_flow(self): - node = self.extractor.extract_node(self.doc) - assert "mermaid_flow" in node.indexed_fields - assert "mermaid" in node.indexed_fields["mermaid_flow"] - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "PROC:PROC-001" - - -# --------------------------------------------------------------------------- -# UI View -# --------------------------------------------------------------------------- - -_UI_VIEW_CONTENT = """\ ---- -id: UI-Dashboard -kind: ui-view -status: draft ---- - -# UI-Dashboard - -## Descripción - -Vista principal del dashboard que muestra el estado del índice. - -## Layout - -Grid de 2 columnas con sidebar. - -## Componentes - -- StatsCard -- GraphViewer -- SearchBar - -## Estados - -- Loading -- Ready -- Error - -## Comportamiento - -Al cargar, la vista solicita datos del [[IndexManifest]]. -""" - - -class TestUIViewExtractor: - def setup_method(self): - self.doc = build_synthetic_document( - _UI_VIEW_CONTENT, - spec_path="specs/03-experience/views/UI-Dashboard.md", - ) - self.extractor = UIViewExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.UI_VIEW - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "UIView:UI-Dashboard" - - def test_node_has_description(self): - node = self.extractor.extract_node(self.doc) - assert "description" in node.indexed_fields - - def test_node_has_layout(self): - node = self.extractor.extract_node(self.doc) - assert "layout" in node.indexed_fields - - def test_node_has_components(self): - node = self.extractor.extract_node(self.doc) - assert "components" in node.indexed_fields - - def test_node_has_behavior(self): - node = self.extractor.extract_node(self.doc) - assert "behavior" in node.indexed_fields - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "UIView:UI-Dashboard" - - -# --------------------------------------------------------------------------- -# UI Component -# --------------------------------------------------------------------------- - -_UI_COMP_CONTENT = """\ ---- -id: UI-SearchBar -kind: ui-component -status: draft ---- - -# UI-SearchBar - -## Descripción - -Componente de barra de búsqueda con autocompletado. - -## Entidades - -Consume datos de [[GraphNode]] y [[Embedding]]. - -## Casos de Uso - -Utilizado en [[UC-004-RetrieveContext]]. -""" - - -class TestUIComponentExtractor: - def setup_method(self): - self.doc = build_synthetic_document( - _UI_COMP_CONTENT, - spec_path="specs/03-experience/views/UI-SearchBar.md", - ) - self.extractor = UIComponentExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.UI_COMPONENT - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "UIComp:UI-SearchBar" - - def test_node_has_description(self): - node = self.extractor.extract_node(self.doc) - assert "description" in node.indexed_fields - - def test_node_has_entities(self): - node = self.extractor.extract_node(self.doc) - assert "entities" in node.indexed_fields - - def test_node_has_use_cases(self): - node = self.extractor.extract_node(self.doc) - assert "use_cases" in node.indexed_fields - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "UIComp:UI-SearchBar" - - -# --------------------------------------------------------------------------- -# Objective -# --------------------------------------------------------------------------- - -_OBJ_CONTENT = """\ ---- -id: OBJ-001 -kind: objective -status: draft ---- - -# OBJ-001 — Agent Retrieval - -## Actor - -Agente de IA (Claude Code, Codex, Cursor). - -## Objetivo - -Obtener contexto preciso de specs KDD para ejecutar tareas de desarrollo -con alta precisión, referenciando [[KDDDocument]] y [[GraphNode]]. - -## Criterios de éxito - -- Retrieval precision >= 90% -- Tiempo de respuesta P95 < 300ms -""" - - -class TestObjectiveExtractor: - def setup_method(self): - self.doc = build_synthetic_document( - _OBJ_CONTENT, - spec_path="specs/00-requirements/objectives/OBJ-001.md", - ) - self.extractor = ObjectiveExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.OBJECTIVE - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "OBJ:OBJ-001" - - def test_node_has_actor(self): - node = self.extractor.extract_node(self.doc) - assert "actor" in node.indexed_fields - - def test_node_has_objective(self): - node = self.extractor.extract_node(self.doc) - assert "objective" in node.indexed_fields - - def test_node_has_success_criteria(self): - node = self.extractor.extract_node(self.doc) - assert "success_criteria" in node.indexed_fields - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "OBJ:OBJ-001" - - -# --------------------------------------------------------------------------- -# ADR -# --------------------------------------------------------------------------- - -_ADR_CONTENT = """\ ---- -id: ADR-0001 -kind: adr -status: accepted ---- - -# ADR-0001 — Repository Pattern - -## Contexto - -El sistema necesita abstraer el almacenamiento para soportar -múltiples backends: SQLite, PostgreSQL, filesystem. - -## Decisión - -Adoptamos el Repository Pattern con interfaces definidas -como Protocols de Python, referenciando [[ArtifactStore]]. - -## Consecuencias - -- Positivas: facilidad de testing, intercambio de backends. -- Negativas: capa de abstracción adicional, posible overhead. -""" - - -class TestADRExtractor: - def setup_method(self): - self.doc = build_synthetic_document( - _ADR_CONTENT, - spec_path="specs/00-requirements/decisions/ADR-0001.md", - ) - self.extractor = ADRExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.ADR - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "ADR:ADR-0001" - - def test_node_has_context(self): - node = self.extractor.extract_node(self.doc) - assert "context" in node.indexed_fields - assert "almacenamiento" in node.indexed_fields["context"] - - def test_node_has_decision(self): - node = self.extractor.extract_node(self.doc) - assert "decision" in node.indexed_fields - assert "Repository Pattern" in node.indexed_fields["decision"] - - def test_node_has_consequences(self): - node = self.extractor.extract_node(self.doc) - assert "consequences" in node.indexed_fields - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "ADR:ADR-0001" diff --git a/tests/v2/application/extractors/test_use_case.py b/tests/v2/application/extractors/test_use_case.py deleted file mode 100644 index 246f80a..0000000 --- a/tests/v2/application/extractors/test_use_case.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Tests for UseCaseExtractor against real spec files.""" - -from kdd.application.extractors.kinds.use_case import UseCaseExtractor -from kdd.domain.enums import KDDKind - -from .conftest import build_document - - -class TestUseCaseExtractor: - """Parse UC-001-IndexDocument.md.""" - - def setup_method(self): - self.doc = build_document("specs/02-behavior/use-cases/UC-001-IndexDocument.md") - self.extractor = UseCaseExtractor() - - def test_kind(self): - assert self.extractor.kind == KDDKind.USE_CASE - - def test_node_id(self): - node = self.extractor.extract_node(self.doc) - assert node.id == "UC:UC-001" - - def test_node_has_description(self): - node = self.extractor.extract_node(self.doc) - assert "description" in node.indexed_fields - assert len(node.indexed_fields["description"]) > 10 - - def test_node_has_actors(self): - node = self.extractor.extract_node(self.doc) - assert "actors" in node.indexed_fields - assert "Developer" in node.indexed_fields["actors"] - - def test_node_has_main_flow(self): - node = self.extractor.extract_node(self.doc) - assert "main_flow" in node.indexed_fields - - def test_node_has_preconditions(self): - node = self.extractor.extract_node(self.doc) - assert "preconditions" in node.indexed_fields - - def test_node_has_postconditions(self): - node = self.extractor.extract_node(self.doc) - assert "postconditions" in node.indexed_fields - - def test_node_has_alternatives(self): - node = self.extractor.extract_node(self.doc) - # Alternatives may be in sub-sections (### FA-1, FA-2, etc.) - # so the H2 "Flujos Alternativos" itself may have no content. - # The extractor collects sub-sections as well. - assert "alternatives" in node.indexed_fields - - def test_node_has_exceptions(self): - node = self.extractor.extract_node(self.doc) - # Same: exceptions live in sub-sections (### EX-1, etc.) - assert "exceptions" in node.indexed_fields - - def test_edges_include_wiki_links(self): - edges = self.extractor.extract_edges(self.doc) - edge_types = {e.edge_type for e in edges} - assert "WIKI_LINK" in edge_types - - def test_edges_include_uc_applies_rule(self): - edges = self.extractor.extract_edges(self.doc) - rule_edges = [e for e in edges if e.edge_type == "UC_APPLIES_RULE"] - # UC-001 references BR-DOCUMENT-001, BR-EMBEDDING-001, etc. - assert len(rule_edges) >= 1 - # Targets should be BR:, BP:, or XP: - for e in rule_edges: - assert e.to_node.startswith(("BR:", "BP:", "XP:")) - - def test_edges_include_uc_executes_cmd(self): - edges = self.extractor.extract_edges(self.doc) - cmd_edges = [e for e in edges if e.edge_type == "UC_EXECUTES_CMD"] - # UC-001 references CMD-001-IndexDocument - assert len(cmd_edges) >= 1 - for e in cmd_edges: - assert e.to_node.startswith("CMD:") - - def test_edges_from_correct_node(self): - edges = self.extractor.extract_edges(self.doc) - for edge in edges: - assert edge.from_node == "UC:UC-001" diff --git a/tests/v2/application/queries/__init__.py b/tests/v2/application/queries/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/application/queries/conftest.py b/tests/v2/application/queries/conftest.py deleted file mode 100644 index 54d38cc..0000000 --- a/tests/v2/application/queries/conftest.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Shared fixtures for query tests. - -Provides a pre-built graph store and vector store with a representative -KDD knowledge graph. -""" - -from datetime import datetime - -import pytest - -from kdd.domain.entities import Embedding, GraphEdge, GraphNode -from kdd.domain.enums import EdgeType, KDDKind, KDDLayer -from kdd.infrastructure.graph.networkx_store import NetworkXGraphStore -from kdd.infrastructure.vector.hnswlib_store import HNSWLibVectorStore - - -def _node( - id: str, - kind: KDDKind, - layer: KDDLayer, - **fields, -) -> GraphNode: - return GraphNode( - id=id, - kind=kind, - source_file=f"{id}.md", - source_hash="abc123", - layer=layer, - indexed_fields=fields, - ) - - -def _edge( - from_node: str, - to_node: str, - edge_type: str, - violation: bool = False, -) -> GraphEdge: - return GraphEdge( - from_node=from_node, - to_node=to_node, - edge_type=edge_type, - source_file="test.md", - extraction_method="section_content", - layer_violation=violation, - ) - - -# A representative mini-graph of the KB Engine specs -NODES = [ - _node("Entity:KDDDocument", KDDKind.ENTITY, KDDLayer.DOMAIN, - title="KDDDocument", description="Atomic input unit"), - _node("Entity:GraphNode", KDDKind.ENTITY, KDDLayer.DOMAIN, - title="GraphNode", description="Node in the knowledge graph"), - _node("BR:BR-DOCUMENT-001", KDDKind.BUSINESS_RULE, KDDLayer.DOMAIN, - title="Kind Router", description="Routes documents to extractors"), - _node("BR:BR-LAYER-001", KDDKind.BUSINESS_RULE, KDDLayer.DOMAIN, - title="Layer Validation", description="Validates layer dependencies"), - _node("CMD:CMD-001", KDDKind.COMMAND, KDDLayer.BEHAVIOR, - title="IndexDocument", description="Index a single document"), - _node("CMD:CMD-002", KDDKind.COMMAND, KDDLayer.BEHAVIOR, - title="IndexIncremental", description="Incremental re-indexing"), - _node("UC:UC-001", KDDKind.USE_CASE, KDDLayer.BEHAVIOR, - title="IndexDocument", description="Full indexing flow"), - _node("UC:UC-004", KDDKind.USE_CASE, KDDLayer.BEHAVIOR, - title="RetrieveContext", description="Hybrid search for agents"), - _node("QRY:QRY-003", KDDKind.QUERY, KDDLayer.BEHAVIOR, - title="RetrieveHybrid", description="Fusion search"), - _node("REQ:REQ-001", KDDKind.REQUIREMENT, KDDLayer.VERIFICATION, - title="Performance", description="Performance requirements"), -] - -EDGES = [ - # Entity relationships - _edge("Entity:KDDDocument", "BR:BR-DOCUMENT-001", EdgeType.ENTITY_RULE.value), - _edge("Entity:KDDDocument", "BR:BR-LAYER-001", EdgeType.ENTITY_RULE.value), - _edge("Entity:KDDDocument", "Entity:GraphNode", EdgeType.DOMAIN_RELATION.value), - # Command relationships - _edge("CMD:CMD-001", "Entity:KDDDocument", EdgeType.WIKI_LINK.value), - _edge("CMD:CMD-002", "CMD:CMD-001", EdgeType.WIKI_LINK.value), - # UC relationships - _edge("UC:UC-001", "CMD:CMD-001", EdgeType.UC_EXECUTES_CMD.value), - _edge("UC:UC-001", "BR:BR-DOCUMENT-001", EdgeType.UC_APPLIES_RULE.value), - _edge("UC:UC-001", "BR:BR-LAYER-001", EdgeType.UC_APPLIES_RULE.value), - _edge("UC:UC-004", "QRY:QRY-003", EdgeType.UC_EXECUTES_CMD.value), - # Requirement traceability - _edge("REQ:REQ-001", "UC:UC-001", EdgeType.REQ_TRACES_TO.value), - _edge("REQ:REQ-001", "UC:UC-004", EdgeType.REQ_TRACES_TO.value), - # Layer violation: domain entity references verification requirement - _edge("Entity:KDDDocument", "REQ:REQ-001", EdgeType.WIKI_LINK.value, violation=True), -] - - -@pytest.fixture -def graph_store(): - """A NetworkX graph store loaded with the mini KB Engine graph.""" - store = NetworkXGraphStore() - store.load(NODES, EDGES) - return store - - -@pytest.fixture -def vector_store(): - """A vector store with embeddings for some of the nodes.""" - store = HNSWLibVectorStore() - dim = 8 - - embeddings = [ - _emb("KDDDocument:chunk-0", "KDDDocument", KDDKind.ENTITY, - _make_vec(dim, [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])), - _emb("BR-DOCUMENT-001:chunk-0", "BR-DOCUMENT-001", KDDKind.BUSINESS_RULE, - _make_vec(dim, [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])), - _emb("CMD-001:chunk-0", "CMD-001", KDDKind.COMMAND, - _make_vec(dim, [0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0])), - _emb("UC-001:chunk-0", "UC-001", KDDKind.USE_CASE, - _make_vec(dim, [0.3, 0.3, 0.3, 0.5, 0.0, 0.0, 0.0, 0.0])), - _emb("QRY-003:chunk-0", "QRY-003", KDDKind.QUERY, - _make_vec(dim, [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])), - _emb("REQ-001:chunk-0", "REQ-001", KDDKind.REQUIREMENT, - _make_vec(dim, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])), - ] - store.load(embeddings) - return store - - -def _make_vec(dim: int, values: list[float]) -> list[float]: - """Ensure vector has exactly *dim* dimensions.""" - return (values + [0.0] * dim)[:dim] - - -def _emb(id: str, doc_id: str, kind: KDDKind, vector: list[float]) -> Embedding: - return Embedding( - id=id, - document_id=doc_id, - document_kind=kind, - section_path="Descripción", - chunk_index=0, - raw_text="test", - context_text="test context", - vector=vector, - model="test-model", - dimensions=len(vector), - text_hash="hash", - generated_at=datetime.now(), - ) diff --git a/tests/v2/application/queries/test_index_loader.py b/tests/v2/application/queries/test_index_loader.py deleted file mode 100644 index c43ae72..0000000 --- a/tests/v2/application/queries/test_index_loader.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Tests for IndexLoader.""" - -from datetime import datetime - -import pytest - -from kdd.application.queries.index_loader import IndexLoader -from kdd.domain.entities import ( - Embedding, - GraphEdge, - GraphNode, - IndexManifest, - IndexStats, -) -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer -from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore -from kdd.infrastructure.graph.networkx_store import NetworkXGraphStore -from kdd.infrastructure.vector.hnswlib_store import HNSWLibVectorStore - - -@pytest.fixture -def artifact_dir(tmp_path): - return tmp_path / ".kdd-index" - - -@pytest.fixture -def populated_artifacts(artifact_dir): - """Write minimal artifacts to disk.""" - store = FilesystemArtifactStore(artifact_dir) - - manifest = IndexManifest( - version="1.0.0", - kdd_version="1.0", - indexed_at=datetime.now(), - indexed_by="test", - index_level=IndexLevel.L1, - stats=IndexStats(nodes=1, edges=1, embeddings=0), - ) - store.write_manifest(manifest) - - node = GraphNode( - id="Entity:Test", - kind=KDDKind.ENTITY, - source_file="test.md", - source_hash="abc", - layer=KDDLayer.DOMAIN, - indexed_fields={"title": "Test"}, - ) - store.write_node(node) - - edge = GraphEdge( - from_node="Entity:Test", - to_node="BR:BR-001", - edge_type="ENTITY_RULE", - source_file="test.md", - extraction_method="section_content", - ) - store.append_edges([edge]) - - return store - - -class TestIndexLoader: - def test_load_populates_graph(self, populated_artifacts, artifact_dir): - graph = NetworkXGraphStore() - loader = IndexLoader(populated_artifacts, graph) - - assert not loader.is_loaded - loaded = loader.load() - assert loaded - assert loader.is_loaded - assert graph.node_count() == 1 - assert graph.has_node("Entity:Test") - - def test_load_caches(self, populated_artifacts, artifact_dir): - graph = NetworkXGraphStore() - loader = IndexLoader(populated_artifacts, graph) - - loader.load() - assert not loader.load() # second call uses cache - - def test_reload_forces_refresh(self, populated_artifacts, artifact_dir): - graph = NetworkXGraphStore() - loader = IndexLoader(populated_artifacts, graph) - - loader.load() - assert loader.reload() # forced reload - - def test_load_without_manifest(self, artifact_dir): - store = FilesystemArtifactStore(artifact_dir) - graph = NetworkXGraphStore() - loader = IndexLoader(store, graph) - - loaded = loader.load() - assert not loaded - assert not loader.is_loaded - - def test_load_with_vector_store(self, artifact_dir): - store = FilesystemArtifactStore(artifact_dir) - - # Write manifest and node - manifest = IndexManifest( - version="1.0.0", - kdd_version="1.0", - indexed_at=datetime.now(), - indexed_by="test", - index_level=IndexLevel.L2, - embedding_model="test-model", - embedding_dimensions=4, - stats=IndexStats(nodes=1, edges=0, embeddings=1), - ) - store.write_manifest(manifest) - - node = GraphNode( - id="Entity:Test", - kind=KDDKind.ENTITY, - source_file="test.md", - source_hash="abc", - layer=KDDLayer.DOMAIN, - ) - store.write_node(node) - - emb = Embedding( - id="Test:chunk-0", - document_id="Test", - document_kind=KDDKind.ENTITY, - section_path="Descripción", - chunk_index=0, - raw_text="test", - context_text="test context", - vector=[1.0, 0.0, 0.0, 0.0], - model="test-model", - dimensions=4, - text_hash="hash", - generated_at=datetime.now(), - ) - store.write_embeddings([emb]) - - graph = NetworkXGraphStore() - vector = HNSWLibVectorStore() - loader = IndexLoader(store, graph, vector) - - loader.load() - assert graph.node_count() == 1 - assert vector.size == 1 diff --git a/tests/v2/application/queries/test_retrieve_coverage.py b/tests/v2/application/queries/test_retrieve_coverage.py deleted file mode 100644 index 836e9cd..0000000 --- a/tests/v2/application/queries/test_retrieve_coverage.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Tests for QRY-005 RetrieveCoverage.""" - -import pytest - -from kdd.application.queries.retrieve_coverage import ( - CoverageQueryInput, - retrieve_coverage, -) - - -class TestRetrieveCoverage: - def test_entity_coverage(self, graph_store): - """Entity:KDDDocument has business rules connected.""" - result = retrieve_coverage( - CoverageQueryInput(node_id="Entity:KDDDocument"), - graph_store, - ) - assert result.analyzed_node is not None - assert result.analyzed_node.id == "Entity:KDDDocument" - - # Should have categories defined for entities - cat_names = {c.name for c in result.categories} - assert "business_rules" in cat_names - - # business_rules should be covered (BR-DOCUMENT-001, BR-LAYER-001) - br_cat = next(c for c in result.categories if c.name == "business_rules") - assert br_cat.status == "covered" - assert len(br_cat.found) >= 1 - - def test_coverage_percentage(self, graph_store): - result = retrieve_coverage( - CoverageQueryInput(node_id="Entity:KDDDocument"), - graph_store, - ) - assert 0 <= result.coverage_percent <= 100 - assert result.present + result.missing == len(result.categories) - - def test_uc_coverage(self, graph_store): - """UC-001 has commands and rules connected.""" - result = retrieve_coverage( - CoverageQueryInput(node_id="UC:UC-001"), - graph_store, - ) - cat_names = {c.name for c in result.categories} - assert "commands" in cat_names - assert "rules" in cat_names - - def test_node_not_found(self, graph_store): - with pytest.raises(ValueError, match="NODE_NOT_FOUND"): - retrieve_coverage( - CoverageQueryInput(node_id="Entity:Missing"), - graph_store, - ) - - def test_unknown_kind_no_rules(self, graph_store): - """QRY:QRY-003 is a query kind — no coverage rules defined.""" - with pytest.raises(ValueError, match="UNKNOWN_KIND"): - retrieve_coverage( - CoverageQueryInput(node_id="QRY:QRY-003"), - graph_store, - ) diff --git a/tests/v2/application/queries/test_retrieve_graph.py b/tests/v2/application/queries/test_retrieve_graph.py deleted file mode 100644 index f07c59a..0000000 --- a/tests/v2/application/queries/test_retrieve_graph.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Tests for QRY-001 RetrieveByGraph.""" - -import pytest - -from kdd.application.queries.retrieve_graph import ( - GraphQueryInput, - retrieve_by_graph, -) - - -class TestRetrieveByGraph: - def test_basic_traversal(self, graph_store): - result = retrieve_by_graph( - GraphQueryInput(root_node="Entity:KDDDocument", depth=1), - graph_store, - ) - assert result.center_node is not None - assert result.center_node.id == "Entity:KDDDocument" - node_ids = {s.node_id for s in result.related_nodes} - assert "BR:BR-DOCUMENT-001" in node_ids - assert "CMD:CMD-001" in node_ids - - def test_depth_2_reaches_uc(self, graph_store): - result = retrieve_by_graph( - GraphQueryInput(root_node="Entity:KDDDocument", depth=2), - graph_store, - ) - node_ids = {s.node_id for s in result.related_nodes} - assert "UC:UC-001" in node_ids - - def test_edge_type_filter(self, graph_store): - result = retrieve_by_graph( - GraphQueryInput( - root_node="Entity:KDDDocument", - depth=2, - edge_types=["ENTITY_RULE"], - ), - graph_store, - ) - node_ids = {s.node_id for s in result.related_nodes} - assert "BR:BR-DOCUMENT-001" in node_ids - assert "CMD:CMD-001" not in node_ids - - def test_kind_filter(self, graph_store): - from kdd.domain.enums import KDDKind - result = retrieve_by_graph( - GraphQueryInput( - root_node="Entity:KDDDocument", - depth=2, - include_kinds=[KDDKind.BUSINESS_RULE], - ), - graph_store, - ) - assert all( - graph_store.get_node(s.node_id).kind == KDDKind.BUSINESS_RULE - for s in result.related_nodes - ) - - def test_scores_descending(self, graph_store): - result = retrieve_by_graph( - GraphQueryInput(root_node="Entity:KDDDocument", depth=3), - graph_store, - ) - scores = [s.score for s in result.related_nodes] - assert scores == sorted(scores, reverse=True) - - def test_node_not_found(self, graph_store): - with pytest.raises(ValueError, match="NODE_NOT_FOUND"): - retrieve_by_graph( - GraphQueryInput(root_node="Entity:Missing"), - graph_store, - ) - - def test_edges_returned(self, graph_store): - result = retrieve_by_graph( - GraphQueryInput(root_node="Entity:KDDDocument", depth=1), - graph_store, - ) - assert result.total_edges > 0 - assert len(result.edges) == result.total_edges diff --git a/tests/v2/application/queries/test_retrieve_hybrid.py b/tests/v2/application/queries/test_retrieve_hybrid.py deleted file mode 100644 index 4261faa..0000000 --- a/tests/v2/application/queries/test_retrieve_hybrid.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Tests for QRY-003 RetrieveHybrid.""" - -import pytest - -from kdd.application.queries.retrieve_hybrid import ( - HybridQueryInput, - retrieve_hybrid, -) -from kdd.domain.enums import KDDKind - - -class FakeEmbeddingModel: - """A fake embedding model that returns fixed vectors.""" - - @property - def model_name(self) -> str: - return "test-model" - - @property - def dimensions(self) -> int: - return 8 - - def encode(self, texts: list[str]) -> list[list[float]]: - # Return a vector pointing toward doc1 (KDDDocument) - return [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] * len(texts) - - -class TestRetrieveHybrid: - def test_basic_hybrid_search(self, graph_store, vector_store): - result = retrieve_hybrid( - HybridQueryInput(query_text="KDDDocument indexing"), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - assert result.total_results > 0 - assert len(result.results) == result.total_results - - def test_fusion_scoring(self, graph_store, vector_store): - """Nodes found via both semantic AND graph should score highest.""" - result = retrieve_hybrid( - HybridQueryInput( - query_text="KDDDocument", - expand_graph=True, - depth=1, - min_score=0.1, - ), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - # Find nodes with fusion match_source - fusion_nodes = [r for r in result.results if r.match_source == "fusion"] - non_fusion = [r for r in result.results if r.match_source != "fusion"] - if fusion_nodes and non_fusion: - assert fusion_nodes[0].score >= non_fusion[0].score - - def test_graph_expansion_edges(self, graph_store, vector_store): - result = retrieve_hybrid( - HybridQueryInput( - query_text="KDDDocument", - expand_graph=True, - depth=1, - min_score=0.1, - ), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - assert len(result.graph_expansion) > 0 - - def test_no_graph_expansion(self, graph_store, vector_store): - result = retrieve_hybrid( - HybridQueryInput( - query_text="KDDDocument", - expand_graph=False, - min_score=0.1, - ), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - assert result.graph_expansion == [] - - def test_kind_filter(self, graph_store, vector_store): - result = retrieve_hybrid( - HybridQueryInput( - query_text="KDDDocument", - include_kinds=[KDDKind.ENTITY], - min_score=0.1, - ), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - for r in result.results: - node = graph_store.get_node(r.node_id) - assert node.kind == KDDKind.ENTITY - - def test_degrades_without_vector_store(self, graph_store): - """L1 mode: no vector store, should still return graph+lexical results.""" - result = retrieve_hybrid( - HybridQueryInput( - query_text="IndexDocument", - min_score=0.1, - ), - graph_store, - vector_store=None, - embedding_model=None, - ) - assert "NO_EMBEDDINGS" in result.warnings[0] - # Should still find nodes via lexical search - assert result.total_results > 0 - - def test_limit_respected(self, graph_store, vector_store): - result = retrieve_hybrid( - HybridQueryInput( - query_text="KDDDocument", - limit=2, - min_score=0.1, - ), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - assert result.total_results <= 2 - - def test_query_too_short(self, graph_store): - with pytest.raises(ValueError, match="QUERY_TOO_SHORT"): - retrieve_hybrid( - HybridQueryInput(query_text="ab"), - graph_store, - ) - - def test_results_sorted_by_score(self, graph_store, vector_store): - result = retrieve_hybrid( - HybridQueryInput(query_text="KDDDocument", min_score=0.1), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - scores = [r.score for r in result.results] - assert scores == sorted(scores, reverse=True) - - def test_max_tokens_truncation(self, graph_store, vector_store): - result = retrieve_hybrid( - HybridQueryInput( - query_text="KDDDocument", - max_tokens=5, - min_score=0.1, - ), - graph_store, - vector_store, - FakeEmbeddingModel(), - ) - # Should have fewer results due to token limit - assert result.total_tokens <= 10 # some slack for estimation diff --git a/tests/v2/application/queries/test_retrieve_impact.py b/tests/v2/application/queries/test_retrieve_impact.py deleted file mode 100644 index 6a00900..0000000 --- a/tests/v2/application/queries/test_retrieve_impact.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Tests for QRY-004 RetrieveImpact.""" - -import pytest - -from kdd.application.queries.retrieve_impact import ( - ImpactQueryInput, - retrieve_impact, -) - - -class TestRetrieveImpact: - def test_direct_dependents(self, graph_store): - """Entity:KDDDocument has incoming edges from CMD-001 and Entity:GraphNode.""" - result = retrieve_impact( - ImpactQueryInput(node_id="Entity:KDDDocument", depth=1), - graph_store, - ) - assert result.analyzed_node is not None - assert result.analyzed_node.id == "Entity:KDDDocument" - direct_ids = {a.node_id for a in result.directly_affected} - assert "CMD:CMD-001" in direct_ids - - def test_transitive_dependents(self, graph_store): - """UC-001 -> CMD-001 -> Entity:KDDDocument at depth 2.""" - result = retrieve_impact( - ImpactQueryInput(node_id="Entity:KDDDocument", depth=3), - graph_store, - ) - trans_ids = {a.node_id for a in result.transitively_affected} - # UC-001 depends on CMD-001 which depends on Entity:KDDDocument - # But UC-001 also depends on BR-DOCUMENT-001 which is an outgoing edge from Entity - # The important thing is transitive impact is found - assert result.total_transitively >= 0 - - def test_leaf_node_no_dependents(self, graph_store): - """CMD-002 has no incoming edges in our fixture, so no dependents.""" - result = retrieve_impact( - ImpactQueryInput(node_id="CMD:CMD-002", depth=2), - graph_store, - ) - assert result.total_directly == 0 - assert result.total_transitively == 0 - - def test_node_not_found(self, graph_store): - with pytest.raises(ValueError, match="NODE_NOT_FOUND"): - retrieve_impact( - ImpactQueryInput(node_id="Entity:Missing"), - graph_store, - ) - - def test_depth_limits_traversal(self, graph_store): - """Depth 1 should only find direct dependents.""" - result = retrieve_impact( - ImpactQueryInput(node_id="Entity:KDDDocument", depth=1), - graph_store, - ) - assert result.total_transitively == 0 - - def test_impact_includes_edge_type(self, graph_store): - result = retrieve_impact( - ImpactQueryInput(node_id="Entity:KDDDocument", depth=1), - graph_store, - ) - for affected in result.directly_affected: - assert affected.edge_type != "" - assert affected.impact_description != "" diff --git a/tests/v2/application/queries/test_retrieve_violations.py b/tests/v2/application/queries/test_retrieve_violations.py deleted file mode 100644 index 8869e0b..0000000 --- a/tests/v2/application/queries/test_retrieve_violations.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Tests for QRY-006 RetrieveLayerViolations.""" - -from kdd.application.queries.retrieve_violations import ( - ViolationsQueryInput, - retrieve_violations, -) -from kdd.domain.enums import KDDKind, KDDLayer - - -class TestRetrieveViolations: - def test_finds_violations(self, graph_store): - result = retrieve_violations( - ViolationsQueryInput(), - graph_store, - ) - assert result.total_violations == 1 - v = result.violations[0] - assert v.from_node == "Entity:KDDDocument" - assert v.to_node == "REQ:REQ-001" - assert v.from_layer == KDDLayer.DOMAIN - assert v.to_layer == KDDLayer.VERIFICATION - - def test_violation_rate(self, graph_store): - result = retrieve_violations( - ViolationsQueryInput(), - graph_store, - ) - assert result.total_edges_analyzed > 0 - assert result.violation_rate > 0 - expected_rate = (1 / result.total_edges_analyzed) * 100 - assert abs(result.violation_rate - round(expected_rate, 2)) < 0.01 - - def test_filter_by_kind(self, graph_store): - result = retrieve_violations( - ViolationsQueryInput(include_kinds=[KDDKind.ENTITY]), - graph_store, - ) - # The violation is from Entity, should still be found - assert result.total_violations == 1 - - def test_filter_excludes_violation(self, graph_store): - result = retrieve_violations( - ViolationsQueryInput(include_kinds=[KDDKind.COMMAND]), - graph_store, - ) - # The violation is Entity->REQ, filtering by COMMAND excludes it - assert result.total_violations == 0 - - def test_filter_by_layer(self, graph_store): - result = retrieve_violations( - ViolationsQueryInput(include_layers=[KDDLayer.DOMAIN]), - graph_store, - ) - assert result.total_violations == 1 diff --git a/tests/v2/application/test_chunking.py b/tests/v2/application/test_chunking.py deleted file mode 100644 index 2f249a6..0000000 --- a/tests/v2/application/test_chunking.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Tests for kdd.application.chunking module.""" - -from kdd.application.chunking import Chunk, chunk_document -from kdd.domain.entities import KDDDocument, Section -from kdd.domain.enums import KDDKind, KDDLayer - - -def _make_document( - kind: KDDKind, - sections: list[Section], - doc_id: str = "TEST-001", -) -> KDDDocument: - return KDDDocument( - id=doc_id, - kind=kind, - source_path="specs/01-domain/entities/Test.md", - source_hash="abc123", - layer=KDDLayer.DOMAIN, - front_matter={"id": doc_id, "kind": kind.value, "status": "draft"}, - sections=sections, - ) - - -class TestChunkDocument: - def test_event_produces_no_chunks(self): - """Events have no embeddable sections per BR-EMBEDDING-001.""" - doc = _make_document( - KDDKind.EVENT, - [Section(heading="Descripción", level=2, content="Some event description.")], - ) - chunks = chunk_document(doc) - assert chunks == [] - - def test_entity_embeds_description(self): - doc = _make_document( - KDDKind.ENTITY, - [ - Section(heading="Descripción", level=2, content="An entity that represents orders."), - Section(heading="Atributos", level=2, content="| name | type |\n|---|---|\n| id | UUID |"), - ], - ) - chunks = chunk_document(doc) - assert len(chunks) >= 1 - # Only Descripción should be chunked, not Atributos - assert all(c.section_heading == "Descripción" for c in chunks) - - def test_chunk_has_context_text(self): - doc = _make_document( - KDDKind.ENTITY, - [Section(heading="Descripción", level=2, content="An entity representing orders.")], - ) - chunks = chunk_document(doc) - assert len(chunks) == 1 - # Context should include document identity - assert "Document: TEST-001" in chunks[0].context_text - assert "Kind: entity" in chunks[0].context_text - assert "Section: Descripción" in chunks[0].context_text - - def test_chunk_id_format(self): - doc = _make_document( - KDDKind.ENTITY, - [Section(heading="Descripción", level=2, content="Short description.")], - ) - chunks = chunk_document(doc) - assert chunks[0].chunk_id == "TEST-001:chunk-0" - - def test_multiple_embeddable_sections(self): - """Business rules embed both Declaración and Cuándo Aplica.""" - doc = _make_document( - KDDKind.BUSINESS_RULE, - [ - Section(heading="Declaración", level=2, content="Rule declaration text."), - Section(heading="Cuándo Aplica", level=2, content="When this rule applies."), - Section(heading="Ejemplos", level=2, content="Example text not embedded."), - ], - ) - chunks = chunk_document(doc) - headings = {c.section_heading for c in chunks} - assert "Declaración" in headings - assert "Cuándo Aplica" in headings - assert "Ejemplos" not in headings - - def test_long_content_splits(self): - """Content exceeding max_chunk_chars is split into multiple chunks.""" - long_text = "\n\n".join(f"Paragraph {i} with enough text." for i in range(50)) - doc = _make_document( - KDDKind.ENTITY, - [Section(heading="Descripción", level=2, content=long_text)], - ) - chunks = chunk_document(doc, max_chunk_chars=200, overlap_chars=50) - assert len(chunks) > 1 - - def test_empty_section_skipped(self): - doc = _make_document( - KDDKind.ENTITY, - [ - Section(heading="Descripción", level=2, content=""), - Section(heading="Atributos", level=2, content="attr"), - ], - ) - chunks = chunk_document(doc) - assert chunks == [] - - def test_use_case_embeds_description_and_main_flow(self): - doc = _make_document( - KDDKind.USE_CASE, - [ - Section(heading="Descripción", level=2, content="UC description."), - Section(heading="Flujo Principal", level=2, content="Step 1, step 2."), - Section(heading="Actores", level=2, content="Developer"), - ], - ) - chunks = chunk_document(doc) - headings = {c.section_heading for c in chunks} - assert "Descripción" in headings - assert "Flujo Principal" in headings - assert "Actores" not in headings - - def test_prd_embeds_problem(self): - doc = _make_document( - KDDKind.PRD, - [ - Section(heading="Problema / Oportunidad", level=2, content="The problem statement."), - Section(heading="Alcance", level=2, content="Scope details."), - ], - ) - chunks = chunk_document(doc) - assert len(chunks) == 1 - assert chunks[0].section_heading == "Problema / Oportunidad" diff --git a/tests/v2/conftest.py b/tests/v2/conftest.py deleted file mode 100644 index a50d920..0000000 --- a/tests/v2/conftest.py +++ /dev/null @@ -1 +0,0 @@ -"""Shared fixtures for v2 (kdd package) tests.""" diff --git a/tests/v2/domain/__init__.py b/tests/v2/domain/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/domain/test_entities.py b/tests/v2/domain/test_entities.py deleted file mode 100644 index a8ed11d..0000000 --- a/tests/v2/domain/test_entities.py +++ /dev/null @@ -1,284 +0,0 @@ -"""Tests for kdd.domain.entities.""" - -import pytest -from datetime import datetime -from uuid import uuid4 - -from kdd.domain.enums import ( - DocumentStatus, - IndexLevel, - KDDKind, - KDDLayer, - QueryStatus, - RetrievalStrategy, -) -from kdd.domain.entities import ( - Embedding, - GraphEdge, - GraphNode, - IndexManifest, - IndexStats, - KDDDocument, - LayerViolation, - RetrievalQuery, - RetrievalResult, - ScoredNode, - Section, -) - - -class TestSection: - def test_construction(self): - s = Section(heading="Descripción", level=2, content="Some text") - assert s.heading == "Descripción" - assert s.level == 2 - assert s.path == "" - - def test_with_path(self): - s = Section(heading="Atributos", level=2, content="...", path="descripcion.atributos") - assert s.path == "descripcion.atributos" - - -class TestKDDDocument: - def test_minimal_construction(self): - doc = KDDDocument( - id="Pedido", - kind=KDDKind.ENTITY, - source_path="specs/01-domain/entities/Pedido.md", - source_hash="abc123", - layer=KDDLayer.DOMAIN, - front_matter={"kind": "entity"}, - sections=[], - ) - assert doc.id == "Pedido" - assert doc.kind == KDDKind.ENTITY - assert doc.status == DocumentStatus.DETECTED - assert doc.wiki_links == [] - assert doc.indexed_at is None - assert doc.domain is None - - def test_with_all_fields(self): - now = datetime.now() - doc = KDDDocument( - id="Pedido", - kind=KDDKind.ENTITY, - source_path="specs/01-domain/entities/Pedido.md", - source_hash="abc123", - layer=KDDLayer.DOMAIN, - front_matter={"kind": "entity", "aliases": ["Orden"]}, - sections=[Section(heading="Desc", level=2, content="text")], - wiki_links=["Usuario", "LineaPedido"], - status=DocumentStatus.INDEXED, - indexed_at=now, - domain="core", - ) - assert len(doc.wiki_links) == 2 - assert doc.status == DocumentStatus.INDEXED - assert doc.domain == "core" - - def test_kind_must_be_valid(self): - with pytest.raises(ValueError): - KDDDocument( - id="x", - kind="invalid-kind", - source_path="x", - source_hash="x", - layer=KDDLayer.DOMAIN, - front_matter={}, - sections=[], - ) - - -class TestGraphNode: - def test_construction(self): - node = GraphNode( - id="Entity:Pedido", - kind=KDDKind.ENTITY, - source_file="specs/01-domain/entities/Pedido.md", - source_hash="abc123", - layer=KDDLayer.DOMAIN, - ) - assert node.id == "Entity:Pedido" - assert node.status == "draft" - assert node.aliases == [] - assert node.indexed_fields == {} - - def test_with_indexed_fields(self): - node = GraphNode( - id="Entity:Pedido", - kind=KDDKind.ENTITY, - source_file="specs/01-domain/entities/Pedido.md", - source_hash="abc123", - layer=KDDLayer.DOMAIN, - indexed_fields={ - "description": "Represents an order", - "attributes": [{"name": "id", "type": "uuid"}], - }, - ) - assert "description" in node.indexed_fields - assert len(node.indexed_fields["attributes"]) == 1 - - -class TestGraphEdge: - def test_structural_edge(self): - edge = GraphEdge( - from_node="UC:UC-001", - to_node="Entity:KDDDocument", - edge_type="WIKI_LINK", - source_file="specs/02-behavior/use-cases/UC-001.md", - extraction_method="wiki_link", - ) - assert edge.layer_violation is False - assert edge.bidirectional is False - assert edge.metadata == {} - - def test_business_edge(self): - edge = GraphEdge( - from_node="Entity:Pedido", - to_node="Entity:Usuario", - edge_type="pertenece_a", - source_file="specs/01-domain/entities/Pedido.md", - extraction_method="section_content", - metadata={"cardinality": "N:1"}, - ) - assert edge.edge_type == "pertenece_a" - assert edge.metadata["cardinality"] == "N:1" - - def test_layer_violation_flag(self): - edge = GraphEdge( - from_node="Entity:X", - to_node="UC:UC-001", - edge_type="WIKI_LINK", - source_file="specs/01-domain/entities/X.md", - extraction_method="wiki_link", - layer_violation=True, - ) - assert edge.layer_violation is True - - -class TestEmbedding: - def test_construction(self): - emb = Embedding( - id="Pedido:descripcion:0", - document_id="Pedido", - document_kind=KDDKind.ENTITY, - section_path="descripcion", - chunk_index=0, - raw_text="An order placed by a user", - context_text="[entity: Pedido] > [Descripción] > An order placed by a user", - vector=[0.1] * 768, - model="nomic-embed-text-v1.5", - dimensions=768, - text_hash="hash123", - generated_at=datetime.now(), - ) - assert emb.id == "Pedido:descripcion:0" - assert len(emb.vector) == 768 - assert emb.dimensions == 768 - - -class TestIndexManifest: - def test_l1_manifest(self): - m = IndexManifest( - version="1.0.0", - kdd_version="1.0", - indexed_at=datetime.now(), - indexed_by="dev-alice", - index_level=IndexLevel.L1, - ) - assert m.embedding_model is None - assert m.embedding_dimensions is None - assert m.stats.nodes == 0 - assert m.structure == "single-domain" - - def test_l2_manifest(self): - m = IndexManifest( - version="1.0.0", - kdd_version="1.0", - embedding_model="nomic-embed-text-v1.5", - embedding_dimensions=768, - indexed_at=datetime.now(), - indexed_by="dev-bob", - index_level=IndexLevel.L2, - stats=IndexStats(nodes=47, edges=132, embeddings=31), - ) - assert m.embedding_model == "nomic-embed-text-v1.5" - assert m.stats.embeddings == 31 - - def test_multi_domain(self): - m = IndexManifest( - version="1.0.0", - kdd_version="1.0", - indexed_at=datetime.now(), - indexed_by="dev-alice", - index_level=IndexLevel.L1, - structure="multi-domain", - domains=["core", "auth"], - ) - assert m.structure == "multi-domain" - assert len(m.domains) == 2 - - -class TestRetrievalQuery: - def test_hybrid_query(self): - q = RetrievalQuery( - id=uuid4(), - strategy=RetrievalStrategy.HYBRID, - query_text="indexing pipeline", - received_at=datetime.now(), - ) - assert q.depth == 2 - assert q.min_score == 0.7 - assert q.limit == 10 - assert q.max_tokens == 8000 - assert q.respect_layers is True - assert q.status == QueryStatus.RECEIVED - - def test_graph_query(self): - q = RetrievalQuery( - id=uuid4(), - strategy=RetrievalStrategy.GRAPH, - root_node="Entity:Pedido", - depth=3, - edge_types=["EMITS", "DOMAIN_RELATION"], - received_at=datetime.now(), - ) - assert q.root_node == "Entity:Pedido" - assert len(q.edge_types) == 2 - assert q.query_text is None - - -class TestRetrievalResult: - def test_construction(self): - qid = uuid4() - r = RetrievalResult( - query_id=qid, - strategy=RetrievalStrategy.HYBRID, - results=[ - ScoredNode(node_id="Entity:Pedido", score=0.95, match_source="fusion"), - ScoredNode(node_id="UC:UC-001", score=0.82, match_source="semantic"), - ], - total_nodes=2, - total_tokens=1500, - ) - assert r.total_nodes == 2 - assert r.results[0].score > r.results[1].score - assert r.layer_violations == [] - - def test_with_violations(self): - r = RetrievalResult( - query_id=uuid4(), - strategy=RetrievalStrategy.GRAPH, - results=[], - total_nodes=0, - layer_violations=[ - LayerViolation( - from_node="Entity:X", - to_node="UC:UC-001", - from_layer=KDDLayer.DOMAIN, - to_layer=KDDLayer.BEHAVIOR, - edge_type="WIKI_LINK", - ), - ], - ) - assert len(r.layer_violations) == 1 diff --git a/tests/v2/domain/test_enums.py b/tests/v2/domain/test_enums.py deleted file mode 100644 index 332bd2c..0000000 --- a/tests/v2/domain/test_enums.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Tests for kdd.domain.enums.""" - -from kdd.domain.enums import ( - DocumentStatus, - EdgeType, - IndexLevel, - KDDKind, - KDDLayer, - QueryStatus, - RetrievalStrategy, -) - - -class TestKDDKind: - """KDDKind covers the 15 artifact types from PRD-KBEngine.""" - - def test_has_15_members(self): - assert len(KDDKind) == 15 - - def test_all_values_are_lowercase_kebab(self): - for kind in KDDKind: - assert kind.value == kind.value.lower() - assert " " not in kind.value - - def test_string_serialisation(self): - assert KDDKind.ENTITY == "entity" - assert KDDKind.USE_CASE == "use-case" - assert KDDKind.BUSINESS_RULE == "business-rule" - - def test_from_string(self): - assert KDDKind("entity") is KDDKind.ENTITY - assert KDDKind("command") is KDDKind.COMMAND - assert KDDKind("adr") is KDDKind.ADR - - def test_expected_kinds(self): - expected = { - "entity", "event", "business-rule", "business-policy", - "cross-policy", "command", "query", "process", "use-case", - "ui-view", "ui-component", "requirement", "objective", - "prd", "adr", - } - assert {k.value for k in KDDKind} == expected - - -class TestKDDLayer: - """KDDLayer has 5 values with correct numeric ordering.""" - - def test_has_5_members(self): - assert len(KDDLayer) == 5 - - def test_numeric_ordering(self): - assert KDDLayer.REQUIREMENTS.numeric == 0 - assert KDDLayer.DOMAIN.numeric == 1 - assert KDDLayer.BEHAVIOR.numeric == 2 - assert KDDLayer.EXPERIENCE.numeric == 3 - assert KDDLayer.VERIFICATION.numeric == 4 - - def test_ordering_ascending(self): - layers = list(KDDLayer) - for i in range(len(layers) - 1): - assert layers[i].numeric < layers[i + 1].numeric - - def test_string_values(self): - assert KDDLayer.REQUIREMENTS == "00-requirements" - assert KDDLayer.DOMAIN == "01-domain" - assert KDDLayer.VERIFICATION == "04-verification" - - -class TestEdgeType: - """EdgeType covers 16 structural edge types from GraphEdge spec.""" - - def test_has_16_members(self): - assert len(EdgeType) == 16 - - def test_all_screaming_snake_case(self): - for et in EdgeType: - assert et.value == et.value.upper() - assert " " not in et.value - - def test_key_types_exist(self): - assert EdgeType.WIKI_LINK.value == "WIKI_LINK" - assert EdgeType.EMITS.value == "EMITS" - assert EdgeType.UC_APPLIES_RULE.value == "UC_APPLIES_RULE" - assert EdgeType.CROSS_DOMAIN_REF.value == "CROSS_DOMAIN_REF" - - -class TestIndexLevel: - def test_has_3_levels(self): - assert len(IndexLevel) == 3 - - def test_values(self): - assert IndexLevel.L1 == "L1" - assert IndexLevel.L2 == "L2" - assert IndexLevel.L3 == "L3" - - -class TestRetrievalStrategy: - def test_has_4_strategies(self): - assert len(RetrievalStrategy) == 4 - - def test_values(self): - assert RetrievalStrategy.GRAPH == "graph" - assert RetrievalStrategy.SEMANTIC == "semantic" - assert RetrievalStrategy.HYBRID == "hybrid" - assert RetrievalStrategy.IMPACT == "impact" - - -class TestDocumentStatus: - def test_has_5_states(self): - assert len(DocumentStatus) == 5 - - def test_lifecycle_states(self): - values = {s.value for s in DocumentStatus} - assert values == {"detected", "parsing", "indexed", "stale", "deleted"} - - -class TestQueryStatus: - def test_has_4_states(self): - assert len(QueryStatus) == 4 diff --git a/tests/v2/domain/test_events.py b/tests/v2/domain/test_events.py deleted file mode 100644 index 8f2a66a..0000000 --- a/tests/v2/domain/test_events.py +++ /dev/null @@ -1,210 +0,0 @@ -"""Tests for kdd.domain.events.""" - -from dataclasses import FrozenInstanceError -from datetime import datetime -from uuid import uuid4 - -import pytest - -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer, RetrievalStrategy -from kdd.domain.events import ( - DocumentDeleted, - DocumentDetected, - DocumentIndexed, - DocumentParsed, - DocumentStale, - MergeCompleted, - MergeRequested, - QueryCompleted, - QueryFailed, - QueryReceived, -) - - -class TestDocumentDetected: - def test_creation(self): - evt = DocumentDetected( - source_path="specs/01-domain/entities/Pedido.md", - source_hash="abc123", - kind=KDDKind.ENTITY, - layer=KDDLayer.DOMAIN, - detected_at=datetime.now(), - ) - assert evt.kind == KDDKind.ENTITY - assert evt.layer == KDDLayer.DOMAIN - - def test_frozen(self): - evt = DocumentDetected( - source_path="x", - source_hash="x", - kind=KDDKind.ENTITY, - layer=KDDLayer.DOMAIN, - detected_at=datetime.now(), - ) - with pytest.raises(FrozenInstanceError): - evt.source_path = "y" - - -class TestDocumentParsed: - def test_payload_completeness(self): - evt = DocumentParsed( - document_id="Pedido", - source_path="specs/01-domain/entities/Pedido.md", - kind=KDDKind.ENTITY, - front_matter={"kind": "entity", "aliases": ["Orden"]}, - section_count=5, - wiki_link_count=3, - parsed_at=datetime.now(), - ) - assert evt.section_count == 5 - assert evt.wiki_link_count == 3 - assert "aliases" in evt.front_matter - - -class TestDocumentIndexed: - def test_payload_completeness(self): - evt = DocumentIndexed( - document_id="Pedido", - source_path="specs/01-domain/entities/Pedido.md", - kind=KDDKind.ENTITY, - node_id="Entity:Pedido", - edge_count=5, - embedding_count=3, - index_level=IndexLevel.L2, - duration_ms=150, - indexed_at=datetime.now(), - ) - assert evt.node_id == "Entity:Pedido" - assert evt.index_level == IndexLevel.L2 - - def test_l1_has_zero_embeddings(self): - evt = DocumentIndexed( - document_id="EVT-Pedido-Creado", - source_path="specs/01-domain/events/EVT-Pedido-Creado.md", - kind=KDDKind.EVENT, - node_id="Event:EVT-Pedido-Creado", - edge_count=2, - embedding_count=0, - index_level=IndexLevel.L1, - duration_ms=50, - indexed_at=datetime.now(), - ) - assert evt.embedding_count == 0 - - -class TestDocumentStale: - def test_hashes_differ(self): - evt = DocumentStale( - document_id="Pedido", - source_path="specs/01-domain/entities/Pedido.md", - previous_hash="abc123", - current_hash="def456", - detected_at=datetime.now(), - ) - assert evt.previous_hash != evt.current_hash - - -class TestDocumentDeleted: - def test_payload_completeness(self): - evt = DocumentDeleted( - document_id="Pedido", - source_path="specs/01-domain/entities/Pedido.md", - node_id="Entity:Pedido", - edge_count=5, - embedding_count=3, - deleted_at=datetime.now(), - ) - assert evt.edge_count == 5 - - -class TestMergeRequested: - def test_payload_completeness(self): - evt = MergeRequested( - merge_id=uuid4(), - source_manifests=["manifest-a", "manifest-b"], - developer_ids=["alice", "bob"], - target_version="1.0.0", - requested_at=datetime.now(), - requested_by="system", - ) - assert len(evt.source_manifests) == 2 - assert len(evt.developer_ids) == 2 - - -class TestMergeCompleted: - def test_payload_completeness(self): - evt = MergeCompleted( - merge_id=uuid4(), - merged_manifest_id="merged-001", - source_count=2, - total_nodes=47, - total_edges=132, - total_embeddings=31, - conflicts_resolved=1, - duration_ms=500, - completed_at=datetime.now(), - ) - assert evt.conflicts_resolved == 1 - - -class TestQueryReceived: - def test_payload_completeness(self): - evt = QueryReceived( - query_id=uuid4(), - strategy=RetrievalStrategy.HYBRID, - query_text="indexing pipeline", - caller="agent-codex", - ) - assert evt.strategy == RetrievalStrategy.HYBRID - - def test_graph_query_no_text(self): - evt = QueryReceived( - query_id=uuid4(), - strategy=RetrievalStrategy.GRAPH, - root_node="Entity:Pedido", - ) - assert evt.query_text is None - assert evt.root_node == "Entity:Pedido" - - -class TestQueryCompleted: - def test_payload_completeness(self): - evt = QueryCompleted( - query_id=uuid4(), - strategy=RetrievalStrategy.HYBRID, - total_results=5, - top_score=0.95, - total_tokens=1500, - duration_ms=120, - completed_at=datetime.now(), - ) - assert evt.duration_ms == 120 - - -class TestQueryFailed: - def test_payload_completeness(self): - evt = QueryFailed( - query_id=uuid4(), - strategy=RetrievalStrategy.SEMANTIC, - error_code="INDEX_UNAVAILABLE", - error_message="No index loaded", - phase="resolution", - duration_ms=5, - failed_at=datetime.now(), - ) - assert evt.phase == "resolution" - assert evt.error_code == "INDEX_UNAVAILABLE" - - -class TestAllEventsAreFrozen: - """Verify every event class is immutable.""" - - EVENT_CLASSES = [ - DocumentDetected, DocumentParsed, DocumentIndexed, - DocumentStale, DocumentDeleted, - MergeRequested, MergeCompleted, - QueryReceived, QueryCompleted, QueryFailed, - ] - - def test_count(self): - assert len(self.EVENT_CLASSES) == 10 diff --git a/tests/v2/domain/test_rules.py b/tests/v2/domain/test_rules.py deleted file mode 100644 index df267c0..0000000 --- a/tests/v2/domain/test_rules.py +++ /dev/null @@ -1,324 +0,0 @@ -"""Tests for kdd.domain.rules — pure function tests for all 5 BRs.""" - -import pytest - -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer -from kdd.domain.rules import ( - EMBEDDABLE_SECTIONS, - KIND_EXPECTED_PATH, - detect_index_level, - detect_layer, - embeddable_sections, - is_layer_violation, - resolve_deletion, - resolve_node_conflict, - route_document, -) - - -# --------------------------------------------------------------------------- -# BR-DOCUMENT-001 — route_document() -# --------------------------------------------------------------------------- - - -class TestRouteDocument: - """BR-DOCUMENT-001: Kind Router.""" - - @pytest.mark.parametrize("kind_str,expected_kind", [ - ("entity", KDDKind.ENTITY), - ("event", KDDKind.EVENT), - ("business-rule", KDDKind.BUSINESS_RULE), - ("business-policy", KDDKind.BUSINESS_POLICY), - ("cross-policy", KDDKind.CROSS_POLICY), - ("command", KDDKind.COMMAND), - ("query", KDDKind.QUERY), - ("process", KDDKind.PROCESS), - ("use-case", KDDKind.USE_CASE), - ("ui-view", KDDKind.UI_VIEW), - ("ui-component", KDDKind.UI_COMPONENT), - ("requirement", KDDKind.REQUIREMENT), - ("objective", KDDKind.OBJECTIVE), - ("prd", KDDKind.PRD), - ("adr", KDDKind.ADR), - ]) - def test_all_15_kinds(self, kind_str, expected_kind): - result = route_document( - front_matter={"kind": kind_str}, - source_path=f"specs/{KIND_EXPECTED_PATH[expected_kind]}SomeFile.md", - ) - assert result.kind == expected_kind - assert result.warning is None - - def test_no_front_matter(self): - result = route_document(front_matter=None, source_path="specs/README.md") - assert result.kind is None - - def test_empty_front_matter(self): - result = route_document(front_matter={}, source_path="specs/README.md") - assert result.kind is None - - def test_unrecognised_kind(self): - result = route_document( - front_matter={"kind": "unknown-type"}, - source_path="specs/01-domain/foo.md", - ) - assert result.kind is None - - def test_missing_kind_field(self): - result = route_document( - front_matter={"title": "Something"}, - source_path="specs/01-domain/foo.md", - ) - assert result.kind is None - - def test_wrong_location_emits_warning(self): - result = route_document( - front_matter={"kind": "entity"}, - source_path="specs/02-behavior/MiEntidad.md", - ) - assert result.kind == KDDKind.ENTITY # front-matter wins - assert result.warning is not None - assert "outside expected path" in result.warning - - def test_correct_location_no_warning(self): - result = route_document( - front_matter={"kind": "entity"}, - source_path="specs/01-domain/entities/Pedido.md", - ) - assert result.kind == KDDKind.ENTITY - assert result.warning is None - - def test_kind_case_insensitive(self): - result = route_document( - front_matter={"kind": "Entity"}, - source_path="specs/01-domain/entities/Pedido.md", - ) - assert result.kind == KDDKind.ENTITY - - def test_kind_whitespace_stripped(self): - result = route_document( - front_matter={"kind": " entity "}, - source_path="specs/01-domain/entities/Pedido.md", - ) - assert result.kind == KDDKind.ENTITY - - -# --------------------------------------------------------------------------- -# BR-EMBEDDING-001 — embeddable_sections() -# --------------------------------------------------------------------------- - - -class TestEmbeddableSections: - """BR-EMBEDDING-001: Embedding Strategy.""" - - def test_entity_embeds_description(self): - sections = embeddable_sections(KDDKind.ENTITY) - assert "descripción" in sections or "description" in sections - - def test_event_has_no_embeddings(self): - sections = embeddable_sections(KDDKind.EVENT) - assert len(sections) == 0 - - def test_business_rule_embeds_declaration_and_when_applies(self): - sections = embeddable_sections(KDDKind.BUSINESS_RULE) - assert "declaración" in sections or "declaration" in sections - assert "cuándo aplica" in sections or "when applies" in sections - - def test_use_case_embeds_description_and_main_flow(self): - sections = embeddable_sections(KDDKind.USE_CASE) - assert "descripción" in sections or "description" in sections - assert "flujo principal" in sections or "main flow" in sections - - def test_all_15_kinds_have_entry(self): - for kind in KDDKind: - # Must not raise KeyError - result = embeddable_sections(kind) - assert isinstance(result, set) - - def test_command_embeds_purpose(self): - sections = embeddable_sections(KDDKind.COMMAND) - assert "purpose" in sections or "propósito" in sections - - def test_prd_embeds_problem(self): - sections = embeddable_sections(KDDKind.PRD) - assert "problema / oportunidad" in sections or "problem / opportunity" in sections - - def test_adr_embeds_context_and_decision(self): - sections = embeddable_sections(KDDKind.ADR) - assert "contexto" in sections or "context" in sections - assert "decisión" in sections or "decision" in sections - - -# --------------------------------------------------------------------------- -# BR-INDEX-001 — detect_index_level() -# --------------------------------------------------------------------------- - - -class TestDetectIndexLevel: - """BR-INDEX-001: Index Level.""" - - def test_no_resources_returns_l1(self): - assert detect_index_level( - embedding_model_available=False, - agent_api_available=False, - ) == IndexLevel.L1 - - def test_embedding_only_returns_l2(self): - assert detect_index_level( - embedding_model_available=True, - agent_api_available=False, - ) == IndexLevel.L2 - - def test_both_returns_l3(self): - assert detect_index_level( - embedding_model_available=True, - agent_api_available=True, - ) == IndexLevel.L3 - - def test_agent_without_embedding_returns_l1(self): - # Agent API without embedding model cannot do L3 - assert detect_index_level( - embedding_model_available=False, - agent_api_available=True, - ) == IndexLevel.L1 - - -# --------------------------------------------------------------------------- -# BR-LAYER-001 — is_layer_violation() -# --------------------------------------------------------------------------- - - -class TestIsLayerViolation: - """BR-LAYER-001: Layer Validation. - - Validates BDD: layer-validation.feature SCN-001..004 - """ - - def test_upper_to_lower_is_valid(self): - # SCN-001: 02-behavior → 01-domain - assert is_layer_violation(KDDLayer.BEHAVIOR, KDDLayer.DOMAIN) is False - - def test_lower_to_upper_is_violation(self): - # SCN-002: 01-domain → 02-behavior - assert is_layer_violation(KDDLayer.DOMAIN, KDDLayer.BEHAVIOR) is True - - def test_requirements_always_valid(self): - # SCN-003: 00-requirements → any - for layer in KDDLayer: - assert is_layer_violation(KDDLayer.REQUIREMENTS, layer) is False - - def test_same_layer_is_valid(self): - # SCN-004: same layer - for layer in KDDLayer: - assert is_layer_violation(layer, layer) is False - - def test_verification_to_domain(self): - assert is_layer_violation(KDDLayer.VERIFICATION, KDDLayer.DOMAIN) is False - - def test_domain_to_verification(self): - assert is_layer_violation(KDDLayer.DOMAIN, KDDLayer.VERIFICATION) is True - - def test_domain_to_experience(self): - assert is_layer_violation(KDDLayer.DOMAIN, KDDLayer.EXPERIENCE) is True - - def test_experience_to_behavior(self): - assert is_layer_violation(KDDLayer.EXPERIENCE, KDDLayer.BEHAVIOR) is False - - @pytest.mark.parametrize("origin,dest,expected", [ - (KDDLayer.VERIFICATION, KDDLayer.EXPERIENCE, False), - (KDDLayer.VERIFICATION, KDDLayer.BEHAVIOR, False), - (KDDLayer.VERIFICATION, KDDLayer.DOMAIN, False), - (KDDLayer.EXPERIENCE, KDDLayer.DOMAIN, False), - (KDDLayer.BEHAVIOR, KDDLayer.DOMAIN, False), - (KDDLayer.DOMAIN, KDDLayer.BEHAVIOR, True), - (KDDLayer.DOMAIN, KDDLayer.EXPERIENCE, True), - (KDDLayer.DOMAIN, KDDLayer.VERIFICATION, True), - (KDDLayer.BEHAVIOR, KDDLayer.EXPERIENCE, True), - (KDDLayer.BEHAVIOR, KDDLayer.VERIFICATION, True), - (KDDLayer.EXPERIENCE, KDDLayer.VERIFICATION, True), - ]) - def test_all_layer_combinations(self, origin, dest, expected): - assert is_layer_violation(origin, dest) is expected - - -class TestDetectLayer: - def test_all_prefixes(self): - assert detect_layer("00-requirements/PRD.md") == KDDLayer.REQUIREMENTS - assert detect_layer("01-domain/entities/X.md") == KDDLayer.DOMAIN - assert detect_layer("02-behavior/commands/CMD.md") == KDDLayer.BEHAVIOR - assert detect_layer("03-experience/views/V.md") == KDDLayer.EXPERIENCE - assert detect_layer("04-verification/criteria/R.md") == KDDLayer.VERIFICATION - - def test_unknown_path(self): - assert detect_layer("random/path.md") is None - - def test_nested_path(self): - assert detect_layer("specs/01-domain/entities/Pedido.md") == KDDLayer.DOMAIN - - -# --------------------------------------------------------------------------- -# BR-MERGE-001 — resolve_node_conflict() / resolve_deletion() -# --------------------------------------------------------------------------- - - -class TestResolveNodeConflict: - """BR-MERGE-001: Merge Conflict Resolution — last-write-wins.""" - - def test_single_candidate(self): - result = resolve_node_conflict([ - {"source_hash": "abc", "indexed_at": "2026-02-15T10:00:00"}, - ]) - assert result.winner_index == 0 - assert result.reason == "single" - - def test_identical_hashes(self): - result = resolve_node_conflict([ - {"source_hash": "abc", "indexed_at": "2026-02-15T10:00:00"}, - {"source_hash": "abc", "indexed_at": "2026-02-15T10:15:00"}, - ]) - assert result.winner_index == 0 - assert result.reason == "identical" - - def test_last_write_wins(self): - result = resolve_node_conflict([ - {"source_hash": "abc", "indexed_at": "2026-02-15T10:00:00"}, - {"source_hash": "xyz", "indexed_at": "2026-02-15T10:15:00"}, - ]) - assert result.winner_index == 1 - assert result.reason == "last-write-wins" - - def test_three_way_conflict(self): - result = resolve_node_conflict([ - {"source_hash": "aaa", "indexed_at": "2026-02-15T10:00:00"}, - {"source_hash": "bbb", "indexed_at": "2026-02-15T10:30:00"}, - {"source_hash": "ccc", "indexed_at": "2026-02-15T10:15:00"}, - ]) - assert result.winner_index == 1 # 10:30 is latest - assert result.reason == "last-write-wins" - - -class TestResolveDeletion: - """BR-MERGE-001: Delete-wins.""" - - def test_all_present(self): - deleted, warning = resolve_deletion([True, True]) - assert deleted is False - assert warning is None - - def test_one_deleted(self): - deleted, warning = resolve_deletion([True, False]) - assert deleted is True - assert warning is None - - def test_deleted_with_modification_warning(self): - deleted, warning = resolve_deletion( - [True, False], - modified_after_deletion=True, - ) - assert deleted is True - assert warning is not None - assert "modified" in warning.lower() - - def test_all_deleted(self): - deleted, warning = resolve_deletion([False, False]) - assert deleted is True diff --git a/tests/v2/infrastructure/__init__.py b/tests/v2/infrastructure/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/v2/infrastructure/test_artifact.py b/tests/v2/infrastructure/test_artifact.py deleted file mode 100644 index 117a325..0000000 --- a/tests/v2/infrastructure/test_artifact.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Tests for kdd.infrastructure.artifact.filesystem.""" - -from datetime import datetime - -import pytest - -from kdd.domain.entities import Embedding, GraphEdge, GraphNode, IndexManifest, IndexStats -from kdd.domain.enums import IndexLevel, KDDKind, KDDLayer -from kdd.infrastructure.artifact.filesystem import FilesystemArtifactStore - - -@pytest.fixture -def store(tmp_path): - """An ArtifactStore rooted in a temp directory.""" - return FilesystemArtifactStore(tmp_path / ".kdd-index") - - -@pytest.fixture -def sample_manifest(): - return IndexManifest( - version="1.0.0", - kdd_version="1.0", - indexed_at=datetime(2026, 2, 15, 10, 0, 0), - indexed_by="test-dev", - index_level=IndexLevel.L1, - stats=IndexStats(nodes=2, edges=3), - ) - - -@pytest.fixture -def sample_node(): - return GraphNode( - id="Entity:Pedido", - kind=KDDKind.ENTITY, - source_file="specs/01-domain/entities/Pedido.md", - source_hash="abc123", - layer=KDDLayer.DOMAIN, - indexed_fields={"description": "An order"}, - indexed_at=datetime(2026, 2, 15, 10, 0, 0), - ) - - -@pytest.fixture -def sample_edges(): - return [ - GraphEdge( - from_node="Entity:Pedido", - to_node="Entity:Usuario", - edge_type="DOMAIN_RELATION", - source_file="specs/01-domain/entities/Pedido.md", - extraction_method="section_content", - ), - GraphEdge( - from_node="Entity:Pedido", - to_node="Event:EVT-Pedido-Creado", - edge_type="EMITS", - source_file="specs/01-domain/entities/Pedido.md", - extraction_method="wiki_link", - ), - ] - - -@pytest.fixture -def sample_embedding(): - return Embedding( - id="Pedido:descripcion:0", - document_id="Pedido", - document_kind=KDDKind.ENTITY, - section_path="descripcion", - chunk_index=0, - raw_text="An order placed by a user", - context_text="[entity: Pedido] > An order placed by a user", - vector=[0.1] * 10, - model="test-model", - dimensions=10, - text_hash="h1", - generated_at=datetime(2026, 2, 15, 10, 0, 0), - ) - - -class TestManifestRoundTrip: - def test_write_and_read(self, store, sample_manifest): - store.write_manifest(sample_manifest) - loaded = store.read_manifest() - assert loaded is not None - assert loaded.version == "1.0.0" - assert loaded.indexed_by == "test-dev" - assert loaded.stats.nodes == 2 - - def test_read_nonexistent(self, store): - assert store.read_manifest() is None - - def test_creates_directory(self, store, sample_manifest): - store.write_manifest(sample_manifest) - assert (store.root / "manifest.json").exists() - - -class TestNodeRoundTrip: - def test_write_and_read(self, store, sample_node): - store.write_node(sample_node) - loaded = store.read_node("Entity:Pedido") - assert loaded is not None - assert loaded.id == "Entity:Pedido" - assert loaded.kind == KDDKind.ENTITY - assert loaded.indexed_fields["description"] == "An order" - - def test_read_nonexistent(self, store): - assert store.read_node("Entity:Missing") is None - - def test_file_structure(self, store, sample_node): - store.write_node(sample_node) - path = store.root / "nodes" / "entity" / "Pedido.json" - assert path.exists() - - def test_read_all_nodes(self, store, sample_node): - store.write_node(sample_node) - node2 = GraphNode( - id="CMD:CMD-001", - kind=KDDKind.COMMAND, - source_file="specs/02-behavior/commands/CMD-001.md", - source_hash="def456", - layer=KDDLayer.BEHAVIOR, - ) - store.write_node(node2) - all_nodes = store.read_all_nodes() - assert len(all_nodes) == 2 - ids = {n.id for n in all_nodes} - assert "Entity:Pedido" in ids - assert "CMD:CMD-001" in ids - - -class TestEdgeRoundTrip: - def test_append_and_read(self, store, sample_edges): - store.append_edges(sample_edges) - loaded = store.read_edges() - assert len(loaded) == 2 - assert loaded[0].from_node == "Entity:Pedido" - assert loaded[1].edge_type == "EMITS" - - def test_append_multiple_times(self, store, sample_edges): - store.append_edges(sample_edges[:1]) - store.append_edges(sample_edges[1:]) - loaded = store.read_edges() - assert len(loaded) == 2 - - def test_read_empty(self, store): - assert store.read_edges() == [] - - def test_jsonl_format(self, store, sample_edges): - store.append_edges(sample_edges) - path = store.root / "edges" / "edges.jsonl" - lines = path.read_text().strip().splitlines() - assert len(lines) == 2 - - -class TestEmbeddingRoundTrip: - def test_write_and_read(self, store, sample_embedding): - store.write_embeddings([sample_embedding]) - loaded = store.read_embeddings("Pedido") - assert len(loaded) == 1 - assert loaded[0].id == "Pedido:descripcion:0" - assert len(loaded[0].vector) == 10 - - def test_read_empty(self, store): - assert store.read_embeddings("Missing") == [] - - def test_file_structure(self, store, sample_embedding): - store.write_embeddings([sample_embedding]) - path = store.root / "embeddings" / "entity" / "Pedido.json" - assert path.exists() - - def test_write_empty_list(self, store): - store.write_embeddings([]) - assert not (store.root / "embeddings").exists() - - -class TestCascadeDelete: - def test_deletes_node_edges_embeddings( - self, store, sample_node, sample_edges, sample_embedding - ): - # Setup - store.write_node(sample_node) - store.append_edges(sample_edges) - store.write_embeddings([sample_embedding]) - - # Act - store.delete_document_artifacts("Pedido") - - # Assert: node gone - assert store.read_node("Entity:Pedido") is None - - # Assert: edges involving Pedido removed - remaining_edges = store.read_edges() - for edge in remaining_edges: - assert "Pedido" not in edge.from_node - assert "Pedido" not in edge.to_node - - # Assert: embeddings gone - assert store.read_embeddings("Pedido") == [] - - def test_delete_nonexistent_is_noop(self, store): - # Should not raise - store.delete_document_artifacts("Missing") - - def test_preserves_other_documents(self, store, sample_node, sample_edges): - other_node = GraphNode( - id="CMD:CMD-001", - kind=KDDKind.COMMAND, - source_file="specs/02-behavior/commands/CMD-001.md", - source_hash="def456", - layer=KDDLayer.BEHAVIOR, - ) - other_edge = GraphEdge( - from_node="CMD:CMD-001", - to_node="Event:EVT-X", - edge_type="EMITS", - source_file="specs/02-behavior/commands/CMD-001.md", - extraction_method="wiki_link", - ) - store.write_node(sample_node) - store.write_node(other_node) - store.append_edges(sample_edges + [other_edge]) - - store.delete_document_artifacts("Pedido") - - # Other node untouched - assert store.read_node("CMD:CMD-001") is not None - # Other edge preserved - remaining = store.read_edges() - assert len(remaining) == 1 - assert remaining[0].from_node == "CMD:CMD-001" diff --git a/tests/v2/infrastructure/test_claude_cli.py b/tests/v2/infrastructure/test_claude_cli.py deleted file mode 100644 index 42a4890..0000000 --- a/tests/v2/infrastructure/test_claude_cli.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Tests for ClaudeCliAgentClient adapter.""" - -from __future__ import annotations - -import json -import subprocess -from unittest.mock import MagicMock, patch - -import pytest - -from kdd.domain.entities import GraphNode -from kdd.domain.enums import KDDKind, KDDLayer -from kdd.infrastructure.agent.claude_cli import ( - ClaudeCliAgentClient, - _parse_enrichment_response, -) - - -@pytest.fixture -def sample_node() -> GraphNode: - return GraphNode( - id="Entity:Pedido", - kind=KDDKind.ENTITY, - source_file="01-domain/entities/Pedido.md", - source_hash="abc123", - layer=KDDLayer.DOMAIN, - ) - - -@pytest.fixture -def valid_enrichment() -> dict: - return { - "summary": "Pedido represents a customer order.", - "implicit_relations": [ - {"target": "Entity:Cliente", "type": "DEPENDS_ON"}, - ], - "impact_analysis": {"change_risk": "high", "reason": "Core entity."}, - } - - -def _make_envelope(enrichment: dict) -> str: - """Build a Claude CLI JSON envelope.""" - return json.dumps({ - "type": "result", - "subtype": "success", - "result": json.dumps(enrichment), - }) - - -class TestEnrichSuccess: - def test_enrich_success(self, sample_node, valid_enrichment): - client = ClaudeCliAgentClient(timeout=30) - mock_result = MagicMock() - mock_result.returncode = 0 - mock_result.stdout = _make_envelope(valid_enrichment) - mock_result.stderr = "" - - with patch("kdd.infrastructure.agent.claude_cli.subprocess.run", return_value=mock_result) as mock_run: - result = client.enrich(sample_node, "some context") - - assert result["summary"] == "Pedido represents a customer order." - assert len(result["implicit_relations"]) == 1 - assert result["impact_analysis"]["change_risk"] == "high" - - # Verify CLAUDECODE* env vars are filtered - call_kwargs = mock_run.call_args - env = call_kwargs.kwargs["env"] - for key in env: - assert not key.startswith("CLAUDECODE"), f"CLAUDECODE var leaked: {key}" - - def test_enrich_with_model_override(self, sample_node, valid_enrichment): - client = ClaudeCliAgentClient(model="sonnet") - mock_result = MagicMock() - mock_result.returncode = 0 - mock_result.stdout = _make_envelope(valid_enrichment) - mock_result.stderr = "" - - with patch("kdd.infrastructure.agent.claude_cli.subprocess.run", return_value=mock_result) as mock_run: - client.enrich(sample_node, "context") - - cmd = mock_run.call_args.args[0] - assert "--model" in cmd - assert "sonnet" in cmd - - -class TestEnrichErrors: - def test_enrich_timeout(self, sample_node): - client = ClaudeCliAgentClient(timeout=1) - - with patch( - "kdd.infrastructure.agent.claude_cli.subprocess.run", - side_effect=subprocess.TimeoutExpired(cmd="claude", timeout=1), - ): - with pytest.raises(RuntimeError, match="timed out"): - client.enrich(sample_node, "context") - - def test_enrich_cli_not_found(self, sample_node): - client = ClaudeCliAgentClient() - - with patch( - "kdd.infrastructure.agent.claude_cli.subprocess.run", - side_effect=FileNotFoundError, - ): - with pytest.raises(RuntimeError, match="not found"): - client.enrich(sample_node, "context") - - def test_enrich_nonzero_exit(self, sample_node): - client = ClaudeCliAgentClient() - mock_result = MagicMock() - mock_result.returncode = 1 - mock_result.stderr = "some error" - - with patch("kdd.infrastructure.agent.claude_cli.subprocess.run", return_value=mock_result): - with pytest.raises(RuntimeError, match="exited with code 1"): - client.enrich(sample_node, "context") - - def test_enrich_invalid_json_envelope(self, sample_node): - client = ClaudeCliAgentClient() - mock_result = MagicMock() - mock_result.returncode = 0 - mock_result.stdout = "not json at all" - mock_result.stderr = "" - - with patch("kdd.infrastructure.agent.claude_cli.subprocess.run", return_value=mock_result): - with pytest.raises(RuntimeError, match="invalid JSON envelope"): - client.enrich(sample_node, "context") - - -class TestParseEnrichmentResponse: - def test_parse_valid_json(self, valid_enrichment): - result = _parse_enrichment_response(json.dumps(valid_enrichment)) - assert result["summary"] == "Pedido represents a customer order." - - def test_parse_with_markdown_fences(self, valid_enrichment): - text = "```json\n" + json.dumps(valid_enrichment) + "\n```" - result = _parse_enrichment_response(text) - assert result["summary"] == "Pedido represents a customer order." - - def test_parse_with_bare_fences(self, valid_enrichment): - text = "```\n" + json.dumps(valid_enrichment) + "\n```" - result = _parse_enrichment_response(text) - assert result["summary"] == "Pedido represents a customer order." - - def test_parse_missing_keys_defaults(self): - result = _parse_enrichment_response("{}") - assert result["summary"] == "" - assert result["implicit_relations"] == [] - assert result["impact_analysis"]["change_risk"] == "medium" - - def test_parse_invalid_json_raises(self): - with pytest.raises(RuntimeError, match="invalid JSON"): - _parse_enrichment_response("this is not json") - - def test_parse_non_object_raises(self): - with pytest.raises(RuntimeError, match="Expected JSON object"): - _parse_enrichment_response("[1, 2, 3]") diff --git a/tests/v2/infrastructure/test_event_bus.py b/tests/v2/infrastructure/test_event_bus.py deleted file mode 100644 index 16cf5dd..0000000 --- a/tests/v2/infrastructure/test_event_bus.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Tests for kdd.infrastructure.events.bus.""" - -from dataclasses import dataclass - -from kdd.infrastructure.events.bus import InMemoryEventBus - - -@dataclass(frozen=True) -class FakeEventA: - value: str - - -@dataclass(frozen=True) -class FakeEventB: - count: int - - -class TestInMemoryEventBus: - def test_publish_calls_handler(self): - bus = InMemoryEventBus() - received = [] - bus.subscribe(FakeEventA, lambda e: received.append(e)) - - bus.publish(FakeEventA(value="hello")) - - assert len(received) == 1 - assert received[0].value == "hello" - - def test_multiple_handlers(self): - bus = InMemoryEventBus() - results_1 = [] - results_2 = [] - bus.subscribe(FakeEventA, lambda e: results_1.append(e.value)) - bus.subscribe(FakeEventA, lambda e: results_2.append(e.value.upper())) - - bus.publish(FakeEventA(value="test")) - - assert results_1 == ["test"] - assert results_2 == ["TEST"] - - def test_different_event_types(self): - bus = InMemoryEventBus() - a_events = [] - b_events = [] - bus.subscribe(FakeEventA, lambda e: a_events.append(e)) - bus.subscribe(FakeEventB, lambda e: b_events.append(e)) - - bus.publish(FakeEventA(value="a")) - bus.publish(FakeEventB(count=42)) - - assert len(a_events) == 1 - assert len(b_events) == 1 - assert b_events[0].count == 42 - - def test_no_handler_is_noop(self): - bus = InMemoryEventBus() - # Should not raise - bus.publish(FakeEventA(value="ignored")) - - def test_handler_order_preserved(self): - bus = InMemoryEventBus() - order = [] - bus.subscribe(FakeEventA, lambda e: order.append(1)) - bus.subscribe(FakeEventA, lambda e: order.append(2)) - bus.subscribe(FakeEventA, lambda e: order.append(3)) - - bus.publish(FakeEventA(value="x")) - - assert order == [1, 2, 3] diff --git a/tests/v2/infrastructure/test_git_diff.py b/tests/v2/infrastructure/test_git_diff.py deleted file mode 100644 index c4d85ba..0000000 --- a/tests/v2/infrastructure/test_git_diff.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Tests for kdd.infrastructure.git.diff.""" - -import subprocess -from pathlib import Path - -import pytest - -from kdd.infrastructure.git.diff import ( - DiffResult, - get_current_commit, - get_diff, - is_git_repo, - scan_files, -) - - -@pytest.fixture -def git_repo(tmp_path): - """Create a minimal git repository for testing.""" - subprocess.run(["git", "init"], cwd=tmp_path, capture_output=True, check=True) - subprocess.run( - ["git", "config", "user.email", "test@test.com"], - cwd=tmp_path, capture_output=True, check=True, - ) - subprocess.run( - ["git", "config", "user.name", "Test"], - cwd=tmp_path, capture_output=True, check=True, - ) - - # Create initial files - specs = tmp_path / "specs" / "01-domain" / "entities" - specs.mkdir(parents=True) - (specs / "Pedido.md").write_text("---\nkind: entity\n---\n# Pedido\n") - (specs / "Usuario.md").write_text("---\nkind: entity\n---\n# Usuario\n") - (tmp_path / "README.md").write_text("# Project\n") - - subprocess.run(["git", "add", "."], cwd=tmp_path, capture_output=True, check=True) - subprocess.run( - ["git", "commit", "-m", "initial"], - cwd=tmp_path, capture_output=True, check=True, - ) - return tmp_path - - -class TestIsGitRepo: - def test_valid_repo(self, git_repo): - assert is_git_repo(git_repo) is True - - def test_not_a_repo(self, tmp_path): - assert is_git_repo(tmp_path) is False - - -class TestGetCurrentCommit: - def test_returns_hash(self, git_repo): - commit = get_current_commit(git_repo) - assert commit is not None - assert len(commit) == 40 # full SHA - - def test_not_a_repo(self, tmp_path): - assert get_current_commit(tmp_path) is None - - -class TestScanFiles: - def test_all_files(self, git_repo): - files = scan_files(git_repo) - assert "README.md" in files - assert "specs/01-domain/entities/Pedido.md" in files - - def test_with_pattern(self, git_repo): - files = scan_files(git_repo, include_patterns=["specs/**/*.md"]) - assert "README.md" not in files - assert "specs/01-domain/entities/Pedido.md" in files - assert len(files) == 2 - - -class TestGetDiff: - def test_detect_added_file(self, git_repo): - base = get_current_commit(git_repo) - - # Add a new file - new_file = git_repo / "specs" / "01-domain" / "entities" / "Nuevo.md" - new_file.write_text("---\nkind: entity\n---\n# Nuevo\n") - subprocess.run(["git", "add", "."], cwd=git_repo, capture_output=True, check=True) - subprocess.run( - ["git", "commit", "-m", "add"], - cwd=git_repo, capture_output=True, check=True, - ) - - diff = get_diff(git_repo, base) - assert "specs/01-domain/entities/Nuevo.md" in diff.added - assert diff.deleted == [] - - def test_detect_modified_file(self, git_repo): - base = get_current_commit(git_repo) - - # Modify existing file - pedido = git_repo / "specs" / "01-domain" / "entities" / "Pedido.md" - pedido.write_text("---\nkind: entity\n---\n# Pedido\n\nModified.\n") - subprocess.run(["git", "add", "."], cwd=git_repo, capture_output=True, check=True) - subprocess.run( - ["git", "commit", "-m", "modify"], - cwd=git_repo, capture_output=True, check=True, - ) - - diff = get_diff(git_repo, base) - assert "specs/01-domain/entities/Pedido.md" in diff.modified - - def test_detect_deleted_file(self, git_repo): - base = get_current_commit(git_repo) - - # Delete a file - (git_repo / "specs" / "01-domain" / "entities" / "Usuario.md").unlink() - subprocess.run(["git", "add", "."], cwd=git_repo, capture_output=True, check=True) - subprocess.run( - ["git", "commit", "-m", "delete"], - cwd=git_repo, capture_output=True, check=True, - ) - - diff = get_diff(git_repo, base) - assert "specs/01-domain/entities/Usuario.md" in diff.deleted - - def test_with_include_pattern(self, git_repo): - base = get_current_commit(git_repo) - - # Add files in and out of pattern - (git_repo / "README.md").write_text("# Modified\n") - new_spec = git_repo / "specs" / "01-domain" / "entities" / "Nuevo.md" - new_spec.write_text("---\nkind: entity\n---\n") - subprocess.run(["git", "add", "."], cwd=git_repo, capture_output=True, check=True) - subprocess.run( - ["git", "commit", "-m", "mixed"], - cwd=git_repo, capture_output=True, check=True, - ) - - diff = get_diff(git_repo, base, include_patterns=["specs/**/*.md"]) - all_files = diff.added + diff.modified + diff.deleted - assert all("specs/" in f for f in all_files) - - def test_no_changes(self, git_repo): - base = get_current_commit(git_repo) - diff = get_diff(git_repo, base) - assert diff.added == [] - assert diff.modified == [] - assert diff.deleted == [] diff --git a/tests/v2/infrastructure/test_hnswlib_store.py b/tests/v2/infrastructure/test_hnswlib_store.py deleted file mode 100644 index 2c30047..0000000 --- a/tests/v2/infrastructure/test_hnswlib_store.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Tests for HNSWLibVectorStore.""" - -from datetime import datetime - -import pytest - -from kdd.domain.entities import Embedding -from kdd.domain.enums import KDDKind -from kdd.infrastructure.vector.hnswlib_store import HNSWLibVectorStore - - -def _embedding(id: str, vector: list[float], doc_id: str = "doc1") -> Embedding: - return Embedding( - id=id, - document_id=doc_id, - document_kind=KDDKind.ENTITY, - section_path="Descripción", - chunk_index=0, - raw_text="test text", - context_text="test context", - vector=vector, - model="test-model", - dimensions=len(vector), - text_hash="abc123", - generated_at=datetime.now(), - ) - - -@pytest.fixture -def store(): - return HNSWLibVectorStore() - - -@pytest.fixture -def loaded_store(): - """Store with 4 vectors in 3D space.""" - s = HNSWLibVectorStore() - embeddings = [ - _embedding("doc1:chunk-0", [1.0, 0.0, 0.0], "doc1"), - _embedding("doc2:chunk-0", [0.9, 0.1, 0.0], "doc2"), - _embedding("doc3:chunk-0", [0.0, 1.0, 0.0], "doc3"), - _embedding("doc4:chunk-0", [0.0, 0.0, 1.0], "doc4"), - ] - s.load(embeddings) - return s - - -class TestLoad: - def test_empty_load(self, store): - store.load([]) - assert store.size == 0 - - def test_load_sets_dimensions(self, loaded_store): - assert loaded_store.dimensions == 3 - assert loaded_store.size == 4 - - -class TestSearch: - def test_find_nearest(self, loaded_store): - # Query near doc1 [1, 0, 0] — doc1 and doc2 should be closest - results = loaded_store.search([1.0, 0.0, 0.0], limit=2) - assert len(results) == 2 - ids = [r[0] for r in results] - assert "doc1:chunk-0" in ids - assert "doc2:chunk-0" in ids - - def test_scores_are_sorted_descending(self, loaded_store): - results = loaded_store.search([1.0, 0.0, 0.0], limit=4) - scores = [r[1] for r in results] - assert scores == sorted(scores, reverse=True) - - def test_min_score_filter(self, loaded_store): - results = loaded_store.search([1.0, 0.0, 0.0], limit=4, min_score=0.9) - # Only doc1 (exact match, score=1.0) and doc2 (very similar) should pass - assert all(score >= 0.9 for _, score in results) - - def test_high_min_score_filters_all(self, loaded_store): - results = loaded_store.search([0.5, 0.5, 0.5], limit=4, min_score=0.999) - # No exact match exists - assert len(results) <= 1 # might get one close match - - def test_empty_store_returns_empty(self, store): - results = store.search([1.0, 0.0, 0.0], limit=5) - assert results == [] - - def test_limit_respected(self, loaded_store): - results = loaded_store.search([1.0, 0.0, 0.0], limit=1) - assert len(results) == 1 - - def test_orthogonal_query(self, loaded_store): - # Query [0, 0, 1] should match doc4 best - results = loaded_store.search([0.0, 0.0, 1.0], limit=1) - assert results[0][0] == "doc4:chunk-0" diff --git a/tests/v2/infrastructure/test_markdown.py b/tests/v2/infrastructure/test_markdown.py deleted file mode 100644 index 5237145..0000000 --- a/tests/v2/infrastructure/test_markdown.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Tests for kdd.infrastructure.parsing.markdown.""" - -from kdd.infrastructure.parsing.markdown import ( - extract_frontmatter, - extract_snippet, - heading_to_anchor, - parse_markdown_sections, -) - - -class TestExtractFrontmatter: - def test_valid_frontmatter(self): - content = "---\nkind: entity\naliases: [Orden]\n---\n\n# Pedido\n\nBody." - meta, body = extract_frontmatter(content) - assert meta["kind"] == "entity" - assert meta["aliases"] == ["Orden"] - assert body.strip().startswith("# Pedido") - - def test_no_frontmatter(self): - content = "# Just a heading\n\nSome text." - meta, body = extract_frontmatter(content) - assert meta == {} - assert "Just a heading" in body - - def test_empty_string(self): - meta, body = extract_frontmatter("") - assert meta == {} - assert body == "" - - def test_invalid_yaml(self): - content = "---\n: bad: yaml: here\n---\nBody" - meta, body = extract_frontmatter(content) - # Should not crash — returns something (depends on frontmatter lib) - assert isinstance(meta, dict) - - -class TestParseMarkdownSections: - def test_single_section(self): - content = "# Title\n\nParagraph one.\n\nParagraph two." - sections = parse_markdown_sections(content) - assert len(sections) == 1 - assert sections[0].heading == "Title" - assert sections[0].level == 1 - assert "Paragraph one." in sections[0].content - - def test_nested_sections(self): - content = ( - "# Top\n\nIntro.\n\n" - "## Child\n\nChild text.\n\n" - "### Grandchild\n\nDeep text." - ) - sections = parse_markdown_sections(content) - assert len(sections) == 3 - assert sections[0].heading == "Top" - assert sections[1].heading == "Child" - assert sections[2].heading == "Grandchild" - # Path should be hierarchical - assert sections[2].path == "top.child.grandchild" - - def test_sibling_sections(self): - content = "## A\n\nText A.\n\n## B\n\nText B." - sections = parse_markdown_sections(content) - assert len(sections) == 2 - assert sections[0].heading == "A" - assert sections[1].heading == "B" - # Siblings should have independent paths - assert sections[0].path == "a" - assert sections[1].path == "b" - - def test_empty_content(self): - sections = parse_markdown_sections("") - assert sections == [] - - def test_content_without_headings(self): - sections = parse_markdown_sections("Just some text without headings.") - assert sections == [] - - def test_heading_with_special_chars(self): - content = "## Descripción (v2)\n\nContent." - sections = parse_markdown_sections(content) - assert sections[0].heading == "Descripción (v2)" - - def test_real_entity_structure(self): - content = ( - "## Descripción\n\nEntity description.\n\n" - "## Atributos\n\n| Name | Type |\n|---|---|\n| id | uuid |\n\n" - "## Relaciones\n\nRelation info.\n\n" - "## Invariantes\n\n- Invariant 1." - ) - sections = parse_markdown_sections(content) - assert len(sections) == 4 - headings = [s.heading for s in sections] - assert headings == ["Descripción", "Atributos", "Relaciones", "Invariantes"] - - -class TestHeadingToAnchor: - def test_simple(self): - assert heading_to_anchor("Atributos") == "atributos" - - def test_spaces(self): - assert heading_to_anchor("Ciclo de Vida") == "ciclo-de-vida" - - def test_special_chars(self): - assert heading_to_anchor("Entity: User") == "entity-user" - - def test_parentheses(self): - assert heading_to_anchor("Estados (v2)") == "estados-v2" - - def test_accented(self): - assert heading_to_anchor("Descripción") == "descripcion" - - -class TestExtractSnippet: - def test_short_content(self): - assert extract_snippet("Hello world.") == "Hello world." - - def test_truncation_at_sentence(self): - # ". " must be past the halfway mark of max_length for sentence truncation - text = "This is a fairly long first sentence. " + "A" * 200 - snippet = extract_snippet(text, max_length=60) - assert snippet == "This is a fairly long first sentence." - - def test_truncation_at_word(self): - text = "word " * 100 - snippet = extract_snippet(text, max_length=50) - assert snippet.endswith("...") - assert len(snippet) <= 55 # some margin for "..." - - def test_strips_markdown(self): - text = "**Bold** and *italic* and [link](http://x.com)" - snippet = extract_snippet(text) - assert "**" not in snippet - assert "*" not in snippet - assert "http" not in snippet - assert "Bold" in snippet - assert "link" in snippet diff --git a/tests/v2/infrastructure/test_networkx_store.py b/tests/v2/infrastructure/test_networkx_store.py deleted file mode 100644 index 5687d0d..0000000 --- a/tests/v2/infrastructure/test_networkx_store.py +++ /dev/null @@ -1,239 +0,0 @@ -"""Tests for NetworkXGraphStore.""" - -from datetime import datetime - -import pytest - -from kdd.domain.entities import GraphEdge, GraphNode -from kdd.domain.enums import KDDKind, KDDLayer -from kdd.infrastructure.graph.networkx_store import NetworkXGraphStore - - -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - -def _node(id: str, kind: KDDKind = KDDKind.ENTITY, layer: KDDLayer = KDDLayer.DOMAIN, **fields) -> GraphNode: - return GraphNode( - id=id, - kind=kind, - source_file=f"{id}.md", - source_hash="abc123", - layer=layer, - indexed_fields=fields, - ) - - -def _edge( - from_node: str, - to_node: str, - edge_type: str = "WIKI_LINK", - violation: bool = False, -) -> GraphEdge: - return GraphEdge( - from_node=from_node, - to_node=to_node, - edge_type=edge_type, - source_file="test.md", - extraction_method="wiki_link", - layer_violation=violation, - ) - - -@pytest.fixture -def store(): - return NetworkXGraphStore() - - -@pytest.fixture -def loaded_store(): - """A store with a small graph: - Entity:KDDDocument -> BR:BR-DOCUMENT-001 -> UC:UC-001 - -> CMD:CMD-001 -> UC:UC-001 - Plus a violation edge from Entity -> REQ (lower -> higher) - """ - s = NetworkXGraphStore() - nodes = [ - _node("Entity:KDDDocument", KDDKind.ENTITY, KDDLayer.DOMAIN, title="KDDDocument"), - _node("BR:BR-DOCUMENT-001", KDDKind.BUSINESS_RULE, KDDLayer.DOMAIN, title="Kind Router"), - _node("CMD:CMD-001", KDDKind.COMMAND, KDDLayer.BEHAVIOR, title="IndexDocument"), - _node("UC:UC-001", KDDKind.USE_CASE, KDDLayer.BEHAVIOR, title="IndexDocument"), - _node("REQ:REQ-001", KDDKind.REQUIREMENT, KDDLayer.VERIFICATION, title="Performance"), - ] - edges = [ - _edge("Entity:KDDDocument", "BR:BR-DOCUMENT-001", "ENTITY_RULE"), - _edge("CMD:CMD-001", "Entity:KDDDocument", "WIKI_LINK"), - _edge("UC:UC-001", "CMD:CMD-001", "UC_EXECUTES_CMD"), - _edge("UC:UC-001", "BR:BR-DOCUMENT-001", "UC_APPLIES_RULE"), - _edge("REQ:REQ-001", "UC:UC-001", "REQ_TRACES_TO"), - # Layer violation: domain -> verification - _edge("Entity:KDDDocument", "REQ:REQ-001", "WIKI_LINK", violation=True), - ] - s.load(nodes, edges) - return s - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - -class TestLoad: - def test_empty_load(self, store): - store.load([], []) - assert store.node_count() == 0 - assert store.edge_count() == 0 - - def test_load_nodes_and_edges(self, loaded_store): - assert loaded_store.node_count() == 5 - assert loaded_store.edge_count() == 6 - - def test_reload_clears_previous(self, loaded_store): - loaded_store.load([], []) - assert loaded_store.node_count() == 0 - - -class TestGetNode: - def test_existing_node(self, loaded_store): - node = loaded_store.get_node("Entity:KDDDocument") - assert node is not None - assert node.kind == KDDKind.ENTITY - - def test_missing_node(self, loaded_store): - assert loaded_store.get_node("Entity:Missing") is None - - def test_has_node(self, loaded_store): - assert loaded_store.has_node("Entity:KDDDocument") - assert not loaded_store.has_node("Entity:Missing") - - -class TestTraverse: - def test_depth_1_from_entity(self, loaded_store): - nodes, edges = loaded_store.traverse("Entity:KDDDocument", depth=1) - node_ids = {n.id for n in nodes} - assert "Entity:KDDDocument" in node_ids - assert "BR:BR-DOCUMENT-001" in node_ids - assert "CMD:CMD-001" in node_ids - # UC-001 is 2 hops away, should not be found at depth 1 - assert "UC:UC-001" not in node_ids - - def test_depth_2_reaches_further(self, loaded_store): - nodes, edges = loaded_store.traverse("Entity:KDDDocument", depth=2) - node_ids = {n.id for n in nodes} - assert "UC:UC-001" in node_ids - - def test_edge_type_filter(self, loaded_store): - nodes, edges = loaded_store.traverse( - "Entity:KDDDocument", depth=2, edge_types=["ENTITY_RULE"] - ) - node_ids = {n.id for n in nodes} - assert "BR:BR-DOCUMENT-001" in node_ids - # CMD-001 connected via WIKI_LINK, should be filtered out - assert "CMD:CMD-001" not in node_ids - - def test_respect_layers_excludes_violations(self, loaded_store): - nodes, _ = loaded_store.traverse( - "Entity:KDDDocument", depth=1, respect_layers=True - ) - node_ids = {n.id for n in nodes} - # REQ-001 connected via violation edge, should be excluded - assert "REQ:REQ-001" not in node_ids - - def test_ignore_layers_includes_violations(self, loaded_store): - nodes, _ = loaded_store.traverse( - "Entity:KDDDocument", depth=1, respect_layers=False - ) - node_ids = {n.id for n in nodes} - assert "REQ:REQ-001" in node_ids - - def test_unknown_root_returns_empty(self, loaded_store): - nodes, edges = loaded_store.traverse("Entity:Missing", depth=2) - assert nodes == [] - assert edges == [] - - -class TestTextSearch: - def test_search_by_title(self, loaded_store): - results = loaded_store.text_search("Kind Router") - assert len(results) == 1 - assert results[0].id == "BR:BR-DOCUMENT-001" - - def test_case_insensitive(self, loaded_store): - results = loaded_store.text_search("kind router") - assert len(results) == 1 - - def test_search_by_node_id(self, loaded_store): - results = loaded_store.text_search("CMD-001") - assert any(n.id == "CMD:CMD-001" for n in results) - - def test_search_no_match(self, loaded_store): - results = loaded_store.text_search("nonexistent-term-xyz") - assert len(results) == 0 - - def test_search_with_field_filter(self, loaded_store): - results = loaded_store.text_search("Performance", fields=["title"]) - assert len(results) == 1 - assert results[0].id == "REQ:REQ-001" - - -class TestNeighbors: - def test_neighbors_includes_both_directions(self, loaded_store): - neighbors = loaded_store.neighbors("BR:BR-DOCUMENT-001") - neighbor_ids = {n.id for n in neighbors} - # Entity -> BR (outgoing from entity = incoming to BR) - assert "Entity:KDDDocument" in neighbor_ids - # UC-001 -> BR (outgoing from UC = incoming to BR) - assert "UC:UC-001" in neighbor_ids - - def test_missing_node_returns_empty(self, loaded_store): - assert loaded_store.neighbors("Entity:Missing") == [] - - -class TestEdgeQueries: - def test_incoming_edges(self, loaded_store): - incoming = loaded_store.incoming_edges("BR:BR-DOCUMENT-001") - assert len(incoming) == 2 # Entity->BR and UC->BR - from_ids = {e.from_node for e in incoming} - assert "Entity:KDDDocument" in from_ids - assert "UC:UC-001" in from_ids - - def test_outgoing_edges(self, loaded_store): - outgoing = loaded_store.outgoing_edges("UC:UC-001") - assert len(outgoing) == 2 # UC->CMD and UC->BR - to_ids = {e.to_node for e in outgoing} - assert "CMD:CMD-001" in to_ids - assert "BR:BR-DOCUMENT-001" in to_ids - - def test_all_edges(self, loaded_store): - assert len(loaded_store.all_edges()) == 6 - - def test_find_violations(self, loaded_store): - violations = loaded_store.find_violations() - assert len(violations) == 1 - assert violations[0].from_node == "Entity:KDDDocument" - assert violations[0].to_node == "REQ:REQ-001" - - -class TestReverseTraverse: - def test_reverse_from_entity(self, loaded_store): - """CMD-001 references Entity:KDDDocument, so it's a dependent.""" - results = loaded_store.reverse_traverse("Entity:KDDDocument", depth=2) - dependent_ids = {node.id for node, _ in results} - assert "CMD:CMD-001" in dependent_ids - - def test_reverse_with_path(self, loaded_store): - results = loaded_store.reverse_traverse("Entity:KDDDocument", depth=3) - for node, path in results: - if node.id == "UC:UC-001": - # UC-001 -> CMD-001 -> Entity via incoming edges - assert len(path) >= 1 - break - - def test_reverse_from_leaf(self, loaded_store): - """REQ-001 has an incoming violation edge from Entity:KDDDocument.""" - results = loaded_store.reverse_traverse("REQ:REQ-001", depth=2) - dependent_ids = {node.id for node, _ in results} - assert "Entity:KDDDocument" in dependent_ids - - def test_reverse_missing_node(self, loaded_store): - results = loaded_store.reverse_traverse("Entity:Missing", depth=2) - assert results == [] diff --git a/tests/v2/infrastructure/test_wiki_links.py b/tests/v2/infrastructure/test_wiki_links.py deleted file mode 100644 index 59e2f6f..0000000 --- a/tests/v2/infrastructure/test_wiki_links.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Tests for kdd.infrastructure.parsing.wiki_links.""" - -from kdd.infrastructure.parsing.wiki_links import ( - WikiLink, - extract_wiki_link_targets, - extract_wiki_links, -) - - -class TestExtractWikiLinks: - def test_simple_link(self): - links = extract_wiki_links("See [[KDDDocument]] for details.") - assert len(links) == 1 - assert links[0].target == "KDDDocument" - assert links[0].domain is None - assert links[0].alias is None - - def test_multiple_links(self): - content = "The [[KDDDocument]] produces a [[GraphNode]] and [[GraphEdge|edges]]." - links = extract_wiki_links(content) - assert len(links) == 3 - targets = [l.target for l in links] - assert "KDDDocument" in targets - assert "GraphNode" in targets - assert "GraphEdge" in targets - - def test_cross_domain_link(self): - links = extract_wiki_links("References [[auth::Usuario]] entity.") - assert len(links) == 1 - assert links[0].target == "Usuario" - assert links[0].domain == "auth" - assert links[0].alias is None - - def test_alias_link(self): - links = extract_wiki_links("The [[GraphEdge|edges]] connect nodes.") - assert len(links) == 1 - assert links[0].target == "GraphEdge" - assert links[0].alias == "edges" - assert links[0].domain is None - - def test_cross_domain_with_alias(self): - links = extract_wiki_links("See [[payments::Pedido|order]].") - assert len(links) == 1 - assert links[0].target == "Pedido" - assert links[0].domain == "payments" - assert links[0].alias == "order" - - def test_no_links(self): - links = extract_wiki_links("No links here, just [markdown](http://x.com).") - assert links == [] - - def test_empty_brackets(self): - links = extract_wiki_links("Empty [[ ]] should be skipped.") - assert links == [] - - def test_multiline(self): - content = "Line one [[A]].\nLine two [[B]].\nLine three [[C]]." - links = extract_wiki_links(content) - assert len(links) == 3 - - def test_event_reference(self): - links = extract_wiki_links("Emits [[EVT-Pedido-Confirmado]].") - assert links[0].target == "EVT-Pedido-Confirmado" - - def test_spec_id_reference(self): - links = extract_wiki_links("See [[BR-DOCUMENT-001]] and [[UC-001-IndexDocument]].") - assert len(links) == 2 - targets = [l.target for l in links] - assert "BR-DOCUMENT-001" in targets - assert "UC-001-IndexDocument" in targets - - def test_frozen(self): - links = extract_wiki_links("[[A]]") - import dataclasses - assert dataclasses.is_dataclass(links[0]) - - -class TestExtractWikiLinkTargets: - def test_returns_flat_list(self): - targets = extract_wiki_link_targets("[[A]] and [[B|alias]] and [[d::C]]") - assert targets == ["A", "B", "C"] diff --git a/tests/v2/test_container.py b/tests/v2/test_container.py deleted file mode 100644 index c9b6fdc..0000000 --- a/tests/v2/test_container.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Tests for the DI container (container.py).""" - -from __future__ import annotations - -from pathlib import Path -from unittest.mock import patch - -import pytest - -from kdd.container import Container, create_container -from kdd.domain.enums import IndexLevel - - -class TestContainerCreation: - def test_creates_l1_container_without_deps(self, tmp_path): - """When sentence-transformers is not installed, container is L1.""" - specs = tmp_path / "specs" - specs.mkdir() - - with patch.dict("sys.modules", {"sentence_transformers": None}): - container = create_container(specs) - - assert isinstance(container, Container) - assert container.index_level == IndexLevel.L1 - assert container.embedding_model is None - assert container.vector_store is None - assert container.specs_root == specs - - def test_default_index_path(self, tmp_path): - specs = tmp_path / "specs" - specs.mkdir() - - with patch.dict("sys.modules", {"sentence_transformers": None}): - container = create_container(specs) - - # Default: parent of specs_root / .kdd-index - assert container.index_path == tmp_path / ".kdd-index" - - def test_custom_index_path(self, tmp_path): - specs = tmp_path / "specs" - specs.mkdir() - custom = tmp_path / "custom-idx" - - with patch.dict("sys.modules", {"sentence_transformers": None}): - container = create_container(specs, custom) - - assert container.index_path == custom - - def test_ensure_loaded_returns_false_without_index(self, tmp_path): - specs = tmp_path / "specs" - specs.mkdir() - - with patch.dict("sys.modules", {"sentence_transformers": None}): - container = create_container(specs) - - assert container.ensure_loaded() is False diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..7664559 --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,22 @@ +{ + "compilerOptions": { + "lib": ["ESNext"], + "target": "ESNext", + "module": "Preserve", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + "noImplicitOverride": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + } +}