Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ __pycache__


.chunks

.whoosh
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: all test mypy
.PHONY: all test mypy test_verbose

all: test mypy

Expand All @@ -7,3 +7,6 @@ test:

mypy:
mypy src/docs_buddy/ tests/

test_verbose:
pytest -s tests/
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ classifiers = [
]
requires-python = ">=3.11"
dependencies = [
"python-frontmatter>=1.1.0"
"python-frontmatter>=1.1.0",
"Whoosh-Reloaded>=2.7.5",
]

[project.optional-dependencies]
Expand All @@ -36,3 +37,7 @@ disallow_incomplete_defs = true
[[tool.mypy.overrides]]
module = ["frontmatter.*"]
follow_untyped_imports = true

[[tool.mypy.overrides]]
module = ["whoosh.*"]
follow_untyped_imports = true
Comment thread
programmer-ke marked this conversation as resolved.
213 changes: 179 additions & 34 deletions src/docs_buddy/adapters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""Docs buddy adapters reside here"""

from contextlib import contextmanager
from typing import Iterator
from typing import Iterator, Any
from pathlib import Path
import subprocess
import shutil
import tempfile
import json

import frontmatter

from docs_buddy.common import PathLike
from docs_buddy import domain
from .whoosh_index import WhooshDocumentIndex


class FakeRepoStorage:
Expand Down Expand Up @@ -93,39 +96,58 @@ def _update_repository(directory: str) -> None:
subprocess.run(["git", "pull"], cwd=directory, check=True, capture_output=True)


class FakeIntermediateStorage:
"""In memory implementation of intermediate storage provider"""

def __init__(self, destination):
self._destination = Path(destination)
self.sink = {}

def __repr__(self):
classname = type(self).__name__
return f"{classname}({self._destination!r})"

@contextmanager
def get_temp_location(self):
temp_location = str(self._destination) + ".tmp"
self.sink[temp_location] = {}
try:
yield temp_location
finally:
self.sink.pop(temp_location, None)

def replace_destination(self, temp_location: PathLike) -> None:
self.sink[str(self._destination)] = self.sink.pop(temp_location)


class FakeDocsStorage:
"""In-memory test implementation of DocsArtifactStorage protocol."""

def __init__(self, source: PathLike, destination: PathLike):
self._source = Path(source)
self._destination = Path(destination)
self._intermediate_storage = FakeIntermediateStorage(destination)
self.actions: list = []
self.read_paths: set = set()

self.sources = {
"src/content/Docs/index.md": SAMPLE_DOC_2,
"src/content/Development_Page/welcome/index.mdx": SAMPLE_DOC_1,
"src/content/Docs/index.md": _SAMPLE_DOC_2,
"src/content/Development_Page/welcome/index.mdx": _SAMPLE_DOC_1,
}

self.sink: dict = {}

def __repr__(self):
classname = type(self).__name__
return f"{classname}({self._source!r}, {self._destination!r})"

@property
def destination_sink(self):
return self.sink[str(self._destination)]
def sink(self):
return self._intermediate_storage.sink

@contextmanager
def get_temp_location(self):
temp_location = str(self._destination) + ".tmp"
self.sink[temp_location] = {}
self.actions.append(("MKDIR", temp_location))
try:
with self._intermediate_storage.get_temp_location() as temp_location:
self.actions.append(("MKDIR", temp_location))
yield temp_location
finally:
self.sink.pop(temp_location, None)

def get_source_paths(self) -> Iterator[PathLike]:
for k in self.sources.keys():
Expand All @@ -139,13 +161,129 @@ def read_from_source(self, nested_path: PathLike) -> str:
def write_to_location(
self, content: str, path: PathLike, base_dir: PathLike
) -> None:
self.sink[str(base_dir)][str(path)] = content
self._intermediate_storage.sink[str(base_dir)][str(path)] = content

def replace_destination(self, temp_location: PathLike) -> None:
self.sink[str(self._destination)] = self.sink.pop(temp_location)
self._intermediate_storage.replace_destination(temp_location)
self.actions.append(("RMRF", str(self._destination)))
self.actions.append(("MV", str(temp_location), str(self._destination)))


class FakeDocumentChunksPipeline:
"""Fake implementation of the document chunk pipeline"""

def __init__(self, source: PathLike, destination: PathLike):
self._source = source
self._destination = destination
self._intermediate_storage = FakeIntermediateStorage(destination)

self._chunks = [_SAMPLE_CHUNK_1, _SAMPLE_CHUNK_2]
self.actions: list = []

@property
def sink(self):
return self._intermediate_storage.sink

@contextmanager
def get_temp_location(self):
with self._intermediate_storage.get_temp_location() as temp_location:
self.actions.append(("MKDIR", temp_location))
yield temp_location

def replace_destination(self, temp_location: PathLike) -> None:
self._intermediate_storage.replace_destination(temp_location)
self.actions.append(("RMRF", str(self._destination)))
self.actions.append(("MV", str(temp_location), str(self._destination)))

def get_document_chunks(self):
return (
domain.DocumentChunk.fromstring(json.dumps(chunk)) for chunk in self._chunks
)


class FakeIndex:
"""Implements a document index for in memory testing"""

def __init__(self, pipeline):
self._pipeline = pipeline

def fit(self, chunks, destination):
"""Index document chunks in memory"""
self._pipeline.sink[destination] = list(chunks)


class FileSystemIntermediateStorage:
"""File system implementation of the intermediate storage protocol"""

def __init__(self, destination: PathLike):
self._destination = Path(destination)

def __repr__(self):
classname = type(self).__name__
return f"{classname}({self._destination!r})"

@contextmanager
def get_temp_location(self, prefix=""):
"""
Create a temporary directory for atomic writes.

Yields:
Path to temporary directory
"""
with tempfile.TemporaryDirectory(
prefix=(prefix or f"{self._destination.name}_")
) as temp_dir:
yield Path(temp_dir)

def replace_destination(self, temp_location: PathLike) -> None:
"""Replaces the destination with the provided temp_location"""

temp_path = Path(temp_location)
dest_path = self._destination

if dest_path.exists():
shutil.rmtree(dest_path)

shutil.move(str(temp_path), str(dest_path))
Comment thread
coderabbitai[bot] marked this conversation as resolved.


class FileSystemDocumentChunksPipeline:
"""Implements the filesytem document chunks pipeline protocol"""

def __init__(
self,
source: PathLike,
destination: PathLike,
doc_extensions: tuple[str, ...] = ("json",),
):
self._source = Path(source)
self._dest = Path(destination)
self._intermediate_storage = FileSystemIntermediateStorage(destination)
self._doc_extensions = doc_extensions

def get_document_chunks(self) -> Iterator[domain.DocumentChunk]:
"""Yields chunked documents from the source"""
doc_paths = (
p
for ext in self._doc_extensions
for p in Path(self._source).rglob(f"*.{ext}")
)

for path in doc_paths:
content = _read_file(path)
chunk = domain.DocumentChunk.fromstring(content)
yield chunk

@contextmanager
def get_temp_location(self):
"""Yields a temp location by delegating to intermediate storage"""
with self._intermediate_storage.get_temp_location() as temp_location:
yield temp_location

def replace_destination(self, temp_location: PathLike) -> None:
"""Replace destination with temp location by delegating to intermediate storage"""
self._intermediate_storage.replace_destination(temp_location)


class FileSystemDocsStorage:
"""File system implementation of DocsArtifactStorage protocol."""
Expand All @@ -167,6 +305,7 @@ def __init__(
self._destination = Path(destination)
self._source = Path(source)
self._doc_extensions = doc_extensions
self._intermediate_storage = FileSystemIntermediateStorage(destination)

def __repr__(self):
classname = type(self).__name__
Expand All @@ -180,9 +319,7 @@ def get_temp_location(self, prefix=""):
Yields:
Path to temporary directory
"""
with tempfile.TemporaryDirectory(
prefix=(prefix or f"{self._destination.name}_")
) as temp_dir:
with self._intermediate_storage.get_temp_location() as temp_dir:
yield Path(temp_dir)

def get_source_paths(self) -> Iterator[PathLike]:
Expand Down Expand Up @@ -212,7 +349,7 @@ def read_from_source(self, nested_path: PathLike) -> str:
Document content as string
"""
full_path = Path(self._source) / nested_path
return self._read_file(full_path)
return _read_file(full_path)

def write_to_location(
self, content: str, path: PathLike, base_dir: PathLike
Expand All @@ -231,32 +368,26 @@ def write_to_location(
def replace_destination(self, temp_location: PathLike) -> None:
"""Replaces the destination with the provided temp_location"""

temp_path = Path(temp_location)
dest_path = self._destination

if dest_path.exists():
shutil.rmtree(dest_path)

shutil.move(str(temp_path), str(dest_path))
self._intermediate_storage.replace_destination(temp_location)

@staticmethod
def _is_empty_dir(path: Path) -> bool:
"""Check if a directory exists and is empty."""
return path.is_dir() and not any(path.iterdir())

@staticmethod
def _read_file(path: Path, encoding: str = "utf-8") -> str:
"""Read file content with specified encoding."""
with open(path, "rt", encoding=encoding) as f:
return f.read()

@staticmethod
def _write_file(path: Path, content: str, encoding: str = "utf-8") -> None:
"""Write content to file with specified encoding."""
with open(path, "wt", encoding=encoding) as f:
f.write(content)


def _read_file(path: Path, encoding: str = "utf-8") -> str:
"""Read file content with specified encoding."""
with open(path, "rt", encoding=encoding) as f:
return f.read()


def frontmatter_metadata_extractor(text: str) -> tuple[dict, str]:
"""
Extract metadata from document text using frontmatter.
Expand All @@ -270,7 +401,7 @@ def frontmatter_metadata_extractor(text: str) -> tuple[dict, str]:
return frontmatter.parse(text)


SAMPLE_DOC_1 = """\
_SAMPLE_DOC_1 = """\
---
title: Open Source Community
description: Learn how Starlight can help you build greener documentation sites and reduce your carbon footprint.
Expand All @@ -295,7 +426,7 @@ def frontmatter_metadata_extractor(text: str) -> tuple[dict, str]:
<Calendar />
"""

SAMPLE_DOC_2 = """\
_SAMPLE_DOC_2 = """\
---
title: "Akash Network Documentation"
linkTitle: "Documentation"
Expand Down Expand Up @@ -323,3 +454,17 @@ def frontmatter_metadata_extractor(text: str) -> tuple[dict, str]:
- **[Deployment](/docs/developers/deployment)** - Console, CLI, SDKs, SDL, and AuthZ
- **[Contributing](/docs/developers/contributing)** - Contribute to Akash codebase and documentation
"""

_SAMPLE_CHUNK_1 = {
"chunk": '```markdown\n## What is a Deployment?\n\nA deployment is your application running on the Akash Network. When you \ncreate a deployment, you\'re requesting compute resources (CPU, RAM, storage) \nfrom providers on the network.\n\nThink of it like renting a server, but:\n- Pay only for what you use (per-block pricing)\n- Choose from multiple providers bidding on your request\n- Your app runs in an isolated container\n```\n\n### For Developers (Technical Users)\n\n**Audience:** Developers integrating Akash\n\n**Requirements:**\n- Assume CLI/programming familiarity\n- Focus on concepts and integration patterns\n- Provide multi-language examples (curl, Go, TypeScript)\n- Link to detailed API reference\n- Show best practices and common patterns\n- Include error handling\n\n**Tone:** Professional, technical, concise\n\n**Example:**\n```markdown\n## Query Providers via gRPC\n\nThe provider query service returns all registered providers and their attributes.\n\n\\```go\nclient, _ := provider.NewQueryClient(conn)\nres, _ := client.Providers(context.Background(), &provider.QueryProvidersRequest{})\n\\```\n\nFilter by attribute:\n\\```go\nreq := &provider.QueryProvidersRequest{\n Filters: &provider.ProviderFilters{\n Attributes: []*v1beta3.Attribute{\n {Key: "region", Value: "us-west"},\n },\n },\n}\n\\```\n```\n\n### For Providers (System Administrators)\n\n**Audience:** DevOps engineers, system administrators\n\n**Requirements:**\n- Assume Linux/Kubernetes knowledge\n- Be precise with commands and versions\n- Include all prerequisites\n- Provide verification steps\n- Add comprehensive troubleshooting\n- Emphasize security best practices\n- Provide automated solutions first, manual as fallback\n\n**Tone:** Direct, technical, security-conscious\n\n**Example:**\n```markdown\n## STEP 3: Configure Persistent Storage\n\nInstall Rook-Ceph for persistent storage classes (beta1, beta2, beta3).\n\n**Prerequisites:**\n- Dedicated drives (not partitions) on each worker node\n- Minimum 4 SSDs or 2 NVMe SSDs across cluster\n- Drives mus',
"index": 13000,
"path": "DOCUMENTATION_AI_GUIDE.md",
"metadata": {},
}

_SAMPLE_CHUNK_2 = {
"chunk": 'Providers(context.Background(), &provider.QueryProvidersRequest{})\n\\```\n\nFilter by attribute:\n\\```go\nreq := &provider.QueryProvidersRequest{\n Filters: &provider.ProviderFilters{\n Attributes: []*v1beta3.Attribute{\n {Key: "region", Value: "us-west"},\n },\n },\n}\n\\```\n```\n\n### For Providers (System Administrators)\n\n**Audience:** DevOps engineers, system administrators\n\n**Requirements:**\n- Assume Linux/Kubernetes knowledge\n- Be precise with commands and versions\n- Include all prerequisites\n- Provide verification steps\n- Add comprehensive troubleshooting\n- Emphasize security best practices\n- Provide automated solutions first, manual as fallback\n\n**Tone:** Direct, technical, security-conscious\n\n**Example:**\n```markdown\n## STEP 3: Configure Persistent Storage\n\nInstall Rook-Ceph for persistent storage classes (beta1, beta2, beta3).\n\n**Prerequisites:**\n- Dedicated drives (not partitions) on each worker node\n- Minimum 4 SSDs or 2 NVMe SSDs across cluster\n- Drives must be unformatted\n\n\\```bash\n# Verify available drives (should show no filesystem)\nlsblk -f\n\n# Expected: Empty FSTYPE column for target drives\n\\```\n\n**Important:** Do not use system drives or shared partitions. Rook-Ceph \nrequires exclusive access to raw block devices.\n```\n\n### For Node Operators (Blockchain Engineers)\n\n**Audience:** Blockchain node operators, validators\n\n**Requirements:**\n- Assume blockchain experience\n- Focus on node operations and security\n- Document upgrade procedures clearly\n- Include monitoring and alerting\n- Separate architecture (for devs) from operations (for ops)\n- Provide recovery procedures\n\n**Tone:** Technical, security-focused, precise\n\n**Example:**\n```markdown\n## Validator Security with TMKMS\n\nTMKMS (Tendermint Key Management System) separates your validator key \nfrom the node, adding a critical security layer.\n\n**Architecture:**\n- Validator node runs on Akash (no private key)\n- TMKMS runs on local machine (holds private key)\n- Stunnel provides encrypted c',
"index": 14000,
"path": "DOCUMENTATION_AI_GUIDE.md",
"metadata": {},
}
Loading