Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
0a77a9d
feat: implement documents embed CLI command
priyankeshh Aug 24, 2025
400b067
demo: add comprehensive demonstration of embed command functionality
priyankeshh Aug 24, 2025
93c7d02
refactor: organize test files and create integration testing suite
priyankeshh Aug 24, 2025
2f63419
docs: add comprehensive testing documentation and instructions
priyankeshh Aug 24, 2025
b5b1d33
test: complete end-to-end validation of embed functionality with real…
priyankeshh Aug 24, 2025
b06f6df
feat: implement configurable embedding system with random vectors
priyankeshh Aug 24, 2025
8fc7d08
feat: enhance embedding functionality and improve document processing
JonnyTran Aug 24, 2025
b1ffa82
Apply ruff formatting
priyankeshh Aug 27, 2025
95dd534
Add PyMuPDF integration with document metadata in extralit-server
priyankeshh Aug 27, 2025
dcf53e0
fix: address review feedback - proper dataset creation, remove test f…
priyankeshh Aug 27, 2025
8138efd
refactor: minimize code and remove llama-index dependency
priyankeshh Aug 29, 2025
7a765d2
style: apply pre-commit formatting
priyankeshh Aug 29, 2025
d875219
fix: simplify RQ job to use HF space service for margin lookup
priyankeshh Aug 29, 2025
eb415ee
fix: remove redundant PyMuPDF job definition from extralit-server
priyankeshh Sep 1, 2025
1b39f1f
fix: simplify table context to follow existing workflow patterns
priyankeshh Sep 1, 2025
7321381
fix: fetch analysis metadata from document for table extraction
priyankeshh Sep 1, 2025
fd1b9e9
Merge branch 'develop' into feat/document-embedding-cli
priyankeshh Sep 1, 2025
9faff49
chore: remove redundant ocr/tables.py (functionality already in workf…
priyankeshh Sep 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,7 @@ extralit-server/src/extralit_server/static
extralit/site

# Development files
**/*.db
**/*.db

# PDF files
*.pdf
15 changes: 15 additions & 0 deletions extralit-server/src/extralit_ocr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# extralit_ocr package for PyMuPDF integration
155 changes: 155 additions & 0 deletions extralit-server/src/extralit_ocr/jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
RQ jobs for PDF text extraction using PyMuPDF with document metadata integration.

This module provides the job functions called by extralit-server workflows.
It imports existing metadata classes and uses margin information from document metadata.
"""

import logging
import os
from typing import Any
from uuid import UUID

import requests
from rq import get_current_job

# Import existing metadata classes from extralit-server
try:
from extralit_server.api.schemas.v1.document.metadata import LayoutAnalysisMetadata
from extralit_server.database import SyncSessionLocal
from extralit_server.models.database import Document
except ImportError:
# Fallback for testing/development
LayoutAnalysisMetadata = None
SyncSessionLocal = None
Document = None

LOGGER = logging.getLogger(__name__)


def get_document_margins(document_id: UUID) -> tuple[int, int, int, int]:
"""
Retrieve margin information from document metadata in database.

Args:
document_id: UUID of the document

Returns:
Tuple of (left, top, right, bottom) margins in PDF points
"""
default_margins = (0, 50, 0, 30) # Default margins if no metadata available

if not SyncSessionLocal or not Document:
LOGGER.info(f"Database classes not available, using default margins for {document_id}")
return default_margins

try:
with SyncSessionLocal() as db:
document = db.query(Document).filter(Document.id == document_id).first()
if not document or not document.metadata_:
LOGGER.info(f"No metadata found for document {document_id}, using default margins")
return default_margins

# Extract margin analysis from document metadata
analysis_metadata = document.metadata_.get("analysis_metadata", {})
layout_analysis = analysis_metadata.get("layout_analysis", {})
margin_analysis = layout_analysis.get("margin_analysis", {})

if margin_analysis and "estimated_margins" in margin_analysis:
estimated_margins = margin_analysis["estimated_margins"]

# Convert from pixel-based margins to PDF points
left = estimated_margins.get("left_px", 0)
top = estimated_margins.get("top_px", 50)
right = estimated_margins.get("right_px", 0)
bottom = estimated_margins.get("bottom_px", 30)

margins = (left, top, right, bottom)
LOGGER.info(f"Using document-specific margins for {document_id}: {margins}")
return margins

except Exception as e:
LOGGER.warning(f"Error retrieving margins for document {document_id}: {e}")

LOGGER.info(f"Using default margins for document {document_id}: {default_margins}")
return default_margins


def pymupdf_to_markdown_job(
document_id: UUID,
s3_url: str,
filename: str,
job_metadata: dict[str, Any],
workspace_name: str,
) -> dict[str, Any]:
"""
RQ job to extract markdown from PDF using PyMuPDF with document-specific margins.

This function integrates with extralit-hf-space service to perform the actual
PyMuPDF extraction while using margin information from document metadata.

Args:
document_id: UUID of the document being processed
s3_url: S3 URL of the PDF file
filename: Original filename
job_metadata: Additional job metadata
workspace_name: Name of the workspace

Returns:
Dictionary containing extraction results and metadata
"""
job = get_current_job()
LOGGER.info(f"Starting PyMuPDF extraction for document {document_id}")

try:
# Call extralit-hf-space service with document ID (service fetches margins from DB)
service_url = os.getenv("EXTRALIT_HF_SPACE_URL", "http://localhost:8000")

# Download PDF from S3
pdf_response = requests.get(s3_url)
pdf_response.raise_for_status()

# Call extraction service with document ID (service will fetch margins from DB)
files = {"pdf": (filename, pdf_response.content, "application/pdf")}

extract_response = requests.post(f"{service_url}/extract_with_document/{document_id}", files=files)
extract_response.raise_for_status()
extraction_result = extract_response.json()

result = {
"status": "success",
"document_id": str(document_id),
"filename": filename,
"margins_used": extraction_result.get("margins_used"),
"extraction_method": "pymupdf4llm",
"job_id": job.id if job else None,
"workspace_name": workspace_name,
"markdown": extraction_result.get("markdown"),
"metadata": extraction_result.get("metadata"),
}

LOGGER.info(f"PyMuPDF extraction completed for document {document_id}")
return result

except Exception as e:
LOGGER.error(f"PyMuPDF extraction failed for document {document_id}: {e}")
return {
"status": "error",
"document_id": str(document_id),
"error": str(e),
"job_id": job.id if job else None,
}
2 changes: 2 additions & 0 deletions extralit/src/extralit/cli/documents/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from extralit.cli.documents.add import add_document
from extralit.cli.documents.delete import delete_document
from extralit.cli.documents.embed import embed_documents
from extralit.cli.documents.import_bib import import_bib
from extralit.cli.documents.import_history import list_import_histories
from extralit.cli.documents.list import list_documents
Expand All @@ -28,6 +29,7 @@
app.command(name="add")(add_document)
app.command(name="import")(import_bib)
app.command(name="delete")(delete_document)
app.command(name="embed")(embed_documents)

# Import history commands - new structure
app.command(name="history")(list_import_histories)
Expand Down
Loading