Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,14 @@ You can follow these steps to generate a PageIndex tree from a PDF document.
pip3 install --upgrade -r requirements.txt
```

### 2. Set your OpenAI API key
### 2. Set your API key

Create a `.env` file in the root directory and add your API key:

```bash
CHATGPT_API_KEY=your_openai_key_here
OPENAI_API_KEY=your_openai_key_here
# or
CHATGPT_API_KEY=your_openai_key_here # legacy, still supported
```

### 3. Run PageIndex on your PDF
Expand Down Expand Up @@ -189,7 +191,41 @@ python3 run_pageindex.py --md_path /path/to/your/document.md
> Note: in this function, we use "#" to determine node heading and their levels. For example, "##" is level 2, "###" is level 3, etc. Make sure your markdown file is formatted correctly. If your Markdown file was converted from a PDF or HTML, we don't recommend using this function, since most existing conversion tools cannot preserve the original hierarchy. Instead, use our [PageIndex OCR](https://pageindex.ai/blog/ocr), which is designed to preserve the original hierarchy, to convert the PDF to a markdown file and then use this function.
</details>

<!--
---

# 🐍 Python API

### Index & Retrieve

```python
from pageindex import PageIndexClient

client = PageIndexClient(workspace="~/.pageindex")

# Index a document (PDF or Markdown)
doc_id = client.index("path/to/document.pdf")

# Retrieve
client.get_document(doc_id) # metadata: name, type, page count
client.get_document_structure(doc_id) # full tree structure
client.get_page_content(doc_id, pages="5-7") # page content
```

### Agent-based QA (OpenAI Agents)

For a complete agent QA example using the [OpenAI Agents SDK](https://github.com/openai/openai-agents-python), see [`examples/openai_agents_demo.py`](examples/openai_agents_demo.py).

```bash
# Install optional dependency
pip install openai-agents

# Run the demo
python examples/openai_agents_demo.py
```

---

<!--
# ☁️ Improved Tree Generation with PageIndex OCR

This repo is designed for generating PageIndex tree structure for simple PDFs, but many real-world use cases involve complex PDFs that are hard to parse by classic Python tools. However, extracting high-quality text from PDF documents remains a non-trivial challenge. Most OCR tools only extract page-level content, losing the broader document context and hierarchy.
Expand Down
165 changes: 165 additions & 0 deletions examples/openai_agents_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
"""
PageIndex x OpenAI Agents Demo

Demonstrates how to use PageIndexClient with the OpenAI Agents SDK
to build a document QA agent with 3 tools:
- get_document()
- get_document_structure()
- get_page_content()

Requirements:
pip install openai-agents

Steps:
1 — Index PDF and inspect tree structure
2 — Inspect document metadata
3 — Ask a question (agent auto-calls tools)
4 — Reload from workspace and verify persistence
"""
import os
import sys
import asyncio
import concurrent.futures
import requests
from pathlib import Path

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from agents import Agent, Runner, function_tool
from agents.stream_events import RawResponsesStreamEvent, RunItemStreamEvent
from openai.types.responses import ResponseTextDeltaEvent

from pageindex import PageIndexClient
import pageindex.utils as utils

PDF_URL = "https://arxiv.org/pdf/2501.12948.pdf"
PDF_PATH = "tests/pdfs/deepseek-r1.pdf"
WORKSPACE = "./pageindex_workspace"

AGENT_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
TOOL USE:
- Call get_document() first to confirm status and page/line count.
- Call get_document_structure() to find relevant page ranges (use node summaries and start_index/end_index).
- Call get_page_content(pages="5-7") with tight ranges. Never fetch the whole doc.
- When calling tool call, output one short sentence explaining reason.
ANSWERING: Answer based only on tool output. Be concise.
"""


def query_agent(
client: PageIndexClient,
doc_id: str,
prompt: str,
verbose: bool = False,
) -> str:
"""Run a document QA agent using the OpenAI Agents SDK.

Streams text output token-by-token and returns the full answer string.
Tool calls are always printed; verbose=True also prints arguments and output previews.
"""

@function_tool
def get_document() -> str:
"""Get document metadata: status, page count, name, and description."""
return client.get_document(doc_id)

@function_tool
def get_document_structure() -> str:
"""Get the document's full tree structure (without text) to find relevant sections."""
return client.get_document_structure(doc_id)

@function_tool
def get_page_content(pages: str) -> str:
"""
Get the text content of specific pages or line numbers.
Use tight ranges: e.g. '5-7' for pages 5 to 7, '3,8' for pages 3 and 8, '12' for page 12.
For Markdown documents, use line numbers from the structure's line_num field.
"""
return client.get_page_content(doc_id, pages)

agent = Agent(
name="PageIndex",
instructions=AGENT_SYSTEM_PROMPT,
tools=[get_document, get_document_structure, get_page_content],
model=client.model,
)

async def _run():
turn = 0
collected = []
streamed_run = Runner.run_streamed(agent, prompt)
async for event in streamed_run.stream_events():
if isinstance(event, RawResponsesStreamEvent):
if isinstance(event.data, ResponseTextDeltaEvent):
delta = event.data.delta
print(delta, end="", flush=True)
collected.append(delta)
elif isinstance(event, RunItemStreamEvent):
if event.name == "tool_called":
if collected:
print()
collected.clear()
raw = event.item.raw_item
args = getattr(raw, "arguments", "{}")
args_str = f"({args})" if verbose else "()"
print(f"\nCalling tool: {raw.name}{args_str}")
elif event.name == "tool_output" and verbose:
output = str(event.item.output)
preview = output[:200] + "..." if len(output) > 200 else output
print(f"Tool output: {preview}")
if collected:
print()
return "".join(collected)

try:
asyncio.get_running_loop()
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
return pool.submit(asyncio.run, _run()).result()
except RuntimeError:
return asyncio.run(_run())


# ── Download PDF if needed ─────────────────────────────────────────────────────
if not os.path.exists(PDF_PATH):
print(f"Downloading {PDF_URL} ...")
os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True)
with requests.get(PDF_URL, stream=True, timeout=30) as r:
r.raise_for_status()
with open(PDF_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.\n")

# ── Setup ──────────────────────────────────────────────────────────────────────
client = PageIndexClient(workspace=WORKSPACE)

# ── Step 1: Index + Tree ───────────────────────────────────────────────────────
print("=" * 60)
print("Step 1: Indexing PDF and inspecting tree structure")
print("=" * 60)
_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"
if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:
print(f"\nLoaded cached doc_id: {doc_id}")
else:
doc_id = client.index(PDF_PATH)
_id_cache.parent.mkdir(parents=True, exist_ok=True)
_id_cache.write_text(doc_id)
print(f"\nIndexed. doc_id: {doc_id}")
print("\nTree Structure (top-level sections):")
utils.print_tree(client.documents[doc_id]["structure"])

# ── Step 2: Document Metadata ──────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Step 2: Document Metadata (get_document)")
print("=" * 60)
print(client.get_document(doc_id))

# ── Step 3: Agent Query ────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Step 3: Agent Query (auto tool-use)")
print("=" * 60)
question = "What reward design does DeepSeek-R1-Zero use, and why was it chosen over supervised fine-tuning?"
print(f"\nQuestion: '{question}'\n")
query_agent(client, doc_id, question, verbose=True)
4 changes: 3 additions & 1 deletion pageindex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from .page_index import *
from .page_index_md import md_to_tree
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .client import PageIndexClient
118 changes: 118 additions & 0 deletions pageindex/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
import uuid
import json
import asyncio
from pathlib import Path

from .page_index import page_index
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content


class PageIndexClient:
"""
A client for indexing and retrieving document content.
Flow: index() -> get_document() / get_document_structure() / get_page_content()

For agent-based QA, see examples/openai_agents_demo.py.
"""
def __init__(self, api_key: str = None, model: str = "gpt-4o-2024-11-20", workspace: str = None):
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
self.model = model
self.workspace = Path(workspace).expanduser() if workspace else None
if self.workspace:
self.workspace.mkdir(parents=True, exist_ok=True)
self.documents = {}
if self.workspace:
self._load_workspace()

def index(self, file_path: str, mode: str = "auto") -> str:
"""Index a document. Returns a document_id."""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")

doc_id = str(uuid.uuid4())
ext = os.path.splitext(file_path)[1].lower()

is_pdf = ext == '.pdf'
is_md = ext in ['.md', '.markdown']

if mode == "pdf" or (mode == "auto" and is_pdf):
print(f"Indexing PDF: {file_path}")
result = page_index(
doc=file_path,
model=self.model,
if_add_node_summary='yes',
if_add_node_text='yes',
if_add_node_id='yes',
if_add_doc_description='yes'
)
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'pdf',
'structure': result['structure'],
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
}

elif mode == "md" or (mode == "auto" and is_md):
print(f"Indexing Markdown: {file_path}")
result = asyncio.run(md_to_tree(
md_path=file_path,
if_thinning=False,
if_add_node_summary='yes',
summary_token_threshold=200,
model=self.model,
if_add_doc_description='yes',
if_add_node_text='yes',
if_add_node_id='yes'
))
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'md',
'structure': result['structure'],
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
}
else:
raise ValueError(f"Unsupported file format for: {file_path}")

print(f"Indexing complete. Document ID: {doc_id}")
if self.workspace:
self._save_doc(doc_id)
return doc_id

def _save_doc(self, doc_id: str):
path = self.workspace / f"{doc_id}.json"
with open(path, "w", encoding="utf-8") as f:
json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)

def _load_workspace(self):
loaded = 0
for path in self.workspace.glob("*.json"):
try:
with open(path, "r", encoding="utf-8") as f:
doc = json.load(f)
self.documents[path.stem] = doc
loaded += 1
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
if loaded:
print(f"Loaded {loaded} document(s) from workspace.")

def get_document(self, doc_id: str) -> str:
"""Return document metadata JSON."""
return get_document(self.documents, doc_id)

def get_document_structure(self, doc_id: str) -> str:
"""Return document tree structure JSON (without text fields)."""
return get_document_structure(self.documents, doc_id)

def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
return get_page_content(self.documents, doc_id, pages)
Loading