Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions examples/agentic_vectorless_rag_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
- get_page_content() — retrieve text content of specific pages

Steps:
1 — Index a PDF and view its tree structure index
1 — Index a PDF/md and view its tree structure index
2 — View document metadata
3 — Ask a question (agent reasons over the index and auto-calls tools)

Expand Down Expand Up @@ -44,9 +44,16 @@
AGENT_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
TOOL USE:
- Call get_document() first to confirm status and page/line count.
- Call get_document_structure() to identify relevant page ranges.
- Call get_page_content(pages="5-7") with tight ranges; never fetch the whole document.
- Call get_document() first to confirm status, type (pdf/md), and
page/line count.
- Call get_document_structure() to identify relevant sections.
- Call get_page_content(pages=...) to fetch content:
- For PDF documents: use page number ranges like "5-7" or "3,8"
- For Markdown documents: use ONLY the exact line_num values from the
structure,
separated by commas. Example: "3,53,69". Do NOT use ranges.
Each line_num represents a complete section node — you will receive
the full content of that section.
- Before each tool call, output one short sentence explaining the reason.
Answer based only on tool output. Be concise.
"""
Expand All @@ -72,9 +79,12 @@ def get_document_structure() -> str:
@function_tool
def get_page_content(pages: str) -> str:
"""
Get the text content of specific pages or line numbers.
Use tight ranges: e.g. '5-7' for pages 5 to 7, '3,8' for pages 3 and 8, '12' for page 12.
For Markdown documents, use line numbers from the structure's line_num field.
Get the text content of specific pages (PDF) or sections (Markdown).
For PDF: use page ranges, e.g. '5-7', '3,8', '12'.
For Markdown: use EXACT line_num values from get_document_structure(),
comma-separated. e.g. '3,53,69'. Each line_num returns the FULL text
of that section node. Do NOT guess or interpolate line numbers —
only use values that appear in the structure.
"""
return client.get_page_content(doc_id, pages)

Expand Down
10 changes: 6 additions & 4 deletions pageindex/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,19 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:

def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
"""
For Markdown documents, 'pages' are line numbers.
Find nodes whose line_num falls within [min(page_nums), max(page_nums)] and return their text.
For Markdown documents, page_nums are line_num values of section
nodes.
Return the text of nodes whose line_num exactly matches one of the
requested values.
"""
min_line, max_line = min(page_nums), max(page_nums)
requested = set(page_nums)
results = []
seen = set()

def _traverse(nodes):
for node in nodes:
ln = node.get('line_num')
if ln and min_line <= ln <= max_line and ln not in seen:
if ln and ln in requested and ln not in seen:
seen.add(ln)
results.append({'page': ln, 'content': node.get('text', '')})
if node.get('nodes'):
Expand Down
Loading