Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ log/*
logs/
parts/*
json_results/*
.idea/
32 changes: 16 additions & 16 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None):
}}
Directly return the final JSON structure. Do not output anything else."""

response = await ChatGPT_API_async(model=model, prompt=prompt)
response = await OpenAI_API_async(model=model, prompt=prompt)
response = extract_json(response)
if 'answer' in response:
answer = response['answer']
Expand Down Expand Up @@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N
}}
Directly return the final JSON structure. Do not output anything else."""

response = await ChatGPT_API_async(model=model, prompt=prompt)
response = await OpenAI_API_async(model=model, prompt=prompt)
response = extract_json(response)
if logger:
logger.info(f"Response: {response}")
Expand Down Expand Up @@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None):
Directly return the final JSON structure. Do not output anything else.
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""

response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
# print('response', response)
json_content = extract_json(response)
return json_content['toc_detected']
Expand All @@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']

Expand All @@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']

Expand All @@ -165,7 +165,7 @@ def extract_toc_content(content, model=None):

Directly return the full table of contents content. Do not output anything else."""

response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)

if_complete = check_if_toc_transformation_is_complete(content, response, model)
if if_complete == "yes" and finish_reason == "finished":
Expand All @@ -176,7 +176,7 @@ def extract_toc_content(content, model=None):
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
new_response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response, model)

Expand All @@ -186,7 +186,7 @@ def extract_toc_content(content, model=None):
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
new_response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response, model)

Expand All @@ -212,7 +212,7 @@ def detect_page_index(toc_content, model=None):
}}
Directly return the final JSON structure. Do not output anything else."""

response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['page_index_given_in_toc']

Expand Down Expand Up @@ -261,7 +261,7 @@ def toc_index_extractor(toc, content, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content

Expand Down Expand Up @@ -289,7 +289,7 @@ def toc_transformer(toc_content, model=None):
Directly return the final JSON structure, do not output anything else. """

prompt = init_prompt + '\n Given table of contents\n:' + toc_content
last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
last_complete, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
if if_complete == "yes" and finish_reason == "finished":
last_complete = extract_json(last_complete)
Expand All @@ -313,7 +313,7 @@ def toc_transformer(toc_content, model=None):

Please continue the json structure, directly output the remaining part of the json structure."""

new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
new_complete, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)

if new_complete.startswith('```json'):
new_complete = get_json_content(new_complete)
Expand Down Expand Up @@ -474,7 +474,7 @@ def add_page_number_to_toc(part, structure, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n"
current_json_raw = ChatGPT_API(model=model, prompt=prompt)
current_json_raw = OpenAI_API(model=model, prompt=prompt)
json_result = extract_json(current_json_raw)

for item in json_result:
Expand Down Expand Up @@ -524,7 +524,7 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
Directly return the additional part of the final JSON structure. Do not output anything else."""

prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2)
response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)
if finish_reason == 'finished':
return extract_json(response)
else:
Expand Down Expand Up @@ -558,7 +558,7 @@ def generate_toc_init(part, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = prompt + '\nGiven text\n:' + part
response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)

if finish_reason == 'finished':
return extract_json(response)
Expand Down Expand Up @@ -743,7 +743,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20
Directly return the final JSON structure. Do not output anything else."""

prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return convert_physical_index_to_int(json_content['physical_index'])

Expand Down
2 changes: 1 addition & 1 deletion pageindex/page_index_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')


MODEL="gpt-4.1"
MODEL = os.getenv('OPENAI_MODEL', 'deepseek-chat')
IF_THINNING=False
THINNING_THRESHOLD=5000
SUMMARY_TOKEN_THRESHOLD=200
Expand Down
29 changes: 15 additions & 14 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import openai
import logging
import os
import re
from datetime import datetime
import time
import json
Expand All @@ -17,18 +18,18 @@
from pathlib import Path
from types import SimpleNamespace as config

CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com")

def count_tokens(text, model=None):
if not text:
return 0
enc = tiktoken.encoding_for_model(model)
tokens = enc.encode(text)
return len(tokens)
# 忽略 model 参数,直接强制使用通用编码
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(text))

def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
def OpenAI_API_with_finish_reason(model, prompt, api_key=OPENAI_API_KEY, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL)
for i in range(max_retries):
try:
if chat_history:
Expand Down Expand Up @@ -58,9 +59,9 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_



def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
def OpenAI_API(model, prompt, api_key=OPENAI_API_KEY, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL)
for i in range(max_retries):
try:
if chat_history:
Expand All @@ -86,12 +87,12 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
return "Error"


async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
async def OpenAI_API_async(model, prompt, api_key=OPENAI_API_KEY):
max_retries = 10
messages = [{"role": "user", "content": prompt}]
for i in range(max_retries):
try:
async with openai.AsyncOpenAI(api_key=api_key) as client:
async with openai.AsyncOpenAI(api_key=api_key, base_url=OPENAI_BASE_URL) as client:
response = await client.chat.completions.create(
model=model,
messages=messages,
Expand Down Expand Up @@ -609,7 +610,7 @@ async def generate_node_summary(node, model=None):

Directly return the description, do not include any other text.
"""
response = await ChatGPT_API_async(model, prompt)
response = await OpenAI_API_async(model, prompt)
return response


Expand Down Expand Up @@ -654,7 +655,7 @@ def generate_doc_description(structure, model=None):

Directly return the description, do not include any other text.
"""
response = ChatGPT_API(model, prompt)
response = OpenAI_API(model, prompt)
return response


Expand Down
46 changes: 24 additions & 22 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@
import json
from pageindex import *
from pageindex.page_index_md import md_to_tree
from dotenv import load_dotenv
load_dotenv()

if __name__ == "__main__":
# Set up argument parser
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')

parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
parser.add_argument('--model', type=str, default='deepseek-chat', help='Model to use')

parser.add_argument('--toc-check-pages', type=int, default=20,
parser.add_argument('--toc-check-pages', type=int, default=20,
help='Number of pages to check for table of contents (PDF only)')
parser.add_argument('--max-pages-per-node', type=int, default=10,
help='Maximum number of pages per node (PDF only)')
Expand All @@ -27,7 +29,7 @@
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default='no',
help='Whether to add text to the node')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
Expand All @@ -36,20 +38,20 @@
parser.add_argument('--summary-token-threshold', type=int, default=200,
help='Token threshold for generating summaries (markdown only)')
args = parser.parse_args()

# Validate that exactly one file type is specified
if not args.pdf_path and not args.md_path:
raise ValueError("Either --pdf_path or --md_path must be specified")
if args.pdf_path and args.md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")

if args.pdf_path:
# Validate PDF file
if not args.pdf_path.lower().endswith('.pdf'):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(args.pdf_path):
raise ValueError(f"PDF file not found: {args.pdf_path}")

# Process PDF file
# Configure options
opt = config(
Expand All @@ -66,35 +68,35 @@
# Process the PDF
toc_with_page_number = page_index_main(args.pdf_path, opt)
print('Parsing done, saving to file...')

# Save results
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{pdf_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2)

print(f'Tree structure saved to: {output_file}')

elif args.md_path:
# Validate Markdown file
if not args.md_path.lower().endswith(('.md', '.markdown')):
raise ValueError("Markdown file must have .md or .markdown extension")
if not os.path.isfile(args.md_path):
raise ValueError(f"Markdown file not found: {args.md_path}")

# Process markdown file
print('Processing markdown file...')

# Process the markdown
import asyncio

# Use ConfigLoader to get consistent defaults (matching PDF behavior)
from pageindex.utils import ConfigLoader
config_loader = ConfigLoader()

# Create options dict with user args
user_opt = {
'model': args.model,
Expand All @@ -103,10 +105,10 @@
'if_add_node_text': args.if_add_node_text,
'if_add_node_id': args.if_add_node_id
}

# Load config with defaults from config.yaml
opt = config_loader.load(user_opt)

toc_with_page_number = asyncio.run(md_to_tree(
md_path=args.md_path,
if_thinning=args.if_thinning.lower() == 'yes',
Expand All @@ -118,16 +120,16 @@
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
))

print('Parsing done, saving to file...')

# Save results
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{md_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')