From c8f2240cab37c6e62aac83e9be624d36a0ca66a1 Mon Sep 17 00:00:00 2001 From: funnamer <439325484@qq.com> Date: Wed, 11 Mar 2026 20:21:02 +0800 Subject: [PATCH] feat: support any OpenAI-compatible API endpoints --- .gitignore | 1 + pageindex/page_index.py | 32 +++++++++++++------------- pageindex/page_index_md.py | 2 +- pageindex/utils.py | 29 ++++++++++++------------ run_pageindex.py | 46 ++++++++++++++++++++------------------ 5 files changed, 57 insertions(+), 53 deletions(-) diff --git a/.gitignore b/.gitignore index 47d38baef..3db9d3575 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ log/* logs/ parts/* json_results/* +.idea/ \ No newline at end of file diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 39018c4df..8fb75feb9 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = await ChatGPT_API_async(model=model, prompt=prompt) + response = await OpenAI_API_async(model=model, prompt=prompt) response = extract_json(response) if 'answer' in response: answer = response['answer'] @@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N }} Directly return the final JSON structure. Do not output anything else.""" - response = await ChatGPT_API_async(model=model, prompt=prompt) + response = await OpenAI_API_async(model=model, prompt=prompt) response = extract_json(response) if logger: logger.info(f"Response: {response}") @@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None): Directly return the final JSON structure. Do not output anything else. Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" - response = ChatGPT_API(model=model, prompt=prompt) + response = OpenAI_API(model=model, prompt=prompt) # print('response', response) json_content = extract_json(response) return json_content['toc_detected'] @@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) + response = OpenAI_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) + response = OpenAI_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -165,7 +165,7 @@ def extract_toc_content(content, model=None): Directly return the full table of contents content. Do not output anything else.""" - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt) if_complete = check_if_toc_transformation_is_complete(content, response, model) if if_complete == "yes" and finish_reason == "finished": @@ -176,7 +176,7 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) + new_response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -186,7 +186,7 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) + new_response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -212,7 +212,7 @@ def detect_page_index(toc_content, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = ChatGPT_API(model=model, prompt=prompt) + response = OpenAI_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content['page_index_given_in_toc'] @@ -261,7 +261,7 @@ def toc_index_extractor(toc, content, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) + response = OpenAI_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content @@ -289,7 +289,7 @@ def toc_transformer(toc_content, model=None): Directly return the final JSON structure, do not output anything else. """ prompt = init_prompt + '\n Given table of contents\n:' + toc_content - last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + last_complete, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt) if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) if if_complete == "yes" and finish_reason == "finished": last_complete = extract_json(last_complete) @@ -313,7 +313,7 @@ def toc_transformer(toc_content, model=None): Please continue the json structure, directly output the remaining part of the json structure.""" - new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + new_complete, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt) if new_complete.startswith('```json'): new_complete = get_json_content(new_complete) @@ -474,7 +474,7 @@ def add_page_number_to_toc(part, structure, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n" - current_json_raw = ChatGPT_API(model=model, prompt=prompt) + current_json_raw = OpenAI_API(model=model, prompt=prompt) json_result = extract_json(current_json_raw) for item in json_result: @@ -524,7 +524,7 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): Directly return the additional part of the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2) - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt) if finish_reason == 'finished': return extract_json(response) else: @@ -558,7 +558,7 @@ def generate_toc_init(part, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt) if finish_reason == 'finished': return extract_json(response) @@ -743,7 +743,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20 Directly return the final JSON structure. Do not output anything else.""" prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) + response = OpenAI_API(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index']) diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 70e8de086..df3d485b0 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -306,7 +306,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md') - MODEL="gpt-4.1" + MODEL = os.getenv('OPENAI_MODEL', 'deepseek-chat') IF_THINNING=False THINNING_THRESHOLD=5000 SUMMARY_TOKEN_THRESHOLD=200 diff --git a/pageindex/utils.py b/pageindex/utils.py index dc7acd888..a53342cea 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -2,6 +2,7 @@ import openai import logging import os +import re from datetime import datetime import time import json @@ -17,18 +18,18 @@ from pathlib import Path from types import SimpleNamespace as config -CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat") +OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com") def count_tokens(text, model=None): - if not text: - return 0 - enc = tiktoken.encoding_for_model(model) - tokens = enc.encode(text) - return len(tokens) + # 忽略 model 参数,直接强制使用通用编码 + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(text)) -def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): +def OpenAI_API_with_finish_reason(model, prompt, api_key=OPENAI_API_KEY, chat_history=None): max_retries = 10 - client = openai.OpenAI(api_key=api_key) + client = openai.OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL) for i in range(max_retries): try: if chat_history: @@ -58,9 +59,9 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_ -def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): +def OpenAI_API(model, prompt, api_key=OPENAI_API_KEY, chat_history=None): max_retries = 10 - client = openai.OpenAI(api_key=api_key) + client = openai.OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL) for i in range(max_retries): try: if chat_history: @@ -86,12 +87,12 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): return "Error" -async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY): +async def OpenAI_API_async(model, prompt, api_key=OPENAI_API_KEY): max_retries = 10 messages = [{"role": "user", "content": prompt}] for i in range(max_retries): try: - async with openai.AsyncOpenAI(api_key=api_key) as client: + async with openai.AsyncOpenAI(api_key=api_key, base_url=OPENAI_BASE_URL) as client: response = await client.chat.completions.create( model=model, messages=messages, @@ -609,7 +610,7 @@ async def generate_node_summary(node, model=None): Directly return the description, do not include any other text. """ - response = await ChatGPT_API_async(model, prompt) + response = await OpenAI_API_async(model, prompt) return response @@ -654,7 +655,7 @@ def generate_doc_description(structure, model=None): Directly return the description, do not include any other text. """ - response = ChatGPT_API(model, prompt) + response = OpenAI_API(model, prompt) return response diff --git a/run_pageindex.py b/run_pageindex.py index 107024505..45e5641a5 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -3,6 +3,8 @@ import json from pageindex import * from pageindex.page_index_md import md_to_tree +from dotenv import load_dotenv +load_dotenv() if __name__ == "__main__": # Set up argument parser @@ -10,9 +12,9 @@ parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') parser.add_argument('--md_path', type=str, help='Path to the Markdown file') - parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') + parser.add_argument('--model', type=str, default='deepseek-chat', help='Model to use') - parser.add_argument('--toc-check-pages', type=int, default=20, + parser.add_argument('--toc-check-pages', type=int, default=20, help='Number of pages to check for table of contents (PDF only)') parser.add_argument('--max-pages-per-node', type=int, default=10, help='Maximum number of pages per node (PDF only)') @@ -27,7 +29,7 @@ help='Whether to add doc description to the doc') parser.add_argument('--if-add-node-text', type=str, default='no', help='Whether to add text to the node') - + # Markdown specific arguments parser.add_argument('--if-thinning', type=str, default='no', help='Whether to apply tree thinning for markdown (markdown only)') @@ -36,20 +38,20 @@ parser.add_argument('--summary-token-threshold', type=int, default=200, help='Token threshold for generating summaries (markdown only)') args = parser.parse_args() - + # Validate that exactly one file type is specified if not args.pdf_path and not args.md_path: raise ValueError("Either --pdf_path or --md_path must be specified") if args.pdf_path and args.md_path: raise ValueError("Only one of --pdf_path or --md_path can be specified") - + if args.pdf_path: # Validate PDF file if not args.pdf_path.lower().endswith('.pdf'): raise ValueError("PDF file must have .pdf extension") if not os.path.isfile(args.pdf_path): raise ValueError(f"PDF file not found: {args.pdf_path}") - + # Process PDF file # Configure options opt = config( @@ -66,35 +68,35 @@ # Process the PDF toc_with_page_number = page_index_main(args.pdf_path, opt) print('Parsing done, saving to file...') - + # Save results - pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] + pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] output_dir = './results' output_file = f'{output_dir}/{pdf_name}_structure.json' os.makedirs(output_dir, exist_ok=True) - + with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2) - + print(f'Tree structure saved to: {output_file}') - + elif args.md_path: # Validate Markdown file if not args.md_path.lower().endswith(('.md', '.markdown')): raise ValueError("Markdown file must have .md or .markdown extension") if not os.path.isfile(args.md_path): raise ValueError(f"Markdown file not found: {args.md_path}") - + # Process markdown file print('Processing markdown file...') - + # Process the markdown import asyncio - + # Use ConfigLoader to get consistent defaults (matching PDF behavior) from pageindex.utils import ConfigLoader config_loader = ConfigLoader() - + # Create options dict with user args user_opt = { 'model': args.model, @@ -103,10 +105,10 @@ 'if_add_node_text': args.if_add_node_text, 'if_add_node_id': args.if_add_node_id } - + # Load config with defaults from config.yaml opt = config_loader.load(user_opt) - + toc_with_page_number = asyncio.run(md_to_tree( md_path=args.md_path, if_thinning=args.if_thinning.lower() == 'yes', @@ -118,16 +120,16 @@ if_add_node_text=opt.if_add_node_text, if_add_node_id=opt.if_add_node_id )) - + print('Parsing done, saving to file...') - + # Save results - md_name = os.path.splitext(os.path.basename(args.md_path))[0] + md_name = os.path.splitext(os.path.basename(args.md_path))[0] output_dir = './results' output_file = f'{output_dir}/{md_name}_structure.json' os.makedirs(output_dir, exist_ok=True) - + with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) - + print(f'Tree structure saved to: {output_file}') \ No newline at end of file