diff --git a/.gitignore b/.gitignore index e0b82c1..ddbb497 100644 --- a/.gitignore +++ b/.gitignore @@ -19,12 +19,12 @@ wheels/ *.egg-info/ .installed.cfg *.egg -data/* + # Virtual Environment venv/ env/ ENV/ -.venv +.venv/ # IDE .vscode/ @@ -34,51 +34,69 @@ ENV/ *~ .DS_Store +# Claude Code +.claude/ + # API Keys - 중요! .env -.env.local -.env.development -.env.test -.env.production - +.env.* apis/gemini_keys.yaml -!apis/gemini_keys.yaml.template +apis/*.yaml +!apis/*-example.yaml +!apis/*.template.yaml # Logs *.log +logs/ # Jupyter Notebook -.ipynb_checkpoints +.ipynb_checkpoints/ # pytest .pytest_cache/ .coverage +htmlcov/ # MyPy .mypy_cache/ .dmypy.json dmypy.json -# database +# Data - 원본 데이터 +data/ + +# Output - 생성된 결과물 +output/ +output_*/ +I_origin_*/ + +# Temp - 임시 파일 +temp/ + +# Archives +*.zip +*.tar.gz +*.rar + +# Generated JSON (except input templates) +pipeline_output*.json +qa_difficulty_analysis_*.json +qa_for_review_*.json +eval_results_*.json + +# Keep input templates +!test_*_input.json + +# Database/Token info/ token.json -*.json -test_input.json -# env +# Docs (if generated) .bemad/ -docs/ -pipeline_ui/backend/checkpoints/* -pipeline_ui/backend/output/* -pipeline_ui/backend/uploads/* -# Frontend (Node.js) -pipeline_ui/frontend/node_modules/* +# Pipeline UI +pipeline_ui/backend/checkpoints/ +pipeline_ui/backend/output/ +pipeline_ui/backend/uploads/ +pipeline_ui/frontend/node_modules/ pipeline_ui/frontend/package-lock.json - - -I_origin_0/* -I_origin_1/* -I_origin_2/* - -output/* \ No newline at end of file diff --git a/capture_html_images.py b/capture_html_images.py new file mode 100644 index 0000000..4491bcb --- /dev/null +++ b/capture_html_images.py @@ -0,0 +1,127 @@ +""" +Capture HTML files from output_* directories as images using Playwright. +""" +import argparse +import asyncio +from pathlib import Path +from typing import List + +from playwright.async_api import async_playwright + + +async def capture_html_file_async( + html_path: Path, + output_path: Path, + width: int = 800, +) -> None: + """Capture a single HTML file as an image.""" + html_content = html_path.read_text(encoding="utf-8") + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + try: + page = await browser.new_page(viewport={"width": width, "height": 600}) + await page.set_content(html_content) + await page.screenshot(path=output_path, full_page=True) + finally: + await browser.close() + + +async def capture_batch_async( + html_files: List[Path], + output_dir: Path, + width: int = 800, +) -> None: + """Capture multiple HTML files, reusing a single browser instance.""" + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + try: + for html_path in html_files: + output_path = output_dir / f"{html_path.stem}.png" + if output_path.exists(): + print(f" [SKIP] {output_path.name} already exists") + continue + + try: + html_content = html_path.read_text(encoding="utf-8") + page = await browser.new_page(viewport={"width": width, "height": 600}) + await page.set_content(html_content) + await page.screenshot(path=output_path, full_page=True) + await page.close() + print(f" [OK] {html_path.name} -> {output_path.name}") + except Exception as e: + print(f" [ERROR] {html_path.name}: {e}") + finally: + await browser.close() + + +def main(): + parser = argparse.ArgumentParser(description="Capture HTML files as images") + parser.add_argument( + "--output-dirs", + nargs="+", + default=None, + help="Specific output directories to process (e.g., output_academic output_finance)", + ) + parser.add_argument( + "--width", + type=int, + default=800, + help="Viewport width for rendering (default: 800)", + ) + parser.add_argument( + "--force", + action="store_true", + help="Overwrite existing images", + ) + args = parser.parse_args() + + base_dir = Path(__file__).parent + + # Find output_* directories + if args.output_dirs: + output_dirs = [base_dir / d for d in args.output_dirs] + else: + output_dirs = sorted(base_dir.glob("output_*")) + output_dirs = [d for d in output_dirs if d.is_dir()] + + if not output_dirs: + print("No output_* directories found.") + return + + print(f"Found {len(output_dirs)} output directories to process") + + for output_dir in output_dirs: + html_dir = output_dir / "html" + if not html_dir.exists(): + print(f"\n[SKIP] {output_dir.name}: no html/ subdirectory") + continue + + # Create images directory + images_dir = output_dir / "images" + images_dir.mkdir(exist_ok=True) + + html_files = sorted(html_dir.glob("*.html")) + if not html_files: + print(f"\n[SKIP] {output_dir.name}: no HTML files found") + continue + + # Filter out already processed files unless --force + if not args.force: + html_files = [ + f for f in html_files + if not (images_dir / f"{f.stem}.png").exists() + ] + + if not html_files: + print(f"\n[SKIP] {output_dir.name}: all files already processed") + continue + + print(f"\n[Processing] {output_dir.name}: {len(html_files)} HTML files") + asyncio.run(capture_batch_async(html_files, images_dir, args.width)) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/eval/__init__.py b/eval/__init__.py index 65cc49f..4bdd2ee 100644 --- a/eval/__init__.py +++ b/eval/__init__.py @@ -28,6 +28,13 @@ evaluate_predictions, run_evaluation, ) +from .evaluate_vllm import ( + EvalConfig, + load_qa_from_pipeline_output, + evaluate_domain, + evaluate_all_domains, + DOMAIN_DIRS, +) __all__ = [ # Dataset @@ -50,4 +57,10 @@ # Evaluate "evaluate_predictions", "run_evaluation", + # vLLM Evaluate + "EvalConfig", + "load_qa_from_pipeline_output", + "evaluate_domain", + "evaluate_all_domains", + "DOMAIN_DIRS", ] diff --git a/eval/evaluate_vllm.py b/eval/evaluate_vllm.py new file mode 100644 index 0000000..aa9740d --- /dev/null +++ b/eval/evaluate_vllm.py @@ -0,0 +1,648 @@ +#!/usr/bin/env python3 +""" +vLLM 서버를 사용한 Table QA 평가 스크립트. + +output_* 디렉토리의 HTML 테이블 이미지와 QA 데이터를 사용하여 +멀티모달 모델의 Table QA 성능을 평가합니다. + +Usage: + # 단일 도메인 평가 + python -m eval.evaluate_vllm --domain public --vllm-url http://localhost:8000/v1 + + # 모든 도메인 평가 + python -m eval.evaluate_vllm --all-domains --vllm-url http://localhost:8000/v1 + + # 특정 모델 사용 + python -m eval.evaluate_vllm --domain business --model Qwen/Qwen2-VL-7B-Instruct + + # LLM-as-Judge 포함 + python -m eval.evaluate_vllm --domain finance --use-judge --judge-model gpt-4o +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import os +import sys +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import List, Dict, Any, Optional + +# 프로젝트 루트를 path에 추가 +project_root = Path(__file__).parent.parent +if str(project_root) not in sys.path: + sys.path.insert(0, str(project_root)) + +from eval.dataset import QAItem, EvalDataset +from eval.inference import VLLMClient, InferenceRequest, InferenceResponse, run_inference +from eval.metrics import compute_metrics, aggregate_metrics, EvalResult, AggregatedMetrics +from eval.evaluate import evaluate_predictions, generate_report, print_report +from eval.llm_judge import create_judge_client + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +# 도메인별 output 디렉토리 매핑 +DOMAIN_DIRS = { + "academic": "output_academic", + "business": "output_business", + "finance": "output_finance", + "medical": "output_medical", + "public": "output_public", +} + +# 기본 프롬프트 템플릿 +DEFAULT_PROMPT_TEMPLATE = """당신은 테이블 이미지를 분석하여 질문에 답하는 AI 어시스턴트입니다. +주어진 테이블 이미지를 주의 깊게 분석한 후, 질문에 대해 정확하고 간결하게 답변해주세요. + +질문: {question} + +답변:""" + +# Context가 있는 경우의 프롬프트 템플릿 +CONTEXT_PROMPT_TEMPLATE = """당신은 테이블 이미지를 분석하여 질문에 답하는 AI 어시스턴트입니다. +주어진 테이블 이미지와 아래 문맥을 함께 고려하여 질문에 정확하고 간결하게 답변해주세요. + +[문맥] +{context} + +질문: {question} + +답변:""" + + +@dataclass +class EvalConfig: + """평가 설정""" + domain: str + vllm_url: str = "http://localhost:8000/v1" + model: str = "default" + max_tokens: int = 512 + temperature: float = 0.0 + max_concurrent: int = 10 + timeout: float = 120.0 + use_judge: bool = False + judge_provider: str = "openai" + judge_model: Optional[str] = None + judge_api_key: Optional[str] = None + output_dir: Optional[Path] = None + limit: Optional[int] = None # 평가할 최대 샘플 수 (디버깅용) + qa_types: Optional[List[str]] = None # 특정 QA 타입만 평가 + + +def find_table_images( + output_dir: Path, + pair_id: str, + image_paths: List[str], +) -> List[Path]: + """ + QA 항목에 해당하는 테이블 이미지를 찾습니다. + + HTML 파일이 이미지로 캡처되었다고 가정하고, + output_dir/images/ 디렉토리에서 이미지를 찾습니다. + + 파일명 패턴: + - pair_id: "B_origin_3_3_0" + - HTML: "B_origin_3_3_0_table_0.html", "B_origin_3_3_0_table_1.html" + - Images: "B_origin_3_3_0_table_0.png", "B_origin_3_3_0_table_1.png" + + Args: + output_dir: output_* 디렉토리 경로 + pair_id: QA pair ID + image_paths: 원본 이미지 경로 리스트 (data/Public/Table/P_origin_0/...) + + Returns: + 찾은 이미지 경로 리스트 + """ + found_images = [] + + images_dir = output_dir / "images" + html_dir = output_dir / "html" + + # 1. images/ 디렉토리에서 캡처된 이미지 찾기 (pair_id_table_*.png 패턴) + if images_dir.exists(): + # pair_id_table_N.png 패턴으로 찾기 + for img_file in sorted(images_dir.glob(f"{pair_id}_table_*.png")): + found_images.append(img_file) + + # 못 찾았으면 pair_id*.png 패턴으로 시도 + if not found_images: + for img_file in sorted(images_dir.glob(f"{pair_id}*.png")): + found_images.append(img_file) + + # 2. html/ 디렉토리의 HTML 파일에 대응하는 이미지 찾기 + if not found_images and html_dir.exists() and images_dir.exists(): + for html_file in sorted(html_dir.glob(f"{pair_id}*.html")): + img_path = images_dir / f"{html_file.stem}.png" + if img_path.exists(): + found_images.append(img_path) + + # 3. 원본 이미지 경로가 존재하면 사용 (fallback) + if not found_images: + for orig_path in image_paths: + p = Path(orig_path) + if p.exists(): + found_images.append(p) + else: + # 프로젝트 루트 기준 상대 경로 시도 + full_path = project_root / orig_path + if full_path.exists(): + found_images.append(full_path) + + return sorted(set(found_images)) # 중복 제거 및 정렬 + + +def load_qa_from_pipeline_output( + output_dir: Path, + limit: Optional[int] = None, + qa_types: Optional[List[str]] = None, +) -> EvalDataset: + """ + pipeline_output.json에서 QA 데이터를 로드합니다. + + Args: + output_dir: output_* 디렉토리 + limit: 최대 로드할 샘플 수 + qa_types: 특정 QA 타입만 로드 + + Returns: + EvalDataset + """ + pipeline_output = output_dir / "pipeline_output.json" + + if not pipeline_output.exists(): + logger.warning(f"pipeline_output.json not found in {output_dir}") + return EvalDataset() + + with open(pipeline_output, "r", encoding="utf-8") as f: + data = json.load(f) + + items = [] + skipped_no_images = 0 + skipped_qa_type = 0 + + for entry in data: + pair_id = entry.get("pair_id", entry.get("name", "unknown")) + image_paths = entry.get("image_paths", []) + domain = entry.get("domain", "unknown") + qa_results = entry.get("qa_results", []) + + # 테이블 이미지 찾기 + table_images = find_table_images(output_dir, pair_id, image_paths) + + for idx, qa in enumerate(qa_results): + qa_type = qa.get("type", "unknown") + + # QA 타입 필터링 + if qa_types and qa_type not in qa_types: + skipped_qa_type += 1 + continue + + item_id = f"{pair_id}_{idx}" + + # 이미지 경로 결정 + if table_images: + item_image_paths = [str(p) for p in table_images] + elif image_paths: + # 원본 경로 사용 (fallback) + item_image_paths = image_paths + else: + skipped_no_images += 1 + continue + + item = QAItem( + id=item_id, + question=qa.get("question", ""), + answer=qa.get("answer", ""), + qa_type=qa_type, + image_paths=item_image_paths, + reasoning_annotation=qa.get("reasoning_annotation"), + context=qa.get("context"), + source_file=str(pipeline_output), + ) + items.append(item) + + if limit and len(items) >= limit: + break + + if limit and len(items) >= limit: + break + + if skipped_no_images: + logger.warning(f"Skipped {skipped_no_images} QA items without images") + if skipped_qa_type: + logger.info(f"Skipped {skipped_qa_type} QA items due to type filter") + + dataset = EvalDataset( + items=items, + metadata={ + "source": str(pipeline_output), + "domain": output_dir.name, + "total_entries": len(data), + "loaded_qa_count": len(items), + } + ) + + return dataset + + +def create_inference_requests( + dataset: EvalDataset, + prompt_template: Optional[str] = None, +) -> List[InferenceRequest]: + """ + 추론 요청을 생성합니다. + + Args: + dataset: 평가 데이터셋 + prompt_template: 프롬프트 템플릿 + + Returns: + InferenceRequest 리스트 + """ + requests = [] + + for item in dataset: + # context가 있으면 context 템플릿 사용 + if item.context: + template = prompt_template or CONTEXT_PROMPT_TEMPLATE + prompt = template.format( + question=item.question, + context=item.context, + ) + else: + template = prompt_template or DEFAULT_PROMPT_TEMPLATE + prompt = template.format(question=item.question) + + request = InferenceRequest( + id=item.id, + prompt=prompt, + ground_truth=item.answer, + qa_type=item.qa_type, + image_paths=item.image_paths, + ) + requests.append(request) + + return requests + + +async def evaluate_domain( + config: EvalConfig, +) -> tuple[List[EvalResult], AggregatedMetrics, Dict[str, Any]]: + """ + 단일 도메인에 대해 평가를 실행합니다. + + Args: + config: 평가 설정 + + Returns: + (개별 결과, 집계 메트릭, 메타데이터) + """ + domain_dir_name = DOMAIN_DIRS.get(config.domain) + if not domain_dir_name: + raise ValueError(f"Unknown domain: {config.domain}. Available: {list(DOMAIN_DIRS.keys())}") + + output_dir = project_root / domain_dir_name + if not output_dir.exists(): + raise FileNotFoundError(f"Output directory not found: {output_dir}") + + logger.info(f"Evaluating domain: {config.domain}") + logger.info(f"Output directory: {output_dir}") + + # 1. 데이터셋 로드 + dataset = load_qa_from_pipeline_output( + output_dir, + limit=config.limit, + qa_types=config.qa_types, + ) + + if len(dataset) == 0: + logger.error("No QA items loaded. Check if pipeline_output.json exists and contains valid data.") + return [], AggregatedMetrics(), {} + + logger.info(f"Loaded {len(dataset)} QA items") + logger.info(f"Type distribution: {dataset.get_type_distribution()}") + + # 2. 추론 요청 생성 + requests = create_inference_requests(dataset) + + # 3. vLLM 클라이언트 생성 + client = VLLMClient( + base_url=config.vllm_url, + model=config.model, + max_tokens=config.max_tokens, + temperature=config.temperature, + timeout=config.timeout, + max_concurrent=config.max_concurrent, + ) + + # 4. 추론 실행 + inference_output = None + if config.output_dir: + config.output_dir.mkdir(parents=True, exist_ok=True) + inference_output = config.output_dir / f"{config.domain}_inference.json" + + logger.info(f"Running inference on {len(requests)} requests...") + responses = await run_inference(client, requests, output_path=inference_output) + + # 5. 평가 + predictions = [ + { + "id": r.id, + "prediction": r.prediction, + "ground_truth": r.ground_truth, + "qa_type": r.qa_type, + "question": dataset.items[i].question if i < len(dataset.items) else "", + } + for i, r in enumerate(responses) + ] + + # Judge 클라이언트 설정 + judge_client = None + if config.use_judge: + judge_client = create_judge_client( + provider=config.judge_provider, + model=config.judge_model, + api_key=config.judge_api_key, + ) + + questions = [item.question for item in dataset.items] + results, aggregated = await evaluate_predictions( + predictions, + use_judge=config.use_judge, + judge_client=judge_client, + questions=questions, + ) + + # 메타데이터 + metadata = { + "domain": config.domain, + "model": config.model, + "vllm_url": config.vllm_url, + "total_items": len(dataset), + "type_distribution": dataset.get_type_distribution(), + "timestamp": datetime.now().isoformat(), + } + + return results, aggregated, metadata + + +async def evaluate_all_domains( + config: EvalConfig, +) -> Dict[str, tuple[List[EvalResult], AggregatedMetrics]]: + """ + 모든 도메인에 대해 평가를 실행합니다. + + Args: + config: 기본 평가 설정 (domain 필드는 무시됨) + + Returns: + 도메인별 결과 딕셔너리 + """ + all_results = {} + + for domain in DOMAIN_DIRS.keys(): + domain_config = EvalConfig( + domain=domain, + vllm_url=config.vllm_url, + model=config.model, + max_tokens=config.max_tokens, + temperature=config.temperature, + max_concurrent=config.max_concurrent, + timeout=config.timeout, + use_judge=config.use_judge, + judge_provider=config.judge_provider, + judge_model=config.judge_model, + judge_api_key=config.judge_api_key, + output_dir=config.output_dir, + limit=config.limit, + qa_types=config.qa_types, + ) + + try: + results, aggregated, metadata = await evaluate_domain(domain_config) + all_results[domain] = (results, aggregated, metadata) + print_report(aggregated) + except Exception as e: + logger.error(f"Failed to evaluate domain {domain}: {e}") + all_results[domain] = ([], AggregatedMetrics(), {"error": str(e)}) + + return all_results + + +def save_results( + results: List[EvalResult], + aggregated: AggregatedMetrics, + metadata: Dict[str, Any], + output_dir: Path, + domain: str, +) -> None: + """결과를 파일로 저장합니다.""" + output_dir.mkdir(parents=True, exist_ok=True) + + # 전체 리포트 + report = generate_report(results, aggregated, metadata) + report_path = output_dir / f"{domain}_evaluation_report.json" + with open(report_path, "w", encoding="utf-8") as f: + json.dump(report, ensure_ascii=False, indent=2, fp=f) + logger.info(f"Saved report to {report_path}") + + # 요약 결과 (CSV 친화적) + summary_path = output_dir / f"{domain}_summary.json" + summary = { + "domain": domain, + "total_count": aggregated.total_count, + "exact_match": aggregated.exact_match_avg, + "f1_score": aggregated.f1_score_avg, + "contains_match": aggregated.contains_match_avg, + "bleu_score": aggregated.bleu_score_avg, + "by_type": aggregated.by_type, + } + if aggregated.judge_overall_avg is not None: + summary["judge_overall"] = aggregated.judge_overall_avg + summary["judge_accuracy"] = aggregated.judge_accuracy + + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(summary, ensure_ascii=False, indent=2, fp=f) + logger.info(f"Saved summary to {summary_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="vLLM 서버를 사용한 Table QA 평가", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # 단일 도메인 평가 + python -m eval.evaluate_vllm --domain public --vllm-url http://localhost:8000/v1 + + # 모든 도메인 평가 + python -m eval.evaluate_vllm --all-domains --vllm-url http://localhost:8000/v1 + + # 특정 모델 사용 + python -m eval.evaluate_vllm --domain business --model Qwen/Qwen2-VL-7B-Instruct + + # LLM-as-Judge 포함 + python -m eval.evaluate_vllm --domain finance --use-judge --judge-model gpt-4o + + # 특정 QA 타입만 평가 + python -m eval.evaluate_vllm --domain public --qa-types lookup compare + + # 제한된 샘플로 테스트 + python -m eval.evaluate_vllm --domain public --limit 10 + """, + ) + + # 필수 인자 + parser.add_argument( + "--domain", + choices=list(DOMAIN_DIRS.keys()), + help="평가할 도메인", + ) + parser.add_argument( + "--all-domains", + action="store_true", + help="모든 도메인 평가", + ) + + # vLLM 설정 + parser.add_argument( + "--vllm-url", + default="http://localhost:8000/v1", + help="vLLM 서버 URL (default: http://localhost:8000/v1)", + ) + parser.add_argument( + "--model", + default="default", + help="사용할 모델 이름 (default: vLLM에서 로드된 모델 사용)", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=512, + help="최대 생성 토큰 수 (default: 512)", + ) + parser.add_argument( + "--temperature", + type=float, + default=0.0, + help="생성 온도 (default: 0.0)", + ) + parser.add_argument( + "--max-concurrent", + type=int, + default=10, + help="최대 동시 요청 수 (default: 10)", + ) + parser.add_argument( + "--timeout", + type=float, + default=120.0, + help="요청 타임아웃(초) (default: 120.0)", + ) + + # Judge 설정 + parser.add_argument( + "--use-judge", + action="store_true", + help="LLM-as-Judge 평가 사용", + ) + parser.add_argument( + "--judge-provider", + default="openai", + choices=["openai", "anthropic"], + help="Judge 제공자 (default: openai)", + ) + parser.add_argument( + "--judge-model", + help="Judge 모델 (default: gpt-4o-mini)", + ) + parser.add_argument( + "--judge-api-key", + help="Judge API 키 (환경변수에서 가져오지 않을 경우)", + ) + + # 출력 설정 + parser.add_argument( + "--output-dir", + type=Path, + default=Path("eval_results"), + help="결과 저장 디렉토리 (default: eval_results)", + ) + + # 필터링 옵션 + parser.add_argument( + "--limit", + type=int, + help="평가할 최대 샘플 수 (디버깅용)", + ) + parser.add_argument( + "--qa-types", + nargs="+", + help="특정 QA 타입만 평가 (예: lookup compare arithmetic)", + ) + + args = parser.parse_args() + + # 인자 검증 + if not args.domain and not args.all_domains: + parser.error("--domain 또는 --all-domains 중 하나를 지정해야 합니다.") + + # 설정 생성 + config = EvalConfig( + domain=args.domain or "public", # all-domains일 때 기본값 + vllm_url=args.vllm_url, + model=args.model, + max_tokens=args.max_tokens, + temperature=args.temperature, + max_concurrent=args.max_concurrent, + timeout=args.timeout, + use_judge=args.use_judge, + judge_provider=args.judge_provider, + judge_model=args.judge_model, + judge_api_key=args.judge_api_key, + output_dir=args.output_dir, + limit=args.limit, + qa_types=args.qa_types, + ) + + # 평가 실행 + if args.all_domains: + all_results = asyncio.run(evaluate_all_domains(config)) + + # 전체 요약 저장 + if config.output_dir: + config.output_dir.mkdir(parents=True, exist_ok=True) + + all_summary = {} + for domain, (results, aggregated, metadata) in all_results.items(): + if results: + save_results(results, aggregated, metadata, config.output_dir, domain) + all_summary[domain] = { + "total_count": aggregated.total_count, + "exact_match": aggregated.exact_match_avg, + "f1_score": aggregated.f1_score_avg, + } + + summary_path = config.output_dir / "all_domains_summary.json" + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(all_summary, ensure_ascii=False, indent=2, fp=f) + logger.info(f"Saved all-domains summary to {summary_path}") + else: + results, aggregated, metadata = asyncio.run(evaluate_domain(config)) + print_report(aggregated) + + if config.output_dir and results: + save_results(results, aggregated, metadata, config.output_dir, config.domain) + + +if __name__ == "__main__": + main() diff --git a/filter_qa_by_difficulty.py b/filter_qa_by_difficulty.py new file mode 100755 index 0000000..1469baa --- /dev/null +++ b/filter_qa_by_difficulty.py @@ -0,0 +1,595 @@ +#!/usr/bin/env python3 +""" +vLLM 서버를 사용하여 QA 난이도를 측정하고 필터링하는 스크립트. + +모델이 너무 쉽게 맞추는 문제(10/10)는 제외하고, +적당한 난이도(3-6/10 정확도)의 QA만 검수 대상으로 추출합니다. + +Usage: + # 기본 사용 (business 도메인) + python filter_qa_by_difficulty.py --domain business + + # 여러 도메인 + python filter_qa_by_difficulty.py --all + + # 커스텀 설정 + python filter_qa_by_difficulty.py --domain business --trials 10 --min-acc 0.3 --max-acc 0.6 + + # vLLM 서버 URL 지정 + python filter_qa_by_difficulty.py --domain business --vllm-url http://localhost:8000/v1 +""" + +import argparse +import base64 +import json +import logging +import os +import sys +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +from concurrent.futures import ThreadPoolExecutor, as_completed + +from dotenv import load_dotenv + +load_dotenv() + +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) + +from eval.metrics import normalize_answer, exact_match, f1_score + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +DOMAIN_DIRS = { + "academic": "output_academic", + "business": "output_business", + "finance": "output_finance", + "medical": "output_medical", + "public": "output_public", +} + + +@dataclass +class FilterConfig: + """필터링 설정""" + vllm_url: str = "http://localhost:8000/v1" + model_name: str = "" # vLLM 서버에서 자동 감지 + trials: int = 10 # 각 QA당 시도 횟수 + min_accuracy: float = 0.3 # 최소 정확도 (이상) + max_accuracy: float = 0.6 # 최대 정확도 (이하) + temperature: float = 0.7 # 다양한 응답을 위해 + max_tokens: int = 512 + timeout: int = 60 + max_workers: int = 4 # 병렬 처리 + + +@dataclass +class QADifficultyResult: + """QA 난이도 측정 결과""" + pair_id: str + table_index: int + qa_index: int + question: str + answer: str + qa_type: str + correct_count: int + total_trials: int + accuracy: float + responses: List[str] = field(default_factory=list) + difficulty_category: str = "" # easy, medium, hard, very_hard + + def to_dict(self) -> Dict[str, Any]: + return { + "pair_id": self.pair_id, + "table_index": self.table_index, + "qa_index": self.qa_index, + "question": self.question, + "answer": self.answer, + "qa_type": self.qa_type, + "correct_count": self.correct_count, + "total_trials": self.total_trials, + "accuracy": self.accuracy, + "difficulty_category": self.difficulty_category, + "sample_responses": self.responses[:3], # 샘플만 저장 + } + + +def get_vllm_model_name(vllm_url: str) -> str: + """vLLM 서버에서 모델 이름 가져오기""" + import requests + try: + response = requests.get(f"{vllm_url}/models", timeout=10) + response.raise_for_status() + models = response.json().get("data", []) + if models: + return models[0]["id"] + except Exception as e: + logger.warning(f"Failed to get model name from vLLM: {e}") + return "default" + + +def find_table_images(output_dir: Path, pair_id: str) -> List[Path]: + """pair_id에 해당하는 테이블 이미지 찾기""" + images_dir = output_dir / "images" + if not images_dir.exists(): + return [] + + found_images = [] + for img_file in sorted(images_dir.glob(f"{pair_id}_table_*.png")): + found_images.append(img_file) + + return found_images + + +def encode_image_base64(image_path: Path) -> str: + """이미지를 base64로 인코딩""" + with open(image_path, "rb") as f: + return base64.b64encode(f.read()).decode("utf-8") + + +def run_single_inference( + vllm_url: str, + model_name: str, + image_base64: str, + question: str, + config: FilterConfig, +) -> Optional[str]: + """단일 추론 실행""" + import requests + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + } + }, + { + "type": "text", + "text": f"Based on the table image, answer the following question concisely.\n\nQuestion: {question}\n\nAnswer:" + } + ] + } + ] + + try: + response = requests.post( + f"{vllm_url}/chat/completions", + json={ + "model": model_name, + "messages": messages, + "max_tokens": config.max_tokens, + "temperature": config.temperature, + }, + timeout=config.timeout, + ) + response.raise_for_status() + result = response.json() + return result["choices"][0]["message"]["content"].strip() + except Exception as e: + logger.debug(f"Inference failed: {e}") + return None + + +def check_answer_correct(prediction: str, ground_truth: str) -> bool: + """답변이 맞는지 확인 (EM 또는 F1 > 0.8)""" + if not prediction: + return False + + # Exact match + if exact_match(prediction, ground_truth): + return True + + # F1 score > 0.8 + if f1_score(prediction, ground_truth) > 0.8: + return True + + # 정규화된 답변이 포함되는지 확인 + norm_pred = normalize_answer(prediction) + norm_gt = normalize_answer(ground_truth) + if norm_gt in norm_pred or norm_pred in norm_gt: + return True + + return False + + +def measure_qa_difficulty( + vllm_url: str, + model_name: str, + image_base64: str, + question: str, + answer: str, + config: FilterConfig, +) -> Tuple[int, List[str]]: + """QA 난이도 측정 (여러 번 시도)""" + correct_count = 0 + responses = [] + + for trial in range(config.trials): + response = run_single_inference( + vllm_url, model_name, image_base64, question, config + ) + if response: + responses.append(response) + if check_answer_correct(response, answer): + correct_count += 1 + + return correct_count, responses + + +def categorize_difficulty(accuracy: float) -> str: + """정확도에 따라 난이도 분류""" + if accuracy >= 0.9: + return "too_easy" + elif accuracy >= 0.7: + return "easy" + elif accuracy >= 0.3: + return "medium" # 목표 범위 + elif accuracy > 0: + return "hard" + else: + return "very_hard" + + +def filter_qa_for_domain( + domain: str, + config: FilterConfig, + limit: Optional[int] = None, + dry_run: bool = False, +) -> Dict[str, Any]: + """도메인의 QA를 필터링""" + domain_dir = DOMAIN_DIRS.get(domain) + if not domain_dir: + raise ValueError(f"Unknown domain: {domain}") + + output_dir = project_root / domain_dir + pipeline_output_path = output_dir / "pipeline_output.json" + + if not pipeline_output_path.exists(): + raise FileNotFoundError(f"pipeline_output.json not found: {pipeline_output_path}") + + # 이미지 디렉토리 + images_dir = output_dir / "images" + + # 데이터 로드 + with open(pipeline_output_path, "r", encoding="utf-8") as f: + data = json.load(f) + + logger.info(f"Loaded {len(data)} entries from {pipeline_output_path}") + + if limit: + data = data[:limit] + logger.info(f"Limited to {limit} entries") + + if dry_run: + # QA 수 확인만 (이미지 없어도 OK) + total_qa = sum(len(entry.get("qa_results", [])) for entry in data) + images_exist = images_dir.exists() + image_count = len(list(images_dir.glob("*.png"))) if images_exist else 0 + logger.info(f"Dry run: {len(data)} entries, {total_qa} QA pairs") + logger.info(f"Images directory: {'exists' if images_exist else 'NOT FOUND'} ({image_count} images)") + if not images_exist: + logger.warning("이미지 디렉토리가 없습니다. 먼저 capture_html_to_images.py를 실행하세요.") + return { + "domain": domain, + "entries": len(data), + "total_qa": total_qa, + "images_exist": images_exist, + "image_count": image_count, + "dry_run": True, + } + + # 이미지 디렉토리 확인 (실제 실행 시) + if not images_dir.exists(): + raise FileNotFoundError( + f"Images directory not found: {images_dir}\n" + "먼저 capture_html_to_images.py를 실행하여 HTML을 이미지로 변환하세요." + ) + + # vLLM 모델 이름 가져오기 + model_name = config.model_name or get_vllm_model_name(config.vllm_url) + logger.info(f"Using model: {model_name}") + + # 결과 수집 + all_results: List[QADifficultyResult] = [] + stats = { + "total_qa": 0, + "too_easy": 0, + "easy": 0, + "medium": 0, + "hard": 0, + "very_hard": 0, + "skipped": 0, + } + + for entry_idx, entry in enumerate(data): + pair_id = entry.get("pair_id", entry.get("name", f"entry_{entry_idx}")) + qa_results = entry.get("qa_results", []) + + if not qa_results: + continue + + # 이미지 찾기 + image_files = find_table_images(output_dir, pair_id) + if not image_files: + logger.warning(f"No images found for {pair_id}, skipping") + stats["skipped"] += len(qa_results) + continue + + # 첫 번째 이미지 사용 (TODO: 멀티 이미지 지원) + image_base64 = encode_image_base64(image_files[0]) + + logger.info(f"[{entry_idx + 1}/{len(data)}] Processing {pair_id} ({len(qa_results)} QAs)") + + for qa_idx, qa in enumerate(qa_results): + question = qa.get("question", "") + answer = qa.get("answer", "") + qa_type = qa.get("type", "unknown") + + if not question or not answer: + stats["skipped"] += 1 + continue + + stats["total_qa"] += 1 + + # 난이도 측정 + correct_count, responses = measure_qa_difficulty( + config.vllm_url, + model_name, + image_base64, + question, + answer, + config, + ) + + accuracy = correct_count / config.trials if config.trials > 0 else 0 + difficulty = categorize_difficulty(accuracy) + stats[difficulty] += 1 + + result = QADifficultyResult( + pair_id=pair_id, + table_index=0, + qa_index=qa_idx, + question=question, + answer=answer, + qa_type=qa_type, + correct_count=correct_count, + total_trials=config.trials, + accuracy=accuracy, + responses=responses, + difficulty_category=difficulty, + ) + all_results.append(result) + + # 진행 상황 로그 + status = "✓" if config.min_accuracy <= accuracy <= config.max_accuracy else "✗" + logger.info(f" [{qa_idx + 1}/{len(qa_results)}] {qa_type}: {correct_count}/{config.trials} ({accuracy:.0%}) [{difficulty}] {status}") + + # 필터링 (목표 난이도 범위) + filtered_results = [ + r for r in all_results + if config.min_accuracy <= r.accuracy <= config.max_accuracy + ] + + # 결과 저장 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = output_dir / f"qa_difficulty_analysis_{timestamp}.json" + + output_data = { + "domain": domain, + "config": { + "vllm_url": config.vllm_url, + "model_name": model_name, + "trials": config.trials, + "min_accuracy": config.min_accuracy, + "max_accuracy": config.max_accuracy, + "temperature": config.temperature, + }, + "stats": stats, + "filtered_count": len(filtered_results), + "all_results": [r.to_dict() for r in all_results], + "filtered_for_review": [r.to_dict() for r in filtered_results], + "timestamp": timestamp, + } + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(output_data, f, ensure_ascii=False, indent=2) + + logger.info(f"Results saved to {output_file}") + + # 검수용 간단 리스트 저장 + review_file = output_dir / f"qa_for_review_{timestamp}.json" + review_data = { + "domain": domain, + "description": f"QA pairs with accuracy between {config.min_accuracy:.0%} and {config.max_accuracy:.0%}", + "count": len(filtered_results), + "items": [ + { + "pair_id": r.pair_id, + "qa_type": r.qa_type, + "question": r.question, + "answer": r.answer, + "accuracy": f"{r.accuracy:.0%} ({r.correct_count}/{r.total_trials})", + "sample_model_responses": r.responses[:3], + } + for r in filtered_results + ], + } + + with open(review_file, "w", encoding="utf-8") as f: + json.dump(review_data, f, ensure_ascii=False, indent=2) + + logger.info(f"Review list saved to {review_file}") + + return { + "domain": domain, + "total_qa": stats["total_qa"], + "stats": stats, + "filtered_for_review": len(filtered_results), + "output_file": str(output_file), + "review_file": str(review_file), + } + + +def main(): + parser = argparse.ArgumentParser( + description="vLLM을 사용하여 QA 난이도를 측정하고 검수 대상을 필터링합니다.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # business 도메인 필터링 + python filter_qa_by_difficulty.py --domain business + + # 모든 도메인 + python filter_qa_by_difficulty.py --all + + # 커스텀 설정 (5회 시도, 20-50% 정확도) + python filter_qa_by_difficulty.py --domain business --trials 5 --min-acc 0.2 --max-acc 0.5 + + # vLLM 서버 지정 + python filter_qa_by_difficulty.py --domain business --vllm-url http://localhost:8000/v1 + + # 테스트 (3개 entry만) + python filter_qa_by_difficulty.py --domain business --limit 3 + +Difficulty Categories: + - too_easy: 90-100% accuracy (제외) + - easy: 70-89% accuracy + - medium: 30-69% accuracy (검수 대상) + - hard: 1-29% accuracy + - very_hard: 0% accuracy + """ + ) + + parser.add_argument( + "--domain", + nargs="+", + choices=list(DOMAIN_DIRS.keys()), + help="필터링할 도메인(들)", + ) + parser.add_argument( + "--all", + action="store_true", + help="모든 도메인 필터링", + ) + parser.add_argument( + "--vllm-url", + default="http://localhost:8000/v1", + help="vLLM 서버 URL (default: http://localhost:8000/v1)", + ) + parser.add_argument( + "--model", + default="", + help="모델 이름 (미지정시 vLLM에서 자동 감지)", + ) + parser.add_argument( + "--trials", + type=int, + default=10, + help="각 QA당 시도 횟수 (default: 10)", + ) + parser.add_argument( + "--min-acc", + type=float, + default=0.3, + help="최소 정확도 (default: 0.3)", + ) + parser.add_argument( + "--max-acc", + type=float, + default=0.6, + help="최대 정확도 (default: 0.6)", + ) + parser.add_argument( + "--temperature", + type=float, + default=0.7, + help="샘플링 temperature (default: 0.7)", + ) + parser.add_argument( + "--limit", + type=int, + help="처리할 최대 entry 수 (테스트용)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="실제 추론 없이 확인만", + ) + + args = parser.parse_args() + + # 도메인 결정 + if args.all: + domains = list(DOMAIN_DIRS.keys()) + elif args.domain: + domains = args.domain + else: + parser.error("--domain 또는 --all을 지정해야 합니다.") + + # 설정 + config = FilterConfig( + vllm_url=args.vllm_url, + model_name=args.model, + trials=args.trials, + min_accuracy=args.min_acc, + max_accuracy=args.max_acc, + temperature=args.temperature, + ) + + logger.info(f"Domains: {domains}") + logger.info(f"Config: trials={config.trials}, accuracy range={config.min_accuracy:.0%}-{config.max_accuracy:.0%}") + + # 각 도메인 처리 + results = [] + for domain in domains: + logger.info(f"\n{'='*60}") + logger.info(f"Processing domain: {domain}") + logger.info(f"{'='*60}") + + try: + result = filter_qa_for_domain( + domain=domain, + config=config, + limit=args.limit, + dry_run=args.dry_run, + ) + results.append(result) + except Exception as e: + logger.error(f"Failed to process {domain}: {e}") + results.append({"domain": domain, "error": str(e)}) + + # 요약 + print("\n" + "=" * 60) + print(" QA Difficulty Filtering Summary") + print("=" * 60) + for result in results: + domain = result.get("domain", "unknown") + if "error" in result: + print(f" {domain}: ERROR - {result['error']}") + elif result.get("dry_run"): + img_status = "✓" if result.get("images_exist") else "✗ (run capture_html_to_images.py first)" + print(f" {domain}: {result.get('total_qa', 0)} QA pairs, {result.get('image_count', 0)} images {img_status} (dry run)") + else: + stats = result.get("stats", {}) + filtered = result.get("filtered_for_review", 0) + total = result.get("total_qa", 0) + print(f" {domain}:") + print(f" Total QA: {total}") + print(f" too_easy: {stats.get('too_easy', 0)}, easy: {stats.get('easy', 0)}") + print(f" medium: {stats.get('medium', 0)}, hard: {stats.get('hard', 0)}, very_hard: {stats.get('very_hard', 0)}") + print(f" → For review: {filtered} ({filtered/total*100:.1f}% of total)" if total > 0 else "") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/generate_synthetic_table/flow.py b/generate_synthetic_table/flow.py index 8be68dd..ce2d976 100644 --- a/generate_synthetic_table/flow.py +++ b/generate_synthetic_table/flow.py @@ -49,6 +49,7 @@ class TableState(TypedDict, total=False): synthetic_json: dict # 파싱된 합성 데이터 JSON qa_results: List[Dict] # 생성된 QA 쌍 token_usage: int # QA 생성에 사용된 총 토큰 수 + is_multi_image: bool # 다중 이미지 입력 여부 (cross-image QA 생성됨) def _encode_image(image_path: Path) -> str: @@ -684,12 +685,65 @@ def _node(state: TableState) -> TableState: return _node +def generate_long_sequence_node(llm: ChatOpenAI) -> Callable[[TableState], TableState]: + """Generate long_sequence QA pair separately (context-dependent questions).""" + + def _node(state: TableState) -> TableState: + logger.info("Entering node: generate_long_sequence") + + # Try to load long_sequence prompt, skip if not available + try: + prompt_template = _load_prompt("generate_long_sequence", state.get("domain")) + except ValueError: + logger.info("No generate_long_sequence prompt found, skipping long_sequence generation") + return state + + if state.get("errors"): + return state + + synthetic_html = state.get("synthetic_table") + if not synthetic_html: + logger.warning("No synthetic table for long_sequence generation, skipping") + return state + + try: + prompt = prompt_template.format(synthetic_html=synthetic_html) + except KeyError as e: + logger.warning(f"long_sequence prompt missing placeholder: {e}, skipping") + return state + + response_text, token_usage = _call_llm(llm, prompt, return_token_usage=True) + + logger.info(f"Long sequence generation token usage: {token_usage}") + + response_json = robust_json_parse(response_text) + + if response_json and "qa_pairs" in response_json: + long_seq_qa = response_json["qa_pairs"] + # Append to existing qa_results + existing_qa = list(state.get("qa_results", [])) + existing_qa.extend(long_seq_qa) + # Update token usage + existing_token_usage = state.get("token_usage", 0) + total_token_usage = existing_token_usage + token_usage + logger.info(f"Added {len(long_seq_qa)} long_sequence QA pairs. Total QA: {len(existing_qa)}") + return {**state, "qa_results": existing_qa, "token_usage": total_token_usage} + else: + logger.warning("long_sequence generation did not return valid JSON or 'qa_pairs' key.") + return state + + return _node + + def generate_qa_from_image_node(llm: ChatOpenAI) -> Callable[[TableState], TableState]: - """Generate QA pairs directly from image (QA-only mode).""" + """Generate QA pairs directly from image (QA-only mode). + + If multiple images are provided, uses 'generate_qa_from_multi_image' prompt + to generate cross-image QA pairs that require understanding multiple tables. + """ def _node(state: TableState) -> TableState: logger.info("Entering node: generate_qa_from_image") - prompt_template = _load_prompt("generate_qa_from_image", state.get("domain")) if state.get("errors"): return state @@ -711,6 +765,18 @@ def _node(state: TableState) -> TableState: else: logger.warning(f"Skipping missing image in batch: {img_p}") + # Use multi-image prompt if there are multiple images + is_multi_image = len(image_data_urls) > 1 + if is_multi_image: + logger.info(f"Multi-image mode detected: {len(image_data_urls)} images. Using cross-image QA prompt.") + try: + prompt_template = _load_prompt("generate_qa_from_multi_image", state.get("domain")) + except ValueError: + logger.warning("Multi-image prompt not found, falling back to single-image prompt") + prompt_template = _load_prompt("generate_qa_from_image", state.get("domain")) + else: + prompt_template = _load_prompt("generate_qa_from_image", state.get("domain")) + prompt = prompt_template response_text, token_usage = _call_llm(llm, prompt, image_urls=image_data_urls, return_token_usage=True) @@ -727,7 +793,10 @@ def _node(state: TableState) -> TableState: logger.warning("QA generation from image did not return valid JSON or 'qa_pairs' key.") logger.info(f"Returning token_usage: {token_usage}") - return {**state, "qa_results": qa_results, "token_usage": token_usage} + result_state = {**state, "qa_results": qa_results, "token_usage": token_usage} + if is_multi_image: + result_state["is_multi_image"] = True + return result_state return _node @@ -800,6 +869,7 @@ def build_synthetic_table_graph( if not skip_qa: graph.add_node("generate_qa", generate_qa_node(llm)) + graph.add_node("generate_long_sequence", generate_long_sequence_node(llm)) # Routing based on provider and input type def route_start(state: TableState) -> str: @@ -852,7 +922,8 @@ def route_start(state: TableState) -> str: graph.add_edge("parse_synthetic_table", END) else: graph.add_edge("parse_synthetic_table", "generate_qa") - graph.add_edge("generate_qa", END) + graph.add_edge("generate_qa", "generate_long_sequence") + graph.add_edge("generate_long_sequence", END) return graph diff --git a/generate_synthetic_table/prompts/academic.yaml b/generate_synthetic_table/prompts/academic.yaml index 87543eb..ea2caed 100644 --- a/generate_synthetic_table/prompts/academic.yaml +++ b/generate_synthetic_table/prompts/academic.yaml @@ -6,7 +6,7 @@ generate_qa: | {synthetic_html} ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain an academic tone and accurately handle experimental results, performance metrics (Accuracy, F1-score, etc.), model names, and statistical significance. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -14,6 +14,7 @@ generate_qa: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real model/dataset names** (e.g., BERT, GPT, ResNet, ImageNet). Use fictional names like "Model-A", "Dataset-X", "Method-알파". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -21,16 +22,15 @@ generate_qa: | - Is the question clear and unambiguous? (e.g., "Best model" -> "Model with highest Accuracy") ### [Reasoning Type Definitions (Academic Domain)] - (1) lookup: Retrieve specific model performance or value without condition/calculation. (e.g., "What is the ImageNet Top-1 Accuracy of ResNet-50?") + (1) lookup: Retrieve specific model performance or value without condition/calculation. (e.g., "What is the Top-1 Accuracy of Model-A?") (2) filter: Select rows/columns meeting specific conditions (performance, params, etc.). (e.g., "List all models with parameters under 10M.") - (3) aggregate: Statistical aggregation of experimental results (Sum, Avg, Max, Min, Count). (e.g., "What is the average F1-score of all BERT variants?") - (4) compare: Compare performance against baseline or between models. (e.g., "Does the Proposed Method have a higher BLEU score than SOTA?") + (3) aggregate: Statistical aggregation of experimental results (Sum, Avg, Max, Min, Count). (e.g., "What is the average F1-score of all model variants?") + (4) compare: Compare performance against baseline or between models. (e.g., "Does the Proposed Method have a higher BLEU score than the baseline?") (5) arithmetic: Specific calculation beyond simple comparison (difference, growth rate). (e.g., "What is the percentage improvement of Large model over Base model?") (6) temporal: Deduce trends over years or epochs. (e.g., "Which model published after 2020 has the best performance?") (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "What is the Precision of the model with the highest Recall?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which is the best performing model?" -> implies Bolded value or Accuracy column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "How much did performance drop in 'w/o attention'?" -> implies comparison to Full Model) - (10) long_sequence (Context-Dependent): Requires interpreting 'Experimental Setup' or 'Hypothesis' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -42,7 +42,7 @@ generate_qa: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) ] }} @@ -51,7 +51,7 @@ generate_qa_from_image: | Your mission is to analyze the provided academic table image and generate Question-Answer (QA) pairs that fit the specified Reasoning Type definitions. ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain an academic tone and accurately handle experimental results, performance metrics (Accuracy, F1-score, etc.), model names, and statistical significance. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -59,6 +59,7 @@ generate_qa_from_image: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real model/dataset names** (e.g., BERT, GPT, ResNet, ImageNet). Use fictional names like "Model-A", "Dataset-X", "Method-알파". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -66,16 +67,15 @@ generate_qa_from_image: | - Is the question clear and unambiguous? (e.g., "Best model" -> "Model with highest Accuracy") ### [Reasoning Type Definitions (Academic Domain)] - (1) lookup: Retrieve specific model performance or value without condition/calculation. (e.g., "What is the ImageNet Top-1 Accuracy of ResNet-50?") + (1) lookup: Retrieve specific model performance or value without condition/calculation. (e.g., "What is the Top-1 Accuracy of Model-A?") (2) filter: Select rows/columns meeting specific conditions (performance, params, etc.). (e.g., "List all models with parameters under 10M.") - (3) aggregate: Statistical aggregation of experimental results (Sum, Avg, Max, Min, Count). (e.g., "What is the average F1-score of all BERT variants?") - (4) compare: Compare performance against baseline or between models. (e.g., "Does the Proposed Method have a higher BLEU score than SOTA?") + (3) aggregate: Statistical aggregation of experimental results (Sum, Avg, Max, Min, Count). (e.g., "What is the average F1-score of all model variants?") + (4) compare: Compare performance against baseline or between models. (e.g., "Does the Proposed Method have a higher BLEU score than the baseline?") (5) arithmetic: Specific calculation beyond simple comparison (difference, growth rate). (e.g., "What is the percentage improvement of Large model over Base model?") (6) temporal: Deduce trends over years or epochs. (e.g., "Which model published after 2020 has the best performance?") (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "What is the Precision of the model with the highest Recall?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which is the best performing model?" -> implies Bolded value or Accuracy column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "How much did performance drop in 'w/o attention'?" -> implies comparison to Full Model) - (10) long_sequence (Context-Dependent): Requires interpreting 'Experimental Setup' or 'Hypothesis' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -87,7 +87,59 @@ generate_qa_from_image: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) + ] + }} + Return ONLY the JSON object. + +generate_qa_from_multi_image: | + You are an 'AI Data Researcher' specialized in building high-quality QA datasets that require understanding MULTIPLE academic/scientific table images together. + Your mission is to analyze ALL provided academic table images and generate Question-Answer (QA) pairs that REQUIRE information from MULTIPLE images to answer. + + **⚠️ CRITICAL REQUIREMENT: CROSS-IMAGE REASONING ⚠️** + - Every QA pair MUST require information from AT LEAST TWO images to answer correctly. + - Questions answerable from a single image are INVALID. + - Focus on comparisons, aggregations, or inferences that span multiple experimental results, model comparisons, or benchmark tables. + + ### [Instructions] + 1. **Analyze All Images**: First, understand what data each image contains and how they relate (e.g., different datasets, different model ablations, training vs test results). + 2. **Generate Cross-Image QA**: Create 9 diverse QA pairs where each question requires synthesizing information from multiple images. + 3. **Strict Constraints**: + - Answers must be derived from combining data across images. No external knowledge. + - Each QA pair must correspond to exactly one Reasoning Type. + - Output format must strictly follow JSON. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English, MUST be a single string, and MUST specify which images were used. + - **DO NOT use real model/dataset names** (e.g., BERT, GPT, ImageNet). Use fictional names like "Model-A", "Dataset-X", "Method-알파". + + ### [Validation Criteria] + - Does the answer REQUIRE data from multiple images? (Single-image answers are INVALID) + - Is the reasoning process logically flawless? + - Is the question clear about what experimental data is being compared or combined? + + ### [Cross-Image Reasoning Type Definitions (Academic Domain)] + (1) cross_lookup: Retrieve and combine performance values from different result tables. (e.g., "What is Model-A's accuracy on both Dataset-X and Dataset-Y from the two tables?") + (2) cross_filter: Filter models across benchmark tables based on conditions. (e.g., "Which models achieve >90% accuracy on both datasets shown in the two images?") + (3) cross_aggregate: Aggregate experimental results spanning multiple benchmarks. (e.g., "What is the average F1-score of Method-가 across all evaluation tables?") + (4) cross_compare: Compare model performance between different experimental settings. (e.g., "Does the proposed method outperform the baseline on both in-domain and out-of-domain tests?") + (5) cross_arithmetic: Calculate performance differences using data from multiple tables. (e.g., "What is the accuracy improvement of Model-A from the ablation table to the full model table?") + (6) cross_temporal: Identify experimental trends by combining multiple result tables. (e.g., "Based on both training curves, which model converges faster?") + (7) cross_multi_hop: Multi-step academic inference across tables. (e.g., "Find the best model on Dataset-X in Image 1, then find its parameters in Image 2.") + (8) cross_implicit: Answer questions requiring implicit understanding of relationships between results. (e.g., "Which approach is most efficient?" requires combining accuracy and parameter count from multiple tables) + (9) cross_synthesis: Synthesize research insights only possible by viewing all tables together. (e.g., "Based on both the main results and ablation study, which component contributes most to performance?") + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring multiple academic images to answer", + "answer": "Answer derived from multiple images", + "type": "cross_lookup", + "reasoning_annotation": "Step 1: From Image 1, extract X. Step 2: From Image 2, extract Y. Step 3: Combine to get answer.", + "context": null, + "images_used": ["image_1", "image_2"] + }}, + ... (One per Reasoning Type => Total 9) ] }} Return ONLY the JSON object. @@ -111,8 +163,9 @@ generate_synthetic_table: | 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - **NEVER copy any original data values** - generate fresh, realistic alternatives. - - For student/model names: Generate DIFFERENT names - - For university names: Generate DIFFERENT names + - **NEVER use real model/dataset/university names** (BERT, GPT, ResNet, ImageNet, MIT, Stanford, etc.). Use fictional names like "Model-A", "Dataset-X", "University-가". + - For student/model names: Generate DIFFERENT fictional names + - For university names: Generate DIFFERENT fictional names - For grades/scores: Generate DIFFERENT realistic values - For course/research topics: Generate DIFFERENT titles - For dates: Generate DIFFERENT plausible dates @@ -151,7 +204,8 @@ generate_synthetic_table_from_image: | 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** - **NEVER copy the data values from the image** - this is NOT an OCR task - **ALL cell content must be completely NEW and DIFFERENT** - - For student/model names: Generate DIFFERENT names + - **NEVER use real model/dataset/university names** (BERT, GPT, ResNet, ImageNet, MIT, Stanford, etc.). Use fictional names like "Model-A", "Dataset-X", "University-가". + - For student/model names: Generate DIFFERENT fictional names - For grades/scores: Generate DIFFERENT values - For course/research topics: Generate DIFFERENT titles 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). @@ -170,3 +224,40 @@ generate_synthetic_table_from_image: | - Score in image: "점수A" → Generate: "점수B" ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. + +generate_long_sequence: | + You are an 'AI Data Researcher' specialized in creating context-dependent QA pairs for academic/scientific tables. + Your mission is to generate a single high-quality "long_sequence" type QA pair that requires interpreting external context to answer questions about the table. + + **Input Table:** + {synthetic_html} + + ### [Instructions] + 1. **Generate ONE long_sequence QA pair** that requires reading and understanding a context paragraph to filter or interpret the table data. + 2. **Create a realistic academic context** (e.g., "Experimental Setup", "Research Hypothesis", "Ablation Study Goals") that provides information needed to answer the question. + 3. **The question must be unanswerable without the context** - the context should contain key criteria or conditions. + 4. **Strict Constraints**: + - Answer must be derived from BOTH the table AND the context. Neither alone is sufficient. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English and MUST be a single string. + - Context must be written in Korean and be 2-4 sentences long. + - **DO NOT use real model/dataset names** (e.g., BERT, GPT, ResNet). Use fictional names. + + ### [Example Scenarios (Academic)] + - Context describes experimental conditions (dataset size, hardware) → Question asks which models meet the criteria + - Context outlines baseline comparison requirements → Question asks which methods show improvement + - Context specifies evaluation metrics of interest → Question asks for rankings based on those metrics + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring context to answer", + "answer": "Answer derived from table + context", + "type": "long_sequence", + "reasoning_annotation": "Step 1: Extract key criteria from context. Step 2: Apply criteria to table. Step 3: Derive answer.", + "context": "실험 설정에 따르면... (2-4 sentences of academic context in Korean)" + }} + ] + }} + Return ONLY the JSON object. diff --git a/generate_synthetic_table/prompts/business.yaml b/generate_synthetic_table/prompts/business.yaml index 18ebc27..40594c5 100644 --- a/generate_synthetic_table/prompts/business.yaml +++ b/generate_synthetic_table/prompts/business.yaml @@ -6,7 +6,7 @@ generate_qa: | {synthetic_html} ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain a business tone and accurately handle revenue, profit margins, growth rates, market share, and employee performance metrics. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -14,6 +14,7 @@ generate_qa: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real company names** (e.g., Samsung, Apple, Google). Use fictional names like "A사", "B기업", "가나다 주식회사". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -30,7 +31,6 @@ generate_qa: | (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "What is the name of the Branch Manager of the branch with the #1 Revenue?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which is the most profitable project?" -> implies Profit Margin column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is Q4 performance?" -> implies continuation from Q1-Q3 context) - (10) long_sequence (Context-Dependent): Requires interpreting 'Management Goals' or 'Market Conditions' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -42,7 +42,7 @@ generate_qa: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) ] }} @@ -51,7 +51,7 @@ generate_qa_from_image: | Your mission is to analyze the provided business table image and generate Question-Answer (QA) pairs that fit the specified Reasoning Type definitions. ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain a business tone and accurately handle revenue, profit margins, growth rates, market share, and employee performance metrics. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -59,6 +59,7 @@ generate_qa_from_image: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real company names** (e.g., Samsung, Apple, Google). Use fictional names like "A사", "B기업", "가나다 주식회사". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -75,7 +76,6 @@ generate_qa_from_image: | (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "What is the name of the Branch Manager of the branch with the #1 Revenue?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which is the most profitable project?" -> implies Profit Margin column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is Q4 performance?" -> implies continuation from Q1-Q3 context) - (10) long_sequence (Context-Dependent): Requires interpreting 'Management Goals' or 'Market Conditions' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -87,7 +87,59 @@ generate_qa_from_image: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) + ] + }} + Return ONLY the JSON object. + +generate_qa_from_multi_image: | + You are an 'AI Data Researcher' specialized in building high-quality QA datasets that require understanding MULTIPLE table images together. + Your mission is to analyze ALL provided business table images and generate Question-Answer (QA) pairs that REQUIRE information from MULTIPLE images to answer. + + **⚠️ CRITICAL REQUIREMENT: CROSS-IMAGE REASONING ⚠️** + - Every QA pair MUST require information from AT LEAST TWO images to answer correctly. + - Questions answerable from a single image are INVALID. + - Focus on comparisons, aggregations, or inferences that span multiple tables. + + ### [Instructions] + 1. **Analyze All Images**: First, understand what data each image contains and how they relate to each other (e.g., same company different periods, different departments, related metrics). + 2. **Generate Cross-Image QA**: Create 9 diverse QA pairs where each question requires synthesizing information from multiple images. + 3. **Strict Constraints**: + - Answers must be derived from combining data across images. No external knowledge. + - Each QA pair must correspond to exactly one Reasoning Type. + - Output format must strictly follow JSON. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English, MUST be a single string, and MUST specify which images were used. + - **DO NOT use real company names** (e.g., Samsung, Apple, Google). Use fictional names like "A사", "B기업", "가나다 주식회사". + + ### [Validation Criteria] + - Does the answer REQUIRE data from multiple images? (Single-image answers are INVALID) + - Is the reasoning process logically flawless? + - Is the question clear about what is being compared or combined? + + ### [Cross-Image Reasoning Type Definitions (Business Domain)] + (1) cross_lookup: Retrieve and combine specific values from different images. (e.g., "What is the total Q1 revenue of Branch A from both Table 1 and Table 2?") + (2) cross_filter: Filter rows across tables based on conditions spanning multiple images. (e.g., "Which departments appear in both tables and have positive profit margins in both?") + (3) cross_aggregate: Aggregate data spanning multiple images. (e.g., "What is the combined total revenue across all branches shown in both images?") + (4) cross_compare: Compare values or trends between different images. (e.g., "Which table shows higher average profit margin - Table 1 or Table 2?") + (5) cross_arithmetic: Calculate differences, ratios, or changes using data from multiple images. (e.g., "What is the revenue growth rate from the Q1 table to the Q2 table for Branch A?") + (6) cross_temporal: Identify trends or changes by combining time-series data from multiple images. (e.g., "Combining both annual reports, which department showed continuous growth?") + (7) cross_multi_hop: Multi-step inference requiring lookups across images. (e.g., "Find the top performer in Image 1, then find their metrics in Image 2.") + (8) cross_implicit: Answer questions requiring implicit understanding of relationships between images. (e.g., "Which region improved the most?" when improvement requires comparing two period tables) + (9) cross_synthesis: Synthesize insights that are only possible by viewing all images together. (e.g., "Based on both the budget table and the results table, which projects exceeded their targets?") + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring multiple images to answer", + "answer": "Answer derived from multiple images", + "type": "cross_lookup", + "reasoning_annotation": "Step 1: From Image 1, extract X. Step 2: From Image 2, extract Y. Step 3: Combine to get answer.", + "context": null, + "images_used": ["image_1", "image_2"] + }}, + ... (One per Reasoning Type => Total 9) ] }} Return ONLY the JSON object. @@ -112,7 +164,8 @@ generate_synthetic_table: | 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - **NEVER copy any original data values** - generate fresh, realistic alternatives. - - For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀") + - **NEVER use real company/brand names** (Samsung, Apple, Google, 현대, LG, etc.). Use fictional names like "A사", "가나다 기업", "XYZ Corp". + - For company/team names: Generate DIFFERENT fictional names (e.g., "A팀" → "B팀") - For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") - For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억") - For strategy/description text: Write DIFFERENT content with similar structure @@ -156,8 +209,9 @@ generate_synthetic_table_from_image: | 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** - **NEVER copy the data values from the image** - this is NOT an OCR task - **ALL cell content must be completely NEW and DIFFERENT from the original** + - **NEVER use real company/brand names** (Samsung, Apple, Google, 현대, LG, etc.). Use fictional names like "A사", "가나다 기업", "XYZ Corp". - Generate COMPLETELY NEW synthetic business values for all data cells: - * For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀") + * For company/team names: Generate DIFFERENT fictional names (e.g., "A팀" → "B팀") * For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억") * For strategy/description text: Write DIFFERENT content with similar structure * For bullet point items: Create DIFFERENT but domain-appropriate items @@ -181,3 +235,40 @@ generate_synthetic_table_from_image: | ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. Remember: The output should be a new synthetic business dataset, not a transcription of the original. + +generate_long_sequence: | + You are an 'AI Data Researcher' specialized in creating context-dependent QA pairs for business tables. + Your mission is to generate a single high-quality "long_sequence" type QA pair that requires interpreting external context to answer questions about the table. + + **Input Table:** + {synthetic_html} + + ### [Instructions] + 1. **Generate ONE long_sequence QA pair** that requires reading and understanding a context paragraph to filter or interpret the table data. + 2. **Create a realistic business context** (e.g., "Management Goals", "Market Conditions", "Strategic Guidelines") that provides information needed to answer the question. + 3. **The question must be unanswerable without the context** - the context should contain key criteria or conditions. + 4. **Strict Constraints**: + - Answer must be derived from BOTH the table AND the context. Neither alone is sufficient. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English and MUST be a single string. + - Context must be written in Korean and be 2-4 sentences long. + - **DO NOT use real company names** (e.g., Samsung, Apple, Google). Use fictional names. + + ### [Example Scenarios (Business)] + - Context describes a target market condition → Question asks which products/departments meet the criteria + - Context outlines budget constraints → Question asks which projects are feasible + - Context specifies performance thresholds → Question asks which teams qualify + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring context to answer", + "answer": "Answer derived from table + context", + "type": "long_sequence", + "reasoning_annotation": "Step 1: Extract key criteria from context. Step 2: Apply criteria to table. Step 3: Derive answer.", + "context": "경영 목표에 따르면... (2-4 sentences of business context in Korean)" + }} + ] + }} + Return ONLY the JSON object. diff --git a/generate_synthetic_table/prompts/default.yaml b/generate_synthetic_table/prompts/default.yaml index 0ca2645..4940e0a 100644 --- a/generate_synthetic_table/prompts/default.yaml +++ b/generate_synthetic_table/prompts/default.yaml @@ -77,6 +77,52 @@ generate_qa_from_image: | Return ONLY the JSON object, no additional text. +generate_qa_from_multi_image: | + You are an expert in creating educational and reasoning questions from tabular data. + Your task is to analyze ALL provided table images and generate Question-Answer (QA) pairs that REQUIRE information from MULTIPLE images to answer. + + **⚠️ CRITICAL REQUIREMENT: CROSS-IMAGE REASONING ⚠️** + - Every QA pair MUST require information from AT LEAST TWO images to answer correctly. + - Questions answerable from a single image are INVALID. + - Focus on comparisons, aggregations, or inferences that span multiple tables. + + ### [Instructions] + 1. **Analyze All Images**: First, understand what data each image contains and how they relate to each other. + 2. **Generate Cross-Image QA**: Create 5 diverse QA pairs where each question requires synthesizing information from multiple images. + 3. **Strict Constraints**: + - Answers must be derived from combining data across images. No external knowledge. + - Output format must strictly follow JSON. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English, MUST be a single string, and MUST specify which images were used. + - **DO NOT use real company/institution names**. Use fictional names. + + ### [Validation Criteria] + - Does the answer REQUIRE data from multiple images? (Single-image answers are INVALID) + - Is the reasoning process logically flawless? + - Is the question clear about what is being compared or combined? + + ### [Cross-Image Reasoning Types] + - **cross_lookup**: Retrieve and combine specific values from different images. + - **cross_compare**: Compare values or trends between different images. + - **cross_aggregate**: Aggregate data spanning multiple images. + - **cross_arithmetic**: Calculate using data from multiple images. + - **cross_synthesis**: Synthesize insights only possible by viewing all images together. + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring multiple images to answer", + "answer": "Answer derived from multiple images", + "type": "cross_lookup", + "reasoning_annotation": "Step 1: From Image 1, extract X. Step 2: From Image 2, extract Y. Step 3: Combine to get answer.", + "images_used": ["image_1", "image_2"] + }}, + ... (Total 5 cross-image QA pairs) + ] + }} + Return ONLY the JSON object. + generate_synthetic_table: | You are a Synthetic Data Generator specialized in creating completely NEW data while preserving table structure. @@ -329,3 +375,40 @@ validate_parsed_table: | Return a JSON object with the following keys: - "valid": boolean (true if valid, false otherwise) - "reason": string (brief explanation of the decision) + +generate_long_sequence: | + You are an 'AI Data Researcher' specialized in creating context-dependent QA pairs for tables. + Your mission is to generate a single high-quality "long_sequence" type QA pair that requires interpreting external context to answer questions about the table. + + **Input Table:** + {synthetic_html} + + ### [Instructions] + 1. **Generate ONE long_sequence QA pair** that requires reading and understanding a context paragraph to filter or interpret the table data. + 2. **Create a realistic context paragraph** (e.g., guidelines, criteria, conditions) that provides information needed to answer the question. + 3. **The question must be unanswerable without the context** - the context should contain key criteria or conditions. + 4. **Strict Constraints**: + - Answer must be derived from BOTH the table AND the context. Neither alone is sufficient. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English and MUST be a single string. + - Context must be written in Korean and be 2-4 sentences long. + - **DO NOT use real company/institution names**. Use fictional names. + + ### [Example Scenarios] + - Context describes selection criteria → Question asks which items meet the criteria + - Context outlines rules/thresholds → Question asks which entries qualify + - Context specifies conditions → Question asks for items matching those conditions + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring context to answer", + "answer": "Answer derived from table + context", + "type": "long_sequence", + "reasoning_annotation": "Step 1: Extract key criteria from context. Step 2: Apply criteria to table. Step 3: Derive answer.", + "context": "조건에 따르면... (2-4 sentences of context in Korean)" + }} + ] + }} + Return ONLY the JSON object. diff --git a/generate_synthetic_table/prompts/finance.yaml b/generate_synthetic_table/prompts/finance.yaml index 77d9927..6e700e0 100644 --- a/generate_synthetic_table/prompts/finance.yaml +++ b/generate_synthetic_table/prompts/finance.yaml @@ -6,7 +6,7 @@ generate_qa: | {synthetic_html} ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain a financial professional tone and accurately handle stock prices, financial statements (Assets, Liabilities, Equity), investment metrics (PER, PBR, ROE), interest rates, and exchange rates. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -14,6 +14,7 @@ generate_qa: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real company names** (e.g., Samsung, Apple, Google, 현대, SK). Use fictional names like "A사", "B기업", "가나다 주식회사". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -21,7 +22,7 @@ generate_qa: | - Is the question clear and unambiguous? (e.g., "Most undervalued stock" -> "Stock with lowest PER") ### [Reasoning Type Definitions (Finance Domain)] - (1) lookup: Retrieve specific stock price or financial figures without condition/calculation. (e.g., "What is Samsung Electronics' 2023 dividend?") + (1) lookup: Retrieve specific stock price or financial figures without condition/calculation. (e.g., "What is Company A's 2023 dividend?") (2) filter: Select rows/columns meeting specific conditions (Market Cap cap, specific sector). (e.g., "List all companies with Debt Ratio under 100%.") (3) aggregate: Statistical aggregation of portfolio or time-series data using Sum/Avg etc. (e.g., "What is the total valuation of held stocks?") (4) compare: Compare financial health between companies or investment metrics. (e.g., "Which company has a higher ROE, Company A or B?") @@ -30,7 +31,6 @@ generate_qa: | (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "Who is the largest shareholder of the company with #1 Market Cap?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which stock has strong dividend tendency?" -> implies Dividend Yield column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is Net Income?" -> implies continuation from Operating Profit column context) - (10) long_sequence (Context-Dependent): Requires interpreting 'Market Outlook' or 'Investment Strategy' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -42,7 +42,7 @@ generate_qa: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) ] }} @@ -51,7 +51,7 @@ generate_qa_from_image: | Your mission is to analyze the provided financial table image and generate Question-Answer (QA) pairs that fit the specified Reasoning Type definitions. ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain a financial professional tone and accurately handle stock prices, financial statements (Assets, Liabilities, Equity), investment metrics (PER, PBR, ROE), interest rates, and exchange rates. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -59,6 +59,7 @@ generate_qa_from_image: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real company names** (e.g., Samsung, Apple, Google, 현대, SK). Use fictional names like "A사", "B기업", "가나다 주식회사". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -66,7 +67,7 @@ generate_qa_from_image: | - Is the question clear and unambiguous? (e.g., "Most undervalued stock" -> "Stock with lowest PER") ### [Reasoning Type Definitions (Finance Domain)] - (1) lookup: Retrieve specific stock price or financial figures without condition/calculation. (e.g., "What is Samsung Electronics' 2023 dividend?") + (1) lookup: Retrieve specific stock price or financial figures without condition/calculation. (e.g., "What is Company A's 2023 dividend?") (2) filter: Select rows/columns meeting specific conditions (Market Cap cap, specific sector). (e.g., "List all companies with Debt Ratio under 100%.") (3) aggregate: Statistical aggregation of portfolio or time-series data using Sum/Avg etc. (e.g., "What is the total valuation of held stocks?") (4) compare: Compare financial health between companies or investment metrics. (e.g., "Which company has a higher ROE, Company A or B?") @@ -75,7 +76,6 @@ generate_qa_from_image: | (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "Who is the largest shareholder of the company with #1 Market Cap?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which stock has strong dividend tendency?" -> implies Dividend Yield column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is Net Income?" -> implies continuation from Operating Profit column context) - (10) long_sequence (Context-Dependent): Requires interpreting 'Market Outlook' or 'Investment Strategy' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -87,7 +87,59 @@ generate_qa_from_image: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) + ] + }} + Return ONLY the JSON object. + +generate_qa_from_multi_image: | + You are an 'AI Data Researcher' specialized in building high-quality QA datasets that require understanding MULTIPLE financial table images together. + Your mission is to analyze ALL provided financial table images and generate Question-Answer (QA) pairs that REQUIRE information from MULTIPLE images to answer. + + **⚠️ CRITICAL REQUIREMENT: CROSS-IMAGE REASONING ⚠️** + - Every QA pair MUST require information from AT LEAST TWO images to answer correctly. + - Questions answerable from a single image are INVALID. + - Focus on comparisons, aggregations, or inferences that span multiple financial statements or reports. + + ### [Instructions] + 1. **Analyze All Images**: First, understand what data each image contains and how they relate (e.g., different fiscal periods, income statement vs balance sheet, different securities). + 2. **Generate Cross-Image QA**: Create 9 diverse QA pairs where each question requires synthesizing information from multiple images. + 3. **Strict Constraints**: + - Answers must be derived from combining data across images. No external knowledge. + - Each QA pair must correspond to exactly one Reasoning Type. + - Output format must strictly follow JSON. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English, MUST be a single string, and MUST specify which images were used. + - **DO NOT use real company/fund names** (e.g., Samsung, Apple, Vanguard). Use fictional names like "A사", "B펀드", "가나다증권". + + ### [Validation Criteria] + - Does the answer REQUIRE data from multiple images? (Single-image answers are INVALID) + - Is the reasoning process logically flawless? + - Is the question clear about what is being compared or combined? + + ### [Cross-Image Reasoning Type Definitions (Finance Domain)] + (1) cross_lookup: Retrieve and combine specific financial values from different statements. (e.g., "What is the total assets combining both Q1 and Q2 balance sheets?") + (2) cross_filter: Filter entries across financial statements based on conditions. (e.g., "Which accounts show positive growth in both the income statement and cash flow statement?") + (3) cross_aggregate: Aggregate financial data spanning multiple periods or statements. (e.g., "What is the total revenue across all quarterly reports shown?") + (4) cross_compare: Compare financial ratios or metrics between different periods or portfolios. (e.g., "Did the debt-to-equity ratio improve from Table 1 to Table 2?") + (5) cross_arithmetic: Calculate financial metrics using data from multiple statements. (e.g., "Calculate the year-over-year revenue growth using data from both annual reports.") + (6) cross_temporal: Identify financial trends by combining multiple period data. (e.g., "Based on all quarterly statements, what is the profit margin trend?") + (7) cross_multi_hop: Multi-step financial inference across statements. (e.g., "Find the highest dividend stock in Image 1, then find its P/E ratio in Image 2.") + (8) cross_implicit: Answer questions requiring understanding relationships between financial statements. (e.g., "Which company is more leveraged?" requires comparing debt from multiple sources) + (9) cross_synthesis: Synthesize financial insights only possible by viewing all statements together. (e.g., "Based on both income and cash flow statements, which segments are cash-generative?") + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring multiple financial images to answer", + "answer": "Answer derived from multiple images", + "type": "cross_lookup", + "reasoning_annotation": "Step 1: From Image 1, extract X. Step 2: From Image 2, extract Y. Step 3: Combine to get answer.", + "context": null, + "images_used": ["image_1", "image_2"] + }}, + ... (One per Reasoning Type => Total 9) ] }} Return ONLY the JSON object. @@ -111,7 +163,8 @@ generate_synthetic_table: | 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - **NEVER copy any original data values** - generate fresh, realistic alternatives. - - For company names: Generate DIFFERENT names (e.g., "A회사" → "B회사") + - **NEVER use real company/brand names** (Samsung, Apple, Google, 현대, SK, LG, etc.). Use fictional names like "A사", "가나다 기업", "XYZ Corp". + - For company names: Generate DIFFERENT fictional names (e.g., "A회사" → "B회사") - For financial figures: Generate DIFFERENT amounts (similar magnitude, different values) - For percentages/ratios: Generate DIFFERENT metrics - For dates: Generate DIFFERENT plausible dates @@ -150,7 +203,8 @@ generate_synthetic_table_from_image: | 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** - **NEVER copy the data values from the image** - this is NOT an OCR task - **ALL cell content must be completely NEW and DIFFERENT** - - For company names: Generate DIFFERENT names + - **NEVER use real company/brand names** (Samsung, Apple, Google, 현대, SK, LG, etc.). Use fictional names like "A사", "가나다 기업", "XYZ Corp". + - For company names: Generate DIFFERENT fictional names - For financial figures: Generate DIFFERENT amounts - For percentages/ratios: Generate DIFFERENT metrics 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). @@ -169,3 +223,40 @@ generate_synthetic_table_from_image: | - Amount in image: "50억" → Generate: "80억" ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. + +generate_long_sequence: | + You are an 'AI Data Researcher' specialized in creating context-dependent QA pairs for financial tables. + Your mission is to generate a single high-quality "long_sequence" type QA pair that requires interpreting external context to answer questions about the table. + + **Input Table:** + {synthetic_html} + + ### [Instructions] + 1. **Generate ONE long_sequence QA pair** that requires reading and understanding a context paragraph to filter or interpret the table data. + 2. **Create a realistic financial context** (e.g., "Market Outlook", "Investment Strategy", "Risk Guidelines") that provides information needed to answer the question. + 3. **The question must be unanswerable without the context** - the context should contain key criteria or conditions. + 4. **Strict Constraints**: + - Answer must be derived from BOTH the table AND the context. Neither alone is sufficient. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English and MUST be a single string. + - Context must be written in Korean and be 2-4 sentences long. + - **DO NOT use real company names** (e.g., Samsung, Apple, Google). Use fictional names. + + ### [Example Scenarios (Finance)] + - Context describes investment criteria (PER < 15, ROE > 10%) → Question asks which stocks qualify + - Context outlines risk tolerance levels → Question asks which portfolio allocation is appropriate + - Context specifies sector preferences → Question asks which companies match the strategy + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring context to answer", + "answer": "Answer derived from table + context", + "type": "long_sequence", + "reasoning_annotation": "Step 1: Extract key criteria from context. Step 2: Apply criteria to table. Step 3: Derive answer.", + "context": "투자 전략에 따르면... (2-4 sentences of financial context in Korean)" + }} + ] + }} + Return ONLY the JSON object. diff --git a/generate_synthetic_table/prompts/medical.yaml b/generate_synthetic_table/prompts/medical.yaml index 7bf995c..eb16745 100644 --- a/generate_synthetic_table/prompts/medical.yaml +++ b/generate_synthetic_table/prompts/medical.yaml @@ -6,7 +6,7 @@ generate_qa: | {synthetic_html} ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain medical professionalism and accurately handle patient vital signs, diagnosis names, medication dosages, lab values, and prognosis. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -15,6 +15,7 @@ generate_qa: | - **Privacy**: Assume patient names/IDs are pseudonymized synthetic data. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real hospital/drug/institution names**. Use fictional names like "A병원", "약물-X", "환자ID-001". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -22,7 +23,7 @@ generate_qa: | - Is the question clear and unambiguous? (e.g., "Patient in bad condition" -> "Patient with systolic BP under 90mmHg") ### [Reasoning Type Definitions (Medical Domain)] - (1) lookup: Retrieve specific patient lab results or medication info without condition/calculation. (e.g., "What is the blood glucose level of Patient ID-101?") + (1) lookup: Retrieve specific patient lab results or medication info without condition/calculation. (e.g., "What is the blood glucose level of Patient ID-001?") (2) filter: Select rows/columns meeting specific conditions (abnormal range, specific disease). (e.g., "List all patients with temperature above 38°C.") (3) aggregate: Statistical aggregation of patient group data (Mean LOS, Prevalence). (e.g., "What is the average age of patients in Ward A?") (4) compare: Compare efficacy between treatment groups or patient status pre/post. (e.g., "Is cholesterol level lower post-medication than pre-medication?") @@ -31,7 +32,6 @@ generate_qa: | (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "Who is the attending physician of the patient prescribed the highest dosage?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which group is at risk of hypertension?" -> implies Systolic/Diastolic BP columns) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is the 2nd test result?" -> implies column next to 1st test) - (10) long_sequence (Context-Dependent): Requires interpreting 'Clinical Protocol' or 'Exclusion Criteria' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -43,7 +43,7 @@ generate_qa: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) ] }} @@ -52,7 +52,7 @@ generate_qa_from_image: | Your mission is to analyze the provided medical table image and generate Question-Answer (QA) pairs that fit the specified Reasoning Type definitions. ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain medical professionalism and accurately handle patient vital signs, diagnosis names, medication dosages, lab values, and prognosis. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -60,6 +60,7 @@ generate_qa_from_image: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English. + - **DO NOT use real hospital/drug/institution names**. Use fictional names like "A병원", "약물-X", "환자ID-001". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -67,7 +68,7 @@ generate_qa_from_image: | - Is the question clear and unambiguous? (e.g., "Patient in bad condition" -> "Patient with systolic BP under 90mmHg") ### [Reasoning Type Definitions (Medical Domain)] - (1) lookup: Retrieve specific patient lab results or medication info without condition/calculation. (e.g., "What is the blood glucose level of Patient ID-101?") + (1) lookup: Retrieve specific patient lab results or medication info without condition/calculation. (e.g., "What is the blood glucose level of Patient ID-001?") (2) filter: Select rows/columns meeting specific conditions (abnormal range, specific disease). (e.g., "List all patients with temperature above 38°C.") (3) aggregate: Statistical aggregation of patient group data (Mean LOS, Prevalence). (e.g., "What is the average age of patients in Ward A?") (4) compare: Compare efficacy between treatment groups or patient status pre/post. (e.g., "Is cholesterol level lower post-medication than pre-medication?") @@ -76,7 +77,6 @@ generate_qa_from_image: | (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "Who is the attending physician of the patient prescribed the highest dosage?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which group is at risk of hypertension?" -> implies Systolic/Diastolic BP columns) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is the 2nd test result?" -> implies column next to 1st test) - (10) long_sequence (Context-Dependent): Requires interpreting 'Clinical Protocol' or 'Exclusion Criteria' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -88,7 +88,59 @@ generate_qa_from_image: | "reasoning_annotation": "Step-by-step logic to derive answer", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) + ] + }} + Return ONLY the JSON object. + +generate_qa_from_multi_image: | + You are an 'AI Data Researcher' specialized in building high-quality QA datasets that require understanding MULTIPLE medical/clinical table images together. + Your mission is to analyze ALL provided medical table images and generate Question-Answer (QA) pairs that REQUIRE information from MULTIPLE images to answer. + + **⚠️ CRITICAL REQUIREMENT: CROSS-IMAGE REASONING ⚠️** + - Every QA pair MUST require information from AT LEAST TWO images to answer correctly. + - Questions answerable from a single image are INVALID. + - Focus on comparisons, aggregations, or inferences that span multiple clinical records, lab results, or patient cohorts. + + ### [Instructions] + 1. **Analyze All Images**: First, understand what data each image contains and how they relate (e.g., different time points, different patient groups, lab results vs vital signs). + 2. **Generate Cross-Image QA**: Create 9 diverse QA pairs where each question requires synthesizing information from multiple images. + 3. **Strict Constraints**: + - Answers must be derived from combining data across images. No external knowledge. + - Each QA pair must correspond to exactly one Reasoning Type. + - Output format must strictly follow JSON. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English, MUST be a single string, and MUST specify which images were used. + - **DO NOT use real hospital/drug names**. Use fictional names like "A병원", "약물-X", "환자ID-001". + + ### [Validation Criteria] + - Does the answer REQUIRE data from multiple images? (Single-image answers are INVALID) + - Is the reasoning process logically flawless? + - Is the question clear about what clinical data is being compared or combined? + + ### [Cross-Image Reasoning Type Definitions (Medical Domain)] + (1) cross_lookup: Retrieve and combine patient data from different clinical records. (e.g., "What is the patient's blood glucose level before and after treatment from both tables?") + (2) cross_filter: Filter patients across clinical datasets based on conditions. (e.g., "Which patients appear in both the treatment and follow-up tables with improved vitals?") + (3) cross_aggregate: Aggregate clinical data spanning multiple patient cohorts. (e.g., "What is the average age of patients across both study groups?") + (4) cross_compare: Compare clinical outcomes between different time points or treatment groups. (e.g., "Did the treatment group in Table 2 show better outcomes than the control in Table 1?") + (5) cross_arithmetic: Calculate clinical metrics using data from multiple records. (e.g., "What is the change in BMI from the baseline table to the 6-month follow-up table?") + (6) cross_temporal: Identify patient progression by combining multiple visit records. (e.g., "Based on both admission and discharge tables, which patients showed improvement?") + (7) cross_multi_hop: Multi-step clinical inference across records. (e.g., "Find the patient with highest creatinine in Image 1, then find their blood pressure in Image 2.") + (8) cross_implicit: Answer questions requiring implicit understanding of clinical relationships. (e.g., "Which patients are at higher risk?" requires combining data from multiple clinical assessments) + (9) cross_synthesis: Synthesize clinical insights only possible by viewing all records together. (e.g., "Based on both lab results and medication tables, which patients may need dose adjustment?") + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring multiple clinical images to answer", + "answer": "Answer derived from multiple images", + "type": "cross_lookup", + "reasoning_annotation": "Step 1: From Image 1, extract X. Step 2: From Image 2, extract Y. Step 3: Combine to get answer.", + "context": null, + "images_used": ["image_1", "image_2"] + }}, + ... (One per Reasoning Type => Total 9) ] }} Return ONLY the JSON object. @@ -112,9 +164,10 @@ generate_synthetic_table: | 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - **NEVER use real hospital/drug/institution names**. Use fictional names like "A병원", "약물-X", "제약사-가". - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers - For lab values: Generate DIFFERENT realistic values - - For diagnoses/medications: Generate DIFFERENT names and codes + - For diagnoses/medications: Generate DIFFERENT fictional names and codes - For dates: Generate DIFFERENT plausible dates 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** - Look at the original image's color scheme and design @@ -151,9 +204,10 @@ generate_synthetic_table_from_image: | 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** - **NEVER copy the data values from the image** - this is NOT an OCR task - **ALL cell content must be completely NEW and DIFFERENT** + - **NEVER use real hospital/drug/institution names**. Use fictional names like "A병원", "약물-X", "제약사-가". - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers - For lab values: Generate DIFFERENT realistic values - - For diagnoses/medications: Generate DIFFERENT names + - For diagnoses/medications: Generate DIFFERENT fictional names 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). - ``: `class="w-full border-collapse text-sm"` - ``: `class="bg-gradient-to-r from-teal-700 to-teal-800 text-white"` @@ -170,3 +224,40 @@ generate_synthetic_table_from_image: | - Value in image: "수치A" → Generate: "수치B" ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. + +generate_long_sequence: | + You are an 'AI Data Researcher' specialized in creating context-dependent QA pairs for medical/clinical tables. + Your mission is to generate a single high-quality "long_sequence" type QA pair that requires interpreting external context to answer questions about the table. + + **Input Table:** + {synthetic_html} + + ### [Instructions] + 1. **Generate ONE long_sequence QA pair** that requires reading and understanding a context paragraph to filter or interpret the table data. + 2. **Create a realistic clinical context** (e.g., "Clinical Protocol", "Exclusion Criteria", "Treatment Guidelines") that provides information needed to answer the question. + 3. **The question must be unanswerable without the context** - the context should contain key criteria or conditions. + 4. **Strict Constraints**: + - Answer must be derived from BOTH the table AND the context. Neither alone is sufficient. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English and MUST be a single string. + - Context must be written in Korean and be 2-4 sentences long. + - **DO NOT use real hospital/drug names**. Use fictional names like "A병원", "약물-X". + + ### [Example Scenarios (Medical)] + - Context describes patient exclusion criteria (age, comorbidities) → Question asks which patients are eligible + - Context outlines dosage adjustment rules → Question asks which patients need dose modification + - Context specifies lab value thresholds for intervention → Question asks which patients require treatment + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring context to answer", + "answer": "Answer derived from table + context", + "type": "long_sequence", + "reasoning_annotation": "Step 1: Extract key criteria from context. Step 2: Apply criteria to table. Step 3: Derive answer.", + "context": "임상 프로토콜에 따르면... (2-4 sentences of clinical context in Korean)" + }} + ] + }} + Return ONLY the JSON object. diff --git a/generate_synthetic_table/prompts/public.yaml b/generate_synthetic_table/prompts/public.yaml index b0c4099..4fc21e2 100644 --- a/generate_synthetic_table/prompts/public.yaml +++ b/generate_synthetic_table/prompts/public.yaml @@ -6,7 +6,7 @@ generate_qa: | {synthetic_html} ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain a public/objective tone and accurately handle demographics, budgets, administrative region names, policy beneficiaries, and annual indicators. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -14,6 +14,7 @@ generate_qa: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English and MUST be a single string (not a list). + - **DO NOT use real place names** (e.g., Seoul, Busan, Gyeonggi). Use fictional names like "A시", "나구", "다군". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -21,16 +22,15 @@ generate_qa: | - Is the question clear and unambiguous? (e.g., "Place with most people" -> "District with highest Population") ### [Reasoning Type Definitions (Public Domain)] - (1) lookup: Retrieve specific regional or annual statistics without condition/calculation. (e.g., "What is the total population of Seoul in 2023?") + (1) lookup: Retrieve specific regional or annual statistics without condition/calculation. (e.g., "What is the total population of Region A in 2023?") (2) filter: Select rows/columns meeting specific conditions (above/below value, specific region). (e.g., "List all departments with budget execution rate over 90%.") - (3) aggregate: Statistical aggregation of regional/annual data (Sum, Avg). (e.g., "What is the average number of births in the 17 provinces?") - (4) compare: Compare regional gaps or annual trends. (e.g., "Which has higher incoming migration, Gyeonggi or Seoul?") + (3) aggregate: Statistical aggregation of regional/annual data (Sum, Avg). (e.g., "What is the average number of births in all provinces?") + (4) compare: Compare regional gaps or annual trends. (e.g., "Which has higher incoming migration, Region A or Region B?") (5) arithmetic: Specific calculation beyond simple comparison (population density, YoY growth). (e.g., "What is the population density of District A?") (6) temporal: Deduce policy effects or long-term statistical trends. (e.g., "Which region shows a decreasing crime rate trend over the last 5 years?") (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "Who is the Mayor of the city with the highest financial independence?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which is the most aged city?" -> implies 65+ Population Ratio column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is next year's target?" -> implies column next to this year's value) - (10) long_sequence (Context-Dependent): Requires interpreting 'Policy Guidelines' or 'Legal Requirements' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -42,7 +42,7 @@ generate_qa: | "reasoning_annotation": "Step-by-step logic to derive answer (MUST be a string, not a list)", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) ] }} @@ -51,7 +51,7 @@ generate_qa_from_image: | Your mission is to analyze the provided public/administrative table image and generate Question-Answer (QA) pairs that fit the specified Reasoning Type definitions. ### [Instructions] - 1. **Candidate Generation & Filtering**: Internally generate 10 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 10 pairs) + 1. **Candidate Generation & Filtering**: Internally generate 9 diverse QA pairs first. Then, for each Reasoning Type, select and output only the single most perfect QA pair that passes the [Validation Criteria]. (Total 9 pairs) 2. **Domain Suitability**: Questions must maintain a public/objective tone and accurately handle demographics, budgets, administrative region names, policy beneficiaries, and annual indicators. 3. **Strict Constraints**: - Answers must be derived ONLY from the table (and provided context). No external knowledge. @@ -59,6 +59,7 @@ generate_qa_from_image: | - Output format must strictly follow JSON. - Questions and Answers MUST be written in Korean. - reasoning_annotation MUST be written in English. + - **DO NOT use real place names** (e.g., Seoul, Busan, Gyeonggi). Use fictional names like "A시", "나구", "다군". ### [Validation Criteria] - Is the answer uniquely determined within the table? @@ -66,16 +67,15 @@ generate_qa_from_image: | - Is the question clear and unambiguous? (e.g., "Place with most people" -> "District with highest Population") ### [Reasoning Type Definitions (Public Domain)] - (1) lookup: Retrieve specific regional or annual statistics without condition/calculation. (e.g., "What is the total population of Seoul in 2023?") + (1) lookup: Retrieve specific regional or annual statistics without condition/calculation. (e.g., "What is the total population of Region A in 2023?") (2) filter: Select rows/columns meeting specific conditions (above/below value, specific region). (e.g., "List all departments with budget execution rate over 90%.") - (3) aggregate: Statistical aggregation of regional/annual data (Sum, Avg). (e.g., "What is the average number of births in the 17 provinces?") - (4) compare: Compare regional gaps or annual trends. (e.g., "Which has higher incoming migration, Gyeonggi or Seoul?") + (3) aggregate: Statistical aggregation of regional/annual data (Sum, Avg). (e.g., "What is the average number of births in all provinces?") + (4) compare: Compare regional gaps or annual trends. (e.g., "Which has higher incoming migration, Region A or Region B?") (5) arithmetic: Specific calculation beyond simple comparison (population density, YoY growth). (e.g., "What is the population density of District A?") (6) temporal: Deduce policy effects or long-term statistical trends. (e.g., "Which region shows a decreasing crime rate trend over the last 5 years?") (7) multi_hop: Multi-step inference finding a value first, then using it as a key. (e.g., "Who is the Mayor of the city with the highest financial independence?") (8) implicit_reference: Referring to specific metrics contextually without explicit column name. (e.g., "Which is the most aged city?" -> implies 65+ Population Ratio column) (9) ellipsis: Recovering omitted comparisons or criteria from table structure. (e.g., "What is next year's target?" -> implies column next to this year's value) - (10) long_sequence (Context-Dependent): Requires interpreting 'Policy Guidelines' or 'Legal Requirements' text (Context) to filter table data. **Requirement**: Must generate a hypothetical [Context] paragraph needed to solve the question. ### [Output Format (JSON)] {{ @@ -87,7 +87,59 @@ generate_qa_from_image: | "reasoning_annotation": "Step-by-step logic to derive answer", "context": null }}, - ... (One per Reasoning Type => Total 10) + ... (One per Reasoning Type => Total 9) + ] + }} + Return ONLY the JSON object. + +generate_qa_from_multi_image: | + You are an 'AI Data Researcher' specialized in building high-quality QA datasets that require understanding MULTIPLE public/government table images together. + Your mission is to analyze ALL provided public data table images and generate Question-Answer (QA) pairs that REQUIRE information from MULTIPLE images to answer. + + **⚠️ CRITICAL REQUIREMENT: CROSS-IMAGE REASONING ⚠️** + - Every QA pair MUST require information from AT LEAST TWO images to answer correctly. + - Questions answerable from a single image are INVALID. + - Focus on comparisons, aggregations, or inferences that span multiple regional statistics, budget tables, or policy data. + + ### [Instructions] + 1. **Analyze All Images**: First, understand what data each image contains and how they relate (e.g., different fiscal years, different regions, budget vs expenditure). + 2. **Generate Cross-Image QA**: Create 9 diverse QA pairs where each question requires synthesizing information from multiple images. + 3. **Strict Constraints**: + - Answers must be derived from combining data across images. No external knowledge. + - Each QA pair must correspond to exactly one Reasoning Type. + - Output format must strictly follow JSON. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English, MUST be a single string, and MUST specify which images were used. + - **DO NOT use real place names** (e.g., Seoul, Busan, Gyeonggi). Use fictional names like "A시", "나구", "다군". + + ### [Validation Criteria] + - Does the answer REQUIRE data from multiple images? (Single-image answers are INVALID) + - Is the reasoning process logically flawless? + - Is the question clear about what public data is being compared or combined? + + ### [Cross-Image Reasoning Type Definitions (Public Domain)] + (1) cross_lookup: Retrieve and combine regional statistics from different data tables. (e.g., "What is the total population of A시 combining both census tables?") + (2) cross_filter: Filter regions across datasets based on conditions. (e.g., "Which districts appear in both tables with budget execution rate >80%?") + (3) cross_aggregate: Aggregate public data spanning multiple regions or years. (e.g., "What is the total government expenditure across all departments in both fiscal year reports?") + (4) cross_compare: Compare regional performance between different periods or metrics. (e.g., "Did the unemployment rate improve from the 2022 table to the 2023 table for Region A?") + (5) cross_arithmetic: Calculate public metrics using data from multiple sources. (e.g., "What is the year-over-year population growth rate using data from both census tables?") + (6) cross_temporal: Identify policy trends by combining multiple year data. (e.g., "Based on both annual reports, which region shows consistent improvement in education metrics?") + (7) cross_multi_hop: Multi-step inference across public data tables. (e.g., "Find the region with highest tax revenue in Image 1, then find its population density in Image 2.") + (8) cross_implicit: Answer questions requiring implicit understanding of relationships between datasets. (e.g., "Which region is most fiscally efficient?" requires combining budget and outcome data from multiple sources) + (9) cross_synthesis: Synthesize policy insights only possible by viewing all tables together. (e.g., "Based on both the budget allocation and service satisfaction tables, which programs are underperforming?") + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring multiple public data images to answer", + "answer": "Answer derived from multiple images", + "type": "cross_lookup", + "reasoning_annotation": "Step 1: From Image 1, extract X. Step 2: From Image 2, extract Y. Step 3: Combine to get answer.", + "context": null, + "images_used": ["image_1", "image_2"] + }}, + ... (One per Reasoning Type => Total 9) ] }} Return ONLY the JSON object. @@ -111,7 +163,8 @@ generate_synthetic_table: | 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - **NEVER copy any original data values** - generate fresh, realistic alternatives. - - For regions: Generate DIFFERENT administrative region names + - **NEVER use real place names** (Seoul, Busan, Gyeonggi, etc.). Use fictional names like "A시", "나구", "다군", "라도". + - For regions: Generate DIFFERENT fictional administrative region names - For departments: Generate DIFFERENT department names - For statistics: Generate DIFFERENT numbers (similar magnitude) - For dates: Generate DIFFERENT plausible dates @@ -150,7 +203,8 @@ generate_synthetic_table_from_image: | 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** - **NEVER copy the data values from the image** - this is NOT an OCR task - **ALL cell content must be completely NEW and DIFFERENT** - - For regions: Generate DIFFERENT administrative region names + - **NEVER use real place names** (Seoul, Busan, Gyeonggi, etc.). Use fictional names like "A시", "나구", "다군", "라도". + - For regions: Generate DIFFERENT fictional administrative region names - For statistics: Generate DIFFERENT numbers - For departments: Generate DIFFERENT names 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). @@ -169,3 +223,40 @@ generate_synthetic_table_from_image: | - Statistic in image: "수치A" → Generate: "수치B" ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. + +generate_long_sequence: | + You are an 'AI Data Researcher' specialized in creating context-dependent QA pairs for public/government data tables. + Your mission is to generate a single high-quality "long_sequence" type QA pair that requires interpreting external context to answer questions about the table. + + **Input Table:** + {synthetic_html} + + ### [Instructions] + 1. **Generate ONE long_sequence QA pair** that requires reading and understanding a context paragraph to filter or interpret the table data. + 2. **Create a realistic public policy context** (e.g., "Policy Guidelines", "Legal Requirements", "Budget Allocation Rules") that provides information needed to answer the question. + 3. **The question must be unanswerable without the context** - the context should contain key criteria or conditions. + 4. **Strict Constraints**: + - Answer must be derived from BOTH the table AND the context. Neither alone is sufficient. + - Questions and Answers MUST be written in Korean. + - reasoning_annotation MUST be written in English and MUST be a single string. + - Context must be written in Korean and be 2-4 sentences long. + - **DO NOT use real place names** (e.g., Seoul, Busan). Use fictional names. + + ### [Example Scenarios (Public)] + - Context describes eligibility criteria for a subsidy → Question asks which regions qualify + - Context outlines budget allocation rules → Question asks which departments receive funding + - Context specifies demographic thresholds → Question asks which areas need intervention + + ### [Output Format (JSON)] + {{ + "qa_pairs": [ + {{ + "question": "Question requiring context to answer", + "answer": "Answer derived from table + context", + "type": "long_sequence", + "reasoning_annotation": "Step 1: Extract key criteria from context. Step 2: Apply criteria to table. Step 3: Derive answer.", + "context": "정책 지침에 따르면... (2-4 sentences of policy context in Korean)" + }} + ] + }} + Return ONLY the JSON object. diff --git a/multi_image_json_list/test_academic_input.json b/multi_image_json_list/test_academic_input.json new file mode 100755 index 0000000..478bf45 --- /dev/null +++ b/multi_image_json_list/test_academic_input.json @@ -0,0 +1,192 @@ +[ + { + "index": 0, + "pair_id": "A_origin_0_0", + "image_paths": [ + "data/Academic/Table/A_origin_0/A_table_0.png", + "data/Academic/Table/A_origin_0/A_table_1.png" + ], + "domain": "academic" + }, + { + "index": 1, + "pair_id": "A_origin_0_1", + "image_paths": [ + "data/Academic/Table/A_origin_0/A_table_2.png", + "data/Academic/Table/A_origin_0/A_table_3.png", + "data/Academic/Table/A_origin_0/A_table_4.png" + ], + "domain": "academic" + }, + { + "index": 2, + "pair_id": "A_origin_1_0", + "image_paths": [ + "data/Academic/Table/A_origin_1/A_table_5.png", + "data/Academic/Table/A_origin_1/A_table_6.png" + ], + "domain": "academic" + }, + { + "index": 3, + "pair_id": "A_origin_2_0", + "image_paths": [ + "data/Academic/Table/A_origin_2/A_table_8.png", + "data/Academic/Table/A_origin_2/A_table_9.png" + ], + "domain": "academic" + }, + { + "index": 4, + "pair_id": "A_origin_3_0", + "image_paths": [ + "data/Academic/Table/A_origin_3/A_table_10.png", + "data/Academic/Table/A_origin_3/A_table_11.png" + ], + "domain": "academic" + }, + { + "index": 5, + "pair_id": "A_origin_4_0", + "image_paths": [ + "data/Academic/Table/A_origin_4/A_table_12.png", + "data/Academic/Table/A_origin_4/A_table_13.png" + ], + "domain": "academic" + }, + { + "index": 6, + "pair_id": "A_origin_5_0", + "image_paths": [ + "data/Academic/Table/A_origin_5/A_table_14.png", + "data/Academic/Table/A_origin_5/A_table_15.png", + "data/Academic/Table/A_origin_5/A_table_16.png" + ], + "domain": "academic" + }, + { + "index": 7, + "pair_id": "A_origin_6_0", + "image_paths": [ + "data/Academic/Table/A_origin_6/A_table_17.png", + "data/Academic/Table/A_origin_6/A_table_18.png", + "data/Academic/Table/A_origin_6/A_table_19.png" + ], + "domain": "academic" + }, + { + "index": 8, + "pair_id": "A_origin_7_0", + "image_paths": [ + "data/Academic/Table/A_origin_7/A_table_20.png", + "data/Academic/Table/A_origin_7/A_table_21.png" + ], + "domain": "academic" + }, + { + "index": 9, + "pair_id": "A_origin_8_0", + "image_paths": [ + "data/Academic/Table/A_origin_8/A_table_22.png", + "data/Academic/Table/A_origin_8/A_table_23.png", + "data/Academic/Table/A_origin_8/A_table_24.png" + ], + "domain": "academic" + }, + { + "index": 10, + "pair_id": "A_origin_9_0", + "image_paths": [ + "data/Academic/Table/A_origin_9/A_table_25.png", + "data/Academic/Table/A_origin_9/A_table_26.png", + "data/Academic/Table/A_origin_9/A_table_27.png" + ], + "domain": "academic" + }, + { + "index": 11, + "pair_id": "A_origin_10_0", + "image_paths": [ + "data/Academic/Table/A_origin_10/A_table_28.png", + "data/Academic/Table/A_origin_10/A_table_29.png" + ], + "domain": "academic" + }, + { + "index": 12, + "pair_id": "A_origin_11_0", + "image_paths": [ + "data/Academic/Table/A_origin_11/A_table_30.png", + "data/Academic/Table/A_origin_11/A_table_31.png" + ], + "domain": "academic" + }, + { + "index": 13, + "pair_id": "A_origin_12_0", + "image_paths": [ + "data/Academic/Table/A_origin_12/A_table_32.png", + "data/Academic/Table/A_origin_12/A_table_33.png", + "data/Academic/Table/A_origin_12/A_table_34.png" + ], + "domain": "academic" + }, + { + "index": 14, + "pair_id": "A_origin_14_0", + "image_paths": [ + "data/Academic/Table/A_origin_14/A_table_39.png", + "data/Academic/Table/A_origin_14/A_table_40.png" + ], + "domain": "academic" + }, + { + "index": 15, + "pair_id": "A_origin_18_0", + "image_paths": [ + "data/Academic/Table/A_origin_18/A_table_47.png", + "data/Academic/Table/A_origin_18/A_table_48.png", + "data/Academic/Table/A_origin_18/A_table_49.png" + ], + "domain": "academic" + }, + { + "index": 16, + "pair_id": "A_origin_26_0", + "image_paths": [ + "data/Academic/Table/A_origin_26/A_table_59_01.png", + "data/Academic/Table/A_origin_26/A_table_59_02.png", + "data/Academic/Table/A_origin_26/A_table_60.png", + "data/Academic/Table/A_origin_26/A_table_61.png" + ], + "domain": "academic" + }, + { + "index": 17, + "pair_id": "A_origin_28_0", + "image_paths": [ + "data/Academic/Table/A_origin_28/A_table_63_01.png", + "data/Academic/Table/A_origin_28/A_table_63_02.png", + "data/Academic/Table/A_origin_28/A_table_64.png" + ], + "domain": "academic" + }, + { + "index": 18, + "pair_id": "A_origin_36_0", + "image_paths": [ + "data/Academic/Table/A_origin_36/A_table_73.png", + "data/Academic/Table/A_origin_36/A_table_75.png" + ], + "domain": "academic" + }, + { + "index": 19, + "pair_id": "A_origin_43_0", + "image_paths": [ + "data/Academic/Table/A_origin_43/A_table_89.png", + "data/Academic/Table/A_origin_43/A_table_90.png" + ], + "domain": "academic" + } +] \ No newline at end of file diff --git a/multi_image_json_list/test_business_input.json b/multi_image_json_list/test_business_input.json new file mode 100755 index 0000000..ace8a1f --- /dev/null +++ b/multi_image_json_list/test_business_input.json @@ -0,0 +1,173 @@ +[ + { + "index": 0, + "pair_id": "B_origin_0_0_0", + "image_paths": [ + "data/Business/Table/B_origin_0/B_table_0_0.png", + "data/Business/Table/B_origin_0/B_table_1_0.png" + ], + "domain": "Business" + }, + { + "index": 1, + "pair_id": "B_origin_3_3_0", + "image_paths": [ + "data/Business/Table/B_origin_3/B_table_10_0.png", + "data/Business/Table/B_origin_3/B_table_11_0.png" + ], + "domain": "Business" + }, + { + "index": 2, + "pair_id": "B_origin_4_4_0", + "image_paths": [ + "data/Business/Table/B_origin_4/B_table_16_0.png", + "data/Business/Table/B_origin_4/B_table_16_1.png" + ], + "domain": "Business" + }, + { + "index": 3, + "pair_id": "B_origin_4_4_1", + "image_paths": [ + "data/Business/Table/B_origin_4/B_table_14_0.png", + "data/Business/Table/B_origin_4/B_table_15_0.png" + ], + "domain": "Business" + }, + { + "index": 4, + "pair_id": "B_origin_6_6_0", + "image_paths": [ + "data/Business/Table/B_origin_6/B_table_20_0.png", + "data/Business/Table/B_origin_6/B_table_21_0.png" + ], + "domain": "Business" + }, + { + "index": 5, + "pair_id": "B_origin_2_10_0", + "image_paths": [ + "data/Business/Table/B_origin_2/B_table_6_0.png", + "data/Business/Table/B_origin_6/B_table_23_0.png" + ], + "domain": "Business" + }, + { + "index": 7, + "pair_id": "B_origin_14_14_0", + "image_paths": [ + "data/Business/Table/B_origin_14/B_table_45_0.png", + "data/Business/Table/B_origin_14/B_table_45_1.png" + ], + "domain": "Business" + }, + { + "index": 8, + "pair_id": "B_origin_15_15_0", + "image_paths": [ + "data/Business/Table/B_origin_15/B_table_51_0.png", + "data/Business/Table/B_origin_15/B_table_51_1.png" + ], + "domain": "Business" + }, + { + "index": 9, + "pair_id": "B_origin_10_15_0", + "image_paths": [ + "data/Business/Table/B_origin_10/B_table_33_0.png", + "data/Business/Table/B_origin_15/B_table_52_0.png" + ], + "domain": "Business" + }, + { + "index": 10, + "pair_id": "B_origin_18_18_0", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_63_0.png", + "data/Business/Table/B_origin_18/B_table_63_1.png" + ], + "domain": "Business" + }, + { + "index": 11, + "pair_id": "B_origin_18_18_1", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_61_0.png", + "data/Business/Table/B_origin_18/B_table_64_0.png" + ], + "domain": "Business" + }, + { + "index": 12, + "pair_id": "B_origin_20_20_0", + "image_paths": [ + "data/Business/Table/B_origin_20/B_table_68_0.png", + "data/Business/Table/B_origin_20/B_table_69_0.png" + ], + "domain": "Business" + }, + { + "index": 13, + "pair_id": "B_origin_21_21_0", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_70_0.png", + "data/Business/Table/B_origin_21/B_table_71_0.png" + ], + "domain": "Business" + }, + { + "index": 14, + "pair_id": "B_origin_21_21_1", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_72_0.png", + "data/Business/Table/B_origin_21/B_table_72_1.png" + ], + "domain": "Business" + }, + { + "index": 15, + "pair_id": "B_origin_23_23_0", + "image_paths": [ + "data/Business/Table/B_origin_23/B_table_80_0.png", + "data/Business/Table/B_origin_23/B_table_81_0.png" + ], + "domain": "Business" + }, + { + "index": 16, + "pair_id": "B_origin_23_23_0", + "image_paths": [ + "data/Business/Table/B_origin_23/B_table_80_0.png", + "data/Business/Table/B_origin_23/B_table_81_0.png" + ], + "domain": "Business" + }, + { + "index": 17, + "pair_id": "B_origin_24_24_0", + "image_paths": [ + "data/Business/Table/B_origin_24/B_table_83_0.png", + "data/Business/Table/B_origin_24/B_table_84_0.png" + ], + "domain": "Business" + }, + { + "index": 18, + "pair_id": "B_origin_32_32_0", + "image_paths": [ + "data/Business/Table/B_origin_32/B_table_110_0.png", + "data/Business/Table/B_origin_32/B_table_112_0.png" + ], + "domain": "Business" + }, + { + "index": 19, + "pair_id": "B_origin_37_37_0", + "image_paths": [ + "data/Business/Table/B_origin_37/B_table_132_0.png", + "data/Business/Table/B_origin_37/B_table_132_1.png" + ], + "domain": "Business" + } +] \ No newline at end of file diff --git a/multi_image_json_list/test_finance_input.json b/multi_image_json_list/test_finance_input.json new file mode 100755 index 0000000..98b7491 --- /dev/null +++ b/multi_image_json_list/test_finance_input.json @@ -0,0 +1,182 @@ +[ + { + "index": 0, + "pair_id": "F_table_0", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_0_0.png", + "data/Finance/Table/F_origin_3/F_table_5_0.png" + ], + "domain": "finance" + }, + { + "index": 1, + "pair_id": "F_table_1", + "image_paths": [ + "data/Finance/Table/F_origin_4/F_table_6_0.png", + "data/Finance/Table/F_origin_6/F_table_8_0.png" + ], + "domain": "finance" + }, + { + "index": 2, + "pair_id": "F_table_2", + "image_paths": [ + "data/Finance/Table/F_origin_10/F_table_14_0.png", + "data/Finance/Table/F_origin_16/F_table_16_0.png" + ], + "domain": "finance" + }, + { + "index": 3, + "pair_id": "F_table_3", + "image_paths": [ + "data/Finance/Table/F_origin_21/F_table_71_0.png", + "data/Finance/Table/F_origin_23/F_table_74_0.png" + ], + "domain": "finance" + }, + { + "index": 4, + "pair_id": "F_table_4", + "image_paths": [ + "data/Finance/Table/F_origin_38/F_table_122_0.png", + "data/Finance/Table/F_origin_39/F_table_123_0.png" + ], + "domain": "finance" + }, + { + "index": 5, + "pair_id": "F_table_5", + "image_paths": [ + "data/Finance/Table/F_origin_41/F_table_129_0.png", + "data/Finance/Table/F_origin_42/F_table_130_0.png" + ], + "domain": "finance" + }, + { + "index": 6, + "pair_id": "F_table_6", + "image_paths": [ + "data/Finance/Table/F_origin_43/F_table_131_0.png", + "data/Finance/Table/F_origin_44/F_table_132_0.png" + ], + "domain": "finance" + }, + { + "index": 7, + "pair_id": "F_table_7", + "image_paths": [ + "data/Finance/Table/F_origin_45/F_table_136_0.png", + "data/Finance/Table/F_origin_49/F_table_146_0.png" + ], + "domain": "finance" + }, + { + "index": 8, + "pair_id": "F_table_8", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_0_0.png", + "data/Finance/Table/F_origin_0/F_table_1_0.png" + ], + "domain": "finance" + }, + { + "index": 9, + "pair_id": "F_table_9", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_3_1.png", + "data/Finance/Table/F_origin_2/F_table_4_0.png" + ], + "domain": "finance" + }, + { + "index": 10, + "pair_id": "F_table_10", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_8_1.png", + "data/Finance/Table/F_origin_6/F_table_9_0.png" + ], + "domain": "finance" + }, + { + "index": 11, + "pair_id": "F_table_11", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_39_0.png", + "data/Finance/Table/F_origin_16/F_table_45_0.png" + ], + "domain": "finance" + }, + { + "index": 12, + "pair_id": "F_table_12", + "image_paths": [ + "data/Finance/Table/F_origin_1/F_table_2_0.png", + "data/Finance/Table/F_origin_11/F_table_15_0.png" + ], + "domain": "finance" + }, + { + "index": 13, + "pair_id": "F_table_13", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_0.png", + "data/Finance/Table/F_origin_13/F_table_21_0.png" + ], + "domain": "finance" + }, + { + "index": 14, + "pair_id": "F_table_14", + "image_paths": [ + "data/Finance/Table/F_origin_4/F_table_6_0.png", + "data/Finance/Table/F_origin_23/F_table_74_0.png" + ], + "domain": "finance" + }, + { + "index": 15, + "pair_id": "F_table_15", + "image_paths": [ + "data/Finance/Table/F_origin_10/F_table_14_0.png", + "data/Finance/Table/F_origin_21/F_table_71_0.png" + ], + "domain": "finance" + }, + { + "index": 16, + "pair_id": "F_table_16", + "image_paths": [ + "data/Finance/Table/F_origin_12/F_table_16_0.png", + "data/Finance/Table/F_origin_45/F_table_136_0.png" + ], + "domain": "finance" + }, + { + "index": 17, + "pair_id": "F_table_17", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_49_0.png", + "data/Finance/Table/F_origin_48/F_table_143_0.png" + ], + "domain": "finance" + }, + { + "index": 18, + "pair_id": "F_table_18", + "image_paths": [ + "data/Finance/Table/F_origin_38/F_table_122_0.png", + "data/Finance/Table/F_origin_45/F_table_136_0.png" + ], + "domain": "finance" + }, + { + "index": 19, + "pair_id": "F_table_19", + "image_paths": [ + "data/Finance/Table/F_origin_43/F_table_131_0.png", + "data/Finance/Table/F_origin_49/F_table_146_0.png" + ], + "domain": "finance" + } +] \ No newline at end of file diff --git a/multi_image_json_list/test_medical_input.json b/multi_image_json_list/test_medical_input.json new file mode 100755 index 0000000..279ef70 --- /dev/null +++ b/multi_image_json_list/test_medical_input.json @@ -0,0 +1,164 @@ +[ + { + "index": 0, + "pair_id": "M_origin_0", + "image_paths": [ + "data/Medical/Table/M_table_0_0_0.png", + "data/Medical/Table/M_table_0_1_0.png" + ], + "domain": "medical" + }, + { + "index": 1, + "pair_id": "M_origin_2", + "image_paths": [ + "data/Medical/Table/M_table_2_0_0.png", + "data/Medical/Table/M_table_2_1_0.png" + ], + "domain": "medical" + }, + { + "index": 2, + "pair_id": "M_origin_3", + "image_paths": [ + "data/Medical/Table/M_table_3_0_1.png", + "data/Medical/Table/M_table_3_0_2.png" + ], + "domain": "medical" + }, + { + "index": 3, + "pair_id": "M_origin_4", + "image_paths": [ + "data/Medical/Table/M_table_4_0_0.png", + "data/Medical/Table/M_table_4_0_1.png" + ], + "domain": "medical" + }, + { + "index": 4, + "pair_id": "M_revised_2_2", + "image_paths": [ + "data/Medical/Table/M_table_2_2_0.png", + "data/Medical/Table/M_table_2_2_1.png" + ], + "domain": "medical" + }, + { + "index": 5, + "pair_id": "M_revised_3_0", + "image_paths": [ + "data/Medical/Table/M_table_3_0_0.png", + "data/Medical/Table/M_table_3_0_1.png" + ], + "domain": "medical" + }, + { + "index": 6, + "pair_id": "M_revised_4_0", + "image_paths": [ + "data/Medical/Table/M_table_4_0_0.png", + "data/Medical/Table/M_table_4_0_1.png" + ], + "domain": "medical" + }, + { + "index": 7, + "pair_id": "M_revised_6_3", + "image_paths": [ + "data/Medical/Table/M_table_6_3_0.png", + "data/Medical/Table/M_table_6_3_1.png" + ], + "domain": "medical" + }, + { + "index": 8, + "pair_id": "M_revised_8_0", + "image_paths": [ + "data/Medical/Table/M_table_8_0_0.png", + "data/Medical/Table/M_table_8_0_1.png" + ], + "domain": "medical" + }, + { + "index": 9, + "pair_id": "M_revised_9_0", + "image_paths": [ + "data/Medical/Table/M_table_9_0_0.png", + "data/Medical/Table/M_table_9_0_1.png" + ], + "domain": "medical" + }, + { + "index": 10, + "pair_id": "M_revised_10_0", + "image_paths": [ + "data/Medical/Table/M_table_10_0_0.png", + "data/Medical/Table/M_table_10_0_1.png" + ], + "domain": "medical" + }, + { + "index": 11, + "pair_id": "M_revised_11_0", + "image_paths": [ + "data/Medical/Table/M_table_11_0_0.png", + "data/Medical/Table/M_table_11_0_1.png" + ], + "domain": "medical" + }, + { + "index": 12, + "pair_id": "M_revised_13_0", + "image_paths": [ + "data/Medical/Table/M_table_13_0_0.png", + "data/Medical/Table/M_table_13_0_1.png" + ], + "domain": "medical" + }, + { + "index": 13, + "pair_id": "M_revised_14_0", + "image_paths": [ + "data/Medical/Table/M_table_14_0_0.png", + "data/Medical/Table/M_table_14_0_1.png" + ], + "domain": "medical" + }, + { + "index": 14, + "pair_id": "M_revised_15_0", + "image_paths": [ + "data/Medical/Table/M_table_15_0_0.png", + "data/Medical/Table/M_table_15_0_1.png" + ], + "domain": "medical" + }, + { + "index": 15, + "pair_id": "M_revised_16_0", + "image_paths": [ + "data/Medical/Table/M_table_16_0_0.png", + "data/Medical/Table/M_table_16_0_1.png" + ], + "domain": "medical" + }, + { + "index": 16, + "pair_id": "M_revised_2_3", + "image_paths": [ + "data/Medical/Table/M_table_2_3_0.png", + "data/Medical/Table/M_table_2_3_1.png" + ], + "domain": "medical" + }, + { + "index": 17, + "pair_id": "M_revised_10_1", + "image_paths": [ + "data/Medical/Table/M_table_10_1_0.png", + "data/Medical/Table/M_table_10_1_1.png" + ], + "domain": "medical" + } +] \ No newline at end of file diff --git a/multi_image_json_list/test_public_input.json b/multi_image_json_list/test_public_input.json new file mode 100755 index 0000000..832ef44 --- /dev/null +++ b/multi_image_json_list/test_public_input.json @@ -0,0 +1,182 @@ +[ + { + "index": 0, + "pair_id": "P_origin_0_1", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_0.png", + "data/Public/Table/P_origin_0/P_origin_0_1_1.png" + ], + "domain": "public" + }, + { + "index": 1, + "pair_id": "P_origin_0_2", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_2_1.png", + "data/Public/Table/P_origin_0/P_origin_0_2_2.png" + ], + "domain": "public" + }, + { + "index": 2, + "pair_id": "P_origin_1_9", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_9_0.png", + "data/Public/Table/P_origin_1/P_origin_1_9_1.png" + ], + "domain": "public" + }, + { + "index": 3, + "pair_id": "P_origin_1_10", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_10_0.png", + "data/Public/Table/P_origin_1/P_origin_1_10_1.png" + ], + "domain": "public" + }, + { + "index": 4, + "pair_id": "P_origin_1_12", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_12_0.png", + "data/Public/Table/P_origin_1/P_origin_1_12_1.png" + ], + "domain": "public" + }, + { + "index": 5, + "pair_id": "P_origin_1_16", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_16_0.png", + "data/Public/Table/P_origin_1/P_origin_1_16_1.png" + ], + "domain": "public" + }, + { + "index": 6, + "pair_id": "P_origin_1_17", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_17_0.png", + "data/Public/Table/P_origin_1/P_origin_1_17_1.png" + ], + "domain": "public" + }, + { + "index": 7, + "pair_id": "P_origin_1_18", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_18_0.png", + "data/Public/Table/P_origin_1/P_origin_1_18_1.png" + ], + "domain": "public" + }, + { + "index": 8, + "pair_id": "P_origin_1_23", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_23_0.png", + "data/Public/Table/P_origin_1/P_origin_1_23_1.png" + ], + "domain": "public" + }, + { + "index": 9, + "pair_id": "P_origin_2_0", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_0_0.png", + "data/Public/Table/P_origin_2/P_origin_2_0_1.png" + ], + "domain": "public" + }, + { + "index": 10, + "pair_id": "P_origin_3_2", + "image_paths": [ + "data/Public/Table/P_origin_3/P_origin_3_2_0.png", + "data/Public/Table/P_origin_3/P_origin_3_2_1.png" + ], + "domain": "public" + }, + { + "index": 11, + "pair_id": "P_origin_4_9", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_9_0.png", + "data/Public/Table/P_origin_4/P_origin_4_9_1.png" + ], + "domain": "public" + }, + { + "index": 12, + "pair_id": "P_origin_4_11", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_11_0.png", + "data/Public/Table/P_origin_4/P_origin_4_11_1.png" + ], + "domain": "public" + }, + { + "index": 13, + "pair_id": "P_origin_5_1", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_1_0.png", + "data/Public/Table/P_origin_5/P_origin_5_1_1.png" + ], + "domain": "public" + }, + { + "index": 14, + "pair_id": "P_origin_5_17", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_17_0.png", + "data/Public/Table/P_origin_5/P_origin_5_17_1.png" + ], + "domain": "public" + }, + { + "index": 15, + "pair_id": "P_origin_6_12", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_12_0.png", + "data/Public/Table/P_origin_6/P_origin_6_12_1.png" + ], + "domain": "public" + }, + { + "index": 16, + "pair_id": "P_origin_7_4", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_4_0.png", + "data/Public/Table/P_origin_7/P_origin_7_4_1.png" + ], + "domain": "public" + }, + { + "index": 17, + "pair_id": "P_origin_7_8", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_8_0.png", + "data/Public/Table/P_origin_7/P_origin_7_8_1.png" + ], + "domain": "public" + }, + { + "index": 18, + "pair_id": "P_origin_8_14", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_0.png", + "data/Public/Table/P_origin_8/P_origin_8_14_1.png" + ], + "domain": "public" + }, + { + "index": 19, + "pair_id": "P_origin_9_0", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_0_1.png", + "data/Public/Table/P_origin_9/P_origin_9_0_2.png" + ], + "domain": "public" + } +] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 513cc81..8d5c460 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,4 +32,5 @@ dependencies = [ "pymongo>=4.6.1", "langgraph-checkpoint-sqlite>=3.0.1", "notion-client>=2.0.0", + "playwright>=1.57.0", ] diff --git a/regenerate_qa.py b/regenerate_qa.py new file mode 100755 index 0000000..008dac4 --- /dev/null +++ b/regenerate_qa.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 +""" +기존 synthetic 테이블에서 QA를 재생성하는 스크립트. + +output_*/html/ 디렉토리의 HTML 파일을 직접 읽어서 새로운 QA pairs를 생성합니다. +pipeline_output.json은 entry 목록과 결과 저장에만 사용됩니다. + +Usage: + # 특정 도메인 재생성 + python regenerate_qa.py --domain business + + # 여러 도메인 재생성 + python regenerate_qa.py --domain business finance academic medical + + # 모든 도메인 재생성 (output_public 제외) + python regenerate_qa.py --all + + # 특정 provider/model 사용 + python regenerate_qa.py --domain business --provider openai --model gpt-4o +""" + +import argparse +import json +import logging +import os +import re +import sys +from pathlib import Path +from typing import List, Dict, Any, Optional +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime + +from dotenv import load_dotenv + +load_dotenv() + +# Add project root to path +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) + +from generate_synthetic_table.flow import ( + _load_prompt, + _call_llm, + robust_json_parse, +) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# 도메인별 output 디렉토리 +DOMAIN_DIRS = { + "academic": "output_academic", + "business": "output_business", + "finance": "output_finance", + "medical": "output_medical", + # "public": "output_public", # 제외 +} + + +def get_llm_client(provider: str, model: str): + """LLM 클라이언트 생성""" + from langchain_openai import ChatOpenAI + from langchain_anthropic import ChatAnthropic + from langchain_google_genai import ChatGoogleGenerativeAI + + if provider == "openai": + return ChatOpenAI( + model=model, + temperature=0.7, + api_key=os.getenv("OPENAI_API_KEY"), + ) + elif provider in ["claude", "anthropic"]: + return ChatAnthropic( + model=model, + temperature=0.7, + api_key=os.getenv("ANTHROPIC_API_KEY"), + ) + elif provider in ["gemini", "google"]: + return ChatGoogleGenerativeAI( + model=model, + temperature=0.7, + google_api_key=os.getenv("GOOGLE_API_KEY"), + ) + else: + raise ValueError(f"Unknown provider: {provider}") + + +def find_html_files(output_dir: Path, pair_id: str) -> List[Path]: + """ + output_dir/html/ 디렉토리에서 pair_id에 해당하는 HTML 파일들을 찾습니다. + + 파일 패턴: {pair_id}_table_*.html + 예: B_origin_0_0_0_table_0.html, B_origin_0_0_0_table_1.html + """ + html_dir = output_dir / "html" + if not html_dir.exists(): + return [] + + # pair_id로 시작하는 HTML 파일 찾기 + pattern = f"{pair_id}_table_*.html" + html_files = sorted(html_dir.glob(pattern)) + + return html_files + + +def read_html_files(html_files: List[Path]) -> List[str]: + """ + HTML 파일들을 읽어서 내용을 반환합니다. + """ + html_contents = [] + for html_file in html_files: + try: + with open(html_file, "r", encoding="utf-8") as f: + content = f.read().strip() + if content: + html_contents.append(content) + except Exception as e: + logger.warning(f"Failed to read {html_file}: {e}") + + return html_contents + + +def generate_qa_for_table( + llm, + synthetic_html: str, + domain: str, +) -> List[Dict[str, Any]]: + """단일 synthetic table에 대해 QA를 생성합니다.""" + try: + prompt_template = _load_prompt("generate_qa", domain) + prompt = prompt_template.format(synthetic_html=synthetic_html) + + response_text, _ = _call_llm(llm, prompt, return_token_usage=True) + response_json = robust_json_parse(response_text) + + if response_json and "qa_pairs" in response_json: + return response_json["qa_pairs"] + else: + logger.warning("QA generation did not return valid qa_pairs") + return [] + except Exception as e: + logger.error(f"Failed to generate QA: {e}") + return [] + + +def generate_long_sequence_for_table( + llm, + synthetic_html: str, + domain: str, +) -> List[Dict[str, Any]]: + """단일 synthetic table에 대해 long_sequence QA를 생성합니다.""" + try: + prompt_template = _load_prompt("generate_long_sequence", domain) + prompt = prompt_template.format(synthetic_html=synthetic_html) + + response_text, _ = _call_llm(llm, prompt, return_token_usage=True) + response_json = robust_json_parse(response_text) + + if response_json and "qa_pairs" in response_json: + return response_json["qa_pairs"] + else: + return [] + except ValueError: + # generate_long_sequence prompt not found + return [] + except Exception as e: + logger.warning(f"Failed to generate long_sequence QA: {e}") + return [] + + +def regenerate_qa_for_entry( + llm, + entry: Dict[str, Any], + output_dir: Path, + domain: str, + include_long_sequence: bool = True, +) -> Dict[str, Any]: + """ + 단일 entry에 대해 QA를 재생성합니다. + + html/ 디렉토리에서 HTML 파일을 직접 읽어서 QA를 생성합니다. + 여러 테이블이 있는 경우, 각 테이블에 대해 QA를 생성하고 합칩니다. + """ + pair_id = entry.get("pair_id", entry.get("name", "unknown")) + + # HTML 파일 찾기 및 읽기 + html_files = find_html_files(output_dir, pair_id) + if not html_files: + logger.warning(f"No HTML files found for {pair_id} in {output_dir}/html/") + return entry + + synthetic_tables = read_html_files(html_files) + if not synthetic_tables: + logger.warning(f"Failed to read HTML files for {pair_id}") + return entry + + logger.info(f" Found {len(html_files)} HTML files: {[f.name for f in html_files]}") + + all_qa_results = [] + + # 각 테이블에 대해 QA 생성 + for idx, synthetic_html in enumerate(synthetic_tables): + logger.info(f" Generating QA for table {idx + 1}/{len(synthetic_tables)}") + + # 기본 QA 생성 (9개 타입) + qa_results = generate_qa_for_table(llm, synthetic_html, domain) + all_qa_results.extend(qa_results) + + # long_sequence QA 생성 (선택적) + if include_long_sequence: + long_seq_results = generate_long_sequence_for_table(llm, synthetic_html, domain) + all_qa_results.extend(long_seq_results) + + # 결과 업데이트 + updated_entry = entry.copy() + updated_entry["qa_results"] = all_qa_results + updated_entry["qa_regenerated_at"] = datetime.now().isoformat() + updated_entry["html_files_used"] = [f.name for f in html_files] + + return updated_entry + + +def regenerate_qa_for_domain( + domain: str, + provider: str = "claude", + model: str = "claude-sonnet-4-5", + include_long_sequence: bool = True, + limit: Optional[int] = None, + dry_run: bool = False, +) -> Dict[str, Any]: + """ + 특정 도메인의 모든 entry에 대해 QA를 재생성합니다. + """ + domain_dir = DOMAIN_DIRS.get(domain) + if not domain_dir: + raise ValueError(f"Unknown domain: {domain}") + + output_dir = project_root / domain_dir + pipeline_output_path = output_dir / "pipeline_output.json" + + if not pipeline_output_path.exists(): + raise FileNotFoundError(f"pipeline_output.json not found: {pipeline_output_path}") + + # Load existing data + with open(pipeline_output_path, "r", encoding="utf-8") as f: + data = json.load(f) + + logger.info(f"Loaded {len(data)} entries from {pipeline_output_path}") + + if limit: + data = data[:limit] + logger.info(f"Limited to {limit} entries") + + if dry_run: + logger.info("Dry run mode - not regenerating QA") + # HTML 파일 존재 여부 확인 + html_dir = output_dir / "html" + if not html_dir.exists(): + logger.warning(f"HTML directory not found: {html_dir}") + return {"domain": domain, "entries": len(data), "dry_run": True, "html_dir_exists": False} + + # 각 entry에 대해 HTML 파일 수 확인 + entries_with_html = 0 + total_html_files = 0 + for entry in data: + pair_id = entry.get("pair_id", entry.get("name", "")) + html_files = find_html_files(output_dir, pair_id) + if html_files: + entries_with_html += 1 + total_html_files += len(html_files) + logger.info(f" {pair_id}: {len(html_files)} HTML files") + + logger.info(f"Summary: {entries_with_html}/{len(data)} entries have HTML files ({total_html_files} total)") + return { + "domain": domain, + "entries": len(data), + "entries_with_html": entries_with_html, + "total_html_files": total_html_files, + "dry_run": True, + } + + # Create LLM client + llm = get_llm_client(provider, model) + + # Regenerate QA for each entry + updated_data = [] + success_count = 0 + error_count = 0 + + for i, entry in enumerate(data): + pair_id = entry.get("pair_id", entry.get("name", f"entry_{i}")) + logger.info(f"[{i + 1}/{len(data)}] Processing: {pair_id}") + + try: + updated_entry = regenerate_qa_for_entry( + llm, + entry, + output_dir, + domain, + include_long_sequence=include_long_sequence, + ) + updated_data.append(updated_entry) + + qa_count = len(updated_entry.get("qa_results", [])) + logger.info(f" Generated {qa_count} QA pairs") + success_count += 1 + except Exception as e: + logger.error(f" Failed: {e}") + updated_data.append(entry) # Keep original + error_count += 1 + + # Backup original file + backup_path = output_dir / f"pipeline_output_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + with open(backup_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + logger.info(f"Backed up original to {backup_path}") + + # Save updated data + with open(pipeline_output_path, "w", encoding="utf-8") as f: + json.dump(updated_data, f, ensure_ascii=False, indent=2) + logger.info(f"Saved updated data to {pipeline_output_path}") + + return { + "domain": domain, + "total_entries": len(data), + "success": success_count, + "errors": error_count, + "backup": str(backup_path), + } + + +def main(): + parser = argparse.ArgumentParser( + description="기존 synthetic 테이블에서 QA를 재생성합니다.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # 특정 도메인 재생성 + python regenerate_qa.py --domain business + + # 여러 도메인 재생성 + python regenerate_qa.py --domain business finance + + # 모든 도메인 재생성 (output_public 제외) + python regenerate_qa.py --all + + # OpenAI 사용 + python regenerate_qa.py --domain business --provider openai --model gpt-4o + + # 테스트 (5개만) + python regenerate_qa.py --domain business --limit 5 + + # Dry run (실제 재생성 없이 확인만) + python regenerate_qa.py --domain business --dry-run + """ + ) + + parser.add_argument( + "--domain", + nargs="+", + choices=list(DOMAIN_DIRS.keys()), + help="재생성할 도메인(들)", + ) + parser.add_argument( + "--all", + action="store_true", + help="모든 도메인 재생성 (output_public 제외)", + ) + parser.add_argument( + "--provider", + default="claude", + choices=["claude", "anthropic", "openai", "gemini", "google"], + help="LLM 제공자 (default: claude)", + ) + parser.add_argument( + "--model", + default="claude-sonnet-4-5", + help="모델 이름 (default: claude-sonnet-4-5)", + ) + parser.add_argument( + "--no-long-sequence", + action="store_true", + help="long_sequence QA 생성 스킵", + ) + parser.add_argument( + "--limit", + type=int, + help="처리할 최대 entry 수 (테스트용)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="실제 재생성 없이 확인만", + ) + + args = parser.parse_args() + + # Determine domains to process + if args.all: + domains = list(DOMAIN_DIRS.keys()) + elif args.domain: + domains = args.domain + else: + parser.error("--domain 또는 --all을 지정해야 합니다.") + + logger.info(f"Domains to process: {domains}") + logger.info(f"Provider: {args.provider}, Model: {args.model}") + + # Process each domain + results = [] + for domain in domains: + logger.info(f"\n{'='*60}") + logger.info(f"Processing domain: {domain}") + logger.info(f"{'='*60}") + + try: + result = regenerate_qa_for_domain( + domain=domain, + provider=args.provider, + model=args.model, + include_long_sequence=not args.no_long_sequence, + limit=args.limit, + dry_run=args.dry_run, + ) + results.append(result) + logger.info(f"Completed: {result}") + except Exception as e: + logger.error(f"Failed to process {domain}: {e}") + results.append({"domain": domain, "error": str(e)}) + + # Summary + print("\n" + "=" * 60) + print(" QA Regeneration Summary") + print("=" * 60) + for result in results: + domain = result.get("domain", "unknown") + if "error" in result: + print(f" {domain}: ERROR - {result['error']}") + elif result.get("dry_run"): + html_info = "" + if "entries_with_html" in result: + html_info = f", {result['entries_with_html']}/{result['entries']} with HTML ({result['total_html_files']} files)" + elif result.get("html_dir_exists") is False: + html_info = ", NO html/ directory!" + print(f" {domain}: {result.get('entries', 0)} entries (dry run){html_info}") + else: + print(f" {domain}: {result.get('success', 0)}/{result.get('total_entries', 0)} success, {result.get('errors', 0)} errors") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/run_all.sh b/run_all.sh new file mode 100755 index 0000000..87ed09f --- /dev/null +++ b/run_all.sh @@ -0,0 +1,536 @@ +#!/bin/bash + +# ============================================================================== +# TableMagnifier - Master Pipeline Script +# ============================================================================== +# +# 전체 파이프라인을 통합 실행합니다: +# 1. Synthetic Table 생성 (from JSON input) +# 2. HTML → Image 변환 +# 3. QA 재생성 (선택) +# 4. QA 난이도 필터링 (vLLM 필요) +# 5. 평가 (vLLM 필요) +# +# Usage: +# ./run_all.sh --input data.json --domain business [OPTIONS] +# +# Examples: +# # 기본 파이프라인 (테이블 생성 + 이미지 변환) +# ./run_all.sh --input test.json --domain business +# +# # 전체 파이프라인 (vLLM 평가 포함) +# ./run_all.sh --input test.json --domain business --with-eval --vllm-url http://localhost:8000/v1 +# +# # QA 재생성만 +# ./run_all.sh --domain business --regenerate-qa-only +# +# # 필터링 + 평가만 (이미 테이블/이미지가 있는 경우) +# ./run_all.sh --domain business --filter-only --with-eval +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# ============================================================================== +# Configuration +# ============================================================================== + +# Default values +INPUT_JSON="" +DOMAIN="" +OUTPUT_DIR="" +PROVIDER="claude" +MODEL="claude-sonnet-4-5" +VLLM_URL="http://localhost:8000/v1" + +# Pipeline steps (default: generate + capture) +DO_GENERATE=true +DO_CAPTURE=true +DO_REGENERATE_QA=false +DO_FILTER=false +DO_EVAL=false + +# Options +LIMIT="" +DRY_RUN=false +SKIP_QA=false +FILTER_TRIALS=10 +FILTER_MIN_ACC=0.3 +FILTER_MAX_ACC=0.6 + +# Colors +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +# ============================================================================== +# Helper Functions +# ============================================================================== + +echo_header() { + echo "" + echo -e "${BLUE}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + echo -e "${BLUE}${BOLD} $1${NC}" + echo -e "${BLUE}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +} + +echo_step() { + echo "" + echo -e "${CYAN}▶ STEP $1: $2${NC}" + echo -e "${CYAN}─────────────────────────────────────────────────────────────${NC}" +} + +echo_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +echo_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +echo_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +echo_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +show_help() { + cat << 'EOF' +Usage: ./run_all.sh [OPTIONS] + +TableMagnifier 전체 파이프라인을 통합 실행합니다. + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Required Options +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + --domain DOMAIN 도메인 (business, finance, academic, medical, public) + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Pipeline Steps +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + --input FILE 입력 JSON 파일 (테이블 생성 시 필수) + --regenerate-qa QA 재생성 포함 + --regenerate-qa-only QA 재생성만 실행 (테이블 생성 스킵) + --with-filter vLLM으로 QA 난이도 필터링 포함 + --filter-only 필터링만 실행 (테이블/이미지 생성 스킵) + --with-eval vLLM으로 평가 포함 + --eval-only 평가만 실행 + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Generation Options +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + --output-dir DIR 출력 디렉토리 (default: output_{domain}) + --provider PROVIDER LLM 제공자: claude, openai, gemini (default: claude) + --model MODEL 모델 이름 (default: claude-sonnet-4-5) + --skip-qa 테이블 생성 시 QA 생성 스킵 + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + vLLM Options (for filter/eval) +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + --vllm-url URL vLLM 서버 URL (default: http://localhost:8000/v1) + --filter-trials N 필터링 시 QA당 시도 횟수 (default: 10) + --filter-min-acc FLOAT 필터링 최소 정확도 (default: 0.3) + --filter-max-acc FLOAT 필터링 최대 정확도 (default: 0.6) + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Other Options +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + --limit N 처리할 최대 entry 수 (테스트용) + --dry-run 실제 실행 없이 확인만 + -h, --help 도움말 표시 + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Examples +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + # 1. 기본 파이프라인 (테이블 생성 → 이미지 변환) + ./run_all.sh --input test.json --domain business + + # 2. 전체 파이프라인 (생성 → 이미지 → 필터링 → 평가) + ./run_all.sh --input test.json --domain business --with-filter --with-eval + + # 3. QA만 재생성 (기존 테이블 유지) + ./run_all.sh --domain business --regenerate-qa-only + + # 4. 필터링만 (이미 이미지가 있는 경우) + ./run_all.sh --domain business --filter-only + + # 5. 평가만 + ./run_all.sh --domain business --eval-only + + # 6. OpenAI 사용 + ./run_all.sh --input test.json --domain business --provider openai --model gpt-4o + + # 7. 테스트 (3개만) + ./run_all.sh --input test.json --domain business --limit 3 --dry-run + +EOF +} + +# ============================================================================== +# Argument Parsing +# ============================================================================== + +# Parse first argument as JSON file if it ends with .json +if [[ "$1" == *.json ]]; then + INPUT_JSON="$1" + shift +fi + +while [[ $# -gt 0 ]]; do + case $1 in + --input) + INPUT_JSON="$2" + shift 2 + ;; + --domain) + DOMAIN="$2" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --provider) + PROVIDER="$2" + shift 2 + ;; + --model) + MODEL="$2" + shift 2 + ;; + --vllm-url) + VLLM_URL="$2" + shift 2 + ;; + --regenerate-qa) + DO_REGENERATE_QA=true + shift + ;; + --regenerate-qa-only) + DO_GENERATE=false + DO_CAPTURE=false + DO_REGENERATE_QA=true + shift + ;; + --with-filter) + DO_FILTER=true + shift + ;; + --filter-only) + DO_GENERATE=false + DO_CAPTURE=false + DO_FILTER=true + shift + ;; + --with-eval) + DO_EVAL=true + shift + ;; + --eval-only) + DO_GENERATE=false + DO_CAPTURE=false + DO_EVAL=true + shift + ;; + --filter-trials) + FILTER_TRIALS="$2" + shift 2 + ;; + --filter-min-acc) + FILTER_MIN_ACC="$2" + shift 2 + ;; + --filter-max-acc) + FILTER_MAX_ACC="$2" + shift 2 + ;; + --skip-qa) + SKIP_QA=true + shift + ;; + --limit) + LIMIT="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=true + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + echo_error "Unknown option: $1" + echo "Use -h or --help for usage information." + exit 1 + ;; + esac +done + +# ============================================================================== +# Validation +# ============================================================================== + +# Domain is always required +if [[ -z "$DOMAIN" ]]; then + echo_error "--domain is required" + echo "Use -h or --help for usage information." + exit 1 +fi + +# Input JSON required for generation +if [[ "$DO_GENERATE" == true ]] && [[ -z "$INPUT_JSON" ]]; then + echo_error "--input is required for table generation" + echo "Use --regenerate-qa-only, --filter-only, or --eval-only to skip generation." + exit 1 +fi + +# Check input file exists +if [[ -n "$INPUT_JSON" ]] && [[ ! -f "$INPUT_JSON" ]]; then + echo_error "Input file not found: $INPUT_JSON" + exit 1 +fi + +# Set default output directory +if [[ -z "$OUTPUT_DIR" ]]; then + OUTPUT_DIR="output_${DOMAIN}" +fi + +# ============================================================================== +# Check Dependencies +# ============================================================================== + +check_vllm_connection() { + if curl -s --connect-timeout 5 "${VLLM_URL}/models" > /dev/null 2>&1; then + VLLM_MODEL=$(curl -s "${VLLM_URL}/models" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data['data'][0]['id'] if data.get('data') else 'unknown')" 2>/dev/null || echo "unknown") + echo_info "vLLM connected: ${VLLM_MODEL}" + return 0 + else + return 1 + fi +} + +check_api_key() { + case $PROVIDER in + claude|anthropic) + if [[ -z "$ANTHROPIC_API_KEY" ]]; then + echo_warn "ANTHROPIC_API_KEY is not set" + fi + ;; + openai) + if [[ -z "$OPENAI_API_KEY" ]]; then + echo_warn "OPENAI_API_KEY is not set" + fi + ;; + gemini|google) + if [[ -z "$GOOGLE_API_KEY" ]]; then + echo_warn "GOOGLE_API_KEY is not set" + fi + ;; + esac +} + +# ============================================================================== +# Main Pipeline +# ============================================================================== + +echo_header "TableMagnifier - Master Pipeline" + +echo "" +echo "Configuration:" +echo " Domain: $DOMAIN" +echo " Output Dir: $OUTPUT_DIR" +echo " Provider: $PROVIDER" +echo " Model: $MODEL" +if [[ -n "$INPUT_JSON" ]]; then + echo " Input JSON: $INPUT_JSON" +fi +if [[ -n "$LIMIT" ]]; then + echo " Limit: $LIMIT entries" +fi +if [[ "$DRY_RUN" == true ]]; then + echo " Mode: DRY RUN" +fi +echo "" +echo "Pipeline Steps:" +echo " 1. Generate Tables: $([ "$DO_GENERATE" == true ] && echo "✓" || echo "✗")" +echo " 2. Capture Images: $([ "$DO_CAPTURE" == true ] && echo "✓" || echo "✗")" +echo " 3. Regenerate QA: $([ "$DO_REGENERATE_QA" == true ] && echo "✓" || echo "✗")" +echo " 4. Filter QA: $([ "$DO_FILTER" == true ] && echo "✓" || echo "✗")" +echo " 5. Evaluate: $([ "$DO_EVAL" == true ] && echo "✓" || echo "✗")" +echo "" + +# Check API key for generation steps +if [[ "$DO_GENERATE" == true ]] || [[ "$DO_REGENERATE_QA" == true ]]; then + check_api_key +fi + +# Check vLLM for filter/eval steps +if [[ "$DO_FILTER" == true ]] || [[ "$DO_EVAL" == true ]]; then + echo_info "Checking vLLM connection..." + if ! check_vllm_connection; then + echo_error "Cannot connect to vLLM server at ${VLLM_URL}" + echo_error "Please ensure vLLM server is running for filter/eval steps." + exit 1 + fi +fi + +STEP_NUM=0 + +# ------------------------------------------------------------------------------ +# Step 1: Generate Synthetic Tables +# ------------------------------------------------------------------------------ +if [[ "$DO_GENERATE" == true ]]; then + STEP_NUM=$((STEP_NUM + 1)) + echo_step $STEP_NUM "Generate Synthetic Tables" + + GENERATE_ARGS="--input \"$INPUT_JSON\" --output-dir \"$OUTPUT_DIR\" --provider \"$PROVIDER\" --model \"$MODEL\" --domain \"$DOMAIN\"" + + if [[ "$SKIP_QA" == true ]]; then + GENERATE_ARGS="$GENERATE_ARGS --skip-qa" + fi + + if [[ -n "$LIMIT" ]]; then + GENERATE_ARGS="$GENERATE_ARGS --limit $LIMIT" + fi + + if [[ "$DRY_RUN" == true ]]; then + echo_info "[DRY RUN] Would run: uv run python run_pipeline_json.py $GENERATE_ARGS" + else + eval "uv run python run_pipeline_json.py $GENERATE_ARGS" + echo_success "Table generation completed" + fi +fi + +# ------------------------------------------------------------------------------ +# Step 2: Capture HTML to Images +# ------------------------------------------------------------------------------ +if [[ "$DO_CAPTURE" == true ]]; then + STEP_NUM=$((STEP_NUM + 1)) + echo_step $STEP_NUM "Capture HTML to Images" + + CAPTURE_ARGS="--output-dirs $OUTPUT_DIR" + + if [[ "$DRY_RUN" == true ]]; then + echo_info "[DRY RUN] Would run: uv run python capture_html_images.py $CAPTURE_ARGS" + else + uv run python capture_html_images.py $CAPTURE_ARGS + echo_success "Image capture completed" + fi +fi + +# ------------------------------------------------------------------------------ +# Step 3: Regenerate QA (Optional) +# ------------------------------------------------------------------------------ +if [[ "$DO_REGENERATE_QA" == true ]]; then + STEP_NUM=$((STEP_NUM + 1)) + echo_step $STEP_NUM "Regenerate QA" + + REGEN_ARGS="--domain $DOMAIN --provider $PROVIDER --model $MODEL" + + if [[ -n "$LIMIT" ]]; then + REGEN_ARGS="$REGEN_ARGS --limit $LIMIT" + fi + + if [[ "$DRY_RUN" == true ]]; then + REGEN_ARGS="$REGEN_ARGS --dry-run" + fi + + uv run python regenerate_qa.py $REGEN_ARGS + echo_success "QA regeneration completed" +fi + +# ------------------------------------------------------------------------------ +# Step 4: Filter QA by Difficulty (Optional) +# ------------------------------------------------------------------------------ +if [[ "$DO_FILTER" == true ]]; then + STEP_NUM=$((STEP_NUM + 1)) + echo_step $STEP_NUM "Filter QA by Difficulty" + + FILTER_ARGS="--domain $DOMAIN --vllm-url $VLLM_URL --trials $FILTER_TRIALS --min-acc $FILTER_MIN_ACC --max-acc $FILTER_MAX_ACC" + + if [[ -n "$LIMIT" ]]; then + FILTER_ARGS="$FILTER_ARGS --limit $LIMIT" + fi + + if [[ "$DRY_RUN" == true ]]; then + FILTER_ARGS="$FILTER_ARGS --dry-run" + fi + + uv run python filter_qa_by_difficulty.py $FILTER_ARGS + echo_success "QA filtering completed" +fi + +# ------------------------------------------------------------------------------ +# Step 5: Evaluate (Optional) +# ------------------------------------------------------------------------------ +if [[ "$DO_EVAL" == true ]]; then + STEP_NUM=$((STEP_NUM + 1)) + echo_step $STEP_NUM "Evaluate with vLLM" + + EVAL_ARGS="--domain $DOMAIN --vllm-url $VLLM_URL" + + if [[ -n "$LIMIT" ]]; then + EVAL_ARGS="$EVAL_ARGS --limit $LIMIT" + fi + + if [[ "$DRY_RUN" == true ]]; then + EVAL_ARGS="$EVAL_ARGS --dry-run" + fi + + uv run python -m eval.evaluate_vllm $EVAL_ARGS + echo_success "Evaluation completed" +fi + +# ============================================================================== +# Summary +# ============================================================================== + +echo_header "Pipeline Completed" + +echo "" +echo "Output Directory: $OUTPUT_DIR/" +echo "" +echo "Generated Files:" + +if [[ -d "$OUTPUT_DIR" ]]; then + # Count files + JSON_COUNT=$(find "$OUTPUT_DIR" -maxdepth 1 -name "*.json" 2>/dev/null | wc -l) + HTML_COUNT=$(find "$OUTPUT_DIR/html" -name "*.html" 2>/dev/null | wc -l) + IMAGE_COUNT=$(find "$OUTPUT_DIR/images" -name "*.png" 2>/dev/null | wc -l) + + echo " - JSON files: $JSON_COUNT" + echo " - HTML files: $HTML_COUNT (in html/)" + echo " - Images: $IMAGE_COUNT (in images/)" + + if [[ "$DO_FILTER" == true ]] && [[ "$DRY_RUN" != true ]]; then + REVIEW_FILE=$(ls -t "$OUTPUT_DIR"/qa_for_review_*.json 2>/dev/null | head -1) + if [[ -n "$REVIEW_FILE" ]]; then + REVIEW_COUNT=$(python3 -c "import json; print(json.load(open('$REVIEW_FILE'))['count'])" 2>/dev/null || echo "?") + echo "" + echo " Review File: $(basename $REVIEW_FILE)" + echo " QA for Review: $REVIEW_COUNT items" + fi + fi + + if [[ "$DO_EVAL" == true ]] && [[ "$DRY_RUN" != true ]]; then + EVAL_FILE=$(ls -t "$OUTPUT_DIR"/eval_results_*.json 2>/dev/null | head -1) + if [[ -n "$EVAL_FILE" ]]; then + echo "" + echo " Eval Results: $(basename $EVAL_FILE)" + fi + fi +fi + +echo "" +echo -e "${GREEN}Done!${NC}" diff --git a/run_capture_html.sh b/run_capture_html.sh new file mode 100755 index 0000000..fb3a9df --- /dev/null +++ b/run_capture_html.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# ============================================================================== +# HTML to Image Capture Script +# Captures HTML files from output_*/html/ directories as PNG images +# ============================================================================== + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "===================================" +echo " HTML to Image Capture" +echo "===================================" + +# Check if playwright is installed +if ! uv run python -c "import playwright" 2>/dev/null; then + echo "[INFO] Installing playwright..." + uv add playwright + uv run playwright install chromium +fi + +# Run the capture script +uv run python "$SCRIPT_DIR/capture_html_images.py" "$@" diff --git a/run_evaluate_vllm.sh b/run_evaluate_vllm.sh new file mode 100755 index 0000000..762a672 --- /dev/null +++ b/run_evaluate_vllm.sh @@ -0,0 +1,255 @@ +#!/bin/bash +# +# vLLM 서버를 사용한 Table QA 평가 스크립트 +# +# 사전 요구사항: +# 1. vLLM 서버가 실행 중이어야 합니다 +# 2. HTML 파일들이 이미지로 캡처되어 있어야 합니다 (./run_capture_html.sh 실행) +# +# Usage: +# ./run_evaluate_vllm.sh [OPTIONS] +# +# Examples: +# # 모든 도메인 평가 +# ./run_evaluate_vllm.sh --all-domains +# +# # 단일 도메인 평가 +# ./run_evaluate_vllm.sh --domain public +# +# # 커스텀 vLLM URL +# ./run_evaluate_vllm.sh --domain public --vllm-url http://gpu-server:8000/v1 +# +# # 특정 모델 사용 +# ./run_evaluate_vllm.sh --domain business --model Qwen/Qwen2.5-VL-7B-Instruct +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# 기본 설정 +VLLM_URL="${VLLM_URL:-http://localhost:8000/v1}" +MODEL="${MODEL:-default}" +OUTPUT_DIR="${OUTPUT_DIR:-eval_results}" + +# 색상 정의 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +echo_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +echo_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 헬프 메시지 +show_help() { + cat << EOF +Usage: $0 [OPTIONS] + +vLLM 서버를 사용한 Table QA 평가 + +Options: + --domain DOMAIN 평가할 도메인 (academic, business, finance, medical, public) + --all-domains 모든 도메인 평가 + --vllm-url URL vLLM 서버 URL (default: $VLLM_URL) + --model MODEL 사용할 모델 이름 (default: $MODEL) + --output-dir DIR 결과 저장 디렉토리 (default: $OUTPUT_DIR) + --use-judge LLM-as-Judge 평가 사용 + --judge-model MODEL Judge 모델 (default: gpt-4o-mini) + --limit N 평가할 최대 샘플 수 (디버깅용) + --qa-types TYPES 특정 QA 타입만 평가 (예: "lookup compare") + --capture-html 평가 전 HTML을 이미지로 캡처 + -h, --help 이 도움말 표시 + +Environment Variables: + VLLM_URL vLLM 서버 URL + MODEL 사용할 모델 이름 + OUTPUT_DIR 결과 저장 디렉토리 + OPENAI_API_KEY LLM-as-Judge 사용 시 OpenAI API 키 + +Examples: + # 모든 도메인 평가 + $0 --all-domains + + # public 도메인만 평가 + $0 --domain public + + # LLM-as-Judge 포함 평가 + $0 --domain finance --use-judge --judge-model gpt-4o + + # 10개 샘플로 빠른 테스트 + $0 --domain public --limit 10 +EOF +} + +# vLLM 서버 연결 확인 +check_vllm_connection() { + local url=$1 + echo_info "vLLM 서버 연결 확인: $url" + + # /v1/models 엔드포인트로 연결 테스트 + if curl -s --connect-timeout 5 "$url/models" > /dev/null 2>&1; then + echo_info "vLLM 서버 연결 성공" + return 0 + else + echo_error "vLLM 서버에 연결할 수 없습니다: $url" + echo_error "vLLM 서버가 실행 중인지 확인하세요." + return 1 + fi +} + +# 이미지 디렉토리 확인 +check_images() { + local domains=("academic" "business" "finance" "medical" "public") + local missing=0 + + for domain in "${domains[@]}"; do + local output_dir="output_${domain}" + local images_dir="${output_dir}/images" + + if [[ -d "$output_dir" ]]; then + if [[ ! -d "$images_dir" ]] || [[ -z "$(ls -A "$images_dir" 2>/dev/null)" ]]; then + echo_warn "$domain: 이미지 디렉토리가 비어있거나 없습니다 ($images_dir)" + ((missing++)) + fi + fi + done + + if [[ $missing -gt 0 ]]; then + echo_warn "일부 도메인에 이미지가 없습니다. HTML 캡처가 필요할 수 있습니다." + echo_warn "실행: ./run_capture_html.sh" + fi +} + +# 인자 파싱 +DOMAIN="" +ALL_DOMAINS=false +CAPTURE_HTML=false +USE_JUDGE=false +JUDGE_MODEL="" +LIMIT="" +QA_TYPES="" + +while [[ $# -gt 0 ]]; do + case $1 in + --domain) + DOMAIN="$2" + shift 2 + ;; + --all-domains) + ALL_DOMAINS=true + shift + ;; + --vllm-url) + VLLM_URL="$2" + shift 2 + ;; + --model) + MODEL="$2" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --use-judge) + USE_JUDGE=true + shift + ;; + --judge-model) + JUDGE_MODEL="$2" + shift 2 + ;; + --limit) + LIMIT="$2" + shift 2 + ;; + --qa-types) + QA_TYPES="$2" + shift 2 + ;; + --capture-html) + CAPTURE_HTML=true + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + echo_error "알 수 없는 옵션: $1" + show_help + exit 1 + ;; + esac +done + +# 인자 검증 +if [[ -z "$DOMAIN" ]] && [[ "$ALL_DOMAINS" != true ]]; then + echo_error "--domain 또는 --all-domains를 지정해야 합니다." + show_help + exit 1 +fi + +# HTML 캡처 (옵션) +if [[ "$CAPTURE_HTML" == true ]]; then + echo_info "HTML 파일을 이미지로 캡처합니다..." + if [[ -f "./run_capture_html.sh" ]]; then + ./run_capture_html.sh + else + echo_warn "run_capture_html.sh를 찾을 수 없습니다. 스킵합니다." + fi +fi + +# vLLM 연결 확인 +check_vllm_connection "$VLLM_URL" || exit 1 + +# 이미지 확인 +check_images + +# 평가 명령어 구성 +CMD="uv run python -m eval.evaluate_vllm" +CMD="$CMD --vllm-url $VLLM_URL" +CMD="$CMD --model $MODEL" +CMD="$CMD --output-dir $OUTPUT_DIR" + +if [[ "$ALL_DOMAINS" == true ]]; then + CMD="$CMD --all-domains" +elif [[ -n "$DOMAIN" ]]; then + CMD="$CMD --domain $DOMAIN" +fi + +if [[ "$USE_JUDGE" == true ]]; then + CMD="$CMD --use-judge" + if [[ -n "$JUDGE_MODEL" ]]; then + CMD="$CMD --judge-model $JUDGE_MODEL" + fi +fi + +if [[ -n "$LIMIT" ]]; then + CMD="$CMD --limit $LIMIT" +fi + +if [[ -n "$QA_TYPES" ]]; then + CMD="$CMD --qa-types $QA_TYPES" +fi + +# 평가 실행 +echo_info "평가 시작..." +echo_info "Command: $CMD" +echo "" + +eval $CMD + +echo "" +echo_info "평가 완료. 결과: $OUTPUT_DIR/" diff --git a/run_filter_qa.sh b/run_filter_qa.sh new file mode 100755 index 0000000..8ed106b --- /dev/null +++ b/run_filter_qa.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# ============================================================================== +# TableMagnifier - QA Difficulty Filtering +# ============================================================================== +# +# vLLM 서버를 사용하여 QA 난이도를 측정하고 검수 대상을 필터링합니다. +# 모델이 너무 쉽게 맞추는 문제(90%+)는 제외하고, +# 적당한 난이도(30-60%)의 QA만 검수 리스트로 추출합니다. +# +# Usage: +# ./run_filter_qa.sh [OPTIONS] +# +# Examples: +# ./run_filter_qa.sh --domain business +# ./run_filter_qa.sh --all --trials 5 +# ./run_filter_qa.sh --domain business --vllm-url http://gpu-server:8000/v1 +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Colors +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +echo_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +echo_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +echo_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +show_help() { + cat << EOF +Usage: $0 [OPTIONS] + +vLLM 서버를 사용하여 QA 난이도를 측정하고 검수 대상을 필터링합니다. + +Options: + --domain DOMAIN [...] 필터링할 도메인(들) (business, finance, academic, medical, public) + --all 모든 도메인 필터링 + --vllm-url URL vLLM 서버 URL (default: http://localhost:8000/v1) + --model MODEL 모델 이름 (미지정시 자동 감지) + --trials N 각 QA당 시도 횟수 (default: 10) + --min-acc FLOAT 최소 정확도 (default: 0.3) + --max-acc FLOAT 최대 정확도 (default: 0.6) + --limit N 처리할 최대 entry 수 (테스트용) + --dry-run 실제 추론 없이 확인만 + -h, --help 도움말 표시 + +Difficulty Categories: + - too_easy: 90-100% (제외 - 모델이 다 맞춤) + - easy: 70-89% + - medium: 30-69% (검수 대상 ✓) + - hard: 1-29% + - very_hard: 0% + +Examples: + # business 도메인 필터링 + $0 --domain business + + # 빠른 테스트 (5회 시도, 2개 entry만) + $0 --domain business --trials 5 --limit 2 + + # 외부 vLLM 서버 사용 + $0 --domain business --vllm-url http://gpu-server:8000/v1 + +Output: + - qa_difficulty_analysis_*.json: 전체 분석 결과 + - qa_for_review_*.json: 검수용 필터링된 QA 리스트 +EOF +} + +# Check for help +for arg in "$@"; do + if [[ "$arg" == "-h" ]] || [[ "$arg" == "--help" ]]; then + show_help + exit 0 + fi +done + +# Check for required arguments +if [[ $# -eq 0 ]]; then + show_help + exit 1 +fi + +# Parse vllm-url for connection check +VLLM_URL="http://localhost:8000/v1" +for i in $(seq 1 $#); do + arg="${!i}" + if [[ "$arg" == "--vllm-url" ]]; then + next=$((i + 1)) + VLLM_URL="${!next}" + break + fi +done + +echo "==============================================" +echo " TableMagnifier - QA Difficulty Filtering" +echo "==============================================" +echo "" + +# Check vLLM connection +echo_info "Checking vLLM server connection..." +if curl -s --connect-timeout 5 "${VLLM_URL}/models" > /dev/null 2>&1; then + MODEL_INFO=$(curl -s "${VLLM_URL}/models" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data['data'][0]['id'] if data.get('data') else 'unknown')" 2>/dev/null || echo "unknown") + echo_info "vLLM server connected. Model: ${MODEL_INFO}" +else + echo_error "Cannot connect to vLLM server at ${VLLM_URL}" + echo_error "Please ensure vLLM server is running." + exit 1 +fi + +echo "" +echo_info "Starting QA difficulty filtering..." +echo "" + +# Run the filter script +uv run python filter_qa_by_difficulty.py "$@" + +echo "" +echo_info "Filtering completed!" +echo "" +echo "Generated files:" +echo " - qa_difficulty_analysis_*.json: Full analysis results" +echo " - qa_for_review_*.json: Filtered QA for human review" diff --git a/run_openai_public.sh b/run_openai_public.sh deleted file mode 100644 index d90e1f6..0000000 --- a/run_openai_public.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# ============================================================================== -# TableMagnifier - JSON Pipeline (Public Domain) -# ============================================================================== - -# Default Configuration -INPUT_JSON="test_business.json" -OUTPUT_DIR="output_business" -DEFAULT_ARGS="--provider claude --model claude-sonnet-4-5 --domain business" - -# Check if the first argument is a JSON file path -if [[ "$1" == *.json ]]; then - INPUT_JSON="$1" - shift -fi - -echo "==============================================" -echo " TableMagnifier - JSON Pipeline (Public)" -echo "==============================================" -echo "Input JSON: $INPUT_JSON" -echo "Output Dir: $OUTPUT_DIR" -echo "Provider: claude" -echo "Model: claude-sonnet-4-5" -echo "Domain: business" -echo "" -echo "💡 Tip: To upload to Notion during pipeline execution:" -echo " Add --upload-to-notion flag to the command" -echo "" -echo "💡 To upload existing results later:" -echo " python upload_to_notion_from_json.py $OUTPUT_DIR" -echo "" - -# Check for ANTHROPIC_API_KEY -if [[ -z "$ANTHROPIC_API_KEY" ]]; then - echo "⚠️ Warning: ANTHROPIC_API_KEY is not set." - echo " Please set it in your environment or .env file." - echo "" -fi - -# Run the pipeline -# Note: "$@" appends any remaining arguments, allowing overrides of defaults -uv run python run_pipeline_json.py --input "$INPUT_JSON" --output-dir "$OUTPUT_DIR" $DEFAULT_ARGS "$@" \ No newline at end of file diff --git a/run_pipeline.sh b/run_pipeline.sh new file mode 100644 index 0000000..460929c --- /dev/null +++ b/run_pipeline.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# ============================================================================== +# TableMagnifier - JSON Pipeline +# ============================================================================== +# +# Usage: +# ./run_pipeline.sh [INPUT_JSON] [OPTIONS] +# +# Examples: +# ./run_pipeline.sh test_public.json --domain public +# ./run_pipeline.sh test_business.json --domain business --provider openai +# ./run_pipeline.sh --input data.json --output-dir output_custom +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Default Configuration +INPUT_JSON="" +OUTPUT_DIR="" +PROVIDER="claude" +MODEL="claude-sonnet-4-5" +DOMAIN="public" + +# Colors +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +show_help() { + cat << EOF +Usage: $0 [INPUT_JSON] [OPTIONS] + +TableMagnifier JSON Pipeline Runner + +Arguments: + INPUT_JSON Input JSON file path (optional, can use --input instead) + +Options: + --input FILE Input JSON file + --output-dir DIR Output directory (default: output_{domain}) + --provider PROVIDER LLM provider: claude, openai, gemini (default: claude) + --model MODEL Model name (default: claude-sonnet-4-5) + --domain DOMAIN Domain: public, business, finance, medical, academic (default: public) + --qa-only Generate QA only (skip table generation) + --skip-qa Skip QA generation (table only) + --upload-to-notion Upload results to Notion + -h, --help Show this help + +Examples: + # Public domain with Claude + $0 test_public.json --domain public + + # Business domain with OpenAI + $0 test_business.json --domain business --provider openai --model gpt-4o + + # Finance domain, QA only mode + $0 test_finance.json --domain finance --qa-only + + # Custom output directory + $0 data.json --output-dir my_output --domain medical +EOF +} + +# Parse first argument as JSON file if it ends with .json +if [[ "$1" == *.json ]]; then + INPUT_JSON="$1" + shift +fi + +# Parse remaining arguments +EXTRA_ARGS="" +while [[ $# -gt 0 ]]; do + case $1 in + --input) + INPUT_JSON="$2" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --provider) + PROVIDER="$2" + shift 2 + ;; + --model) + MODEL="$2" + shift 2 + ;; + --domain) + DOMAIN="$2" + shift 2 + ;; + -h|--help) + show_help + exit 0 + ;; + *) + EXTRA_ARGS="$EXTRA_ARGS $1" + shift + ;; + esac +done + +# Set default output directory based on domain +if [[ -z "$OUTPUT_DIR" ]]; then + OUTPUT_DIR="output_${DOMAIN}" +fi + +# Validate input +if [[ -z "$INPUT_JSON" ]]; then + echo -e "${YELLOW}[WARN]${NC} No input JSON specified." + show_help + exit 1 +fi + +if [[ ! -f "$INPUT_JSON" ]]; then + echo -e "${YELLOW}[ERROR]${NC} Input file not found: $INPUT_JSON" + exit 1 +fi + +echo "==============================================" +echo " TableMagnifier - JSON Pipeline" +echo "==============================================" +echo "Input JSON: $INPUT_JSON" +echo "Output Dir: $OUTPUT_DIR" +echo "Provider: $PROVIDER" +echo "Model: $MODEL" +echo "Domain: $DOMAIN" +echo "" + +# Check API keys based on provider +case $PROVIDER in + claude|anthropic) + if [[ -z "$ANTHROPIC_API_KEY" ]]; then + echo -e "${YELLOW}[WARN]${NC} ANTHROPIC_API_KEY is not set." + fi + ;; + openai) + if [[ -z "$OPENAI_API_KEY" ]]; then + echo -e "${YELLOW}[WARN]${NC} OPENAI_API_KEY is not set." + fi + ;; + gemini|google) + if [[ -z "$GOOGLE_API_KEY" ]]; then + echo -e "${YELLOW}[WARN]${NC} GOOGLE_API_KEY is not set." + fi + ;; +esac + +echo -e "${GREEN}[INFO]${NC} Starting pipeline..." +echo "" + +# Run the pipeline +uv run python run_pipeline_json.py \ + --input "$INPUT_JSON" \ + --output-dir "$OUTPUT_DIR" \ + --provider "$PROVIDER" \ + --model "$MODEL" \ + --domain "$DOMAIN" \ + $EXTRA_ARGS + +echo "" +echo -e "${GREEN}[INFO]${NC} Pipeline completed. Results saved to: $OUTPUT_DIR/" +echo "" +echo "To upload results to Notion:" +echo " python upload_to_notion_from_json.py $OUTPUT_DIR" diff --git a/run_pipeline_json.py b/run_pipeline_json.py index d8fef9c..b282e46 100644 --- a/run_pipeline_json.py +++ b/run_pipeline_json.py @@ -576,6 +576,7 @@ def main(): parser.add_argument("--max-workers", type=int, default=3, help="Maximum number of parallel workers (default: 3)") parser.add_argument("--randomize-style", action="store_true", default=True, help="Randomize HTML table styles (fonts, colors) for diversity (default: True)") parser.add_argument("--no-randomize-style", dest="randomize_style", action="store_false", help="Disable style randomization") + parser.add_argument("--limit", type=int, help="Limit number of entries to process (for testing)") args = parser.parse_args() @@ -596,6 +597,11 @@ def main(): print("Error: Input JSON must be a list of pairs.") return + # Apply limit if specified + if args.limit: + input_data = input_data[:args.limit] + print(f"Limited to {len(input_data)} entries") + data_root = Path(args.data_root) output_dir = Path(args.output_dir) diff --git a/run_regenerate_qa.sh b/run_regenerate_qa.sh new file mode 100755 index 0000000..58aa98d --- /dev/null +++ b/run_regenerate_qa.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# ============================================================================== +# TableMagnifier - QA Regeneration Script +# ============================================================================== +# +# 기존 synthetic 테이블에서 QA를 재생성합니다. +# output_public은 제외됩니다. +# +# Usage: +# ./run_regenerate_qa.sh [OPTIONS] +# +# Examples: +# ./run_regenerate_qa.sh --all # 모든 도메인 +# ./run_regenerate_qa.sh --domain business # 특정 도메인 +# ./run_regenerate_qa.sh --domain business finance # 여러 도메인 +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Colors +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +echo_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +echo_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +echo_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +show_help() { + cat << EOF +Usage: $0 [OPTIONS] + +기존 synthetic 테이블에서 QA를 재생성합니다. + +Options: + --domain DOMAIN [DOMAIN ...] 재생성할 도메인(들) (business, finance, academic, medical) + --all 모든 도메인 재생성 (output_public 제외) + --provider PROVIDER LLM 제공자 (claude, openai, gemini) (default: claude) + --model MODEL 모델 이름 (default: claude-sonnet-4-5) + --no-long-sequence long_sequence QA 생성 스킵 + --limit N 처리할 최대 entry 수 (테스트용) + --dry-run 실제 재생성 없이 확인만 + -h, --help 도움말 표시 + +Examples: + # 모든 도메인 재생성 + $0 --all + + # business 도메인만 + $0 --domain business + + # OpenAI 사용 + $0 --domain business --provider openai --model gpt-4o + + # 테스트 (5개만) + $0 --domain business --limit 5 + + # Dry run + $0 --all --dry-run +EOF +} + +# Check for required arguments +if [[ $# -eq 0 ]]; then + show_help + exit 1 +fi + +# Check for help flag +for arg in "$@"; do + if [[ "$arg" == "-h" ]] || [[ "$arg" == "--help" ]]; then + show_help + exit 0 + fi +done + +# Check API keys +check_api_keys() { + local provider="$1" + case $provider in + claude|anthropic) + if [[ -z "$ANTHROPIC_API_KEY" ]]; then + echo_warn "ANTHROPIC_API_KEY is not set" + fi + ;; + openai) + if [[ -z "$OPENAI_API_KEY" ]]; then + echo_warn "OPENAI_API_KEY is not set" + fi + ;; + gemini|google) + if [[ -z "$GOOGLE_API_KEY" ]]; then + echo_warn "GOOGLE_API_KEY is not set" + fi + ;; + esac +} + +# Parse provider from arguments +PROVIDER="claude" +for i in "${!@}"; do + if [[ "${!i}" == "--provider" ]]; then + next=$((i + 1)) + PROVIDER="${!next}" + break + fi +done + +check_api_keys "$PROVIDER" + +echo "==============================================" +echo " TableMagnifier - QA Regeneration" +echo "==============================================" +echo "" + +# Run the regeneration script +uv run python regenerate_qa.py "$@" diff --git a/single_image_json_list/single_table_academic_input.json b/single_image_json_list/single_table_academic_input.json new file mode 100644 index 0000000..483318f --- /dev/null +++ b/single_image_json_list/single_table_academic_input.json @@ -0,0 +1,1602 @@ +[ + { + "index": 0, + "pair_id": "A_origin_0_A_table_0", + "image_paths": [ + "data/Academic/Table/A_origin_0/A_table_0.png" + ], + "domain": "Academic", + "origin": "A_origin_0", + "table_id": "A_table_0" + }, + { + "index": 1, + "pair_id": "A_origin_0_A_table_1", + "image_paths": [ + "data/Academic/Table/A_origin_0/A_table_1.png" + ], + "domain": "Academic", + "origin": "A_origin_0", + "table_id": "A_table_1" + }, + { + "index": 2, + "pair_id": "A_origin_0_A_table_2", + "image_paths": [ + "data/Academic/Table/A_origin_0/A_table_2.png" + ], + "domain": "Academic", + "origin": "A_origin_0", + "table_id": "A_table_2" + }, + { + "index": 3, + "pair_id": "A_origin_0_A_table_3", + "image_paths": [ + "data/Academic/Table/A_origin_0/A_table_3.png" + ], + "domain": "Academic", + "origin": "A_origin_0", + "table_id": "A_table_3" + }, + { + "index": 4, + "pair_id": "A_origin_0_A_table_4", + "image_paths": [ + "data/Academic/Table/A_origin_0/A_table_4.png" + ], + "domain": "Academic", + "origin": "A_origin_0", + "table_id": "A_table_4" + }, + { + "index": 5, + "pair_id": "A_origin_1_A_table_5_0", + "image_paths": [ + "data/Academic/Table/A_origin_1/A_table_5_0.png" + ], + "domain": "Academic", + "origin": "A_origin_1", + "table_id": "A_table_5_0" + }, + { + "index": 6, + "pair_id": "A_origin_1_A_table_5_1", + "image_paths": [ + "data/Academic/Table/A_origin_1/A_table_5_1.png" + ], + "domain": "Academic", + "origin": "A_origin_1", + "table_id": "A_table_5_1" + }, + { + "index": 7, + "pair_id": "A_origin_1_A_table_6_0", + "image_paths": [ + "data/Academic/Table/A_origin_1/A_table_6_0.png" + ], + "domain": "Academic", + "origin": "A_origin_1", + "table_id": "A_table_6_0" + }, + { + "index": 8, + "pair_id": "A_origin_1_A_table_6_1", + "image_paths": [ + "data/Academic/Table/A_origin_1/A_table_6_1.png" + ], + "domain": "Academic", + "origin": "A_origin_1", + "table_id": "A_table_6_1" + }, + { + "index": 9, + "pair_id": "A_origin_1_A_table_7_0", + "image_paths": [ + "data/Academic/Table/A_origin_1/A_table_7_0.png" + ], + "domain": "Academic", + "origin": "A_origin_1", + "table_id": "A_table_7_0" + }, + { + "index": 10, + "pair_id": "A_origin_1_A_table_7_1", + "image_paths": [ + "data/Academic/Table/A_origin_1/A_table_7_1.png" + ], + "domain": "Academic", + "origin": "A_origin_1", + "table_id": "A_table_7_1" + }, + { + "index": 11, + "pair_id": "A_origin_10_A_table_28", + "image_paths": [ + "data/Academic/Table/A_origin_10/A_table_28.png" + ], + "domain": "Academic", + "origin": "A_origin_10", + "table_id": "A_table_28" + }, + { + "index": 12, + "pair_id": "A_origin_10_A_table_29", + "image_paths": [ + "data/Academic/Table/A_origin_10/A_table_29.png" + ], + "domain": "Academic", + "origin": "A_origin_10", + "table_id": "A_table_29" + }, + { + "index": 13, + "pair_id": "A_origin_11_A_table_30", + "image_paths": [ + "data/Academic/Table/A_origin_11/A_table_30.png" + ], + "domain": "Academic", + "origin": "A_origin_11", + "table_id": "A_table_30" + }, + { + "index": 14, + "pair_id": "A_origin_11_A_table_31", + "image_paths": [ + "data/Academic/Table/A_origin_11/A_table_31.png" + ], + "domain": "Academic", + "origin": "A_origin_11", + "table_id": "A_table_31" + }, + { + "index": 15, + "pair_id": "A_origin_12_A_table_32", + "image_paths": [ + "data/Academic/Table/A_origin_12/A_table_32.png" + ], + "domain": "Academic", + "origin": "A_origin_12", + "table_id": "A_table_32" + }, + { + "index": 16, + "pair_id": "A_origin_12_A_table_33", + "image_paths": [ + "data/Academic/Table/A_origin_12/A_table_33.png" + ], + "domain": "Academic", + "origin": "A_origin_12", + "table_id": "A_table_33" + }, + { + "index": 17, + "pair_id": "A_origin_12_A_table_34", + "image_paths": [ + "data/Academic/Table/A_origin_12/A_table_34.png" + ], + "domain": "Academic", + "origin": "A_origin_12", + "table_id": "A_table_34" + }, + { + "index": 18, + "pair_id": "A_origin_13_A_table_35", + "image_paths": [ + "data/Academic/Table/A_origin_13/A_table_35.png" + ], + "domain": "Academic", + "origin": "A_origin_13", + "table_id": "A_table_35" + }, + { + "index": 19, + "pair_id": "A_origin_13_A_table_36", + "image_paths": [ + "data/Academic/Table/A_origin_13/A_table_36.png" + ], + "domain": "Academic", + "origin": "A_origin_13", + "table_id": "A_table_36" + }, + { + "index": 20, + "pair_id": "A_origin_13_A_table_37", + "image_paths": [ + "data/Academic/Table/A_origin_13/A_table_37.png" + ], + "domain": "Academic", + "origin": "A_origin_13", + "table_id": "A_table_37" + }, + { + "index": 21, + "pair_id": "A_origin_13_A_table_38", + "image_paths": [ + "data/Academic/Table/A_origin_13/A_table_38.png" + ], + "domain": "Academic", + "origin": "A_origin_13", + "table_id": "A_table_38" + }, + { + "index": 22, + "pair_id": "A_origin_14_A_table_39", + "image_paths": [ + "data/Academic/Table/A_origin_14/A_table_39.png" + ], + "domain": "Academic", + "origin": "A_origin_14", + "table_id": "A_table_39" + }, + { + "index": 23, + "pair_id": "A_origin_14_A_table_40", + "image_paths": [ + "data/Academic/Table/A_origin_14/A_table_40.png" + ], + "domain": "Academic", + "origin": "A_origin_14", + "table_id": "A_table_40" + }, + { + "index": 24, + "pair_id": "A_origin_15_A_table_41", + "image_paths": [ + "data/Academic/Table/A_origin_15/A_table_41.png" + ], + "domain": "Academic", + "origin": "A_origin_15", + "table_id": "A_table_41" + }, + { + "index": 25, + "pair_id": "A_origin_16_A_table_42", + "image_paths": [ + "data/Academic/Table/A_origin_16/A_table_42.png" + ], + "domain": "Academic", + "origin": "A_origin_16", + "table_id": "A_table_42" + }, + { + "index": 26, + "pair_id": "A_origin_16_A_table_43", + "image_paths": [ + "data/Academic/Table/A_origin_16/A_table_43.png" + ], + "domain": "Academic", + "origin": "A_origin_16", + "table_id": "A_table_43" + }, + { + "index": 27, + "pair_id": "A_origin_17_A_table_44", + "image_paths": [ + "data/Academic/Table/A_origin_17/A_table_44.png" + ], + "domain": "Academic", + "origin": "A_origin_17", + "table_id": "A_table_44" + }, + { + "index": 28, + "pair_id": "A_origin_17_A_table_45", + "image_paths": [ + "data/Academic/Table/A_origin_17/A_table_45.png" + ], + "domain": "Academic", + "origin": "A_origin_17", + "table_id": "A_table_45" + }, + { + "index": 29, + "pair_id": "A_origin_17_A_table_46", + "image_paths": [ + "data/Academic/Table/A_origin_17/A_table_46.png" + ], + "domain": "Academic", + "origin": "A_origin_17", + "table_id": "A_table_46" + }, + { + "index": 30, + "pair_id": "A_origin_18_A_table_47", + "image_paths": [ + "data/Academic/Table/A_origin_18/A_table_47.png" + ], + "domain": "Academic", + "origin": "A_origin_18", + "table_id": "A_table_47" + }, + { + "index": 31, + "pair_id": "A_origin_18_A_table_48", + "image_paths": [ + "data/Academic/Table/A_origin_18/A_table_48.png" + ], + "domain": "Academic", + "origin": "A_origin_18", + "table_id": "A_table_48" + }, + { + "index": 32, + "pair_id": "A_origin_18_A_table_49", + "image_paths": [ + "data/Academic/Table/A_origin_18/A_table_49.png" + ], + "domain": "Academic", + "origin": "A_origin_18", + "table_id": "A_table_49" + }, + { + "index": 33, + "pair_id": "A_origin_19_A_table_50", + "image_paths": [ + "data/Academic/Table/A_origin_19/A_table_50.png" + ], + "domain": "Academic", + "origin": "A_origin_19", + "table_id": "A_table_50" + }, + { + "index": 34, + "pair_id": "A_origin_2_A_table_8", + "image_paths": [ + "data/Academic/Table/A_origin_2/A_table_8.png" + ], + "domain": "Academic", + "origin": "A_origin_2", + "table_id": "A_table_8" + }, + { + "index": 35, + "pair_id": "A_origin_2_A_table_9", + "image_paths": [ + "data/Academic/Table/A_origin_2/A_table_9.png" + ], + "domain": "Academic", + "origin": "A_origin_2", + "table_id": "A_table_9" + }, + { + "index": 36, + "pair_id": "A_origin_20_A_table_51", + "image_paths": [ + "data/Academic/Table/A_origin_20/A_table_51.png" + ], + "domain": "Academic", + "origin": "A_origin_20", + "table_id": "A_table_51" + }, + { + "index": 37, + "pair_id": "A_origin_20_A_table_52", + "image_paths": [ + "data/Academic/Table/A_origin_20/A_table_52.png" + ], + "domain": "Academic", + "origin": "A_origin_20", + "table_id": "A_table_52" + }, + { + "index": 38, + "pair_id": "A_origin_21_A_table_53", + "image_paths": [ + "data/Academic/Table/A_origin_21/A_table_53.png" + ], + "domain": "Academic", + "origin": "A_origin_21", + "table_id": "A_table_53" + }, + { + "index": 39, + "pair_id": "A_origin_21_A_table_54", + "image_paths": [ + "data/Academic/Table/A_origin_21/A_table_54.png" + ], + "domain": "Academic", + "origin": "A_origin_21", + "table_id": "A_table_54" + }, + { + "index": 40, + "pair_id": "A_origin_22_A_table_55", + "image_paths": [ + "data/Academic/Table/A_origin_22/A_table_55.png" + ], + "domain": "Academic", + "origin": "A_origin_22", + "table_id": "A_table_55" + }, + { + "index": 41, + "pair_id": "A_origin_23_A_table_56", + "image_paths": [ + "data/Academic/Table/A_origin_23/A_table_56.png" + ], + "domain": "Academic", + "origin": "A_origin_23", + "table_id": "A_table_56" + }, + { + "index": 42, + "pair_id": "A_origin_24_A_table_57", + "image_paths": [ + "data/Academic/Table/A_origin_24/A_table_57.png" + ], + "domain": "Academic", + "origin": "A_origin_24", + "table_id": "A_table_57" + }, + { + "index": 43, + "pair_id": "A_origin_25_A_table_58", + "image_paths": [ + "data/Academic/Table/A_origin_25/A_table_58.png" + ], + "domain": "Academic", + "origin": "A_origin_25", + "table_id": "A_table_58" + }, + { + "index": 44, + "pair_id": "A_origin_26_A_table_59_01", + "image_paths": [ + "data/Academic/Table/A_origin_26/A_table_59_01.png" + ], + "domain": "Academic", + "origin": "A_origin_26", + "table_id": "A_table_59_01" + }, + { + "index": 45, + "pair_id": "A_origin_26_A_table_59_02", + "image_paths": [ + "data/Academic/Table/A_origin_26/A_table_59_02.png" + ], + "domain": "Academic", + "origin": "A_origin_26", + "table_id": "A_table_59_02" + }, + { + "index": 46, + "pair_id": "A_origin_26_A_table_60", + "image_paths": [ + "data/Academic/Table/A_origin_26/A_table_60.png" + ], + "domain": "Academic", + "origin": "A_origin_26", + "table_id": "A_table_60" + }, + { + "index": 47, + "pair_id": "A_origin_26_A_table_61", + "image_paths": [ + "data/Academic/Table/A_origin_26/A_table_61.png" + ], + "domain": "Academic", + "origin": "A_origin_26", + "table_id": "A_table_61" + }, + { + "index": 48, + "pair_id": "A_origin_27_A_table_62_01", + "image_paths": [ + "data/Academic/Table/A_origin_27/A_table_62_01.png" + ], + "domain": "Academic", + "origin": "A_origin_27", + "table_id": "A_table_62_01" + }, + { + "index": 49, + "pair_id": "A_origin_27_A_table_62_02", + "image_paths": [ + "data/Academic/Table/A_origin_27/A_table_62_02.png" + ], + "domain": "Academic", + "origin": "A_origin_27", + "table_id": "A_table_62_02" + }, + { + "index": 50, + "pair_id": "A_origin_28_A_table_63_01", + "image_paths": [ + "data/Academic/Table/A_origin_28/A_table_63_01.png" + ], + "domain": "Academic", + "origin": "A_origin_28", + "table_id": "A_table_63_01" + }, + { + "index": 51, + "pair_id": "A_origin_28_A_table_63_02", + "image_paths": [ + "data/Academic/Table/A_origin_28/A_table_63_02.png" + ], + "domain": "Academic", + "origin": "A_origin_28", + "table_id": "A_table_63_02" + }, + { + "index": 52, + "pair_id": "A_origin_28_A_table_64", + "image_paths": [ + "data/Academic/Table/A_origin_28/A_table_64.png" + ], + "domain": "Academic", + "origin": "A_origin_28", + "table_id": "A_table_64" + }, + { + "index": 53, + "pair_id": "A_origin_29_A_table_65", + "image_paths": [ + "data/Academic/Table/A_origin_29/A_table_65.png" + ], + "domain": "Academic", + "origin": "A_origin_29", + "table_id": "A_table_65" + }, + { + "index": 54, + "pair_id": "A_origin_3_A_table_10", + "image_paths": [ + "data/Academic/Table/A_origin_3/A_table_10.png" + ], + "domain": "Academic", + "origin": "A_origin_3", + "table_id": "A_table_10" + }, + { + "index": 55, + "pair_id": "A_origin_3_A_table_11", + "image_paths": [ + "data/Academic/Table/A_origin_3/A_table_11.png" + ], + "domain": "Academic", + "origin": "A_origin_3", + "table_id": "A_table_11" + }, + { + "index": 56, + "pair_id": "A_origin_30_A_table_66_01", + "image_paths": [ + "data/Academic/Table/A_origin_30/A_table_66_01.png" + ], + "domain": "Academic", + "origin": "A_origin_30", + "table_id": "A_table_66_01" + }, + { + "index": 57, + "pair_id": "A_origin_30_A_table_66_02", + "image_paths": [ + "data/Academic/Table/A_origin_30/A_table_66_02.png" + ], + "domain": "Academic", + "origin": "A_origin_30", + "table_id": "A_table_66_02" + }, + { + "index": 58, + "pair_id": "A_origin_31_A_table_67_01", + "image_paths": [ + "data/Academic/Table/A_origin_31/A_table_67_01.png" + ], + "domain": "Academic", + "origin": "A_origin_31", + "table_id": "A_table_67_01" + }, + { + "index": 59, + "pair_id": "A_origin_31_A_table_67_02", + "image_paths": [ + "data/Academic/Table/A_origin_31/A_table_67_02.png" + ], + "domain": "Academic", + "origin": "A_origin_31", + "table_id": "A_table_67_02" + }, + { + "index": 60, + "pair_id": "A_origin_32_A_table_68_01", + "image_paths": [ + "data/Academic/Table/A_origin_32/A_table_68_01.png" + ], + "domain": "Academic", + "origin": "A_origin_32", + "table_id": "A_table_68_01" + }, + { + "index": 61, + "pair_id": "A_origin_32_A_table_68_02", + "image_paths": [ + "data/Academic/Table/A_origin_32/A_table_68_02.png" + ], + "domain": "Academic", + "origin": "A_origin_32", + "table_id": "A_table_68_02" + }, + { + "index": 62, + "pair_id": "A_origin_32_A_table_69_01", + "image_paths": [ + "data/Academic/Table/A_origin_32/A_table_69_01.png" + ], + "domain": "Academic", + "origin": "A_origin_32", + "table_id": "A_table_69_01" + }, + { + "index": 63, + "pair_id": "A_origin_32_A_table_69_02", + "image_paths": [ + "data/Academic/Table/A_origin_32/A_table_69_02.png" + ], + "domain": "Academic", + "origin": "A_origin_32", + "table_id": "A_table_69_02" + }, + { + "index": 64, + "pair_id": "A_origin_33_A_table_70_01", + "image_paths": [ + "data/Academic/Table/A_origin_33/A_table_70_01.png" + ], + "domain": "Academic", + "origin": "A_origin_33", + "table_id": "A_table_70_01" + }, + { + "index": 65, + "pair_id": "A_origin_33_A_table_70_02", + "image_paths": [ + "data/Academic/Table/A_origin_33/A_table_70_02.png" + ], + "domain": "Academic", + "origin": "A_origin_33", + "table_id": "A_table_70_02" + }, + { + "index": 66, + "pair_id": "A_origin_34_A_table_71_01", + "image_paths": [ + "data/Academic/Table/A_origin_34/A_table_71_01.png" + ], + "domain": "Academic", + "origin": "A_origin_34", + "table_id": "A_table_71_01" + }, + { + "index": 67, + "pair_id": "A_origin_34_A_table_71_02", + "image_paths": [ + "data/Academic/Table/A_origin_34/A_table_71_02.png" + ], + "domain": "Academic", + "origin": "A_origin_34", + "table_id": "A_table_71_02" + }, + { + "index": 68, + "pair_id": "A_origin_35_A_table_72", + "image_paths": [ + "data/Academic/Table/A_origin_35/A_table_72.png" + ], + "domain": "Academic", + "origin": "A_origin_35", + "table_id": "A_table_72" + }, + { + "index": 69, + "pair_id": "A_origin_36_A_table_73", + "image_paths": [ + "data/Academic/Table/A_origin_36/A_table_73.png" + ], + "domain": "Academic", + "origin": "A_origin_36", + "table_id": "A_table_73" + }, + { + "index": 70, + "pair_id": "A_origin_36_A_table_74", + "image_paths": [ + "data/Academic/Table/A_origin_36/A_table_74.png" + ], + "domain": "Academic", + "origin": "A_origin_36", + "table_id": "A_table_74" + }, + { + "index": 71, + "pair_id": "A_origin_36_A_table_75", + "image_paths": [ + "data/Academic/Table/A_origin_36/A_table_75.png" + ], + "domain": "Academic", + "origin": "A_origin_36", + "table_id": "A_table_75" + }, + { + "index": 72, + "pair_id": "A_origin_37_A_table_76_01", + "image_paths": [ + "data/Academic/Table/A_origin_37/A_table_76_01.png" + ], + "domain": "Academic", + "origin": "A_origin_37", + "table_id": "A_table_76_01" + }, + { + "index": 73, + "pair_id": "A_origin_37_A_table_76_02", + "image_paths": [ + "data/Academic/Table/A_origin_37/A_table_76_02.png" + ], + "domain": "Academic", + "origin": "A_origin_37", + "table_id": "A_table_76_02" + }, + { + "index": 74, + "pair_id": "A_origin_38_A_table_77", + "image_paths": [ + "data/Academic/Table/A_origin_38/A_table_77.png" + ], + "domain": "Academic", + "origin": "A_origin_38", + "table_id": "A_table_77" + }, + { + "index": 75, + "pair_id": "A_origin_38_A_table_78", + "image_paths": [ + "data/Academic/Table/A_origin_38/A_table_78.png" + ], + "domain": "Academic", + "origin": "A_origin_38", + "table_id": "A_table_78" + }, + { + "index": 76, + "pair_id": "A_origin_39_A_table_79", + "image_paths": [ + "data/Academic/Table/A_origin_39/A_table_79.png" + ], + "domain": "Academic", + "origin": "A_origin_39", + "table_id": "A_table_79" + }, + { + "index": 77, + "pair_id": "A_origin_39_A_table_80_01", + "image_paths": [ + "data/Academic/Table/A_origin_39/A_table_80_01.png" + ], + "domain": "Academic", + "origin": "A_origin_39", + "table_id": "A_table_80_01" + }, + { + "index": 78, + "pair_id": "A_origin_39_A_table_80_02", + "image_paths": [ + "data/Academic/Table/A_origin_39/A_table_80_02.png" + ], + "domain": "Academic", + "origin": "A_origin_39", + "table_id": "A_table_80_02" + }, + { + "index": 79, + "pair_id": "A_origin_4_A_table_12", + "image_paths": [ + "data/Academic/Table/A_origin_4/A_table_12.png" + ], + "domain": "Academic", + "origin": "A_origin_4", + "table_id": "A_table_12" + }, + { + "index": 80, + "pair_id": "A_origin_4_A_table_13", + "image_paths": [ + "data/Academic/Table/A_origin_4/A_table_13.png" + ], + "domain": "Academic", + "origin": "A_origin_4", + "table_id": "A_table_13" + }, + { + "index": 81, + "pair_id": "A_origin_40_A_table_81", + "image_paths": [ + "data/Academic/Table/A_origin_40/A_table_81.png" + ], + "domain": "Academic", + "origin": "A_origin_40", + "table_id": "A_table_81" + }, + { + "index": 82, + "pair_id": "A_origin_40_A_table_82", + "image_paths": [ + "data/Academic/Table/A_origin_40/A_table_82.png" + ], + "domain": "Academic", + "origin": "A_origin_40", + "table_id": "A_table_82" + }, + { + "index": 83, + "pair_id": "A_origin_40_A_table_83", + "image_paths": [ + "data/Academic/Table/A_origin_40/A_table_83.png" + ], + "domain": "Academic", + "origin": "A_origin_40", + "table_id": "A_table_83" + }, + { + "index": 84, + "pair_id": "A_origin_41_A_table_84", + "image_paths": [ + "data/Academic/Table/A_origin_41/A_table_84.png" + ], + "domain": "Academic", + "origin": "A_origin_41", + "table_id": "A_table_84" + }, + { + "index": 85, + "pair_id": "A_origin_41_A_table_85", + "image_paths": [ + "data/Academic/Table/A_origin_41/A_table_85.png" + ], + "domain": "Academic", + "origin": "A_origin_41", + "table_id": "A_table_85" + }, + { + "index": 86, + "pair_id": "A_origin_42_A_table_86", + "image_paths": [ + "data/Academic/Table/A_origin_42/A_table_86.png" + ], + "domain": "Academic", + "origin": "A_origin_42", + "table_id": "A_table_86" + }, + { + "index": 87, + "pair_id": "A_origin_42_A_table_87", + "image_paths": [ + "data/Academic/Table/A_origin_42/A_table_87.png" + ], + "domain": "Academic", + "origin": "A_origin_42", + "table_id": "A_table_87" + }, + { + "index": 88, + "pair_id": "A_origin_43_A_table_88", + "image_paths": [ + "data/Academic/Table/A_origin_43/A_table_88.png" + ], + "domain": "Academic", + "origin": "A_origin_43", + "table_id": "A_table_88" + }, + { + "index": 89, + "pair_id": "A_origin_43_A_table_89", + "image_paths": [ + "data/Academic/Table/A_origin_43/A_table_89.png" + ], + "domain": "Academic", + "origin": "A_origin_43", + "table_id": "A_table_89" + }, + { + "index": 90, + "pair_id": "A_origin_43_A_table_90", + "image_paths": [ + "data/Academic/Table/A_origin_43/A_table_90.png" + ], + "domain": "Academic", + "origin": "A_origin_43", + "table_id": "A_table_90" + }, + { + "index": 91, + "pair_id": "A_origin_44_A_table_91", + "image_paths": [ + "data/Academic/Table/A_origin_44/A_table_91.png" + ], + "domain": "Academic", + "origin": "A_origin_44", + "table_id": "A_table_91" + }, + { + "index": 92, + "pair_id": "A_origin_45_A_table_92", + "image_paths": [ + "data/Academic/Table/A_origin_45/A_table_92.png" + ], + "domain": "Academic", + "origin": "A_origin_45", + "table_id": "A_table_92" + }, + { + "index": 93, + "pair_id": "A_origin_45_A_table_93", + "image_paths": [ + "data/Academic/Table/A_origin_45/A_table_93.png" + ], + "domain": "Academic", + "origin": "A_origin_45", + "table_id": "A_table_93" + }, + { + "index": 94, + "pair_id": "A_origin_45_A_table_94", + "image_paths": [ + "data/Academic/Table/A_origin_45/A_table_94.png" + ], + "domain": "Academic", + "origin": "A_origin_45", + "table_id": "A_table_94" + }, + { + "index": 95, + "pair_id": "A_origin_45_A_table_95", + "image_paths": [ + "data/Academic/Table/A_origin_45/A_table_95.png" + ], + "domain": "Academic", + "origin": "A_origin_45", + "table_id": "A_table_95" + }, + { + "index": 96, + "pair_id": "A_origin_46_A_table_96", + "image_paths": [ + "data/Academic/Table/A_origin_46/A_table_96.png" + ], + "domain": "Academic", + "origin": "A_origin_46", + "table_id": "A_table_96" + }, + { + "index": 97, + "pair_id": "A_origin_47_A_table_97", + "image_paths": [ + "data/Academic/Table/A_origin_47/A_table_97.png" + ], + "domain": "Academic", + "origin": "A_origin_47", + "table_id": "A_table_97" + }, + { + "index": 98, + "pair_id": "A_origin_47_A_table_98", + "image_paths": [ + "data/Academic/Table/A_origin_47/A_table_98.png" + ], + "domain": "Academic", + "origin": "A_origin_47", + "table_id": "A_table_98" + }, + { + "index": 99, + "pair_id": "A_origin_48_A_table_99", + "image_paths": [ + "data/Academic/Table/A_origin_48/A_table_99.png" + ], + "domain": "Academic", + "origin": "A_origin_48", + "table_id": "A_table_99" + }, + { + "index": 100, + "pair_id": "A_origin_49_A_table_100", + "image_paths": [ + "data/Academic/Table/A_origin_49/A_table_100.png" + ], + "domain": "Academic", + "origin": "A_origin_49", + "table_id": "A_table_100" + }, + { + "index": 101, + "pair_id": "A_origin_49_A_table_101", + "image_paths": [ + "data/Academic/Table/A_origin_49/A_table_101.png" + ], + "domain": "Academic", + "origin": "A_origin_49", + "table_id": "A_table_101" + }, + { + "index": 102, + "pair_id": "A_origin_5_A_table_14", + "image_paths": [ + "data/Academic/Table/A_origin_5/A_table_14.png" + ], + "domain": "Academic", + "origin": "A_origin_5", + "table_id": "A_table_14" + }, + { + "index": 103, + "pair_id": "A_origin_5_A_table_15", + "image_paths": [ + "data/Academic/Table/A_origin_5/A_table_15.png" + ], + "domain": "Academic", + "origin": "A_origin_5", + "table_id": "A_table_15" + }, + { + "index": 104, + "pair_id": "A_origin_5_A_table_16", + "image_paths": [ + "data/Academic/Table/A_origin_5/A_table_16.png" + ], + "domain": "Academic", + "origin": "A_origin_5", + "table_id": "A_table_16" + }, + { + "index": 105, + "pair_id": "A_origin_50_A_table_102", + "image_paths": [ + "data/Academic/Table/A_origin_50/A_table_102.png" + ], + "domain": "Academic", + "origin": "A_origin_50", + "table_id": "A_table_102" + }, + { + "index": 106, + "pair_id": "A_origin_51_A_table_103", + "image_paths": [ + "data/Academic/Table/A_origin_51/A_table_103.png" + ], + "domain": "Academic", + "origin": "A_origin_51", + "table_id": "A_table_103" + }, + { + "index": 107, + "pair_id": "A_origin_51_A_table_104", + "image_paths": [ + "data/Academic/Table/A_origin_51/A_table_104.png" + ], + "domain": "Academic", + "origin": "A_origin_51", + "table_id": "A_table_104" + }, + { + "index": 108, + "pair_id": "A_origin_52_A_table_105", + "image_paths": [ + "data/Academic/Table/A_origin_52/A_table_105.png" + ], + "domain": "Academic", + "origin": "A_origin_52", + "table_id": "A_table_105" + }, + { + "index": 109, + "pair_id": "A_origin_53_A_table_106_01", + "image_paths": [ + "data/Academic/Table/A_origin_53/A_table_106_01.png" + ], + "domain": "Academic", + "origin": "A_origin_53", + "table_id": "A_table_106_01" + }, + { + "index": 110, + "pair_id": "A_origin_53_A_table_106_02", + "image_paths": [ + "data/Academic/Table/A_origin_53/A_table_106_02.png" + ], + "domain": "Academic", + "origin": "A_origin_53", + "table_id": "A_table_106_02" + }, + { + "index": 111, + "pair_id": "A_origin_53_A_table_107_01", + "image_paths": [ + "data/Academic/Table/A_origin_53/A_table_107_01.png" + ], + "domain": "Academic", + "origin": "A_origin_53", + "table_id": "A_table_107_01" + }, + { + "index": 112, + "pair_id": "A_origin_53_A_table_107_02", + "image_paths": [ + "data/Academic/Table/A_origin_53/A_table_107_02.png" + ], + "domain": "Academic", + "origin": "A_origin_53", + "table_id": "A_table_107_02" + }, + { + "index": 113, + "pair_id": "A_origin_53_A_table_108", + "image_paths": [ + "data/Academic/Table/A_origin_53/A_table_108.png" + ], + "domain": "Academic", + "origin": "A_origin_53", + "table_id": "A_table_108" + }, + { + "index": 114, + "pair_id": "A_origin_54_A_table_109", + "image_paths": [ + "data/Academic/Table/A_origin_54/A_table_109.png" + ], + "domain": "Academic", + "origin": "A_origin_54", + "table_id": "A_table_109" + }, + { + "index": 115, + "pair_id": "A_origin_54_A_table_110", + "image_paths": [ + "data/Academic/Table/A_origin_54/A_table_110.png" + ], + "domain": "Academic", + "origin": "A_origin_54", + "table_id": "A_table_110" + }, + { + "index": 116, + "pair_id": "A_origin_55_A_table_111", + "image_paths": [ + "data/Academic/Table/A_origin_55/A_table_111.png" + ], + "domain": "Academic", + "origin": "A_origin_55", + "table_id": "A_table_111" + }, + { + "index": 117, + "pair_id": "A_origin_55_A_table_112", + "image_paths": [ + "data/Academic/Table/A_origin_55/A_table_112.png" + ], + "domain": "Academic", + "origin": "A_origin_55", + "table_id": "A_table_112" + }, + { + "index": 118, + "pair_id": "A_origin_56_A_table_113", + "image_paths": [ + "data/Academic/Table/A_origin_56/A_table_113.png" + ], + "domain": "Academic", + "origin": "A_origin_56", + "table_id": "A_table_113" + }, + { + "index": 119, + "pair_id": "A_origin_56_A_table_114", + "image_paths": [ + "data/Academic/Table/A_origin_56/A_table_114.png" + ], + "domain": "Academic", + "origin": "A_origin_56", + "table_id": "A_table_114" + }, + { + "index": 120, + "pair_id": "A_origin_56_A_table_115", + "image_paths": [ + "data/Academic/Table/A_origin_56/A_table_115.png" + ], + "domain": "Academic", + "origin": "A_origin_56", + "table_id": "A_table_115" + }, + { + "index": 121, + "pair_id": "A_origin_57_A_table_116", + "image_paths": [ + "data/Academic/Table/A_origin_57/A_table_116.png" + ], + "domain": "Academic", + "origin": "A_origin_57", + "table_id": "A_table_116" + }, + { + "index": 122, + "pair_id": "A_origin_57_A_table_117", + "image_paths": [ + "data/Academic/Table/A_origin_57/A_table_117.png" + ], + "domain": "Academic", + "origin": "A_origin_57", + "table_id": "A_table_117" + }, + { + "index": 123, + "pair_id": "A_origin_57_A_table_118", + "image_paths": [ + "data/Academic/Table/A_origin_57/A_table_118.png" + ], + "domain": "Academic", + "origin": "A_origin_57", + "table_id": "A_table_118" + }, + { + "index": 124, + "pair_id": "A_origin_57_A_table_119", + "image_paths": [ + "data/Academic/Table/A_origin_57/A_table_119.png" + ], + "domain": "Academic", + "origin": "A_origin_57", + "table_id": "A_table_119" + }, + { + "index": 125, + "pair_id": "A_origin_57_A_table_120", + "image_paths": [ + "data/Academic/Table/A_origin_57/A_table_120.png" + ], + "domain": "Academic", + "origin": "A_origin_57", + "table_id": "A_table_120" + }, + { + "index": 126, + "pair_id": "A_origin_58_A_table_121", + "image_paths": [ + "data/Academic/Table/A_origin_58/A_table_121.png" + ], + "domain": "Academic", + "origin": "A_origin_58", + "table_id": "A_table_121" + }, + { + "index": 127, + "pair_id": "A_origin_58_A_table_122", + "image_paths": [ + "data/Academic/Table/A_origin_58/A_table_122.png" + ], + "domain": "Academic", + "origin": "A_origin_58", + "table_id": "A_table_122" + }, + { + "index": 128, + "pair_id": "A_origin_58_A_table_123_01", + "image_paths": [ + "data/Academic/Table/A_origin_58/A_table_123_01.png" + ], + "domain": "Academic", + "origin": "A_origin_58", + "table_id": "A_table_123_01" + }, + { + "index": 129, + "pair_id": "A_origin_58_A_table_123_02", + "image_paths": [ + "data/Academic/Table/A_origin_58/A_table_123_02.png" + ], + "domain": "Academic", + "origin": "A_origin_58", + "table_id": "A_table_123_02" + }, + { + "index": 130, + "pair_id": "A_origin_59_A_table_124_01", + "image_paths": [ + "data/Academic/Table/A_origin_59/A_table_124_01.png" + ], + "domain": "Academic", + "origin": "A_origin_59", + "table_id": "A_table_124_01" + }, + { + "index": 131, + "pair_id": "A_origin_59_A_table_124_02", + "image_paths": [ + "data/Academic/Table/A_origin_59/A_table_124_02.png" + ], + "domain": "Academic", + "origin": "A_origin_59", + "table_id": "A_table_124_02" + }, + { + "index": 132, + "pair_id": "A_origin_6_A_table_18", + "image_paths": [ + "data/Academic/Table/A_origin_6/A_table_18.png" + ], + "domain": "Academic", + "origin": "A_origin_6", + "table_id": "A_table_18" + }, + { + "index": 133, + "pair_id": "A_origin_6_A_table_19", + "image_paths": [ + "data/Academic/Table/A_origin_6/A_table_19.png" + ], + "domain": "Academic", + "origin": "A_origin_6", + "table_id": "A_table_19" + }, + { + "index": 134, + "pair_id": "A_origin_6_A_table_8", + "image_paths": [ + "data/Academic/Table/A_origin_6/A_table_8.png" + ], + "domain": "Academic", + "origin": "A_origin_6", + "table_id": "A_table_8" + }, + { + "index": 135, + "pair_id": "A_origin_60_A_table_125_01", + "image_paths": [ + "data/Academic/Table/A_origin_60/A_table_125_01.png" + ], + "domain": "Academic", + "origin": "A_origin_60", + "table_id": "A_table_125_01" + }, + { + "index": 136, + "pair_id": "A_origin_60_A_table_125_02", + "image_paths": [ + "data/Academic/Table/A_origin_60/A_table_125_02.png" + ], + "domain": "Academic", + "origin": "A_origin_60", + "table_id": "A_table_125_02" + }, + { + "index": 137, + "pair_id": "A_origin_61_A_table_126", + "image_paths": [ + "data/Academic/Table/A_origin_61/A_table_126.png" + ], + "domain": "Academic", + "origin": "A_origin_61", + "table_id": "A_table_126" + }, + { + "index": 138, + "pair_id": "A_origin_62_A_table_127_01", + "image_paths": [ + "data/Academic/Table/A_origin_62/A_table_127_01.png" + ], + "domain": "Academic", + "origin": "A_origin_62", + "table_id": "A_table_127_01" + }, + { + "index": 139, + "pair_id": "A_origin_62_A_table_127_02", + "image_paths": [ + "data/Academic/Table/A_origin_62/A_table_127_02.png" + ], + "domain": "Academic", + "origin": "A_origin_62", + "table_id": "A_table_127_02" + }, + { + "index": 140, + "pair_id": "A_origin_63_A_table_128", + "image_paths": [ + "data/Academic/Table/A_origin_63/A_table_128.png" + ], + "domain": "Academic", + "origin": "A_origin_63", + "table_id": "A_table_128" + }, + { + "index": 141, + "pair_id": "A_origin_63_A_table_129", + "image_paths": [ + "data/Academic/Table/A_origin_63/A_table_129.png" + ], + "domain": "Academic", + "origin": "A_origin_63", + "table_id": "A_table_129" + }, + { + "index": 142, + "pair_id": "A_origin_64_A_table_130", + "image_paths": [ + "data/Academic/Table/A_origin_64/A_table_130.png" + ], + "domain": "Academic", + "origin": "A_origin_64", + "table_id": "A_table_130" + }, + { + "index": 143, + "pair_id": "A_origin_64_A_table_131", + "image_paths": [ + "data/Academic/Table/A_origin_64/A_table_131.png" + ], + "domain": "Academic", + "origin": "A_origin_64", + "table_id": "A_table_131" + }, + { + "index": 144, + "pair_id": "A_origin_64_A_table_132", + "image_paths": [ + "data/Academic/Table/A_origin_64/A_table_132.png" + ], + "domain": "Academic", + "origin": "A_origin_64", + "table_id": "A_table_132" + }, + { + "index": 145, + "pair_id": "A_origin_65_A_table_133", + "image_paths": [ + "data/Academic/Table/A_origin_65/A_table_133.png" + ], + "domain": "Academic", + "origin": "A_origin_65", + "table_id": "A_table_133" + }, + { + "index": 146, + "pair_id": "A_origin_65_A_table_134", + "image_paths": [ + "data/Academic/Table/A_origin_65/A_table_134.png" + ], + "domain": "Academic", + "origin": "A_origin_65", + "table_id": "A_table_134" + }, + { + "index": 147, + "pair_id": "A_origin_66_A_table_135", + "image_paths": [ + "data/Academic/Table/A_origin_66/A_table_135.png" + ], + "domain": "Academic", + "origin": "A_origin_66", + "table_id": "A_table_135" + }, + { + "index": 148, + "pair_id": "A_origin_66_A_table_136", + "image_paths": [ + "data/Academic/Table/A_origin_66/A_table_136.png" + ], + "domain": "Academic", + "origin": "A_origin_66", + "table_id": "A_table_136" + }, + { + "index": 149, + "pair_id": "A_origin_66_A_table_137", + "image_paths": [ + "data/Academic/Table/A_origin_66/A_table_137.png" + ], + "domain": "Academic", + "origin": "A_origin_66", + "table_id": "A_table_137" + }, + { + "index": 150, + "pair_id": "A_origin_66_A_table_138", + "image_paths": [ + "data/Academic/Table/A_origin_66/A_table_138.png" + ], + "domain": "Academic", + "origin": "A_origin_66", + "table_id": "A_table_138" + }, + { + "index": 151, + "pair_id": "A_origin_67_A_table_139", + "image_paths": [ + "data/Academic/Table/A_origin_67/A_table_139.png" + ], + "domain": "Academic", + "origin": "A_origin_67", + "table_id": "A_table_139" + }, + { + "index": 152, + "pair_id": "A_origin_7_A_table_20", + "image_paths": [ + "data/Academic/Table/A_origin_7/A_table_20.png" + ], + "domain": "Academic", + "origin": "A_origin_7", + "table_id": "A_table_20" + }, + { + "index": 153, + "pair_id": "A_origin_7_A_table_21", + "image_paths": [ + "data/Academic/Table/A_origin_7/A_table_21.png" + ], + "domain": "Academic", + "origin": "A_origin_7", + "table_id": "A_table_21" + }, + { + "index": 154, + "pair_id": "A_origin_8_A_table_22", + "image_paths": [ + "data/Academic/Table/A_origin_8/A_table_22.png" + ], + "domain": "Academic", + "origin": "A_origin_8", + "table_id": "A_table_22" + }, + { + "index": 155, + "pair_id": "A_origin_8_A_table_23", + "image_paths": [ + "data/Academic/Table/A_origin_8/A_table_23.png" + ], + "domain": "Academic", + "origin": "A_origin_8", + "table_id": "A_table_23" + }, + { + "index": 156, + "pair_id": "A_origin_8_A_table_24", + "image_paths": [ + "data/Academic/Table/A_origin_8/A_table_24.png" + ], + "domain": "Academic", + "origin": "A_origin_8", + "table_id": "A_table_24" + }, + { + "index": 157, + "pair_id": "A_origin_9_A_table_25", + "image_paths": [ + "data/Academic/Table/A_origin_9/A_table_25.png" + ], + "domain": "Academic", + "origin": "A_origin_9", + "table_id": "A_table_25" + }, + { + "index": 158, + "pair_id": "A_origin_9_A_table_26", + "image_paths": [ + "data/Academic/Table/A_origin_9/A_table_26.png" + ], + "domain": "Academic", + "origin": "A_origin_9", + "table_id": "A_table_26" + }, + { + "index": 159, + "pair_id": "A_origin_9_A_table_27", + "image_paths": [ + "data/Academic/Table/A_origin_9/A_table_27.png" + ], + "domain": "Academic", + "origin": "A_origin_9", + "table_id": "A_table_27" + } +] diff --git a/single_image_json_list/single_table_business_input.json b/single_image_json_list/single_table_business_input.json new file mode 100644 index 0000000..7309657 --- /dev/null +++ b/single_image_json_list/single_table_business_input.json @@ -0,0 +1,1492 @@ +[ + { + "index": 0, + "pair_id": "B_origin_0_B_table_0_0", + "image_paths": [ + "data/Business/Table/B_origin_0/B_table_0_0.png" + ], + "domain": "Business", + "origin": "B_origin_0", + "table_id": "B_table_0_0" + }, + { + "index": 1, + "pair_id": "B_origin_0_B_table_1_0", + "image_paths": [ + "data/Business/Table/B_origin_0/B_table_1_0.png" + ], + "domain": "Business", + "origin": "B_origin_0", + "table_id": "B_table_1_0" + }, + { + "index": 2, + "pair_id": "B_origin_0_B_table_2_0", + "image_paths": [ + "data/Business/Table/B_origin_0/B_table_2_0.png" + ], + "domain": "Business", + "origin": "B_origin_0", + "table_id": "B_table_2_0" + }, + { + "index": 3, + "pair_id": "B_origin_1_B_table_3_0", + "image_paths": [ + "data/Business/Table/B_origin_1/B_table_3_0.png" + ], + "domain": "Business", + "origin": "B_origin_1", + "table_id": "B_table_3_0" + }, + { + "index": 4, + "pair_id": "B_origin_1_B_table_4_0", + "image_paths": [ + "data/Business/Table/B_origin_1/B_table_4_0.png" + ], + "domain": "Business", + "origin": "B_origin_1", + "table_id": "B_table_4_0" + }, + { + "index": 5, + "pair_id": "B_origin_10_B_table_31_0", + "image_paths": [ + "data/Business/Table/B_origin_10/B_table_31_0.png" + ], + "domain": "Business", + "origin": "B_origin_10", + "table_id": "B_table_31_0" + }, + { + "index": 6, + "pair_id": "B_origin_10_B_table_32_0", + "image_paths": [ + "data/Business/Table/B_origin_10/B_table_32_0.png" + ], + "domain": "Business", + "origin": "B_origin_10", + "table_id": "B_table_32_0" + }, + { + "index": 7, + "pair_id": "B_origin_10_B_table_33_0", + "image_paths": [ + "data/Business/Table/B_origin_10/B_table_33_0.png" + ], + "domain": "Business", + "origin": "B_origin_10", + "table_id": "B_table_33_0" + }, + { + "index": 8, + "pair_id": "B_origin_10_B_table_34_0", + "image_paths": [ + "data/Business/Table/B_origin_10/B_table_34_0.png" + ], + "domain": "Business", + "origin": "B_origin_10", + "table_id": "B_table_34_0" + }, + { + "index": 9, + "pair_id": "B_origin_11_B_table_35_0", + "image_paths": [ + "data/Business/Table/B_origin_11/B_table_35_0.png" + ], + "domain": "Business", + "origin": "B_origin_11", + "table_id": "B_table_35_0" + }, + { + "index": 10, + "pair_id": "B_origin_11_B_table_36_0", + "image_paths": [ + "data/Business/Table/B_origin_11/B_table_36_0.png" + ], + "domain": "Business", + "origin": "B_origin_11", + "table_id": "B_table_36_0" + }, + { + "index": 11, + "pair_id": "B_origin_11_B_table_37_0", + "image_paths": [ + "data/Business/Table/B_origin_11/B_table_37_0.png" + ], + "domain": "Business", + "origin": "B_origin_11", + "table_id": "B_table_37_0" + }, + { + "index": 12, + "pair_id": "B_origin_11_B_table_38_0", + "image_paths": [ + "data/Business/Table/B_origin_11/B_table_38_0.png" + ], + "domain": "Business", + "origin": "B_origin_11", + "table_id": "B_table_38_0" + }, + { + "index": 13, + "pair_id": "B_origin_12_B_table_39_0", + "image_paths": [ + "data/Business/Table/B_origin_12/B_table_39_0.png" + ], + "domain": "Business", + "origin": "B_origin_12", + "table_id": "B_table_39_0" + }, + { + "index": 14, + "pair_id": "B_origin_12_B_table_40_0", + "image_paths": [ + "data/Business/Table/B_origin_12/B_table_40_0.png" + ], + "domain": "Business", + "origin": "B_origin_12", + "table_id": "B_table_40_0" + }, + { + "index": 15, + "pair_id": "B_origin_13_B_table_41_0", + "image_paths": [ + "data/Business/Table/B_origin_13/B_table_41_0.png" + ], + "domain": "Business", + "origin": "B_origin_13", + "table_id": "B_table_41_0" + }, + { + "index": 16, + "pair_id": "B_origin_13_B_table_42_0", + "image_paths": [ + "data/Business/Table/B_origin_13/B_table_42_0.png" + ], + "domain": "Business", + "origin": "B_origin_13", + "table_id": "B_table_42_0" + }, + { + "index": 17, + "pair_id": "B_origin_13_B_table_43_0", + "image_paths": [ + "data/Business/Table/B_origin_13/B_table_43_0.png" + ], + "domain": "Business", + "origin": "B_origin_13", + "table_id": "B_table_43_0" + }, + { + "index": 18, + "pair_id": "B_origin_13_B_table_44_0", + "image_paths": [ + "data/Business/Table/B_origin_13/B_table_44_0.png" + ], + "domain": "Business", + "origin": "B_origin_13", + "table_id": "B_table_44_0" + }, + { + "index": 19, + "pair_id": "B_origin_14_B_table_45_0", + "image_paths": [ + "data/Business/Table/B_origin_14/B_table_45_0.png" + ], + "domain": "Business", + "origin": "B_origin_14", + "table_id": "B_table_45_0" + }, + { + "index": 20, + "pair_id": "B_origin_14_B_table_45_1", + "image_paths": [ + "data/Business/Table/B_origin_14/B_table_45_1.png" + ], + "domain": "Business", + "origin": "B_origin_14", + "table_id": "B_table_45_1" + }, + { + "index": 21, + "pair_id": "B_origin_14_B_table_45_2", + "image_paths": [ + "data/Business/Table/B_origin_14/B_table_45_2.png" + ], + "domain": "Business", + "origin": "B_origin_14", + "table_id": "B_table_45_2" + }, + { + "index": 22, + "pair_id": "B_origin_14_B_table_46_0", + "image_paths": [ + "data/Business/Table/B_origin_14/B_table_46_0.png" + ], + "domain": "Business", + "origin": "B_origin_14", + "table_id": "B_table_46_0" + }, + { + "index": 23, + "pair_id": "B_origin_14_B_table_47_0", + "image_paths": [ + "data/Business/Table/B_origin_14/B_table_47_0.png" + ], + "domain": "Business", + "origin": "B_origin_14", + "table_id": "B_table_47_0" + }, + { + "index": 24, + "pair_id": "B_origin_14_B_table_48_0", + "image_paths": [ + "data/Business/Table/B_origin_14/B_table_48_0.png" + ], + "domain": "Business", + "origin": "B_origin_14", + "table_id": "B_table_48_0" + }, + { + "index": 25, + "pair_id": "B_origin_15_B_table_49_0", + "image_paths": [ + "data/Business/Table/B_origin_15/B_table_49_0.png" + ], + "domain": "Business", + "origin": "B_origin_15", + "table_id": "B_table_49_0" + }, + { + "index": 26, + "pair_id": "B_origin_15_B_table_50_0", + "image_paths": [ + "data/Business/Table/B_origin_15/B_table_50_0.png" + ], + "domain": "Business", + "origin": "B_origin_15", + "table_id": "B_table_50_0" + }, + { + "index": 27, + "pair_id": "B_origin_15_B_table_51_0", + "image_paths": [ + "data/Business/Table/B_origin_15/B_table_51_0.png" + ], + "domain": "Business", + "origin": "B_origin_15", + "table_id": "B_table_51_0" + }, + { + "index": 28, + "pair_id": "B_origin_15_B_table_51_1", + "image_paths": [ + "data/Business/Table/B_origin_15/B_table_51_1.png" + ], + "domain": "Business", + "origin": "B_origin_15", + "table_id": "B_table_51_1" + }, + { + "index": 29, + "pair_id": "B_origin_15_B_table_52_0", + "image_paths": [ + "data/Business/Table/B_origin_15/B_table_52_0.png" + ], + "domain": "Business", + "origin": "B_origin_15", + "table_id": "B_table_52_0" + }, + { + "index": 30, + "pair_id": "B_origin_15_B_table_53_0", + "image_paths": [ + "data/Business/Table/B_origin_15/B_table_53_0.png" + ], + "domain": "Business", + "origin": "B_origin_15", + "table_id": "B_table_53_0" + }, + { + "index": 31, + "pair_id": "B_origin_16_B_table_54_0", + "image_paths": [ + "data/Business/Table/B_origin_16/B_table_54_0.png" + ], + "domain": "Business", + "origin": "B_origin_16", + "table_id": "B_table_54_0" + }, + { + "index": 32, + "pair_id": "B_origin_17_B_table_55_0", + "image_paths": [ + "data/Business/Table/B_origin_17/B_table_55_0.png" + ], + "domain": "Business", + "origin": "B_origin_17", + "table_id": "B_table_55_0" + }, + { + "index": 33, + "pair_id": "B_origin_17_B_table_56_0", + "image_paths": [ + "data/Business/Table/B_origin_17/B_table_56_0.png" + ], + "domain": "Business", + "origin": "B_origin_17", + "table_id": "B_table_56_0" + }, + { + "index": 34, + "pair_id": "B_origin_17_B_table_57_0", + "image_paths": [ + "data/Business/Table/B_origin_17/B_table_57_0.png" + ], + "domain": "Business", + "origin": "B_origin_17", + "table_id": "B_table_57_0" + }, + { + "index": 35, + "pair_id": "B_origin_17_B_table_58_0", + "image_paths": [ + "data/Business/Table/B_origin_17/B_table_58_0.png" + ], + "domain": "Business", + "origin": "B_origin_17", + "table_id": "B_table_58_0" + }, + { + "index": 36, + "pair_id": "B_origin_17_B_table_59_0", + "image_paths": [ + "data/Business/Table/B_origin_17/B_table_59_0.png" + ], + "domain": "Business", + "origin": "B_origin_17", + "table_id": "B_table_59_0" + }, + { + "index": 37, + "pair_id": "B_origin_17_B_table_60_0", + "image_paths": [ + "data/Business/Table/B_origin_17/B_table_60_0.png" + ], + "domain": "Business", + "origin": "B_origin_17", + "table_id": "B_table_60_0" + }, + { + "index": 38, + "pair_id": "B_origin_18_B_table_61_0", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_61_0.png" + ], + "domain": "Business", + "origin": "B_origin_18", + "table_id": "B_table_61_0" + }, + { + "index": 39, + "pair_id": "B_origin_18_B_table_62_0", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_62_0.png" + ], + "domain": "Business", + "origin": "B_origin_18", + "table_id": "B_table_62_0" + }, + { + "index": 40, + "pair_id": "B_origin_18_B_table_63_0", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_63_0.png" + ], + "domain": "Business", + "origin": "B_origin_18", + "table_id": "B_table_63_0" + }, + { + "index": 41, + "pair_id": "B_origin_18_B_table_63_1", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_63_1.png" + ], + "domain": "Business", + "origin": "B_origin_18", + "table_id": "B_table_63_1" + }, + { + "index": 42, + "pair_id": "B_origin_18_B_table_64_0", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_64_0.png" + ], + "domain": "Business", + "origin": "B_origin_18", + "table_id": "B_table_64_0" + }, + { + "index": 43, + "pair_id": "B_origin_18_B_table_65_0", + "image_paths": [ + "data/Business/Table/B_origin_18/B_table_65_0.png" + ], + "domain": "Business", + "origin": "B_origin_18", + "table_id": "B_table_65_0" + }, + { + "index": 44, + "pair_id": "B_origin_19_B_table_66_0", + "image_paths": [ + "data/Business/Table/B_origin_19/B_table_66_0.png" + ], + "domain": "Business", + "origin": "B_origin_19", + "table_id": "B_table_66_0" + }, + { + "index": 45, + "pair_id": "B_origin_2_B_table_5_0", + "image_paths": [ + "data/Business/Table/B_origin_2/B_table_5_0.png" + ], + "domain": "Business", + "origin": "B_origin_2", + "table_id": "B_table_5_0" + }, + { + "index": 46, + "pair_id": "B_origin_2_B_table_6_0", + "image_paths": [ + "data/Business/Table/B_origin_2/B_table_6_0.png" + ], + "domain": "Business", + "origin": "B_origin_2", + "table_id": "B_table_6_0" + }, + { + "index": 47, + "pair_id": "B_origin_2_B_table_7_0", + "image_paths": [ + "data/Business/Table/B_origin_2/B_table_7_0.png" + ], + "domain": "Business", + "origin": "B_origin_2", + "table_id": "B_table_7_0" + }, + { + "index": 48, + "pair_id": "B_origin_2_B_table_8_0", + "image_paths": [ + "data/Business/Table/B_origin_2/B_table_8_0.png" + ], + "domain": "Business", + "origin": "B_origin_2", + "table_id": "B_table_8_0" + }, + { + "index": 49, + "pair_id": "B_origin_2_B_table_9_0", + "image_paths": [ + "data/Business/Table/B_origin_2/B_table_9_0.png" + ], + "domain": "Business", + "origin": "B_origin_2", + "table_id": "B_table_9_0" + }, + { + "index": 50, + "pair_id": "B_origin_20_B_table_67_0", + "image_paths": [ + "data/Business/Table/B_origin_20/B_table_67_0.png" + ], + "domain": "Business", + "origin": "B_origin_20", + "table_id": "B_table_67_0" + }, + { + "index": 51, + "pair_id": "B_origin_20_B_table_68_0", + "image_paths": [ + "data/Business/Table/B_origin_20/B_table_68_0.png" + ], + "domain": "Business", + "origin": "B_origin_20", + "table_id": "B_table_68_0" + }, + { + "index": 52, + "pair_id": "B_origin_20_B_table_69_0", + "image_paths": [ + "data/Business/Table/B_origin_20/B_table_69_0.png" + ], + "domain": "Business", + "origin": "B_origin_20", + "table_id": "B_table_69_0" + }, + { + "index": 53, + "pair_id": "B_origin_21_B_table_70_0", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_70_0.png" + ], + "domain": "Business", + "origin": "B_origin_21", + "table_id": "B_table_70_0" + }, + { + "index": 54, + "pair_id": "B_origin_21_B_table_71_0", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_71_0.png" + ], + "domain": "Business", + "origin": "B_origin_21", + "table_id": "B_table_71_0" + }, + { + "index": 55, + "pair_id": "B_origin_21_B_table_72_0", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_72_0.png" + ], + "domain": "Business", + "origin": "B_origin_21", + "table_id": "B_table_72_0" + }, + { + "index": 56, + "pair_id": "B_origin_21_B_table_72_1", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_72_1.png" + ], + "domain": "Business", + "origin": "B_origin_21", + "table_id": "B_table_72_1" + }, + { + "index": 57, + "pair_id": "B_origin_21_B_table_73_0", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_73_0.png" + ], + "domain": "Business", + "origin": "B_origin_21", + "table_id": "B_table_73_0" + }, + { + "index": 58, + "pair_id": "B_origin_21_B_table_74_0", + "image_paths": [ + "data/Business/Table/B_origin_21/B_table_74_0.png" + ], + "domain": "Business", + "origin": "B_origin_21", + "table_id": "B_table_74_0" + }, + { + "index": 59, + "pair_id": "B_origin_22_B_table_75_0", + "image_paths": [ + "data/Business/Table/B_origin_22/B_table_75_0.png" + ], + "domain": "Business", + "origin": "B_origin_22", + "table_id": "B_table_75_0" + }, + { + "index": 60, + "pair_id": "B_origin_22_B_table_76_0", + "image_paths": [ + "data/Business/Table/B_origin_22/B_table_76_0.png" + ], + "domain": "Business", + "origin": "B_origin_22", + "table_id": "B_table_76_0" + }, + { + "index": 61, + "pair_id": "B_origin_22_B_table_77_0", + "image_paths": [ + "data/Business/Table/B_origin_22/B_table_77_0.png" + ], + "domain": "Business", + "origin": "B_origin_22", + "table_id": "B_table_77_0" + }, + { + "index": 62, + "pair_id": "B_origin_22_B_table_78_0", + "image_paths": [ + "data/Business/Table/B_origin_22/B_table_78_0.png" + ], + "domain": "Business", + "origin": "B_origin_22", + "table_id": "B_table_78_0" + }, + { + "index": 63, + "pair_id": "B_origin_22_B_table_79_0", + "image_paths": [ + "data/Business/Table/B_origin_22/B_table_79_0.png" + ], + "domain": "Business", + "origin": "B_origin_22", + "table_id": "B_table_79_0" + }, + { + "index": 64, + "pair_id": "B_origin_23_B_table_79_0", + "image_paths": [ + "data/Business/Table/B_origin_23/B_table_79_0.png" + ], + "domain": "Business", + "origin": "B_origin_23", + "table_id": "B_table_79_0" + }, + { + "index": 65, + "pair_id": "B_origin_23_B_table_80_0", + "image_paths": [ + "data/Business/Table/B_origin_23/B_table_80_0.png" + ], + "domain": "Business", + "origin": "B_origin_23", + "table_id": "B_table_80_0" + }, + { + "index": 66, + "pair_id": "B_origin_23_B_table_81_0", + "image_paths": [ + "data/Business/Table/B_origin_23/B_table_81_0.png" + ], + "domain": "Business", + "origin": "B_origin_23", + "table_id": "B_table_81_0" + }, + { + "index": 67, + "pair_id": "B_origin_24_B_table_82_0", + "image_paths": [ + "data/Business/Table/B_origin_24/B_table_82_0.png" + ], + "domain": "Business", + "origin": "B_origin_24", + "table_id": "B_table_82_0" + }, + { + "index": 68, + "pair_id": "B_origin_24_B_table_83_0", + "image_paths": [ + "data/Business/Table/B_origin_24/B_table_83_0.png" + ], + "domain": "Business", + "origin": "B_origin_24", + "table_id": "B_table_83_0" + }, + { + "index": 69, + "pair_id": "B_origin_24_B_table_84_0", + "image_paths": [ + "data/Business/Table/B_origin_24/B_table_84_0.png" + ], + "domain": "Business", + "origin": "B_origin_24", + "table_id": "B_table_84_0" + }, + { + "index": 70, + "pair_id": "B_origin_25_B_table_85_0", + "image_paths": [ + "data/Business/Table/B_origin_25/B_table_85_0.png" + ], + "domain": "Business", + "origin": "B_origin_25", + "table_id": "B_table_85_0" + }, + { + "index": 71, + "pair_id": "B_origin_25_B_table_86_0", + "image_paths": [ + "data/Business/Table/B_origin_25/B_table_86_0.png" + ], + "domain": "Business", + "origin": "B_origin_25", + "table_id": "B_table_86_0" + }, + { + "index": 72, + "pair_id": "B_origin_26_B_table_87_0", + "image_paths": [ + "data/Business/Table/B_origin_26/B_table_87_0.png" + ], + "domain": "Business", + "origin": "B_origin_26", + "table_id": "B_table_87_0" + }, + { + "index": 73, + "pair_id": "B_origin_26_B_table_88_0", + "image_paths": [ + "data/Business/Table/B_origin_26/B_table_88_0.png" + ], + "domain": "Business", + "origin": "B_origin_26", + "table_id": "B_table_88_0" + }, + { + "index": 74, + "pair_id": "B_origin_27_B_table_89_0", + "image_paths": [ + "data/Business/Table/B_origin_27/B_table_89_0.png" + ], + "domain": "Business", + "origin": "B_origin_27", + "table_id": "B_table_89_0" + }, + { + "index": 75, + "pair_id": "B_origin_27_B_table_90_0", + "image_paths": [ + "data/Business/Table/B_origin_27/B_table_90_0.png" + ], + "domain": "Business", + "origin": "B_origin_27", + "table_id": "B_table_90_0" + }, + { + "index": 76, + "pair_id": "B_origin_27_B_table_91_0", + "image_paths": [ + "data/Business/Table/B_origin_27/B_table_91_0.png" + ], + "domain": "Business", + "origin": "B_origin_27", + "table_id": "B_table_91_0" + }, + { + "index": 77, + "pair_id": "B_origin_28_B_table_92_0", + "image_paths": [ + "data/Business/Table/B_origin_28/B_table_92_0.png" + ], + "domain": "Business", + "origin": "B_origin_28", + "table_id": "B_table_92_0" + }, + { + "index": 78, + "pair_id": "B_origin_28_B_table_93_0", + "image_paths": [ + "data/Business/Table/B_origin_28/B_table_93_0.png" + ], + "domain": "Business", + "origin": "B_origin_28", + "table_id": "B_table_93_0" + }, + { + "index": 79, + "pair_id": "B_origin_28_B_table_94_0", + "image_paths": [ + "data/Business/Table/B_origin_28/B_table_94_0.png" + ], + "domain": "Business", + "origin": "B_origin_28", + "table_id": "B_table_94_0" + }, + { + "index": 80, + "pair_id": "B_origin_28_B_table_95_0", + "image_paths": [ + "data/Business/Table/B_origin_28/B_table_95_0.png" + ], + "domain": "Business", + "origin": "B_origin_28", + "table_id": "B_table_95_0" + }, + { + "index": 81, + "pair_id": "B_origin_28_B_table_96_0", + "image_paths": [ + "data/Business/Table/B_origin_28/B_table_96_0.png" + ], + "domain": "Business", + "origin": "B_origin_28", + "table_id": "B_table_96_0" + }, + { + "index": 82, + "pair_id": "B_origin_28_B_table_97_0", + "image_paths": [ + "data/Business/Table/B_origin_28/B_table_97_0.png" + ], + "domain": "Business", + "origin": "B_origin_28", + "table_id": "B_table_97_0" + }, + { + "index": 83, + "pair_id": "B_origin_29_B_table_100_0", + "image_paths": [ + "data/Business/Table/B_origin_29/B_table_100_0.png" + ], + "domain": "Business", + "origin": "B_origin_29", + "table_id": "B_table_100_0" + }, + { + "index": 84, + "pair_id": "B_origin_29_B_table_98_0", + "image_paths": [ + "data/Business/Table/B_origin_29/B_table_98_0.png" + ], + "domain": "Business", + "origin": "B_origin_29", + "table_id": "B_table_98_0" + }, + { + "index": 85, + "pair_id": "B_origin_29_B_table_99_0", + "image_paths": [ + "data/Business/Table/B_origin_29/B_table_99_0.png" + ], + "domain": "Business", + "origin": "B_origin_29", + "table_id": "B_table_99_0" + }, + { + "index": 86, + "pair_id": "B_origin_3_B_table_10_0", + "image_paths": [ + "data/Business/Table/B_origin_3/B_table_10_0.png" + ], + "domain": "Business", + "origin": "B_origin_3", + "table_id": "B_table_10_0" + }, + { + "index": 87, + "pair_id": "B_origin_3_B_table_11_0", + "image_paths": [ + "data/Business/Table/B_origin_3/B_table_11_0.png" + ], + "domain": "Business", + "origin": "B_origin_3", + "table_id": "B_table_11_0" + }, + { + "index": 88, + "pair_id": "B_origin_3_B_table_12_0", + "image_paths": [ + "data/Business/Table/B_origin_3/B_table_12_0.png" + ], + "domain": "Business", + "origin": "B_origin_3", + "table_id": "B_table_12_0" + }, + { + "index": 89, + "pair_id": "B_origin_3_B_table_13_0", + "image_paths": [ + "data/Business/Table/B_origin_3/B_table_13_0.png" + ], + "domain": "Business", + "origin": "B_origin_3", + "table_id": "B_table_13_0" + }, + { + "index": 90, + "pair_id": "B_origin_30_B_table_101_0", + "image_paths": [ + "data/Business/Table/B_origin_30/B_table_101_0.png" + ], + "domain": "Business", + "origin": "B_origin_30", + "table_id": "B_table_101_0" + }, + { + "index": 91, + "pair_id": "B_origin_30_B_table_102_0", + "image_paths": [ + "data/Business/Table/B_origin_30/B_table_102_0.png" + ], + "domain": "Business", + "origin": "B_origin_30", + "table_id": "B_table_102_0" + }, + { + "index": 92, + "pair_id": "B_origin_30_B_table_103_0", + "image_paths": [ + "data/Business/Table/B_origin_30/B_table_103_0.png" + ], + "domain": "Business", + "origin": "B_origin_30", + "table_id": "B_table_103_0" + }, + { + "index": 93, + "pair_id": "B_origin_31_B_table_104_0", + "image_paths": [ + "data/Business/Table/B_origin_31/B_table_104_0.png" + ], + "domain": "Business", + "origin": "B_origin_31", + "table_id": "B_table_104_0" + }, + { + "index": 94, + "pair_id": "B_origin_31_B_table_105_0", + "image_paths": [ + "data/Business/Table/B_origin_31/B_table_105_0.png" + ], + "domain": "Business", + "origin": "B_origin_31", + "table_id": "B_table_105_0" + }, + { + "index": 95, + "pair_id": "B_origin_31_B_table_106_0", + "image_paths": [ + "data/Business/Table/B_origin_31/B_table_106_0.png" + ], + "domain": "Business", + "origin": "B_origin_31", + "table_id": "B_table_106_0" + }, + { + "index": 96, + "pair_id": "B_origin_31_B_table_107_0", + "image_paths": [ + "data/Business/Table/B_origin_31/B_table_107_0.png" + ], + "domain": "Business", + "origin": "B_origin_31", + "table_id": "B_table_107_0" + }, + { + "index": 97, + "pair_id": "B_origin_31_B_table_108_0", + "image_paths": [ + "data/Business/Table/B_origin_31/B_table_108_0.png" + ], + "domain": "Business", + "origin": "B_origin_31", + "table_id": "B_table_108_0" + }, + { + "index": 98, + "pair_id": "B_origin_32_B_table_109_0", + "image_paths": [ + "data/Business/Table/B_origin_32/B_table_109_0.png" + ], + "domain": "Business", + "origin": "B_origin_32", + "table_id": "B_table_109_0" + }, + { + "index": 99, + "pair_id": "B_origin_32_B_table_110_0", + "image_paths": [ + "data/Business/Table/B_origin_32/B_table_110_0.png" + ], + "domain": "Business", + "origin": "B_origin_32", + "table_id": "B_table_110_0" + }, + { + "index": 100, + "pair_id": "B_origin_32_B_table_111_0", + "image_paths": [ + "data/Business/Table/B_origin_32/B_table_111_0.png" + ], + "domain": "Business", + "origin": "B_origin_32", + "table_id": "B_table_111_0" + }, + { + "index": 101, + "pair_id": "B_origin_32_B_table_112_0", + "image_paths": [ + "data/Business/Table/B_origin_32/B_table_112_0.png" + ], + "domain": "Business", + "origin": "B_origin_32", + "table_id": "B_table_112_0" + }, + { + "index": 102, + "pair_id": "B_origin_33_B_table_113_0", + "image_paths": [ + "data/Business/Table/B_origin_33/B_table_113_0.png" + ], + "domain": "Business", + "origin": "B_origin_33", + "table_id": "B_table_113_0" + }, + { + "index": 103, + "pair_id": "B_origin_33_B_table_114_0", + "image_paths": [ + "data/Business/Table/B_origin_33/B_table_114_0.png" + ], + "domain": "Business", + "origin": "B_origin_33", + "table_id": "B_table_114_0" + }, + { + "index": 104, + "pair_id": "B_origin_33_B_table_115_9", + "image_paths": [ + "data/Business/Table/B_origin_33/B_table_115_9.png" + ], + "domain": "Business", + "origin": "B_origin_33", + "table_id": "B_table_115_9" + }, + { + "index": 105, + "pair_id": "B_origin_33_B_table_116_0", + "image_paths": [ + "data/Business/Table/B_origin_33/B_table_116_0.png" + ], + "domain": "Business", + "origin": "B_origin_33", + "table_id": "B_table_116_0" + }, + { + "index": 106, + "pair_id": "B_origin_33_B_table_117_0", + "image_paths": [ + "data/Business/Table/B_origin_33/B_table_117_0.png" + ], + "domain": "Business", + "origin": "B_origin_33", + "table_id": "B_table_117_0" + }, + { + "index": 107, + "pair_id": "B_origin_34_B_table_118_0", + "image_paths": [ + "data/Business/Table/B_origin_34/B_table_118_0.png" + ], + "domain": "Business", + "origin": "B_origin_34", + "table_id": "B_table_118_0" + }, + { + "index": 108, + "pair_id": "B_origin_34_B_table_119_0", + "image_paths": [ + "data/Business/Table/B_origin_34/B_table_119_0.png" + ], + "domain": "Business", + "origin": "B_origin_34", + "table_id": "B_table_119_0" + }, + { + "index": 109, + "pair_id": "B_origin_34_B_table_120_0", + "image_paths": [ + "data/Business/Table/B_origin_34/B_table_120_0.png" + ], + "domain": "Business", + "origin": "B_origin_34", + "table_id": "B_table_120_0" + }, + { + "index": 110, + "pair_id": "B_origin_34_B_table_121_0", + "image_paths": [ + "data/Business/Table/B_origin_34/B_table_121_0.png" + ], + "domain": "Business", + "origin": "B_origin_34", + "table_id": "B_table_121_0" + }, + { + "index": 111, + "pair_id": "B_origin_35_B_table_122_0", + "image_paths": [ + "data/Business/Table/B_origin_35/B_table_122_0.png" + ], + "domain": "Business", + "origin": "B_origin_35", + "table_id": "B_table_122_0" + }, + { + "index": 112, + "pair_id": "B_origin_35_B_table_123_0", + "image_paths": [ + "data/Business/Table/B_origin_35/B_table_123_0.png" + ], + "domain": "Business", + "origin": "B_origin_35", + "table_id": "B_table_123_0" + }, + { + "index": 113, + "pair_id": "B_origin_35_B_table_124_0", + "image_paths": [ + "data/Business/Table/B_origin_35/B_table_124_0.png" + ], + "domain": "Business", + "origin": "B_origin_35", + "table_id": "B_table_124_0" + }, + { + "index": 114, + "pair_id": "B_origin_35_B_table_125_0", + "image_paths": [ + "data/Business/Table/B_origin_35/B_table_125_0.png" + ], + "domain": "Business", + "origin": "B_origin_35", + "table_id": "B_table_125_0" + }, + { + "index": 115, + "pair_id": "B_origin_35_B_table_126_0", + "image_paths": [ + "data/Business/Table/B_origin_35/B_table_126_0.png" + ], + "domain": "Business", + "origin": "B_origin_35", + "table_id": "B_table_126_0" + }, + { + "index": 116, + "pair_id": "B_origin_36_B_table_127_0", + "image_paths": [ + "data/Business/Table/B_origin_36/B_table_127_0.png" + ], + "domain": "Business", + "origin": "B_origin_36", + "table_id": "B_table_127_0" + }, + { + "index": 117, + "pair_id": "B_origin_36_B_table_128_0", + "image_paths": [ + "data/Business/Table/B_origin_36/B_table_128_0.png" + ], + "domain": "Business", + "origin": "B_origin_36", + "table_id": "B_table_128_0" + }, + { + "index": 118, + "pair_id": "B_origin_36_B_table_129_0", + "image_paths": [ + "data/Business/Table/B_origin_36/B_table_129_0.png" + ], + "domain": "Business", + "origin": "B_origin_36", + "table_id": "B_table_129_0" + }, + { + "index": 119, + "pair_id": "B_origin_36_B_table_130_0", + "image_paths": [ + "data/Business/Table/B_origin_36/B_table_130_0.png" + ], + "domain": "Business", + "origin": "B_origin_36", + "table_id": "B_table_130_0" + }, + { + "index": 120, + "pair_id": "B_origin_37_B_table_131_0", + "image_paths": [ + "data/Business/Table/B_origin_37/B_table_131_0.png" + ], + "domain": "Business", + "origin": "B_origin_37", + "table_id": "B_table_131_0" + }, + { + "index": 121, + "pair_id": "B_origin_37_B_table_132_0", + "image_paths": [ + "data/Business/Table/B_origin_37/B_table_132_0.png" + ], + "domain": "Business", + "origin": "B_origin_37", + "table_id": "B_table_132_0" + }, + { + "index": 122, + "pair_id": "B_origin_37_B_table_132_1", + "image_paths": [ + "data/Business/Table/B_origin_37/B_table_132_1.png" + ], + "domain": "Business", + "origin": "B_origin_37", + "table_id": "B_table_132_1" + }, + { + "index": 123, + "pair_id": "B_origin_37_B_table_133_0", + "image_paths": [ + "data/Business/Table/B_origin_37/B_table_133_0.png" + ], + "domain": "Business", + "origin": "B_origin_37", + "table_id": "B_table_133_0" + }, + { + "index": 124, + "pair_id": "B_origin_37_B_table_134_0", + "image_paths": [ + "data/Business/Table/B_origin_37/B_table_134_0.png" + ], + "domain": "Business", + "origin": "B_origin_37", + "table_id": "B_table_134_0" + }, + { + "index": 125, + "pair_id": "B_origin_38_B_table_135_0", + "image_paths": [ + "data/Business/Table/B_origin_38/B_table_135_0.png" + ], + "domain": "Business", + "origin": "B_origin_38", + "table_id": "B_table_135_0" + }, + { + "index": 126, + "pair_id": "B_origin_38_B_table_136_0", + "image_paths": [ + "data/Business/Table/B_origin_38/B_table_136_0.png" + ], + "domain": "Business", + "origin": "B_origin_38", + "table_id": "B_table_136_0" + }, + { + "index": 127, + "pair_id": "B_origin_38_B_table_137_0", + "image_paths": [ + "data/Business/Table/B_origin_38/B_table_137_0.png" + ], + "domain": "Business", + "origin": "B_origin_38", + "table_id": "B_table_137_0" + }, + { + "index": 128, + "pair_id": "B_origin_38_B_table_138_0", + "image_paths": [ + "data/Business/Table/B_origin_38/B_table_138_0.png" + ], + "domain": "Business", + "origin": "B_origin_38", + "table_id": "B_table_138_0" + }, + { + "index": 129, + "pair_id": "B_origin_39_B_table_139_0", + "image_paths": [ + "data/Business/Table/B_origin_39/B_table_139_0.png" + ], + "domain": "Business", + "origin": "B_origin_39", + "table_id": "B_table_139_0" + }, + { + "index": 130, + "pair_id": "B_origin_39_B_table_140_0", + "image_paths": [ + "data/Business/Table/B_origin_39/B_table_140_0.png" + ], + "domain": "Business", + "origin": "B_origin_39", + "table_id": "B_table_140_0" + }, + { + "index": 131, + "pair_id": "B_origin_4_B_table_14_0", + "image_paths": [ + "data/Business/Table/B_origin_4/B_table_14_0.png" + ], + "domain": "Business", + "origin": "B_origin_4", + "table_id": "B_table_14_0" + }, + { + "index": 132, + "pair_id": "B_origin_4_B_table_15_0", + "image_paths": [ + "data/Business/Table/B_origin_4/B_table_15_0.png" + ], + "domain": "Business", + "origin": "B_origin_4", + "table_id": "B_table_15_0" + }, + { + "index": 133, + "pair_id": "B_origin_4_B_table_16_0", + "image_paths": [ + "data/Business/Table/B_origin_4/B_table_16_0.png" + ], + "domain": "Business", + "origin": "B_origin_4", + "table_id": "B_table_16_0" + }, + { + "index": 134, + "pair_id": "B_origin_4_B_table_16_1", + "image_paths": [ + "data/Business/Table/B_origin_4/B_table_16_1.png" + ], + "domain": "Business", + "origin": "B_origin_4", + "table_id": "B_table_16_1" + }, + { + "index": 135, + "pair_id": "B_origin_5_B_table_17_0", + "image_paths": [ + "data/Business/Table/B_origin_5/B_table_17_0.png" + ], + "domain": "Business", + "origin": "B_origin_5", + "table_id": "B_table_17_0" + }, + { + "index": 136, + "pair_id": "B_origin_5_B_table_18_0", + "image_paths": [ + "data/Business/Table/B_origin_5/B_table_18_0.png" + ], + "domain": "Business", + "origin": "B_origin_5", + "table_id": "B_table_18_0" + }, + { + "index": 137, + "pair_id": "B_origin_6_B_table_19_0", + "image_paths": [ + "data/Business/Table/B_origin_6/B_table_19_0.png" + ], + "domain": "Business", + "origin": "B_origin_6", + "table_id": "B_table_19_0" + }, + { + "index": 138, + "pair_id": "B_origin_6_B_table_20_0", + "image_paths": [ + "data/Business/Table/B_origin_6/B_table_20_0.png" + ], + "domain": "Business", + "origin": "B_origin_6", + "table_id": "B_table_20_0" + }, + { + "index": 139, + "pair_id": "B_origin_6_B_table_21_0", + "image_paths": [ + "data/Business/Table/B_origin_6/B_table_21_0.png" + ], + "domain": "Business", + "origin": "B_origin_6", + "table_id": "B_table_21_0" + }, + { + "index": 140, + "pair_id": "B_origin_6_B_table_22_0", + "image_paths": [ + "data/Business/Table/B_origin_6/B_table_22_0.png" + ], + "domain": "Business", + "origin": "B_origin_6", + "table_id": "B_table_22_0" + }, + { + "index": 141, + "pair_id": "B_origin_7_B_table_23_0", + "image_paths": [ + "data/Business/Table/B_origin_7/B_table_23_0.png" + ], + "domain": "Business", + "origin": "B_origin_7", + "table_id": "B_table_23_0" + }, + { + "index": 142, + "pair_id": "B_origin_7_B_table_24_0", + "image_paths": [ + "data/Business/Table/B_origin_7/B_table_24_0.png" + ], + "domain": "Business", + "origin": "B_origin_7", + "table_id": "B_table_24_0" + }, + { + "index": 143, + "pair_id": "B_origin_7_B_table_25_0", + "image_paths": [ + "data/Business/Table/B_origin_7/B_table_25_0.png" + ], + "domain": "Business", + "origin": "B_origin_7", + "table_id": "B_table_25_0" + }, + { + "index": 144, + "pair_id": "B_origin_8_B_table_26_0", + "image_paths": [ + "data/Business/Table/B_origin_8/B_table_26_0.png" + ], + "domain": "Business", + "origin": "B_origin_8", + "table_id": "B_table_26_0" + }, + { + "index": 145, + "pair_id": "B_origin_8_B_table_27_0", + "image_paths": [ + "data/Business/Table/B_origin_8/B_table_27_0.png" + ], + "domain": "Business", + "origin": "B_origin_8", + "table_id": "B_table_27_0" + }, + { + "index": 146, + "pair_id": "B_origin_9_B_table_28_0", + "image_paths": [ + "data/Business/Table/B_origin_9/B_table_28_0.png" + ], + "domain": "Business", + "origin": "B_origin_9", + "table_id": "B_table_28_0" + }, + { + "index": 147, + "pair_id": "B_origin_9_B_table_29_0", + "image_paths": [ + "data/Business/Table/B_origin_9/B_table_29_0.png" + ], + "domain": "Business", + "origin": "B_origin_9", + "table_id": "B_table_29_0" + }, + { + "index": 148, + "pair_id": "B_origin_9_B_table_30_0", + "image_paths": [ + "data/Business/Table/B_origin_9/B_table_30_0.png" + ], + "domain": "Business", + "origin": "B_origin_9", + "table_id": "B_table_30_0" + } +] \ No newline at end of file diff --git a/single_image_json_list/single_table_finance_input.json b/single_image_json_list/single_table_finance_input.json new file mode 100644 index 0000000..c40cdac --- /dev/null +++ b/single_image_json_list/single_table_finance_input.json @@ -0,0 +1,3152 @@ +[ + { + "index": 0, + "pair_id": "F_origin_0_F_table_0_0", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_0_0.png" + ], + "domain": "Finance", + "origin": "F_origin_0", + "table_id": "F_table_0_0" + }, + { + "index": 1, + "pair_id": "F_origin_0_F_table_0_1", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_0_1.png" + ], + "domain": "Finance", + "origin": "F_origin_0", + "table_id": "F_table_0_1" + }, + { + "index": 2, + "pair_id": "F_origin_0_F_table_0_2", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_0_2.png" + ], + "domain": "Finance", + "origin": "F_origin_0", + "table_id": "F_table_0_2" + }, + { + "index": 3, + "pair_id": "F_origin_0_F_table_0_3", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_0_3.png" + ], + "domain": "Finance", + "origin": "F_origin_0", + "table_id": "F_table_0_3" + }, + { + "index": 4, + "pair_id": "F_origin_0_F_table_0_4", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_0_4.png" + ], + "domain": "Finance", + "origin": "F_origin_0", + "table_id": "F_table_0_4" + }, + { + "index": 5, + "pair_id": "F_origin_0_F_table_1_0", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_1_0.png" + ], + "domain": "Finance", + "origin": "F_origin_0", + "table_id": "F_table_1_0" + }, + { + "index": 6, + "pair_id": "F_origin_0_F_table_1_1", + "image_paths": [ + "data/Finance/Table/F_origin_0/F_table_1_1.png" + ], + "domain": "Finance", + "origin": "F_origin_0", + "table_id": "F_table_1_1" + }, + { + "index": 7, + "pair_id": "F_origin_1_F_table_2_0", + "image_paths": [ + "data/Finance/Table/F_origin_1/F_table_2_0.png" + ], + "domain": "Finance", + "origin": "F_origin_1", + "table_id": "F_table_2_0" + }, + { + "index": 8, + "pair_id": "F_origin_1_F_table_2_1", + "image_paths": [ + "data/Finance/Table/F_origin_1/F_table_2_1.png" + ], + "domain": "Finance", + "origin": "F_origin_1", + "table_id": "F_table_2_1" + }, + { + "index": 9, + "pair_id": "F_origin_1_F_table_2_2", + "image_paths": [ + "data/Finance/Table/F_origin_1/F_table_2_2.png" + ], + "domain": "Finance", + "origin": "F_origin_1", + "table_id": "F_table_2_2" + }, + { + "index": 10, + "pair_id": "F_origin_1_F_table_2_3", + "image_paths": [ + "data/Finance/Table/F_origin_1/F_table_2_3.png" + ], + "domain": "Finance", + "origin": "F_origin_1", + "table_id": "F_table_2_3" + }, + { + "index": 11, + "pair_id": "F_origin_1_F_table_2_4", + "image_paths": [ + "data/Finance/Table/F_origin_1/F_table_2_4.png" + ], + "domain": "Finance", + "origin": "F_origin_1", + "table_id": "F_table_2_4" + }, + { + "index": 12, + "pair_id": "F_origin_10_F_table_14_0", + "image_paths": [ + "data/Finance/Table/F_origin_10/F_table_14_0.png" + ], + "domain": "Finance", + "origin": "F_origin_10", + "table_id": "F_table_14_0" + }, + { + "index": 13, + "pair_id": "F_origin_10_F_table_14_1", + "image_paths": [ + "data/Finance/Table/F_origin_10/F_table_14_1.png" + ], + "domain": "Finance", + "origin": "F_origin_10", + "table_id": "F_table_14_1" + }, + { + "index": 14, + "pair_id": "F_origin_10_F_table_14_2", + "image_paths": [ + "data/Finance/Table/F_origin_10/F_table_14_2.png" + ], + "domain": "Finance", + "origin": "F_origin_10", + "table_id": "F_table_14_2" + }, + { + "index": 15, + "pair_id": "F_origin_10_F_table_14_3", + "image_paths": [ + "data/Finance/Table/F_origin_10/F_table_14_3.png" + ], + "domain": "Finance", + "origin": "F_origin_10", + "table_id": "F_table_14_3" + }, + { + "index": 16, + "pair_id": "F_origin_10_F_table_14_4", + "image_paths": [ + "data/Finance/Table/F_origin_10/F_table_14_4.png" + ], + "domain": "Finance", + "origin": "F_origin_10", + "table_id": "F_table_14_4" + }, + { + "index": 17, + "pair_id": "F_origin_11_F_table_15_0", + "image_paths": [ + "data/Finance/Table/F_origin_11/F_table_15_0.png" + ], + "domain": "Finance", + "origin": "F_origin_11", + "table_id": "F_table_15_0" + }, + { + "index": 18, + "pair_id": "F_origin_11_F_table_15_1", + "image_paths": [ + "data/Finance/Table/F_origin_11/F_table_15_1.png" + ], + "domain": "Finance", + "origin": "F_origin_11", + "table_id": "F_table_15_1" + }, + { + "index": 19, + "pair_id": "F_origin_11_F_table_15_2", + "image_paths": [ + "data/Finance/Table/F_origin_11/F_table_15_2.png" + ], + "domain": "Finance", + "origin": "F_origin_11", + "table_id": "F_table_15_2" + }, + { + "index": 20, + "pair_id": "F_origin_11_F_table_15_3", + "image_paths": [ + "data/Finance/Table/F_origin_11/F_table_15_3.png" + ], + "domain": "Finance", + "origin": "F_origin_11", + "table_id": "F_table_15_3" + }, + { + "index": 21, + "pair_id": "F_origin_11_F_table_15_4", + "image_paths": [ + "data/Finance/Table/F_origin_11/F_table_15_4.png" + ], + "domain": "Finance", + "origin": "F_origin_11", + "table_id": "F_table_15_4" + }, + { + "index": 22, + "pair_id": "F_origin_11_F_table_15_5", + "image_paths": [ + "data/Finance/Table/F_origin_11/F_table_15_5.png" + ], + "domain": "Finance", + "origin": "F_origin_11", + "table_id": "F_table_15_5" + }, + { + "index": 23, + "pair_id": "F_origin_12_F_table_16_0", + "image_paths": [ + "data/Finance/Table/F_origin_12/F_table_16_0.png" + ], + "domain": "Finance", + "origin": "F_origin_12", + "table_id": "F_table_16_0" + }, + { + "index": 24, + "pair_id": "F_origin_12_F_table_17_0", + "image_paths": [ + "data/Finance/Table/F_origin_12/F_table_17_0.png" + ], + "domain": "Finance", + "origin": "F_origin_12", + "table_id": "F_table_17_0" + }, + { + "index": 25, + "pair_id": "F_origin_12_F_table_18_0", + "image_paths": [ + "data/Finance/Table/F_origin_12/F_table_18_0.png" + ], + "domain": "Finance", + "origin": "F_origin_12", + "table_id": "F_table_18_0" + }, + { + "index": 26, + "pair_id": "F_origin_12_F_table_19_0", + "image_paths": [ + "data/Finance/Table/F_origin_12/F_table_19_0.png" + ], + "domain": "Finance", + "origin": "F_origin_12", + "table_id": "F_table_19_0" + }, + { + "index": 27, + "pair_id": "F_origin_12_F_table_20_0", + "image_paths": [ + "data/Finance/Table/F_origin_12/F_table_20_0.png" + ], + "domain": "Finance", + "origin": "F_origin_12", + "table_id": "F_table_20_0" + }, + { + "index": 28, + "pair_id": "F_origin_12_F_table_21_0", + "image_paths": [ + "data/Finance/Table/F_origin_12/F_table_21_0.png" + ], + "domain": "Finance", + "origin": "F_origin_12", + "table_id": "F_table_21_0" + }, + { + "index": 29, + "pair_id": "F_origin_13_F_table_21_0", + "image_paths": [ + "data/Finance/Table/F_origin_13/F_table_21_0.png" + ], + "domain": "Finance", + "origin": "F_origin_13", + "table_id": "F_table_21_0" + }, + { + "index": 30, + "pair_id": "F_origin_13_F_table_22_0", + "image_paths": [ + "data/Finance/Table/F_origin_13/F_table_22_0.png" + ], + "domain": "Finance", + "origin": "F_origin_13", + "table_id": "F_table_22_0" + }, + { + "index": 31, + "pair_id": "F_origin_13_F_table_23_0", + "image_paths": [ + "data/Finance/Table/F_origin_13/F_table_23_0.png" + ], + "domain": "Finance", + "origin": "F_origin_13", + "table_id": "F_table_23_0" + }, + { + "index": 32, + "pair_id": "F_origin_13_F_table_24_0", + "image_paths": [ + "data/Finance/Table/F_origin_13/F_table_24_0.png" + ], + "domain": "Finance", + "origin": "F_origin_13", + "table_id": "F_table_24_0" + }, + { + "index": 33, + "pair_id": "F_origin_13_F_table_25_0", + "image_paths": [ + "data/Finance/Table/F_origin_13/F_table_25_0.png" + ], + "domain": "Finance", + "origin": "F_origin_13", + "table_id": "F_table_25_0" + }, + { + "index": 34, + "pair_id": "F_origin_14_F_table_26_0", + "image_paths": [ + "data/Finance/Table/F_origin_14/F_table_26_0.png" + ], + "domain": "Finance", + "origin": "F_origin_14", + "table_id": "F_table_26_0" + }, + { + "index": 35, + "pair_id": "F_origin_14_F_table_27_0", + "image_paths": [ + "data/Finance/Table/F_origin_14/F_table_27_0.png" + ], + "domain": "Finance", + "origin": "F_origin_14", + "table_id": "F_table_27_0" + }, + { + "index": 36, + "pair_id": "F_origin_14_F_table_28_0", + "image_paths": [ + "data/Finance/Table/F_origin_14/F_table_28_0.png" + ], + "domain": "Finance", + "origin": "F_origin_14", + "table_id": "F_table_28_0" + }, + { + "index": 37, + "pair_id": "F_origin_14_F_table_29_0", + "image_paths": [ + "data/Finance/Table/F_origin_14/F_table_29_0.png" + ], + "domain": "Finance", + "origin": "F_origin_14", + "table_id": "F_table_29_0" + }, + { + "index": 38, + "pair_id": "F_origin_14_F_table_30_0", + "image_paths": [ + "data/Finance/Table/F_origin_14/F_table_30_0.png" + ], + "domain": "Finance", + "origin": "F_origin_14", + "table_id": "F_table_30_0" + }, + { + "index": 39, + "pair_id": "F_origin_14_F_table_31_0", + "image_paths": [ + "data/Finance/Table/F_origin_14/F_table_31_0.png" + ], + "domain": "Finance", + "origin": "F_origin_14", + "table_id": "F_table_31_0" + }, + { + "index": 40, + "pair_id": "F_origin_15_F_table_32_0", + "image_paths": [ + "data/Finance/Table/F_origin_15/F_table_32_0.png" + ], + "domain": "Finance", + "origin": "F_origin_15", + "table_id": "F_table_32_0" + }, + { + "index": 41, + "pair_id": "F_origin_15_F_table_33_0", + "image_paths": [ + "data/Finance/Table/F_origin_15/F_table_33_0.png" + ], + "domain": "Finance", + "origin": "F_origin_15", + "table_id": "F_table_33_0" + }, + { + "index": 42, + "pair_id": "F_origin_15_F_table_34_0", + "image_paths": [ + "data/Finance/Table/F_origin_15/F_table_34_0.png" + ], + "domain": "Finance", + "origin": "F_origin_15", + "table_id": "F_table_34_0" + }, + { + "index": 43, + "pair_id": "F_origin_15_F_table_35_0", + "image_paths": [ + "data/Finance/Table/F_origin_15/F_table_35_0.png" + ], + "domain": "Finance", + "origin": "F_origin_15", + "table_id": "F_table_35_0" + }, + { + "index": 44, + "pair_id": "F_origin_15_F_table_36_0", + "image_paths": [ + "data/Finance/Table/F_origin_15/F_table_36_0.png" + ], + "domain": "Finance", + "origin": "F_origin_15", + "table_id": "F_table_36_0" + }, + { + "index": 45, + "pair_id": "F_origin_16_F_table_37_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_37_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_37_0" + }, + { + "index": 46, + "pair_id": "F_origin_16_F_table_38_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_38_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_38_0" + }, + { + "index": 47, + "pair_id": "F_origin_16_F_table_39_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_39_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_39_0" + }, + { + "index": 48, + "pair_id": "F_origin_16_F_table_40_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_40_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_40_0" + }, + { + "index": 49, + "pair_id": "F_origin_16_F_table_41_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_41_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_41_0" + }, + { + "index": 50, + "pair_id": "F_origin_16_F_table_42_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_42_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_42_0" + }, + { + "index": 51, + "pair_id": "F_origin_16_F_table_43_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_43_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_43_0" + }, + { + "index": 52, + "pair_id": "F_origin_16_F_table_44_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_44_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_44_0" + }, + { + "index": 53, + "pair_id": "F_origin_16_F_table_45_0", + "image_paths": [ + "data/Finance/Table/F_origin_16/F_table_45_0.png" + ], + "domain": "Finance", + "origin": "F_origin_16", + "table_id": "F_table_45_0" + }, + { + "index": 54, + "pair_id": "F_origin_17_F_table_46_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_46_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_46_0" + }, + { + "index": 55, + "pair_id": "F_origin_17_F_table_47_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_47_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_47_0" + }, + { + "index": 56, + "pair_id": "F_origin_17_F_table_48_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_48_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_48_0" + }, + { + "index": 57, + "pair_id": "F_origin_17_F_table_49_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_49_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_49_0" + }, + { + "index": 58, + "pair_id": "F_origin_17_F_table_50_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_50_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_50_0" + }, + { + "index": 59, + "pair_id": "F_origin_17_F_table_51_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_51_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_51_0" + }, + { + "index": 60, + "pair_id": "F_origin_17_F_table_52_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_52_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_52_0" + }, + { + "index": 61, + "pair_id": "F_origin_17_F_table_53_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_53_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_53_0" + }, + { + "index": 62, + "pair_id": "F_origin_17_F_table_54_0", + "image_paths": [ + "data/Finance/Table/F_origin_17/F_table_54_0.png" + ], + "domain": "Finance", + "origin": "F_origin_17", + "table_id": "F_table_54_0" + }, + { + "index": 63, + "pair_id": "F_origin_18_F_table_55_0", + "image_paths": [ + "data/Finance/Table/F_origin_18/F_table_55_0.png" + ], + "domain": "Finance", + "origin": "F_origin_18", + "table_id": "F_table_55_0" + }, + { + "index": 64, + "pair_id": "F_origin_18_F_table_56_0", + "image_paths": [ + "data/Finance/Table/F_origin_18/F_table_56_0.png" + ], + "domain": "Finance", + "origin": "F_origin_18", + "table_id": "F_table_56_0" + }, + { + "index": 65, + "pair_id": "F_origin_18_F_table_57_0", + "image_paths": [ + "data/Finance/Table/F_origin_18/F_table_57_0.png" + ], + "domain": "Finance", + "origin": "F_origin_18", + "table_id": "F_table_57_0" + }, + { + "index": 66, + "pair_id": "F_origin_18_F_table_58_0", + "image_paths": [ + "data/Finance/Table/F_origin_18/F_table_58_0.png" + ], + "domain": "Finance", + "origin": "F_origin_18", + "table_id": "F_table_58_0" + }, + { + "index": 67, + "pair_id": "F_origin_19_F_table_59_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_59_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_59_0" + }, + { + "index": 68, + "pair_id": "F_origin_19_F_table_59_1", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_59_1.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_59_1" + }, + { + "index": 69, + "pair_id": "F_origin_19_F_table_60_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_60_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_60_0" + }, + { + "index": 70, + "pair_id": "F_origin_19_F_table_61_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_61_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_61_0" + }, + { + "index": 71, + "pair_id": "F_origin_19_F_table_62_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_62_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_62_0" + }, + { + "index": 72, + "pair_id": "F_origin_19_F_table_63_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_63_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_63_0" + }, + { + "index": 73, + "pair_id": "F_origin_19_F_table_64_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_64_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_64_0" + }, + { + "index": 74, + "pair_id": "F_origin_19_F_table_65_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_65_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_65_0" + }, + { + "index": 75, + "pair_id": "F_origin_19_F_table_66_0", + "image_paths": [ + "data/Finance/Table/F_origin_19/F_table_66_0.png" + ], + "domain": "Finance", + "origin": "F_origin_19", + "table_id": "F_table_66_0" + }, + { + "index": 76, + "pair_id": "F_origin_2_F_table_3_0", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_3_0.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_3_0" + }, + { + "index": 77, + "pair_id": "F_origin_2_F_table_3_1", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_3_1.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_3_1" + }, + { + "index": 78, + "pair_id": "F_origin_2_F_table_3_2", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_3_2.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_3_2" + }, + { + "index": 79, + "pair_id": "F_origin_2_F_table_3_3", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_3_3.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_3_3" + }, + { + "index": 80, + "pair_id": "F_origin_2_F_table_3_4", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_3_4.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_3_4" + }, + { + "index": 81, + "pair_id": "F_origin_2_F_table_3_5", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_3_5.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_3_5" + }, + { + "index": 82, + "pair_id": "F_origin_2_F_table_4_0", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_4_0.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_4_0" + }, + { + "index": 83, + "pair_id": "F_origin_2_F_table_4_1", + "image_paths": [ + "data/Finance/Table/F_origin_2/F_table_4_1.png" + ], + "domain": "Finance", + "origin": "F_origin_2", + "table_id": "F_table_4_1" + }, + { + "index": 84, + "pair_id": "F_origin_20_F_table_67_0", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_67_0.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_67_0" + }, + { + "index": 85, + "pair_id": "F_origin_20_F_table_67_1", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_67_1.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_67_1" + }, + { + "index": 86, + "pair_id": "F_origin_20_F_table_68_0", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_68_0.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_68_0" + }, + { + "index": 87, + "pair_id": "F_origin_20_F_table_68_1", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_68_1.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_68_1" + }, + { + "index": 88, + "pair_id": "F_origin_20_F_table_68_2", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_68_2.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_68_2" + }, + { + "index": 89, + "pair_id": "F_origin_20_F_table_68_3", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_68_3.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_68_3" + }, + { + "index": 90, + "pair_id": "F_origin_20_F_table_68_4", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_68_4.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_68_4" + }, + { + "index": 91, + "pair_id": "F_origin_20_F_table_68_5", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_68_5.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_68_5" + }, + { + "index": 92, + "pair_id": "F_origin_20_F_table_68_6", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_68_6.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_68_6" + }, + { + "index": 93, + "pair_id": "F_origin_20_F_table_69_0", + "image_paths": [ + "data/Finance/Table/F_origin_20/F_table_69_0.png" + ], + "domain": "Finance", + "origin": "F_origin_20", + "table_id": "F_table_69_0" + }, + { + "index": 94, + "pair_id": "F_origin_21_F_table_70_0", + "image_paths": [ + "data/Finance/Table/F_origin_21/F_table_70_0.png" + ], + "domain": "Finance", + "origin": "F_origin_21", + "table_id": "F_table_70_0" + }, + { + "index": 95, + "pair_id": "F_origin_21_F_table_70_1", + "image_paths": [ + "data/Finance/Table/F_origin_21/F_table_70_1.png" + ], + "domain": "Finance", + "origin": "F_origin_21", + "table_id": "F_table_70_1" + }, + { + "index": 96, + "pair_id": "F_origin_21_F_table_70_2", + "image_paths": [ + "data/Finance/Table/F_origin_21/F_table_70_2.png" + ], + "domain": "Finance", + "origin": "F_origin_21", + "table_id": "F_table_70_2" + }, + { + "index": 97, + "pair_id": "F_origin_21_F_table_70_3", + "image_paths": [ + "data/Finance/Table/F_origin_21/F_table_70_3.png" + ], + "domain": "Finance", + "origin": "F_origin_21", + "table_id": "F_table_70_3" + }, + { + "index": 98, + "pair_id": "F_origin_21_F_table_71_0", + "image_paths": [ + "data/Finance/Table/F_origin_21/F_table_71_0.png" + ], + "domain": "Finance", + "origin": "F_origin_21", + "table_id": "F_table_71_0" + }, + { + "index": 99, + "pair_id": "F_origin_22_F_table_72_0", + "image_paths": [ + "data/Finance/Table/F_origin_22/F_table_72_0.png" + ], + "domain": "Finance", + "origin": "F_origin_22", + "table_id": "F_table_72_0" + }, + { + "index": 100, + "pair_id": "F_origin_22_F_table_72_1", + "image_paths": [ + "data/Finance/Table/F_origin_22/F_table_72_1.png" + ], + "domain": "Finance", + "origin": "F_origin_22", + "table_id": "F_table_72_1" + }, + { + "index": 101, + "pair_id": "F_origin_22_F_table_72_2", + "image_paths": [ + "data/Finance/Table/F_origin_22/F_table_72_2.png" + ], + "domain": "Finance", + "origin": "F_origin_22", + "table_id": "F_table_72_2" + }, + { + "index": 102, + "pair_id": "F_origin_22_F_table_72_3", + "image_paths": [ + "data/Finance/Table/F_origin_22/F_table_72_3.png" + ], + "domain": "Finance", + "origin": "F_origin_22", + "table_id": "F_table_72_3" + }, + { + "index": 103, + "pair_id": "F_origin_22_F_table_72_4", + "image_paths": [ + "data/Finance/Table/F_origin_22/F_table_72_4.png" + ], + "domain": "Finance", + "origin": "F_origin_22", + "table_id": "F_table_72_4" + }, + { + "index": 104, + "pair_id": "F_origin_22_F_table_72_5", + "image_paths": [ + "data/Finance/Table/F_origin_22/F_table_72_5.png" + ], + "domain": "Finance", + "origin": "F_origin_22", + "table_id": "F_table_72_5" + }, + { + "index": 105, + "pair_id": "F_origin_22_F_table_73_0", + "image_paths": [ + "data/Finance/Table/F_origin_22/F_table_73_0.png" + ], + "domain": "Finance", + "origin": "F_origin_22", + "table_id": "F_table_73_0" + }, + { + "index": 106, + "pair_id": "F_origin_23_F_table_74_0", + "image_paths": [ + "data/Finance/Table/F_origin_23/F_table_74_0.png" + ], + "domain": "Finance", + "origin": "F_origin_23", + "table_id": "F_table_74_0" + }, + { + "index": 107, + "pair_id": "F_origin_23_F_table_74_1", + "image_paths": [ + "data/Finance/Table/F_origin_23/F_table_74_1.png" + ], + "domain": "Finance", + "origin": "F_origin_23", + "table_id": "F_table_74_1" + }, + { + "index": 108, + "pair_id": "F_origin_23_F_table_74_2", + "image_paths": [ + "data/Finance/Table/F_origin_23/F_table_74_2.png" + ], + "domain": "Finance", + "origin": "F_origin_23", + "table_id": "F_table_74_2" + }, + { + "index": 109, + "pair_id": "F_origin_23_F_table_74_3", + "image_paths": [ + "data/Finance/Table/F_origin_23/F_table_74_3.png" + ], + "domain": "Finance", + "origin": "F_origin_23", + "table_id": "F_table_74_3" + }, + { + "index": 110, + "pair_id": "F_origin_23_F_table_74_4", + "image_paths": [ + "data/Finance/Table/F_origin_23/F_table_74_4.png" + ], + "domain": "Finance", + "origin": "F_origin_23", + "table_id": "F_table_74_4" + }, + { + "index": 111, + "pair_id": "F_origin_23_F_table_74_5", + "image_paths": [ + "data/Finance/Table/F_origin_23/F_table_74_5.png" + ], + "domain": "Finance", + "origin": "F_origin_23", + "table_id": "F_table_74_5" + }, + { + "index": 112, + "pair_id": "F_origin_23_F_table_75_0", + "image_paths": [ + "data/Finance/Table/F_origin_23/F_table_75_0.png" + ], + "domain": "Finance", + "origin": "F_origin_23", + "table_id": "F_table_75_0" + }, + { + "index": 113, + "pair_id": "F_origin_24_F_table_75_0", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_0.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_0" + }, + { + "index": 114, + "pair_id": "F_origin_24_F_table_75_1", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_1.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_1" + }, + { + "index": 115, + "pair_id": "F_origin_24_F_table_75_2", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_2.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_2" + }, + { + "index": 116, + "pair_id": "F_origin_24_F_table_75_3", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_3.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_3" + }, + { + "index": 117, + "pair_id": "F_origin_24_F_table_75_4", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_4.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_4" + }, + { + "index": 118, + "pair_id": "F_origin_24_F_table_75_5", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_5.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_5" + }, + { + "index": 119, + "pair_id": "F_origin_24_F_table_75_6", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_6.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_6" + }, + { + "index": 120, + "pair_id": "F_origin_24_F_table_75_7", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_75_7.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_75_7" + }, + { + "index": 121, + "pair_id": "F_origin_24_F_table_76_0", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_76_0.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_76_0" + }, + { + "index": 122, + "pair_id": "F_origin_24_F_table_76_1", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_76_1.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_76_1" + }, + { + "index": 123, + "pair_id": "F_origin_24_F_table_76_2", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_76_2.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_76_2" + }, + { + "index": 124, + "pair_id": "F_origin_24_F_table_76_3", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_76_3.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_76_3" + }, + { + "index": 125, + "pair_id": "F_origin_24_F_table_76_4", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_76_4.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_76_4" + }, + { + "index": 126, + "pair_id": "F_origin_24_F_table_77_0", + "image_paths": [ + "data/Finance/Table/F_origin_24/F_table_77_0.png" + ], + "domain": "Finance", + "origin": "F_origin_24", + "table_id": "F_table_77_0" + }, + { + "index": 127, + "pair_id": "F_origin_25_F_table_79_0", + "image_paths": [ + "data/Finance/Table/F_origin_25/F_table_79_0.png" + ], + "domain": "Finance", + "origin": "F_origin_25", + "table_id": "F_table_79_0" + }, + { + "index": 128, + "pair_id": "F_origin_25_F_table_80_0", + "image_paths": [ + "data/Finance/Table/F_origin_25/F_table_80_0.png" + ], + "domain": "Finance", + "origin": "F_origin_25", + "table_id": "F_table_80_0" + }, + { + "index": 129, + "pair_id": "F_origin_25_F_table_81_0", + "image_paths": [ + "data/Finance/Table/F_origin_25/F_table_81_0.png" + ], + "domain": "Finance", + "origin": "F_origin_25", + "table_id": "F_table_81_0" + }, + { + "index": 130, + "pair_id": "F_origin_25_F_table_82_0", + "image_paths": [ + "data/Finance/Table/F_origin_25/F_table_82_0.png" + ], + "domain": "Finance", + "origin": "F_origin_25", + "table_id": "F_table_82_0" + }, + { + "index": 131, + "pair_id": "F_origin_25_F_table_82_1", + "image_paths": [ + "data/Finance/Table/F_origin_25/F_table_82_1.png" + ], + "domain": "Finance", + "origin": "F_origin_25", + "table_id": "F_table_82_1" + }, + { + "index": 132, + "pair_id": "F_origin_25_F_table_82_2", + "image_paths": [ + "data/Finance/Table/F_origin_25/F_table_82_2.png" + ], + "domain": "Finance", + "origin": "F_origin_25", + "table_id": "F_table_82_2" + }, + { + "index": 133, + "pair_id": "F_origin_25_F_table_83_0", + "image_paths": [ + "data/Finance/Table/F_origin_25/F_table_83_0.png" + ], + "domain": "Finance", + "origin": "F_origin_25", + "table_id": "F_table_83_0" + }, + { + "index": 134, + "pair_id": "F_origin_26_F_table_84_0", + "image_paths": [ + "data/Finance/Table/F_origin_26/F_table_84_0.png" + ], + "domain": "Finance", + "origin": "F_origin_26", + "table_id": "F_table_84_0" + }, + { + "index": 135, + "pair_id": "F_origin_26_F_table_84_1", + "image_paths": [ + "data/Finance/Table/F_origin_26/F_table_84_1.png" + ], + "domain": "Finance", + "origin": "F_origin_26", + "table_id": "F_table_84_1" + }, + { + "index": 136, + "pair_id": "F_origin_26_F_table_85_0", + "image_paths": [ + "data/Finance/Table/F_origin_26/F_table_85_0.png" + ], + "domain": "Finance", + "origin": "F_origin_26", + "table_id": "F_table_85_0" + }, + { + "index": 137, + "pair_id": "F_origin_26_F_table_86_0", + "image_paths": [ + "data/Finance/Table/F_origin_26/F_table_86_0.png" + ], + "domain": "Finance", + "origin": "F_origin_26", + "table_id": "F_table_86_0" + }, + { + "index": 138, + "pair_id": "F_origin_26_F_table_87_0", + "image_paths": [ + "data/Finance/Table/F_origin_26/F_table_87_0.png" + ], + "domain": "Finance", + "origin": "F_origin_26", + "table_id": "F_table_87_0" + }, + { + "index": 139, + "pair_id": "F_origin_27_F_table_88_0", + "image_paths": [ + "data/Finance/Table/F_origin_27/F_table_88_0.png" + ], + "domain": "Finance", + "origin": "F_origin_27", + "table_id": "F_table_88_0" + }, + { + "index": 140, + "pair_id": "F_origin_27_F_table_88_1", + "image_paths": [ + "data/Finance/Table/F_origin_27/F_table_88_1.png" + ], + "domain": "Finance", + "origin": "F_origin_27", + "table_id": "F_table_88_1" + }, + { + "index": 141, + "pair_id": "F_origin_27_F_table_89_0", + "image_paths": [ + "data/Finance/Table/F_origin_27/F_table_89_0.png" + ], + "domain": "Finance", + "origin": "F_origin_27", + "table_id": "F_table_89_0" + }, + { + "index": 142, + "pair_id": "F_origin_27_F_table_89_1", + "image_paths": [ + "data/Finance/Table/F_origin_27/F_table_89_1.png" + ], + "domain": "Finance", + "origin": "F_origin_27", + "table_id": "F_table_89_1" + }, + { + "index": 143, + "pair_id": "F_origin_27_F_table_89_2", + "image_paths": [ + "data/Finance/Table/F_origin_27/F_table_89_2.png" + ], + "domain": "Finance", + "origin": "F_origin_27", + "table_id": "F_table_89_2" + }, + { + "index": 144, + "pair_id": "F_origin_27_F_table_90_0", + "image_paths": [ + "data/Finance/Table/F_origin_27/F_table_90_0.png" + ], + "domain": "Finance", + "origin": "F_origin_27", + "table_id": "F_table_90_0" + }, + { + "index": 145, + "pair_id": "F_origin_28_F_table_91_0", + "image_paths": [ + "data/Finance/Table/F_origin_28/F_table_91_0.png" + ], + "domain": "Finance", + "origin": "F_origin_28", + "table_id": "F_table_91_0" + }, + { + "index": 146, + "pair_id": "F_origin_28_F_table_91_1", + "image_paths": [ + "data/Finance/Table/F_origin_28/F_table_91_1.png" + ], + "domain": "Finance", + "origin": "F_origin_28", + "table_id": "F_table_91_1" + }, + { + "index": 147, + "pair_id": "F_origin_28_F_table_91_2", + "image_paths": [ + "data/Finance/Table/F_origin_28/F_table_91_2.png" + ], + "domain": "Finance", + "origin": "F_origin_28", + "table_id": "F_table_91_2" + }, + { + "index": 148, + "pair_id": "F_origin_28_F_table_91_3", + "image_paths": [ + "data/Finance/Table/F_origin_28/F_table_91_3.png" + ], + "domain": "Finance", + "origin": "F_origin_28", + "table_id": "F_table_91_3" + }, + { + "index": 149, + "pair_id": "F_origin_28_F_table_92_0", + "image_paths": [ + "data/Finance/Table/F_origin_28/F_table_92_0.png" + ], + "domain": "Finance", + "origin": "F_origin_28", + "table_id": "F_table_92_0" + }, + { + "index": 150, + "pair_id": "F_origin_29_F_table_93_0", + "image_paths": [ + "data/Finance/Table/F_origin_29/F_table_93_0.png" + ], + "domain": "Finance", + "origin": "F_origin_29", + "table_id": "F_table_93_0" + }, + { + "index": 151, + "pair_id": "F_origin_29_F_table_93_1", + "image_paths": [ + "data/Finance/Table/F_origin_29/F_table_93_1.png" + ], + "domain": "Finance", + "origin": "F_origin_29", + "table_id": "F_table_93_1" + }, + { + "index": 152, + "pair_id": "F_origin_29_F_table_93_2", + "image_paths": [ + "data/Finance/Table/F_origin_29/F_table_93_2.png" + ], + "domain": "Finance", + "origin": "F_origin_29", + "table_id": "F_table_93_2" + }, + { + "index": 153, + "pair_id": "F_origin_29_F_table_93_3", + "image_paths": [ + "data/Finance/Table/F_origin_29/F_table_93_3.png" + ], + "domain": "Finance", + "origin": "F_origin_29", + "table_id": "F_table_93_3" + }, + { + "index": 154, + "pair_id": "F_origin_29_F_table_93_4", + "image_paths": [ + "data/Finance/Table/F_origin_29/F_table_93_4.png" + ], + "domain": "Finance", + "origin": "F_origin_29", + "table_id": "F_table_93_4" + }, + { + "index": 155, + "pair_id": "F_origin_29_F_table_94_0", + "image_paths": [ + "data/Finance/Table/F_origin_29/F_table_94_0.png" + ], + "domain": "Finance", + "origin": "F_origin_29", + "table_id": "F_table_94_0" + }, + { + "index": 156, + "pair_id": "F_origin_3_F_table_5_0", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_0.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_0" + }, + { + "index": 157, + "pair_id": "F_origin_3_F_table_5_1", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_1.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_1" + }, + { + "index": 158, + "pair_id": "F_origin_3_F_table_5_10", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_10.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_10" + }, + { + "index": 159, + "pair_id": "F_origin_3_F_table_5_11", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_11.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_11" + }, + { + "index": 160, + "pair_id": "F_origin_3_F_table_5_2", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_2.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_2" + }, + { + "index": 161, + "pair_id": "F_origin_3_F_table_5_3", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_3.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_3" + }, + { + "index": 162, + "pair_id": "F_origin_3_F_table_5_4", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_4.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_4" + }, + { + "index": 163, + "pair_id": "F_origin_3_F_table_5_5", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_5.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_5" + }, + { + "index": 164, + "pair_id": "F_origin_3_F_table_5_6", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_6.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_6" + }, + { + "index": 165, + "pair_id": "F_origin_3_F_table_5_7", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_7.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_7" + }, + { + "index": 166, + "pair_id": "F_origin_3_F_table_5_8", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_8.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_8" + }, + { + "index": 167, + "pair_id": "F_origin_3_F_table_5_9", + "image_paths": [ + "data/Finance/Table/F_origin_3/F_table_5_9.png" + ], + "domain": "Finance", + "origin": "F_origin_3", + "table_id": "F_table_5_9" + }, + { + "index": 168, + "pair_id": "F_origin_30_F_table_95_0", + "image_paths": [ + "data/Finance/Table/F_origin_30/F_table_95_0.png" + ], + "domain": "Finance", + "origin": "F_origin_30", + "table_id": "F_table_95_0" + }, + { + "index": 169, + "pair_id": "F_origin_30_F_table_96_0", + "image_paths": [ + "data/Finance/Table/F_origin_30/F_table_96_0.png" + ], + "domain": "Finance", + "origin": "F_origin_30", + "table_id": "F_table_96_0" + }, + { + "index": 170, + "pair_id": "F_origin_30_F_table_97_0", + "image_paths": [ + "data/Finance/Table/F_origin_30/F_table_97_0.png" + ], + "domain": "Finance", + "origin": "F_origin_30", + "table_id": "F_table_97_0" + }, + { + "index": 171, + "pair_id": "F_origin_30_F_table_97_1", + "image_paths": [ + "data/Finance/Table/F_origin_30/F_table_97_1.png" + ], + "domain": "Finance", + "origin": "F_origin_30", + "table_id": "F_table_97_1" + }, + { + "index": 172, + "pair_id": "F_origin_30_F_table_97_2", + "image_paths": [ + "data/Finance/Table/F_origin_30/F_table_97_2.png" + ], + "domain": "Finance", + "origin": "F_origin_30", + "table_id": "F_table_97_2" + }, + { + "index": 173, + "pair_id": "F_origin_31_F_table_100_0", + "image_paths": [ + "data/Finance/Table/F_origin_31/F_table_100_0.png" + ], + "domain": "Finance", + "origin": "F_origin_31", + "table_id": "F_table_100_0" + }, + { + "index": 174, + "pair_id": "F_origin_31_F_table_100_1", + "image_paths": [ + "data/Finance/Table/F_origin_31/F_table_100_1.png" + ], + "domain": "Finance", + "origin": "F_origin_31", + "table_id": "F_table_100_1" + }, + { + "index": 175, + "pair_id": "F_origin_31_F_table_100_2", + "image_paths": [ + "data/Finance/Table/F_origin_31/F_table_100_2.png" + ], + "domain": "Finance", + "origin": "F_origin_31", + "table_id": "F_table_100_2" + }, + { + "index": 176, + "pair_id": "F_origin_31_F_table_98_0", + "image_paths": [ + "data/Finance/Table/F_origin_31/F_table_98_0.png" + ], + "domain": "Finance", + "origin": "F_origin_31", + "table_id": "F_table_98_0" + }, + { + "index": 177, + "pair_id": "F_origin_31_F_table_99_0", + "image_paths": [ + "data/Finance/Table/F_origin_31/F_table_99_0.png" + ], + "domain": "Finance", + "origin": "F_origin_31", + "table_id": "F_table_99_0" + }, + { + "index": 178, + "pair_id": "F_origin_32_F_table_101_0", + "image_paths": [ + "data/Finance/Table/F_origin_32/F_table_101_0.png" + ], + "domain": "Finance", + "origin": "F_origin_32", + "table_id": "F_table_101_0" + }, + { + "index": 179, + "pair_id": "F_origin_32_F_table_102_0", + "image_paths": [ + "data/Finance/Table/F_origin_32/F_table_102_0.png" + ], + "domain": "Finance", + "origin": "F_origin_32", + "table_id": "F_table_102_0" + }, + { + "index": 180, + "pair_id": "F_origin_32_F_table_103_0", + "image_paths": [ + "data/Finance/Table/F_origin_32/F_table_103_0.png" + ], + "domain": "Finance", + "origin": "F_origin_32", + "table_id": "F_table_103_0" + }, + { + "index": 181, + "pair_id": "F_origin_32_F_table_103_1", + "image_paths": [ + "data/Finance/Table/F_origin_32/F_table_103_1.png" + ], + "domain": "Finance", + "origin": "F_origin_32", + "table_id": "F_table_103_1" + }, + { + "index": 182, + "pair_id": "F_origin_32_F_table_103_2", + "image_paths": [ + "data/Finance/Table/F_origin_32/F_table_103_2.png" + ], + "domain": "Finance", + "origin": "F_origin_32", + "table_id": "F_table_103_2" + }, + { + "index": 183, + "pair_id": "F_origin_33_F_table_104_0", + "image_paths": [ + "data/Finance/Table/F_origin_33/F_table_104_0.png" + ], + "domain": "Finance", + "origin": "F_origin_33", + "table_id": "F_table_104_0" + }, + { + "index": 184, + "pair_id": "F_origin_33_F_table_105_0", + "image_paths": [ + "data/Finance/Table/F_origin_33/F_table_105_0.png" + ], + "domain": "Finance", + "origin": "F_origin_33", + "table_id": "F_table_105_0" + }, + { + "index": 185, + "pair_id": "F_origin_33_F_table_106_0", + "image_paths": [ + "data/Finance/Table/F_origin_33/F_table_106_0.png" + ], + "domain": "Finance", + "origin": "F_origin_33", + "table_id": "F_table_106_0" + }, + { + "index": 186, + "pair_id": "F_origin_33_F_table_106_1", + "image_paths": [ + "data/Finance/Table/F_origin_33/F_table_106_1.png" + ], + "domain": "Finance", + "origin": "F_origin_33", + "table_id": "F_table_106_1" + }, + { + "index": 187, + "pair_id": "F_origin_33_F_table_106_2", + "image_paths": [ + "data/Finance/Table/F_origin_33/F_table_106_2.png" + ], + "domain": "Finance", + "origin": "F_origin_33", + "table_id": "F_table_106_2" + }, + { + "index": 188, + "pair_id": "F_origin_34_F_table_107_0", + "image_paths": [ + "data/Finance/Table/F_origin_34/F_table_107_0.png" + ], + "domain": "Finance", + "origin": "F_origin_34", + "table_id": "F_table_107_0" + }, + { + "index": 189, + "pair_id": "F_origin_34_F_table_108_0", + "image_paths": [ + "data/Finance/Table/F_origin_34/F_table_108_0.png" + ], + "domain": "Finance", + "origin": "F_origin_34", + "table_id": "F_table_108_0" + }, + { + "index": 190, + "pair_id": "F_origin_34_F_table_109_0", + "image_paths": [ + "data/Finance/Table/F_origin_34/F_table_109_0.png" + ], + "domain": "Finance", + "origin": "F_origin_34", + "table_id": "F_table_109_0" + }, + { + "index": 191, + "pair_id": "F_origin_34_F_table_109_1", + "image_paths": [ + "data/Finance/Table/F_origin_34/F_table_109_1.png" + ], + "domain": "Finance", + "origin": "F_origin_34", + "table_id": "F_table_109_1" + }, + { + "index": 192, + "pair_id": "F_origin_34_F_table_109_2", + "image_paths": [ + "data/Finance/Table/F_origin_34/F_table_109_2.png" + ], + "domain": "Finance", + "origin": "F_origin_34", + "table_id": "F_table_109_2" + }, + { + "index": 193, + "pair_id": "F_origin_35_F_table_110_0", + "image_paths": [ + "data/Finance/Table/F_origin_35/F_table_110_0.png" + ], + "domain": "Finance", + "origin": "F_origin_35", + "table_id": "F_table_110_0" + }, + { + "index": 194, + "pair_id": "F_origin_35_F_table_111_0", + "image_paths": [ + "data/Finance/Table/F_origin_35/F_table_111_0.png" + ], + "domain": "Finance", + "origin": "F_origin_35", + "table_id": "F_table_111_0" + }, + { + "index": 195, + "pair_id": "F_origin_35_F_table_112_0", + "image_paths": [ + "data/Finance/Table/F_origin_35/F_table_112_0.png" + ], + "domain": "Finance", + "origin": "F_origin_35", + "table_id": "F_table_112_0" + }, + { + "index": 196, + "pair_id": "F_origin_35_F_table_112_1", + "image_paths": [ + "data/Finance/Table/F_origin_35/F_table_112_1.png" + ], + "domain": "Finance", + "origin": "F_origin_35", + "table_id": "F_table_112_1" + }, + { + "index": 197, + "pair_id": "F_origin_35_F_table_112_2", + "image_paths": [ + "data/Finance/Table/F_origin_35/F_table_112_2.png" + ], + "domain": "Finance", + "origin": "F_origin_35", + "table_id": "F_table_112_2" + }, + { + "index": 198, + "pair_id": "F_origin_36_F_table_113_0", + "image_paths": [ + "data/Finance/Table/F_origin_36/F_table_113_0.png" + ], + "domain": "Finance", + "origin": "F_origin_36", + "table_id": "F_table_113_0" + }, + { + "index": 199, + "pair_id": "F_origin_36_F_table_114_0", + "image_paths": [ + "data/Finance/Table/F_origin_36/F_table_114_0.png" + ], + "domain": "Finance", + "origin": "F_origin_36", + "table_id": "F_table_114_0" + }, + { + "index": 200, + "pair_id": "F_origin_36_F_table_115_0", + "image_paths": [ + "data/Finance/Table/F_origin_36/F_table_115_0.png" + ], + "domain": "Finance", + "origin": "F_origin_36", + "table_id": "F_table_115_0" + }, + { + "index": 201, + "pair_id": "F_origin_36_F_table_115_1", + "image_paths": [ + "data/Finance/Table/F_origin_36/F_table_115_1.png" + ], + "domain": "Finance", + "origin": "F_origin_36", + "table_id": "F_table_115_1" + }, + { + "index": 202, + "pair_id": "F_origin_36_F_table_115_2", + "image_paths": [ + "data/Finance/Table/F_origin_36/F_table_115_2.png" + ], + "domain": "Finance", + "origin": "F_origin_36", + "table_id": "F_table_115_2" + }, + { + "index": 203, + "pair_id": "F_origin_36_F_table_116_0", + "image_paths": [ + "data/Finance/Table/F_origin_36/F_table_116_0.png" + ], + "domain": "Finance", + "origin": "F_origin_36", + "table_id": "F_table_116_0" + }, + { + "index": 204, + "pair_id": "F_origin_37_F_table_117_0", + "image_paths": [ + "data/Finance/Table/F_origin_37/F_table_117_0.png" + ], + "domain": "Finance", + "origin": "F_origin_37", + "table_id": "F_table_117_0" + }, + { + "index": 205, + "pair_id": "F_origin_37_F_table_118_0", + "image_paths": [ + "data/Finance/Table/F_origin_37/F_table_118_0.png" + ], + "domain": "Finance", + "origin": "F_origin_37", + "table_id": "F_table_118_0" + }, + { + "index": 206, + "pair_id": "F_origin_37_F_table_119_0", + "image_paths": [ + "data/Finance/Table/F_origin_37/F_table_119_0.png" + ], + "domain": "Finance", + "origin": "F_origin_37", + "table_id": "F_table_119_0" + }, + { + "index": 207, + "pair_id": "F_origin_37_F_table_119_1", + "image_paths": [ + "data/Finance/Table/F_origin_37/F_table_119_1.png" + ], + "domain": "Finance", + "origin": "F_origin_37", + "table_id": "F_table_119_1" + }, + { + "index": 208, + "pair_id": "F_origin_37_F_table_119_2", + "image_paths": [ + "data/Finance/Table/F_origin_37/F_table_119_2.png" + ], + "domain": "Finance", + "origin": "F_origin_37", + "table_id": "F_table_119_2" + }, + { + "index": 209, + "pair_id": "F_origin_37_F_table_119_3", + "image_paths": [ + "data/Finance/Table/F_origin_37/F_table_119_3.png" + ], + "domain": "Finance", + "origin": "F_origin_37", + "table_id": "F_table_119_3" + }, + { + "index": 210, + "pair_id": "F_origin_37_F_table_119_4", + "image_paths": [ + "data/Finance/Table/F_origin_37/F_table_119_4.png" + ], + "domain": "Finance", + "origin": "F_origin_37", + "table_id": "F_table_119_4" + }, + { + "index": 211, + "pair_id": "F_origin_38_F_table_120_0", + "image_paths": [ + "data/Finance/Table/F_origin_38/F_table_120_0.png" + ], + "domain": "Finance", + "origin": "F_origin_38", + "table_id": "F_table_120_0" + }, + { + "index": 212, + "pair_id": "F_origin_38_F_table_121_0", + "image_paths": [ + "data/Finance/Table/F_origin_38/F_table_121_0.png" + ], + "domain": "Finance", + "origin": "F_origin_38", + "table_id": "F_table_121_0" + }, + { + "index": 213, + "pair_id": "F_origin_38_F_table_122_0", + "image_paths": [ + "data/Finance/Table/F_origin_38/F_table_122_0.png" + ], + "domain": "Finance", + "origin": "F_origin_38", + "table_id": "F_table_122_0" + }, + { + "index": 214, + "pair_id": "F_origin_38_F_table_122_1", + "image_paths": [ + "data/Finance/Table/F_origin_38/F_table_122_1.png" + ], + "domain": "Finance", + "origin": "F_origin_38", + "table_id": "F_table_122_1" + }, + { + "index": 215, + "pair_id": "F_origin_38_F_table_122_2", + "image_paths": [ + "data/Finance/Table/F_origin_38/F_table_122_2.png" + ], + "domain": "Finance", + "origin": "F_origin_38", + "table_id": "F_table_122_2" + }, + { + "index": 216, + "pair_id": "F_origin_39_F_table_123_0", + "image_paths": [ + "data/Finance/Table/F_origin_39/F_table_123_0.png" + ], + "domain": "Finance", + "origin": "F_origin_39", + "table_id": "F_table_123_0" + }, + { + "index": 217, + "pair_id": "F_origin_39_F_table_124_0", + "image_paths": [ + "data/Finance/Table/F_origin_39/F_table_124_0.png" + ], + "domain": "Finance", + "origin": "F_origin_39", + "table_id": "F_table_124_0" + }, + { + "index": 218, + "pair_id": "F_origin_39_F_table_125_0", + "image_paths": [ + "data/Finance/Table/F_origin_39/F_table_125_0.png" + ], + "domain": "Finance", + "origin": "F_origin_39", + "table_id": "F_table_125_0" + }, + { + "index": 219, + "pair_id": "F_origin_39_F_table_125_1", + "image_paths": [ + "data/Finance/Table/F_origin_39/F_table_125_1.png" + ], + "domain": "Finance", + "origin": "F_origin_39", + "table_id": "F_table_125_1" + }, + { + "index": 220, + "pair_id": "F_origin_39_F_table_125_2", + "image_paths": [ + "data/Finance/Table/F_origin_39/F_table_125_2.png" + ], + "domain": "Finance", + "origin": "F_origin_39", + "table_id": "F_table_125_2" + }, + { + "index": 221, + "pair_id": "F_origin_4_F_table_6_0", + "image_paths": [ + "data/Finance/Table/F_origin_4/F_table_6_0.png" + ], + "domain": "Finance", + "origin": "F_origin_4", + "table_id": "F_table_6_0" + }, + { + "index": 222, + "pair_id": "F_origin_4_F_table_6_1", + "image_paths": [ + "data/Finance/Table/F_origin_4/F_table_6_1.png" + ], + "domain": "Finance", + "origin": "F_origin_4", + "table_id": "F_table_6_1" + }, + { + "index": 223, + "pair_id": "F_origin_4_F_table_6_2", + "image_paths": [ + "data/Finance/Table/F_origin_4/F_table_6_2.png" + ], + "domain": "Finance", + "origin": "F_origin_4", + "table_id": "F_table_6_2" + }, + { + "index": 224, + "pair_id": "F_origin_4_F_table_6_3", + "image_paths": [ + "data/Finance/Table/F_origin_4/F_table_6_3.png" + ], + "domain": "Finance", + "origin": "F_origin_4", + "table_id": "F_table_6_3" + }, + { + "index": 225, + "pair_id": "F_origin_40_F_table_126_0", + "image_paths": [ + "data/Finance/Table/F_origin_40/F_table_126_0.png" + ], + "domain": "Finance", + "origin": "F_origin_40", + "table_id": "F_table_126_0" + }, + { + "index": 226, + "pair_id": "F_origin_40_F_table_127_0", + "image_paths": [ + "data/Finance/Table/F_origin_40/F_table_127_0.png" + ], + "domain": "Finance", + "origin": "F_origin_40", + "table_id": "F_table_127_0" + }, + { + "index": 227, + "pair_id": "F_origin_40_F_table_128_0", + "image_paths": [ + "data/Finance/Table/F_origin_40/F_table_128_0.png" + ], + "domain": "Finance", + "origin": "F_origin_40", + "table_id": "F_table_128_0" + }, + { + "index": 228, + "pair_id": "F_origin_40_F_table_128_1", + "image_paths": [ + "data/Finance/Table/F_origin_40/F_table_128_1.png" + ], + "domain": "Finance", + "origin": "F_origin_40", + "table_id": "F_table_128_1" + }, + { + "index": 229, + "pair_id": "F_origin_40_F_table_128_2", + "image_paths": [ + "data/Finance/Table/F_origin_40/F_table_128_2.png" + ], + "domain": "Finance", + "origin": "F_origin_40", + "table_id": "F_table_128_2" + }, + { + "index": 230, + "pair_id": "F_origin_41_F_table_129_0", + "image_paths": [ + "data/Finance/Table/F_origin_41/F_table_129_0.png" + ], + "domain": "Finance", + "origin": "F_origin_41", + "table_id": "F_table_129_0" + }, + { + "index": 231, + "pair_id": "F_origin_41_F_table_129_1", + "image_paths": [ + "data/Finance/Table/F_origin_41/F_table_129_1.png" + ], + "domain": "Finance", + "origin": "F_origin_41", + "table_id": "F_table_129_1" + }, + { + "index": 232, + "pair_id": "F_origin_41_F_table_129_2", + "image_paths": [ + "data/Finance/Table/F_origin_41/F_table_129_2.png" + ], + "domain": "Finance", + "origin": "F_origin_41", + "table_id": "F_table_129_2" + }, + { + "index": 233, + "pair_id": "F_origin_41_F_table_129_3", + "image_paths": [ + "data/Finance/Table/F_origin_41/F_table_129_3.png" + ], + "domain": "Finance", + "origin": "F_origin_41", + "table_id": "F_table_129_3" + }, + { + "index": 234, + "pair_id": "F_origin_41_F_table_129_4", + "image_paths": [ + "data/Finance/Table/F_origin_41/F_table_129_4.png" + ], + "domain": "Finance", + "origin": "F_origin_41", + "table_id": "F_table_129_4" + }, + { + "index": 235, + "pair_id": "F_origin_41_F_table_129_5", + "image_paths": [ + "data/Finance/Table/F_origin_41/F_table_129_5.png" + ], + "domain": "Finance", + "origin": "F_origin_41", + "table_id": "F_table_129_5" + }, + { + "index": 236, + "pair_id": "F_origin_42_F_table_130_0", + "image_paths": [ + "data/Finance/Table/F_origin_42/F_table_130_0.png" + ], + "domain": "Finance", + "origin": "F_origin_42", + "table_id": "F_table_130_0" + }, + { + "index": 237, + "pair_id": "F_origin_42_F_table_130_1", + "image_paths": [ + "data/Finance/Table/F_origin_42/F_table_130_1.png" + ], + "domain": "Finance", + "origin": "F_origin_42", + "table_id": "F_table_130_1" + }, + { + "index": 238, + "pair_id": "F_origin_42_F_table_130_2", + "image_paths": [ + "data/Finance/Table/F_origin_42/F_table_130_2.png" + ], + "domain": "Finance", + "origin": "F_origin_42", + "table_id": "F_table_130_2" + }, + { + "index": 239, + "pair_id": "F_origin_42_F_table_130_3", + "image_paths": [ + "data/Finance/Table/F_origin_42/F_table_130_3.png" + ], + "domain": "Finance", + "origin": "F_origin_42", + "table_id": "F_table_130_3" + }, + { + "index": 240, + "pair_id": "F_origin_42_F_table_130_4", + "image_paths": [ + "data/Finance/Table/F_origin_42/F_table_130_4.png" + ], + "domain": "Finance", + "origin": "F_origin_42", + "table_id": "F_table_130_4" + }, + { + "index": 241, + "pair_id": "F_origin_42_F_table_130_5", + "image_paths": [ + "data/Finance/Table/F_origin_42/F_table_130_5.png" + ], + "domain": "Finance", + "origin": "F_origin_42", + "table_id": "F_table_130_5" + }, + { + "index": 242, + "pair_id": "F_origin_43_F_table_131_0", + "image_paths": [ + "data/Finance/Table/F_origin_43/F_table_131_0.png" + ], + "domain": "Finance", + "origin": "F_origin_43", + "table_id": "F_table_131_0" + }, + { + "index": 243, + "pair_id": "F_origin_43_F_table_131_1", + "image_paths": [ + "data/Finance/Table/F_origin_43/F_table_131_1.png" + ], + "domain": "Finance", + "origin": "F_origin_43", + "table_id": "F_table_131_1" + }, + { + "index": 244, + "pair_id": "F_origin_43_F_table_131_2", + "image_paths": [ + "data/Finance/Table/F_origin_43/F_table_131_2.png" + ], + "domain": "Finance", + "origin": "F_origin_43", + "table_id": "F_table_131_2" + }, + { + "index": 245, + "pair_id": "F_origin_43_F_table_131_3", + "image_paths": [ + "data/Finance/Table/F_origin_43/F_table_131_3.png" + ], + "domain": "Finance", + "origin": "F_origin_43", + "table_id": "F_table_131_3" + }, + { + "index": 246, + "pair_id": "F_origin_43_F_table_131_4", + "image_paths": [ + "data/Finance/Table/F_origin_43/F_table_131_4.png" + ], + "domain": "Finance", + "origin": "F_origin_43", + "table_id": "F_table_131_4" + }, + { + "index": 247, + "pair_id": "F_origin_44_F_table_132_0", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_132_0.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_132_0" + }, + { + "index": 248, + "pair_id": "F_origin_44_F_table_132_1", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_132_1.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_132_1" + }, + { + "index": 249, + "pair_id": "F_origin_44_F_table_133_0", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_133_0.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_133_0" + }, + { + "index": 250, + "pair_id": "F_origin_44_F_table_133_1", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_133_1.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_133_1" + }, + { + "index": 251, + "pair_id": "F_origin_44_F_table_133_2", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_133_2.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_133_2" + }, + { + "index": 252, + "pair_id": "F_origin_44_F_table_134_0", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_134_0.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_134_0" + }, + { + "index": 253, + "pair_id": "F_origin_44_F_table_135_0", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_135_0.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_135_0" + }, + { + "index": 254, + "pair_id": "F_origin_44_F_table_135_1", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_135_1.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_135_1" + }, + { + "index": 255, + "pair_id": "F_origin_44_F_table_135_2", + "image_paths": [ + "data/Finance/Table/F_origin_44/F_table_135_2.png" + ], + "domain": "Finance", + "origin": "F_origin_44", + "table_id": "F_table_135_2" + }, + { + "index": 256, + "pair_id": "F_origin_45_F_table_136_0", + "image_paths": [ + "data/Finance/Table/F_origin_45/F_table_136_0.png" + ], + "domain": "Finance", + "origin": "F_origin_45", + "table_id": "F_table_136_0" + }, + { + "index": 257, + "pair_id": "F_origin_45_F_table_136_1", + "image_paths": [ + "data/Finance/Table/F_origin_45/F_table_136_1.png" + ], + "domain": "Finance", + "origin": "F_origin_45", + "table_id": "F_table_136_1" + }, + { + "index": 258, + "pair_id": "F_origin_45_F_table_136_2", + "image_paths": [ + "data/Finance/Table/F_origin_45/F_table_136_2.png" + ], + "domain": "Finance", + "origin": "F_origin_45", + "table_id": "F_table_136_2" + }, + { + "index": 259, + "pair_id": "F_origin_45_F_table_136_3", + "image_paths": [ + "data/Finance/Table/F_origin_45/F_table_136_3.png" + ], + "domain": "Finance", + "origin": "F_origin_45", + "table_id": "F_table_136_3" + }, + { + "index": 260, + "pair_id": "F_origin_45_F_table_136_4", + "image_paths": [ + "data/Finance/Table/F_origin_45/F_table_136_4.png" + ], + "domain": "Finance", + "origin": "F_origin_45", + "table_id": "F_table_136_4" + }, + { + "index": 261, + "pair_id": "F_origin_45_F_table_136_5", + "image_paths": [ + "data/Finance/Table/F_origin_45/F_table_136_5.png" + ], + "domain": "Finance", + "origin": "F_origin_45", + "table_id": "F_table_136_5" + }, + { + "index": 262, + "pair_id": "F_origin_46_F_table_137_0", + "image_paths": [ + "data/Finance/Table/F_origin_46/F_table_137_0.png" + ], + "domain": "Finance", + "origin": "F_origin_46", + "table_id": "F_table_137_0" + }, + { + "index": 263, + "pair_id": "F_origin_46_F_table_137_1", + "image_paths": [ + "data/Finance/Table/F_origin_46/F_table_137_1.png" + ], + "domain": "Finance", + "origin": "F_origin_46", + "table_id": "F_table_137_1" + }, + { + "index": 264, + "pair_id": "F_origin_46_F_table_138_0", + "image_paths": [ + "data/Finance/Table/F_origin_46/F_table_138_0.png" + ], + "domain": "Finance", + "origin": "F_origin_46", + "table_id": "F_table_138_0" + }, + { + "index": 265, + "pair_id": "F_origin_46_F_table_138_1", + "image_paths": [ + "data/Finance/Table/F_origin_46/F_table_138_1.png" + ], + "domain": "Finance", + "origin": "F_origin_46", + "table_id": "F_table_138_1" + }, + { + "index": 266, + "pair_id": "F_origin_46_F_table_139_0", + "image_paths": [ + "data/Finance/Table/F_origin_46/F_table_139_0.png" + ], + "domain": "Finance", + "origin": "F_origin_46", + "table_id": "F_table_139_0" + }, + { + "index": 267, + "pair_id": "F_origin_46_F_table_139_1", + "image_paths": [ + "data/Finance/Table/F_origin_46/F_table_139_1.png" + ], + "domain": "Finance", + "origin": "F_origin_46", + "table_id": "F_table_139_1" + }, + { + "index": 268, + "pair_id": "F_origin_47_F_table_140_0", + "image_paths": [ + "data/Finance/Table/F_origin_47/F_table_140_0.png" + ], + "domain": "Finance", + "origin": "F_origin_47", + "table_id": "F_table_140_0" + }, + { + "index": 269, + "pair_id": "F_origin_47_F_table_141_0", + "image_paths": [ + "data/Finance/Table/F_origin_47/F_table_141_0.png" + ], + "domain": "Finance", + "origin": "F_origin_47", + "table_id": "F_table_141_0" + }, + { + "index": 270, + "pair_id": "F_origin_47_F_table_141_1", + "image_paths": [ + "data/Finance/Table/F_origin_47/F_table_141_1.png" + ], + "domain": "Finance", + "origin": "F_origin_47", + "table_id": "F_table_141_1" + }, + { + "index": 271, + "pair_id": "F_origin_47_F_table_142_0", + "image_paths": [ + "data/Finance/Table/F_origin_47/F_table_142_0.png" + ], + "domain": "Finance", + "origin": "F_origin_47", + "table_id": "F_table_142_0" + }, + { + "index": 272, + "pair_id": "F_origin_47_F_table_142_1", + "image_paths": [ + "data/Finance/Table/F_origin_47/F_table_142_1.png" + ], + "domain": "Finance", + "origin": "F_origin_47", + "table_id": "F_table_142_1" + }, + { + "index": 273, + "pair_id": "F_origin_47_F_table_142_2", + "image_paths": [ + "data/Finance/Table/F_origin_47/F_table_142_2.png" + ], + "domain": "Finance", + "origin": "F_origin_47", + "table_id": "F_table_142_2" + }, + { + "index": 274, + "pair_id": "F_origin_47_F_table_142_3", + "image_paths": [ + "data/Finance/Table/F_origin_47/F_table_142_3.png" + ], + "domain": "Finance", + "origin": "F_origin_47", + "table_id": "F_table_142_3" + }, + { + "index": 275, + "pair_id": "F_origin_48_F_table_143_0", + "image_paths": [ + "data/Finance/Table/F_origin_48/F_table_143_0.png" + ], + "domain": "Finance", + "origin": "F_origin_48", + "table_id": "F_table_143_0" + }, + { + "index": 276, + "pair_id": "F_origin_48_F_table_144_0", + "image_paths": [ + "data/Finance/Table/F_origin_48/F_table_144_0.png" + ], + "domain": "Finance", + "origin": "F_origin_48", + "table_id": "F_table_144_0" + }, + { + "index": 277, + "pair_id": "F_origin_48_F_table_144_1", + "image_paths": [ + "data/Finance/Table/F_origin_48/F_table_144_1.png" + ], + "domain": "Finance", + "origin": "F_origin_48", + "table_id": "F_table_144_1" + }, + { + "index": 278, + "pair_id": "F_origin_48_F_table_145_0", + "image_paths": [ + "data/Finance/Table/F_origin_48/F_table_145_0.png" + ], + "domain": "Finance", + "origin": "F_origin_48", + "table_id": "F_table_145_0" + }, + { + "index": 279, + "pair_id": "F_origin_48_F_table_145_1", + "image_paths": [ + "data/Finance/Table/F_origin_48/F_table_145_1.png" + ], + "domain": "Finance", + "origin": "F_origin_48", + "table_id": "F_table_145_1" + }, + { + "index": 280, + "pair_id": "F_origin_49_F_table_146_0", + "image_paths": [ + "data/Finance/Table/F_origin_49/F_table_146_0.png" + ], + "domain": "Finance", + "origin": "F_origin_49", + "table_id": "F_table_146_0" + }, + { + "index": 281, + "pair_id": "F_origin_49_F_table_147_0", + "image_paths": [ + "data/Finance/Table/F_origin_49/F_table_147_0.png" + ], + "domain": "Finance", + "origin": "F_origin_49", + "table_id": "F_table_147_0" + }, + { + "index": 282, + "pair_id": "F_origin_49_F_table_147_1", + "image_paths": [ + "data/Finance/Table/F_origin_49/F_table_147_1.png" + ], + "domain": "Finance", + "origin": "F_origin_49", + "table_id": "F_table_147_1" + }, + { + "index": 283, + "pair_id": "F_origin_49_F_table_147_2", + "image_paths": [ + "data/Finance/Table/F_origin_49/F_table_147_2.png" + ], + "domain": "Finance", + "origin": "F_origin_49", + "table_id": "F_table_147_2" + }, + { + "index": 284, + "pair_id": "F_origin_49_F_table_148_0", + "image_paths": [ + "data/Finance/Table/F_origin_49/F_table_148_0.png" + ], + "domain": "Finance", + "origin": "F_origin_49", + "table_id": "F_table_148_0" + }, + { + "index": 285, + "pair_id": "F_origin_49_F_table_148_1", + "image_paths": [ + "data/Finance/Table/F_origin_49/F_table_148_1.png" + ], + "domain": "Finance", + "origin": "F_origin_49", + "table_id": "F_table_148_1" + }, + { + "index": 286, + "pair_id": "F_origin_5_F_table_7_0", + "image_paths": [ + "data/Finance/Table/F_origin_5/F_table_7_0.png" + ], + "domain": "Finance", + "origin": "F_origin_5", + "table_id": "F_table_7_0" + }, + { + "index": 287, + "pair_id": "F_origin_5_F_table_7_1", + "image_paths": [ + "data/Finance/Table/F_origin_5/F_table_7_1.png" + ], + "domain": "Finance", + "origin": "F_origin_5", + "table_id": "F_table_7_1" + }, + { + "index": 288, + "pair_id": "F_origin_5_F_table_7_2", + "image_paths": [ + "data/Finance/Table/F_origin_5/F_table_7_2.png" + ], + "domain": "Finance", + "origin": "F_origin_5", + "table_id": "F_table_7_2" + }, + { + "index": 289, + "pair_id": "F_origin_5_F_table_7_3", + "image_paths": [ + "data/Finance/Table/F_origin_5/F_table_7_3.png" + ], + "domain": "Finance", + "origin": "F_origin_5", + "table_id": "F_table_7_3" + }, + { + "index": 290, + "pair_id": "F_origin_5_F_table_7_4", + "image_paths": [ + "data/Finance/Table/F_origin_5/F_table_7_4.png" + ], + "domain": "Finance", + "origin": "F_origin_5", + "table_id": "F_table_7_4" + }, + { + "index": 291, + "pair_id": "F_origin_6_F_table_8_0", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_8_0.png" + ], + "domain": "Finance", + "origin": "F_origin_6", + "table_id": "F_table_8_0" + }, + { + "index": 292, + "pair_id": "F_origin_6_F_table_8_1", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_8_1.png" + ], + "domain": "Finance", + "origin": "F_origin_6", + "table_id": "F_table_8_1" + }, + { + "index": 293, + "pair_id": "F_origin_6_F_table_8_2", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_8_2.png" + ], + "domain": "Finance", + "origin": "F_origin_6", + "table_id": "F_table_8_2" + }, + { + "index": 294, + "pair_id": "F_origin_6_F_table_8_3", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_8_3.png" + ], + "domain": "Finance", + "origin": "F_origin_6", + "table_id": "F_table_8_3" + }, + { + "index": 295, + "pair_id": "F_origin_6_F_table_8_4", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_8_4.png" + ], + "domain": "Finance", + "origin": "F_origin_6", + "table_id": "F_table_8_4" + }, + { + "index": 296, + "pair_id": "F_origin_6_F_table_9_0", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_9_0.png" + ], + "domain": "Finance", + "origin": "F_origin_6", + "table_id": "F_table_9_0" + }, + { + "index": 297, + "pair_id": "F_origin_6_F_table_9_1", + "image_paths": [ + "data/Finance/Table/F_origin_6/F_table_9_1.png" + ], + "domain": "Finance", + "origin": "F_origin_6", + "table_id": "F_table_9_1" + }, + { + "index": 298, + "pair_id": "F_origin_7_F_table_10_0", + "image_paths": [ + "data/Finance/Table/F_origin_7/F_table_10_0.png" + ], + "domain": "Finance", + "origin": "F_origin_7", + "table_id": "F_table_10_0" + }, + { + "index": 299, + "pair_id": "F_origin_7_F_table_10_1", + "image_paths": [ + "data/Finance/Table/F_origin_7/F_table_10_1.png" + ], + "domain": "Finance", + "origin": "F_origin_7", + "table_id": "F_table_10_1" + }, + { + "index": 300, + "pair_id": "F_origin_7_F_table_10_2", + "image_paths": [ + "data/Finance/Table/F_origin_7/F_table_10_2.png" + ], + "domain": "Finance", + "origin": "F_origin_7", + "table_id": "F_table_10_2" + }, + { + "index": 301, + "pair_id": "F_origin_7_F_table_10_3", + "image_paths": [ + "data/Finance/Table/F_origin_7/F_table_10_3.png" + ], + "domain": "Finance", + "origin": "F_origin_7", + "table_id": "F_table_10_3" + }, + { + "index": 302, + "pair_id": "F_origin_7_F_table_10_4", + "image_paths": [ + "data/Finance/Table/F_origin_7/F_table_10_4.png" + ], + "domain": "Finance", + "origin": "F_origin_7", + "table_id": "F_table_10_4" + }, + { + "index": 303, + "pair_id": "F_origin_7_F_table_11_0", + "image_paths": [ + "data/Finance/Table/F_origin_7/F_table_11_0.png" + ], + "domain": "Finance", + "origin": "F_origin_7", + "table_id": "F_table_11_0" + }, + { + "index": 304, + "pair_id": "F_origin_7_F_table_11_1", + "image_paths": [ + "data/Finance/Table/F_origin_7/F_table_11_1.png" + ], + "domain": "Finance", + "origin": "F_origin_7", + "table_id": "F_table_11_1" + }, + { + "index": 305, + "pair_id": "F_origin_8_F_table_12_0", + "image_paths": [ + "data/Finance/Table/F_origin_8/F_table_12_0.png" + ], + "domain": "Finance", + "origin": "F_origin_8", + "table_id": "F_table_12_0" + }, + { + "index": 306, + "pair_id": "F_origin_8_F_table_12_1", + "image_paths": [ + "data/Finance/Table/F_origin_8/F_table_12_1.png" + ], + "domain": "Finance", + "origin": "F_origin_8", + "table_id": "F_table_12_1" + }, + { + "index": 307, + "pair_id": "F_origin_8_F_table_12_2", + "image_paths": [ + "data/Finance/Table/F_origin_8/F_table_12_2.png" + ], + "domain": "Finance", + "origin": "F_origin_8", + "table_id": "F_table_12_2" + }, + { + "index": 308, + "pair_id": "F_origin_8_F_table_12_3", + "image_paths": [ + "data/Finance/Table/F_origin_8/F_table_12_3.png" + ], + "domain": "Finance", + "origin": "F_origin_8", + "table_id": "F_table_12_3" + }, + { + "index": 309, + "pair_id": "F_origin_8_F_table_12_4", + "image_paths": [ + "data/Finance/Table/F_origin_8/F_table_12_4.png" + ], + "domain": "Finance", + "origin": "F_origin_8", + "table_id": "F_table_12_4" + }, + { + "index": 310, + "pair_id": "F_origin_9_F_table_13_0", + "image_paths": [ + "data/Finance/Table/F_origin_9/F_table_13_0.png" + ], + "domain": "Finance", + "origin": "F_origin_9", + "table_id": "F_table_13_0" + }, + { + "index": 311, + "pair_id": "F_origin_9_F_table_13_1", + "image_paths": [ + "data/Finance/Table/F_origin_9/F_table_13_1.png" + ], + "domain": "Finance", + "origin": "F_origin_9", + "table_id": "F_table_13_1" + }, + { + "index": 312, + "pair_id": "F_origin_9_F_table_13_2", + "image_paths": [ + "data/Finance/Table/F_origin_9/F_table_13_2.png" + ], + "domain": "Finance", + "origin": "F_origin_9", + "table_id": "F_table_13_2" + }, + { + "index": 313, + "pair_id": "F_origin_9_F_table_13_3", + "image_paths": [ + "data/Finance/Table/F_origin_9/F_table_13_3.png" + ], + "domain": "Finance", + "origin": "F_origin_9", + "table_id": "F_table_13_3" + }, + { + "index": 314, + "pair_id": "F_origin_9_F_table_13_4", + "image_paths": [ + "data/Finance/Table/F_origin_9/F_table_13_4.png" + ], + "domain": "Finance", + "origin": "F_origin_9", + "table_id": "F_table_13_4" + } +] \ No newline at end of file diff --git a/single_image_json_list/single_table_insurance_input.json b/single_image_json_list/single_table_insurance_input.json new file mode 100644 index 0000000..73dc561 --- /dev/null +++ b/single_image_json_list/single_table_insurance_input.json @@ -0,0 +1,1572 @@ +[ + { + "index": 0, + "pair_id": "I_origin_0_I_table_0", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_0.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_0" + }, + { + "index": 1, + "pair_id": "I_origin_0_I_table_1", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_1.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_1" + }, + { + "index": 2, + "pair_id": "I_origin_0_I_table_10", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_10.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_10" + }, + { + "index": 3, + "pair_id": "I_origin_0_I_table_11", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_11.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_11" + }, + { + "index": 4, + "pair_id": "I_origin_0_I_table_12_0", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_12_0.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_12_0" + }, + { + "index": 5, + "pair_id": "I_origin_0_I_table_12_1", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_12_1.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_12_1" + }, + { + "index": 6, + "pair_id": "I_origin_0_I_table_13", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_13.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_13" + }, + { + "index": 7, + "pair_id": "I_origin_0_I_table_14", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_14.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_14" + }, + { + "index": 8, + "pair_id": "I_origin_0_I_table_15", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_15.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_15" + }, + { + "index": 9, + "pair_id": "I_origin_0_I_table_16", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_16.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_16" + }, + { + "index": 10, + "pair_id": "I_origin_0_I_table_17", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_17.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_17" + }, + { + "index": 11, + "pair_id": "I_origin_0_I_table_18", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_18.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_18" + }, + { + "index": 12, + "pair_id": "I_origin_0_I_table_19_0", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_0.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_0" + }, + { + "index": 13, + "pair_id": "I_origin_0_I_table_19_1", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_1.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_1" + }, + { + "index": 14, + "pair_id": "I_origin_0_I_table_19_10", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_10.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_10" + }, + { + "index": 15, + "pair_id": "I_origin_0_I_table_19_11", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_11.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_11" + }, + { + "index": 16, + "pair_id": "I_origin_0_I_table_19_12", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_12.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_12" + }, + { + "index": 17, + "pair_id": "I_origin_0_I_table_19_13", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_13.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_13" + }, + { + "index": 18, + "pair_id": "I_origin_0_I_table_19_14", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_14.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_14" + }, + { + "index": 19, + "pair_id": "I_origin_0_I_table_19_15", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_15.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_15" + }, + { + "index": 20, + "pair_id": "I_origin_0_I_table_19_16", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_16.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_16" + }, + { + "index": 21, + "pair_id": "I_origin_0_I_table_19_17", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_17.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_17" + }, + { + "index": 22, + "pair_id": "I_origin_0_I_table_19_18", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_18.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_18" + }, + { + "index": 23, + "pair_id": "I_origin_0_I_table_19_19", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_19.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_19" + }, + { + "index": 24, + "pair_id": "I_origin_0_I_table_19_2", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_2.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_2" + }, + { + "index": 25, + "pair_id": "I_origin_0_I_table_19_20", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_20.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_20" + }, + { + "index": 26, + "pair_id": "I_origin_0_I_table_19_21", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_21.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_21" + }, + { + "index": 27, + "pair_id": "I_origin_0_I_table_19_3", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_3.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_3" + }, + { + "index": 28, + "pair_id": "I_origin_0_I_table_19_4", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_4.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_4" + }, + { + "index": 29, + "pair_id": "I_origin_0_I_table_19_5", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_5.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_5" + }, + { + "index": 30, + "pair_id": "I_origin_0_I_table_19_6", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_6.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_6" + }, + { + "index": 31, + "pair_id": "I_origin_0_I_table_19_7", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_7.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_7" + }, + { + "index": 32, + "pair_id": "I_origin_0_I_table_19_8", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_8.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_8" + }, + { + "index": 33, + "pair_id": "I_origin_0_I_table_19_9", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_19_9.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_19_9" + }, + { + "index": 34, + "pair_id": "I_origin_0_I_table_2", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_2.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_2" + }, + { + "index": 35, + "pair_id": "I_origin_0_I_table_20_0", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_0.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_0" + }, + { + "index": 36, + "pair_id": "I_origin_0_I_table_20_1", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_1.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_1" + }, + { + "index": 37, + "pair_id": "I_origin_0_I_table_20_2", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_2.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_2" + }, + { + "index": 38, + "pair_id": "I_origin_0_I_table_20_3", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_3.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_3" + }, + { + "index": 39, + "pair_id": "I_origin_0_I_table_20_4", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_4.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_4" + }, + { + "index": 40, + "pair_id": "I_origin_0_I_table_20_5", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_5.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_5" + }, + { + "index": 41, + "pair_id": "I_origin_0_I_table_20_6", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_6.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_6" + }, + { + "index": 42, + "pair_id": "I_origin_0_I_table_20_7", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_7.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_7" + }, + { + "index": 43, + "pair_id": "I_origin_0_I_table_20_8", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_8.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_8" + }, + { + "index": 44, + "pair_id": "I_origin_0_I_table_20_9", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_20_9.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_20_9" + }, + { + "index": 45, + "pair_id": "I_origin_0_I_table_21", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_21.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_21" + }, + { + "index": 46, + "pair_id": "I_origin_0_I_table_22", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_22.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_22" + }, + { + "index": 47, + "pair_id": "I_origin_0_I_table_23", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_23.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_23" + }, + { + "index": 48, + "pair_id": "I_origin_0_I_table_24", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_24.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_24" + }, + { + "index": 49, + "pair_id": "I_origin_0_I_table_25", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_25.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_25" + }, + { + "index": 50, + "pair_id": "I_origin_0_I_table_26", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_26.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_26" + }, + { + "index": 51, + "pair_id": "I_origin_0_I_table_27", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_27.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_27" + }, + { + "index": 52, + "pair_id": "I_origin_0_I_table_28", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_28.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_28" + }, + { + "index": 53, + "pair_id": "I_origin_0_I_table_29", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_29.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_29" + }, + { + "index": 54, + "pair_id": "I_origin_0_I_table_3", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_3.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_3" + }, + { + "index": 55, + "pair_id": "I_origin_0_I_table_30", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_30.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_30" + }, + { + "index": 56, + "pair_id": "I_origin_0_I_table_31", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_31.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_31" + }, + { + "index": 57, + "pair_id": "I_origin_0_I_table_32", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_32.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_32" + }, + { + "index": 58, + "pair_id": "I_origin_0_I_table_33", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_33.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_33" + }, + { + "index": 59, + "pair_id": "I_origin_0_I_table_34", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_34.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_34" + }, + { + "index": 60, + "pair_id": "I_origin_0_I_table_35", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_35.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_35" + }, + { + "index": 61, + "pair_id": "I_origin_0_I_table_36", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_36.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_36" + }, + { + "index": 62, + "pair_id": "I_origin_0_I_table_37", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_37.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_37" + }, + { + "index": 63, + "pair_id": "I_origin_0_I_table_38", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_38.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_38" + }, + { + "index": 64, + "pair_id": "I_origin_0_I_table_39", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_39.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_39" + }, + { + "index": 65, + "pair_id": "I_origin_0_I_table_4", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_4.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_4" + }, + { + "index": 66, + "pair_id": "I_origin_0_I_table_40_0", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_40_0.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_40_0" + }, + { + "index": 67, + "pair_id": "I_origin_0_I_table_40_1", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_40_1.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_40_1" + }, + { + "index": 68, + "pair_id": "I_origin_0_I_table_41", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_41.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_41" + }, + { + "index": 69, + "pair_id": "I_origin_0_I_table_42", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_42.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_42" + }, + { + "index": 70, + "pair_id": "I_origin_0_I_table_43", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_43.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_43" + }, + { + "index": 71, + "pair_id": "I_origin_0_I_table_44", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_44.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_44" + }, + { + "index": 72, + "pair_id": "I_origin_0_I_table_45", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_45.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_45" + }, + { + "index": 73, + "pair_id": "I_origin_0_I_table_46", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_46.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_46" + }, + { + "index": 74, + "pair_id": "I_origin_0_I_table_47", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_47.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_47" + }, + { + "index": 75, + "pair_id": "I_origin_0_I_table_48", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_48.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_48" + }, + { + "index": 76, + "pair_id": "I_origin_0_I_table_49", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_49.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_49" + }, + { + "index": 77, + "pair_id": "I_origin_0_I_table_5", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_5.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_5" + }, + { + "index": 78, + "pair_id": "I_origin_0_I_table_50", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_50.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_50" + }, + { + "index": 79, + "pair_id": "I_origin_0_I_table_51", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_51.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_51" + }, + { + "index": 80, + "pair_id": "I_origin_0_I_table_52", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_52.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_52" + }, + { + "index": 81, + "pair_id": "I_origin_0_I_table_53", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_53.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_53" + }, + { + "index": 82, + "pair_id": "I_origin_0_I_table_54", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_54.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_54" + }, + { + "index": 83, + "pair_id": "I_origin_0_I_table_55", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_55.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_55" + }, + { + "index": 84, + "pair_id": "I_origin_0_I_table_56", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_56.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_56" + }, + { + "index": 85, + "pair_id": "I_origin_0_I_table_57", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_57.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_57" + }, + { + "index": 86, + "pair_id": "I_origin_0_I_table_58", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_58.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_58" + }, + { + "index": 87, + "pair_id": "I_origin_0_I_table_59", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_59.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_59" + }, + { + "index": 88, + "pair_id": "I_origin_0_I_table_6", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_6.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_6" + }, + { + "index": 89, + "pair_id": "I_origin_0_I_table_60", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_60.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_60" + }, + { + "index": 90, + "pair_id": "I_origin_0_I_table_61", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_61.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_61" + }, + { + "index": 91, + "pair_id": "I_origin_0_I_table_62", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_62.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_62" + }, + { + "index": 92, + "pair_id": "I_origin_0_I_table_63", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_63.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_63" + }, + { + "index": 93, + "pair_id": "I_origin_0_I_table_64_0", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_0.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_0" + }, + { + "index": 94, + "pair_id": "I_origin_0_I_table_64_1", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_1.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_1" + }, + { + "index": 95, + "pair_id": "I_origin_0_I_table_64_10", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_10.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_10" + }, + { + "index": 96, + "pair_id": "I_origin_0_I_table_64_11", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_11.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_11" + }, + { + "index": 97, + "pair_id": "I_origin_0_I_table_64_12", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_12.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_12" + }, + { + "index": 98, + "pair_id": "I_origin_0_I_table_64_13", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_13.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_13" + }, + { + "index": 99, + "pair_id": "I_origin_0_I_table_64_14", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_14.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_14" + }, + { + "index": 100, + "pair_id": "I_origin_0_I_table_64_15", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_15.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_15" + }, + { + "index": 101, + "pair_id": "I_origin_0_I_table_64_16", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_16.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_16" + }, + { + "index": 102, + "pair_id": "I_origin_0_I_table_64_17", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_17.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_17" + }, + { + "index": 103, + "pair_id": "I_origin_0_I_table_64_18", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_18.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_18" + }, + { + "index": 104, + "pair_id": "I_origin_0_I_table_64_19", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_19.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_19" + }, + { + "index": 105, + "pair_id": "I_origin_0_I_table_64_2", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_2.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_2" + }, + { + "index": 106, + "pair_id": "I_origin_0_I_table_64_20", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_20.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_20" + }, + { + "index": 107, + "pair_id": "I_origin_0_I_table_64_21", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_21.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_21" + }, + { + "index": 108, + "pair_id": "I_origin_0_I_table_64_22", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_22.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_22" + }, + { + "index": 109, + "pair_id": "I_origin_0_I_table_64_23", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_23.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_23" + }, + { + "index": 110, + "pair_id": "I_origin_0_I_table_64_24", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_24.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_24" + }, + { + "index": 111, + "pair_id": "I_origin_0_I_table_64_25", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_25.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_25" + }, + { + "index": 112, + "pair_id": "I_origin_0_I_table_64_26", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_26.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_26" + }, + { + "index": 113, + "pair_id": "I_origin_0_I_table_64_27", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_27.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_27" + }, + { + "index": 114, + "pair_id": "I_origin_0_I_table_64_3", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_3.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_3" + }, + { + "index": 115, + "pair_id": "I_origin_0_I_table_64_4", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_4.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_4" + }, + { + "index": 116, + "pair_id": "I_origin_0_I_table_64_5", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_5.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_5" + }, + { + "index": 117, + "pair_id": "I_origin_0_I_table_64_6", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_6.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_6" + }, + { + "index": 118, + "pair_id": "I_origin_0_I_table_64_7", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_7.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_7" + }, + { + "index": 119, + "pair_id": "I_origin_0_I_table_64_8", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_8.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_8" + }, + { + "index": 120, + "pair_id": "I_origin_0_I_table_64_9", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_64_9.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_64_9" + }, + { + "index": 121, + "pair_id": "I_origin_0_I_table_65_0", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_65_0.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_65_0" + }, + { + "index": 122, + "pair_id": "I_origin_0_I_table_65_1", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_65_1.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_65_1" + }, + { + "index": 123, + "pair_id": "I_origin_0_I_table_65_2", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_65_2.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_65_2" + }, + { + "index": 124, + "pair_id": "I_origin_0_I_table_66", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_66.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_66" + }, + { + "index": 125, + "pair_id": "I_origin_0_I_table_67", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_67.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_67" + }, + { + "index": 126, + "pair_id": "I_origin_0_I_table_68", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_68.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_68" + }, + { + "index": 127, + "pair_id": "I_origin_0_I_table_69", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_69.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_69" + }, + { + "index": 128, + "pair_id": "I_origin_0_I_table_7", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_7.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_7" + }, + { + "index": 129, + "pair_id": "I_origin_0_I_table_70", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_70.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_70" + }, + { + "index": 130, + "pair_id": "I_origin_0_I_table_71", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_71.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_71" + }, + { + "index": 131, + "pair_id": "I_origin_0_I_table_72", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_72.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_72" + }, + { + "index": 132, + "pair_id": "I_origin_0_I_table_73", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_73.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_73" + }, + { + "index": 133, + "pair_id": "I_origin_0_I_table_74", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_74.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_74" + }, + { + "index": 134, + "pair_id": "I_origin_0_I_table_75", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_75.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_75" + }, + { + "index": 135, + "pair_id": "I_origin_0_I_table_76", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_76.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_76" + }, + { + "index": 136, + "pair_id": "I_origin_0_I_table_77", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_77.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_77" + }, + { + "index": 137, + "pair_id": "I_origin_0_I_table_8", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_8.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_8" + }, + { + "index": 138, + "pair_id": "I_origin_0_I_table_9", + "image_paths": [ + "data/Insurance/Table/I_origin_0/I_table_9.png" + ], + "domain": "Insurance", + "origin": "I_origin_0", + "table_id": "I_table_9" + }, + { + "index": 139, + "pair_id": "I_origin_1_I_table_78", + "image_paths": [ + "data/Insurance/Table/I_origin_1/I_table_78.png" + ], + "domain": "Insurance", + "origin": "I_origin_1", + "table_id": "I_table_78" + }, + { + "index": 140, + "pair_id": "I_origin_2_I_table_79", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_79.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_79" + }, + { + "index": 141, + "pair_id": "I_origin_2_I_table_80", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_80.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_80" + }, + { + "index": 142, + "pair_id": "I_origin_2_I_table_81", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_81.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_81" + }, + { + "index": 143, + "pair_id": "I_origin_2_I_table_82", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_82.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_82" + }, + { + "index": 144, + "pair_id": "I_origin_2_I_table_83", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_83.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_83" + }, + { + "index": 145, + "pair_id": "I_origin_2_I_table_84", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_84.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_84" + }, + { + "index": 146, + "pair_id": "I_origin_2_I_table_85", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_85.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_85" + }, + { + "index": 147, + "pair_id": "I_origin_2_I_table_86", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_86.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_86" + }, + { + "index": 148, + "pair_id": "I_origin_2_I_table_87", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_87.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_87" + }, + { + "index": 149, + "pair_id": "I_origin_2_I_table_88", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_88.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_88" + }, + { + "index": 150, + "pair_id": "I_origin_2_I_table_89", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_89.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_89" + }, + { + "index": 151, + "pair_id": "I_origin_2_I_table_90", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_90.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_90" + }, + { + "index": 152, + "pair_id": "I_origin_2_I_table_91", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_91.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_91" + }, + { + "index": 153, + "pair_id": "I_origin_2_I_table_92", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_92.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_92" + }, + { + "index": 154, + "pair_id": "I_origin_2_I_table_93", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_93.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_93" + }, + { + "index": 155, + "pair_id": "I_origin_2_I_table_94", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_94.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_94" + }, + { + "index": 156, + "pair_id": "I_origin_2_I_table_95", + "image_paths": [ + "data/Insurance/Table/I_origin_2/I_table_95.png" + ], + "domain": "Insurance", + "origin": "I_origin_2", + "table_id": "I_table_95" + } +] \ No newline at end of file diff --git a/single_image_json_list/single_table_medical_input.json b/single_image_json_list/single_table_medical_input.json new file mode 100644 index 0000000..3978487 --- /dev/null +++ b/single_image_json_list/single_table_medical_input.json @@ -0,0 +1,1292 @@ +[ + { + "index": 0, + "pair_id": "Medical_M_table_0_0_0", + "image_paths": [ + "data/Medical/Table/M_table_0_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_0_0_0" + }, + { + "index": 1, + "pair_id": "Medical_M_table_0_1_0", + "image_paths": [ + "data/Medical/Table/M_table_0_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_0_1_0" + }, + { + "index": 2, + "pair_id": "Medical_M_table_10_0_0", + "image_paths": [ + "data/Medical/Table/M_table_10_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_0_0" + }, + { + "index": 3, + "pair_id": "Medical_M_table_10_0_1", + "image_paths": [ + "data/Medical/Table/M_table_10_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_0_1" + }, + { + "index": 4, + "pair_id": "Medical_M_table_10_0_2", + "image_paths": [ + "data/Medical/Table/M_table_10_0_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_0_2" + }, + { + "index": 5, + "pair_id": "Medical_M_table_10_1_0", + "image_paths": [ + "data/Medical/Table/M_table_10_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_1_0" + }, + { + "index": 6, + "pair_id": "Medical_M_table_10_1_1", + "image_paths": [ + "data/Medical/Table/M_table_10_1_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_1_1" + }, + { + "index": 7, + "pair_id": "Medical_M_table_10_1_2", + "image_paths": [ + "data/Medical/Table/M_table_10_1_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_1_2" + }, + { + "index": 8, + "pair_id": "Medical_M_table_10_1_3", + "image_paths": [ + "data/Medical/Table/M_table_10_1_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_1_3" + }, + { + "index": 9, + "pair_id": "Medical_M_table_10_2_0", + "image_paths": [ + "data/Medical/Table/M_table_10_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_2_0" + }, + { + "index": 10, + "pair_id": "Medical_M_table_10_2_1", + "image_paths": [ + "data/Medical/Table/M_table_10_2_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_2_1" + }, + { + "index": 11, + "pair_id": "Medical_M_table_10_2_2", + "image_paths": [ + "data/Medical/Table/M_table_10_2_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_2_2" + }, + { + "index": 12, + "pair_id": "Medical_M_table_10_2_3", + "image_paths": [ + "data/Medical/Table/M_table_10_2_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_2_3" + }, + { + "index": 13, + "pair_id": "Medical_M_table_10_2_4", + "image_paths": [ + "data/Medical/Table/M_table_10_2_4.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_2_4" + }, + { + "index": 14, + "pair_id": "Medical_M_table_10_2_5", + "image_paths": [ + "data/Medical/Table/M_table_10_2_5.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_2_5" + }, + { + "index": 15, + "pair_id": "Medical_M_table_10_3_0", + "image_paths": [ + "data/Medical/Table/M_table_10_3_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_3_0" + }, + { + "index": 16, + "pair_id": "Medical_M_table_10_3_1", + "image_paths": [ + "data/Medical/Table/M_table_10_3_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_3_1" + }, + { + "index": 17, + "pair_id": "Medical_M_table_10_3_2", + "image_paths": [ + "data/Medical/Table/M_table_10_3_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_3_2" + }, + { + "index": 18, + "pair_id": "Medical_M_table_10_3_3", + "image_paths": [ + "data/Medical/Table/M_table_10_3_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_3_3" + }, + { + "index": 19, + "pair_id": "Medical_M_table_10_3_4", + "image_paths": [ + "data/Medical/Table/M_table_10_3_4.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_3_4" + }, + { + "index": 20, + "pair_id": "Medical_M_table_10_4_0", + "image_paths": [ + "data/Medical/Table/M_table_10_4_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_4_0" + }, + { + "index": 21, + "pair_id": "Medical_M_table_10_4_1", + "image_paths": [ + "data/Medical/Table/M_table_10_4_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_4_1" + }, + { + "index": 22, + "pair_id": "Medical_M_table_10_4_2", + "image_paths": [ + "data/Medical/Table/M_table_10_4_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_4_2" + }, + { + "index": 23, + "pair_id": "Medical_M_table_10_4_3", + "image_paths": [ + "data/Medical/Table/M_table_10_4_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_4_3" + }, + { + "index": 24, + "pair_id": "Medical_M_table_10_4_4", + "image_paths": [ + "data/Medical/Table/M_table_10_4_4.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_4_4" + }, + { + "index": 25, + "pair_id": "Medical_M_table_10_5_0", + "image_paths": [ + "data/Medical/Table/M_table_10_5_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_10_5_0" + }, + { + "index": 26, + "pair_id": "Medical_M_table_11_0_0", + "image_paths": [ + "data/Medical/Table/M_table_11_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_11_0_0" + }, + { + "index": 27, + "pair_id": "Medical_M_table_11_0_1", + "image_paths": [ + "data/Medical/Table/M_table_11_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_11_0_1" + }, + { + "index": 28, + "pair_id": "Medical_M_table_11_0_2", + "image_paths": [ + "data/Medical/Table/M_table_11_0_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_11_0_2" + }, + { + "index": 29, + "pair_id": "Medical_M_table_11_0_3", + "image_paths": [ + "data/Medical/Table/M_table_11_0_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_11_0_3" + }, + { + "index": 30, + "pair_id": "Medical_M_table_11_0_4", + "image_paths": [ + "data/Medical/Table/M_table_11_0_4.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_11_0_4" + }, + { + "index": 31, + "pair_id": "Medical_M_table_12_0_0", + "image_paths": [ + "data/Medical/Table/M_table_12_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_12_0_0" + }, + { + "index": 32, + "pair_id": "Medical_M_table_13_0_0", + "image_paths": [ + "data/Medical/Table/M_table_13_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_0_0" + }, + { + "index": 33, + "pair_id": "Medical_M_table_13_0_1", + "image_paths": [ + "data/Medical/Table/M_table_13_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_0_1" + }, + { + "index": 34, + "pair_id": "Medical_M_table_13_0_2", + "image_paths": [ + "data/Medical/Table/M_table_13_0_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_0_2" + }, + { + "index": 35, + "pair_id": "Medical_M_table_13_0_3", + "image_paths": [ + "data/Medical/Table/M_table_13_0_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_0_3" + }, + { + "index": 36, + "pair_id": "Medical_M_table_13_1_0", + "image_paths": [ + "data/Medical/Table/M_table_13_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_1_0" + }, + { + "index": 37, + "pair_id": "Medical_M_table_13_1_1", + "image_paths": [ + "data/Medical/Table/M_table_13_1_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_1_1" + }, + { + "index": 38, + "pair_id": "Medical_M_table_13_1_10", + "image_paths": [ + "data/Medical/Table/M_table_13_1_10.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_1_10" + }, + { + "index": 39, + "pair_id": "Medical_M_table_13_1_11", + "image_paths": [ + "data/Medical/Table/M_table_13_1_11.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_1_11" + }, + { + "index": 40, + "pair_id": "Medical_M_table_13_1_2", + "image_paths": [ + "data/Medical/Table/M_table_13_1_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_1_2" + }, + { + "index": 41, + "pair_id": "Medical_M_table_13_1_6", + "image_paths": [ + "data/Medical/Table/M_table_13_1_6.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_1_6" + }, + { + "index": 42, + "pair_id": "Medical_M_table_13_1_8", + "image_paths": [ + "data/Medical/Table/M_table_13_1_8.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_1_8" + }, + { + "index": 43, + "pair_id": "Medical_M_table_13_2_0", + "image_paths": [ + "data/Medical/Table/M_table_13_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_2_0" + }, + { + "index": 44, + "pair_id": "Medical_M_table_13_2_1", + "image_paths": [ + "data/Medical/Table/M_table_13_2_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_13_2_1" + }, + { + "index": 45, + "pair_id": "Medical_M_table_14_0_0", + "image_paths": [ + "data/Medical/Table/M_table_14_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_0_0" + }, + { + "index": 46, + "pair_id": "Medical_M_table_14_0_1", + "image_paths": [ + "data/Medical/Table/M_table_14_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_0_1" + }, + { + "index": 47, + "pair_id": "Medical_M_table_14_1_0", + "image_paths": [ + "data/Medical/Table/M_table_14_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_1_0" + }, + { + "index": 48, + "pair_id": "Medical_M_table_14_2_0", + "image_paths": [ + "data/Medical/Table/M_table_14_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_2_0" + }, + { + "index": 49, + "pair_id": "Medical_M_table_14_3_0", + "image_paths": [ + "data/Medical/Table/M_table_14_3_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_3_0" + }, + { + "index": 50, + "pair_id": "Medical_M_table_14_3_1", + "image_paths": [ + "data/Medical/Table/M_table_14_3_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_3_1" + }, + { + "index": 51, + "pair_id": "Medical_M_table_14_3_2", + "image_paths": [ + "data/Medical/Table/M_table_14_3_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_3_2" + }, + { + "index": 52, + "pair_id": "Medical_M_table_14_3_3", + "image_paths": [ + "data/Medical/Table/M_table_14_3_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_3_3" + }, + { + "index": 53, + "pair_id": "Medical_M_table_14_3_4", + "image_paths": [ + "data/Medical/Table/M_table_14_3_4.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_14_3_4" + }, + { + "index": 54, + "pair_id": "Medical_M_table_15_0_0", + "image_paths": [ + "data/Medical/Table/M_table_15_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_0_0" + }, + { + "index": 55, + "pair_id": "Medical_M_table_15_0_1", + "image_paths": [ + "data/Medical/Table/M_table_15_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_0_1" + }, + { + "index": 56, + "pair_id": "Medical_M_table_15_0_2", + "image_paths": [ + "data/Medical/Table/M_table_15_0_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_0_2" + }, + { + "index": 57, + "pair_id": "Medical_M_table_15_10_0", + "image_paths": [ + "data/Medical/Table/M_table_15_10_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_10_0" + }, + { + "index": 58, + "pair_id": "Medical_M_table_15_10_1", + "image_paths": [ + "data/Medical/Table/M_table_15_10_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_10_1" + }, + { + "index": 59, + "pair_id": "Medical_M_table_15_11_0", + "image_paths": [ + "data/Medical/Table/M_table_15_11_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_11_0" + }, + { + "index": 60, + "pair_id": "Medical_M_table_15_12_0", + "image_paths": [ + "data/Medical/Table/M_table_15_12_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_12_0" + }, + { + "index": 61, + "pair_id": "Medical_M_table_15_13_0", + "image_paths": [ + "data/Medical/Table/M_table_15_13_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_13_0" + }, + { + "index": 62, + "pair_id": "Medical_M_table_15_1_0", + "image_paths": [ + "data/Medical/Table/M_table_15_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_1_0" + }, + { + "index": 63, + "pair_id": "Medical_M_table_15_2_0", + "image_paths": [ + "data/Medical/Table/M_table_15_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_2_0" + }, + { + "index": 64, + "pair_id": "Medical_M_table_15_3_0", + "image_paths": [ + "data/Medical/Table/M_table_15_3_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_3_0" + }, + { + "index": 65, + "pair_id": "Medical_M_table_15_4_0", + "image_paths": [ + "data/Medical/Table/M_table_15_4_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_4_0" + }, + { + "index": 66, + "pair_id": "Medical_M_table_15_5_0", + "image_paths": [ + "data/Medical/Table/M_table_15_5_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_5_0" + }, + { + "index": 67, + "pair_id": "Medical_M_table_15_5_1", + "image_paths": [ + "data/Medical/Table/M_table_15_5_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_5_1" + }, + { + "index": 68, + "pair_id": "Medical_M_table_15_5_2", + "image_paths": [ + "data/Medical/Table/M_table_15_5_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_5_2" + }, + { + "index": 69, + "pair_id": "Medical_M_table_15_6_0", + "image_paths": [ + "data/Medical/Table/M_table_15_6_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_6_0" + }, + { + "index": 70, + "pair_id": "Medical_M_table_15_7_0", + "image_paths": [ + "data/Medical/Table/M_table_15_7_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_7_0" + }, + { + "index": 71, + "pair_id": "Medical_M_table_15_8_0", + "image_paths": [ + "data/Medical/Table/M_table_15_8_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_8_0" + }, + { + "index": 72, + "pair_id": "Medical_M_table_15_9_0", + "image_paths": [ + "data/Medical/Table/M_table_15_9_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_15_9_0" + }, + { + "index": 73, + "pair_id": "Medical_M_table_16_0_0", + "image_paths": [ + "data/Medical/Table/M_table_16_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_16_0_0" + }, + { + "index": 74, + "pair_id": "Medical_M_table_16_0_1", + "image_paths": [ + "data/Medical/Table/M_table_16_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_16_0_1" + }, + { + "index": 75, + "pair_id": "Medical_M_table_16_1_0", + "image_paths": [ + "data/Medical/Table/M_table_16_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_16_1_0" + }, + { + "index": 76, + "pair_id": "Medical_M_table_16_1_1", + "image_paths": [ + "data/Medical/Table/M_table_16_1_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_16_1_1" + }, + { + "index": 77, + "pair_id": "Medical_M_table_1_0_0", + "image_paths": [ + "data/Medical/Table/M_table_1_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_1_0_0" + }, + { + "index": 78, + "pair_id": "Medical_M_table_2_0_0", + "image_paths": [ + "data/Medical/Table/M_table_2_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_0_0" + }, + { + "index": 79, + "pair_id": "Medical_M_table_2_1_0", + "image_paths": [ + "data/Medical/Table/M_table_2_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_1_0" + }, + { + "index": 80, + "pair_id": "Medical_M_table_2_2_0", + "image_paths": [ + "data/Medical/Table/M_table_2_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_2_0" + }, + { + "index": 81, + "pair_id": "Medical_M_table_2_2_1", + "image_paths": [ + "data/Medical/Table/M_table_2_2_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_2_1" + }, + { + "index": 82, + "pair_id": "Medical_M_table_2_3_0", + "image_paths": [ + "data/Medical/Table/M_table_2_3_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_3_0" + }, + { + "index": 83, + "pair_id": "Medical_M_table_2_3_1", + "image_paths": [ + "data/Medical/Table/M_table_2_3_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_3_1" + }, + { + "index": 84, + "pair_id": "Medical_M_table_2_4_0", + "image_paths": [ + "data/Medical/Table/M_table_2_4_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_4_0" + }, + { + "index": 85, + "pair_id": "Medical_M_table_2_4_1", + "image_paths": [ + "data/Medical/Table/M_table_2_4_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_4_1" + }, + { + "index": 86, + "pair_id": "Medical_M_table_2_5_0", + "image_paths": [ + "data/Medical/Table/M_table_2_5_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_5_0" + }, + { + "index": 87, + "pair_id": "Medical_M_table_2_6_0", + "image_paths": [ + "data/Medical/Table/M_table_2_6_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_0" + }, + { + "index": 88, + "pair_id": "Medical_M_table_2_6_1", + "image_paths": [ + "data/Medical/Table/M_table_2_6_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_1" + }, + { + "index": 89, + "pair_id": "Medical_M_table_2_6_2", + "image_paths": [ + "data/Medical/Table/M_table_2_6_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_2" + }, + { + "index": 90, + "pair_id": "Medical_M_table_2_6_3", + "image_paths": [ + "data/Medical/Table/M_table_2_6_3.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_3" + }, + { + "index": 91, + "pair_id": "Medical_M_table_2_6_4", + "image_paths": [ + "data/Medical/Table/M_table_2_6_4.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_4" + }, + { + "index": 92, + "pair_id": "Medical_M_table_2_6_5", + "image_paths": [ + "data/Medical/Table/M_table_2_6_5.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_5" + }, + { + "index": 93, + "pair_id": "Medical_M_table_2_6_6", + "image_paths": [ + "data/Medical/Table/M_table_2_6_6.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_6" + }, + { + "index": 94, + "pair_id": "Medical_M_table_2_6_7", + "image_paths": [ + "data/Medical/Table/M_table_2_6_7.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_6_7" + }, + { + "index": 95, + "pair_id": "Medical_M_table_2_7_0", + "image_paths": [ + "data/Medical/Table/M_table_2_7_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_2_7_0" + }, + { + "index": 96, + "pair_id": "Medical_M_table_3_0_0", + "image_paths": [ + "data/Medical/Table/M_table_3_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_0_0" + }, + { + "index": 97, + "pair_id": "Medical_M_table_3_0_1", + "image_paths": [ + "data/Medical/Table/M_table_3_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_0_1" + }, + { + "index": 98, + "pair_id": "Medical_M_table_3_0_2", + "image_paths": [ + "data/Medical/Table/M_table_3_0_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_0_2" + }, + { + "index": 99, + "pair_id": "Medical_M_table_3_1_0", + "image_paths": [ + "data/Medical/Table/M_table_3_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_1_0" + }, + { + "index": 100, + "pair_id": "Medical_M_table_3_1_1", + "image_paths": [ + "data/Medical/Table/M_table_3_1_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_1_1" + }, + { + "index": 101, + "pair_id": "Medical_M_table_3_2_0", + "image_paths": [ + "data/Medical/Table/M_table_3_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_2_0" + }, + { + "index": 102, + "pair_id": "Medical_M_table_3_2_1", + "image_paths": [ + "data/Medical/Table/M_table_3_2_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_2_1" + }, + { + "index": 103, + "pair_id": "Medical_M_table_3_3_0", + "image_paths": [ + "data/Medical/Table/M_table_3_3_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_3_0" + }, + { + "index": 104, + "pair_id": "Medical_M_table_3_3_1", + "image_paths": [ + "data/Medical/Table/M_table_3_3_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_3_3_1" + }, + { + "index": 105, + "pair_id": "Medical_M_table_4_0_0", + "image_paths": [ + "data/Medical/Table/M_table_4_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_4_0_0" + }, + { + "index": 106, + "pair_id": "Medical_M_table_4_0_1", + "image_paths": [ + "data/Medical/Table/M_table_4_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_4_0_1" + }, + { + "index": 107, + "pair_id": "Medical_M_table_4_1_0", + "image_paths": [ + "data/Medical/Table/M_table_4_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_4_1_0" + }, + { + "index": 108, + "pair_id": "Medical_M_table_5_0_0", + "image_paths": [ + "data/Medical/Table/M_table_5_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_5_0_0" + }, + { + "index": 109, + "pair_id": "Medical_M_table_6_0_0", + "image_paths": [ + "data/Medical/Table/M_table_6_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_6_0_0" + }, + { + "index": 110, + "pair_id": "Medical_M_table_6_1_0", + "image_paths": [ + "data/Medical/Table/M_table_6_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_6_1_0" + }, + { + "index": 111, + "pair_id": "Medical_M_table_6_2_0", + "image_paths": [ + "data/Medical/Table/M_table_6_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_6_2_0" + }, + { + "index": 112, + "pair_id": "Medical_M_table_6_3_0", + "image_paths": [ + "data/Medical/Table/M_table_6_3_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_6_3_0" + }, + { + "index": 113, + "pair_id": "Medical_M_table_6_3_1", + "image_paths": [ + "data/Medical/Table/M_table_6_3_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_6_3_1" + }, + { + "index": 114, + "pair_id": "Medical_M_table_8_0_0", + "image_paths": [ + "data/Medical/Table/M_table_8_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_8_0_0" + }, + { + "index": 115, + "pair_id": "Medical_M_table_8_0_1", + "image_paths": [ + "data/Medical/Table/M_table_8_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_8_0_1" + }, + { + "index": 116, + "pair_id": "Medical_M_table_8_0_2", + "image_paths": [ + "data/Medical/Table/M_table_8_0_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_8_0_2" + }, + { + "index": 117, + "pair_id": "Medical_M_table_8_1_0", + "image_paths": [ + "data/Medical/Table/M_table_8_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_8_1_0" + }, + { + "index": 118, + "pair_id": "Medical_M_table_8_2_0", + "image_paths": [ + "data/Medical/Table/M_table_8_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_8_2_0" + }, + { + "index": 119, + "pair_id": "Medical_M_table_8_3_0", + "image_paths": [ + "data/Medical/Table/M_table_8_3_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_8_3_0" + }, + { + "index": 120, + "pair_id": "Medical_M_table_9_0_0", + "image_paths": [ + "data/Medical/Table/M_table_9_0_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_0_0" + }, + { + "index": 121, + "pair_id": "Medical_M_table_9_0_1", + "image_paths": [ + "data/Medical/Table/M_table_9_0_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_0_1" + }, + { + "index": 122, + "pair_id": "Medical_M_table_9_0_2", + "image_paths": [ + "data/Medical/Table/M_table_9_0_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_0_2" + }, + { + "index": 123, + "pair_id": "Medical_M_table_9_1_0", + "image_paths": [ + "data/Medical/Table/M_table_9_1_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_1_0" + }, + { + "index": 124, + "pair_id": "Medical_M_table_9_1_1", + "image_paths": [ + "data/Medical/Table/M_table_9_1_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_1_1" + }, + { + "index": 125, + "pair_id": "Medical_M_table_9_1_2", + "image_paths": [ + "data/Medical/Table/M_table_9_1_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_1_2" + }, + { + "index": 126, + "pair_id": "Medical_M_table_9_2_0", + "image_paths": [ + "data/Medical/Table/M_table_9_2_0.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_2_0" + }, + { + "index": 127, + "pair_id": "Medical_M_table_9_2_1", + "image_paths": [ + "data/Medical/Table/M_table_9_2_1.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_2_1" + }, + { + "index": 128, + "pair_id": "Medical_M_table_9_2_2", + "image_paths": [ + "data/Medical/Table/M_table_9_2_2.png" + ], + "domain": "Medical", + "origin": "Medical", + "table_id": "M_table_9_2_2" + } +] \ No newline at end of file diff --git a/single_image_json_list/single_table_public_input.json b/single_image_json_list/single_table_public_input.json new file mode 100644 index 0000000..b7d33d5 --- /dev/null +++ b/single_image_json_list/single_table_public_input.json @@ -0,0 +1,2492 @@ +[ + { + "index": 0, + "pair_id": "P_origin_0_P_origin_0_0", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_0.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_0" + }, + { + "index": 1, + "pair_id": "P_origin_0_P_origin_0_1_0", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_0.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_1_0" + }, + { + "index": 2, + "pair_id": "P_origin_0_P_origin_0_1_1", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_1.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_1_1" + }, + { + "index": 3, + "pair_id": "P_origin_0_P_origin_0_1_2", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_2.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_1_2" + }, + { + "index": 4, + "pair_id": "P_origin_0_P_origin_0_1_3", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_3.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_1_3" + }, + { + "index": 5, + "pair_id": "P_origin_0_P_origin_0_1_4", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_4.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_1_4" + }, + { + "index": 6, + "pair_id": "P_origin_0_P_origin_0_1_5", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_5.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_1_5" + }, + { + "index": 7, + "pair_id": "P_origin_0_P_origin_0_1_6", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_1_6.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_1_6" + }, + { + "index": 8, + "pair_id": "P_origin_0_P_origin_0_2_0", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_2_0.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_2_0" + }, + { + "index": 9, + "pair_id": "P_origin_0_P_origin_0_2_1", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_2_1.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_2_1" + }, + { + "index": 10, + "pair_id": "P_origin_0_P_origin_0_2_2", + "image_paths": [ + "data/Public/Table/P_origin_0/P_origin_0_2_2.png" + ], + "domain": "Public", + "origin": "P_origin_0", + "table_id": "P_origin_0_2_2" + }, + { + "index": 11, + "pair_id": "P_origin_1_P_origin_1_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_0" + }, + { + "index": 12, + "pair_id": "P_origin_1_P_origin_1_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_1" + }, + { + "index": 13, + "pair_id": "P_origin_1_P_origin_1_10", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_10.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_10" + }, + { + "index": 14, + "pair_id": "P_origin_1_P_origin_1_11_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_11_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_11_0" + }, + { + "index": 15, + "pair_id": "P_origin_1_P_origin_1_11_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_11_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_11_1" + }, + { + "index": 16, + "pair_id": "P_origin_1_P_origin_1_11_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_11_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_11_2" + }, + { + "index": 17, + "pair_id": "P_origin_1_P_origin_1_12_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_12_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_12_0" + }, + { + "index": 18, + "pair_id": "P_origin_1_P_origin_1_12_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_12_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_12_1" + }, + { + "index": 19, + "pair_id": "P_origin_1_P_origin_1_12_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_12_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_12_2" + }, + { + "index": 20, + "pair_id": "P_origin_1_P_origin_1_13_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_13_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_13_0" + }, + { + "index": 21, + "pair_id": "P_origin_1_P_origin_1_13_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_13_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_13_1" + }, + { + "index": 22, + "pair_id": "P_origin_1_P_origin_1_13_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_13_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_13_2" + }, + { + "index": 23, + "pair_id": "P_origin_1_P_origin_1_14_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_14_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_14_0" + }, + { + "index": 24, + "pair_id": "P_origin_1_P_origin_1_14_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_14_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_14_1" + }, + { + "index": 25, + "pair_id": "P_origin_1_P_origin_1_14_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_14_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_14_2" + }, + { + "index": 26, + "pair_id": "P_origin_1_P_origin_1_15_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_15_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_15_0" + }, + { + "index": 27, + "pair_id": "P_origin_1_P_origin_1_15_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_15_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_15_1" + }, + { + "index": 28, + "pair_id": "P_origin_1_P_origin_1_15_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_15_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_15_2" + }, + { + "index": 29, + "pair_id": "P_origin_1_P_origin_1_16_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_16_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_16_0" + }, + { + "index": 30, + "pair_id": "P_origin_1_P_origin_1_16_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_16_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_16_1" + }, + { + "index": 31, + "pair_id": "P_origin_1_P_origin_1_16_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_16_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_16_2" + }, + { + "index": 32, + "pair_id": "P_origin_1_P_origin_1_17_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_17_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_17_0" + }, + { + "index": 33, + "pair_id": "P_origin_1_P_origin_1_17_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_17_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_17_1" + }, + { + "index": 34, + "pair_id": "P_origin_1_P_origin_1_17_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_17_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_17_2" + }, + { + "index": 35, + "pair_id": "P_origin_1_P_origin_1_18_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_18_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_18_0" + }, + { + "index": 36, + "pair_id": "P_origin_1_P_origin_1_18_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_18_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_18_1" + }, + { + "index": 37, + "pair_id": "P_origin_1_P_origin_1_19_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_19_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_19_0" + }, + { + "index": 38, + "pair_id": "P_origin_1_P_origin_1_19_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_19_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_19_1" + }, + { + "index": 39, + "pair_id": "P_origin_1_P_origin_1_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_2" + }, + { + "index": 40, + "pair_id": "P_origin_1_P_origin_1_20_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_20_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_20_0" + }, + { + "index": 41, + "pair_id": "P_origin_1_P_origin_1_20_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_20_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_20_1" + }, + { + "index": 42, + "pair_id": "P_origin_1_P_origin_1_20_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_20_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_20_2" + }, + { + "index": 43, + "pair_id": "P_origin_1_P_origin_1_21_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_21_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_21_0" + }, + { + "index": 44, + "pair_id": "P_origin_1_P_origin_1_21_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_21_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_21_1" + }, + { + "index": 45, + "pair_id": "P_origin_1_P_origin_1_21_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_21_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_21_2" + }, + { + "index": 46, + "pair_id": "P_origin_1_P_origin_1_22_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_22_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_22_0" + }, + { + "index": 47, + "pair_id": "P_origin_1_P_origin_1_22_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_22_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_22_1" + }, + { + "index": 48, + "pair_id": "P_origin_1_P_origin_1_22_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_22_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_22_2" + }, + { + "index": 49, + "pair_id": "P_origin_1_P_origin_1_23_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_23_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_23_0" + }, + { + "index": 50, + "pair_id": "P_origin_1_P_origin_1_23_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_23_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_23_1" + }, + { + "index": 51, + "pair_id": "P_origin_1_P_origin_1_23_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_23_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_23_2" + }, + { + "index": 52, + "pair_id": "P_origin_1_P_origin_1_24", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_24.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_24" + }, + { + "index": 53, + "pair_id": "P_origin_1_P_origin_1_25", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_25.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_25" + }, + { + "index": 54, + "pair_id": "P_origin_1_P_origin_1_26", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_26.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_26" + }, + { + "index": 55, + "pair_id": "P_origin_1_P_origin_1_3", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_3.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_3" + }, + { + "index": 56, + "pair_id": "P_origin_1_P_origin_1_4", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_4.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_4" + }, + { + "index": 57, + "pair_id": "P_origin_1_P_origin_1_5", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_5.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_5" + }, + { + "index": 58, + "pair_id": "P_origin_1_P_origin_1_6", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_6.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_6" + }, + { + "index": 59, + "pair_id": "P_origin_1_P_origin_1_7", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_7.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_7" + }, + { + "index": 60, + "pair_id": "P_origin_1_P_origin_1_8_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_8_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_8_0" + }, + { + "index": 61, + "pair_id": "P_origin_1_P_origin_1_8_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_8_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_8_1" + }, + { + "index": 62, + "pair_id": "P_origin_1_P_origin_1_8_2", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_8_2.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_8_2" + }, + { + "index": 63, + "pair_id": "P_origin_1_P_origin_1_9_0", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_9_0.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_9_0" + }, + { + "index": 64, + "pair_id": "P_origin_1_P_origin_1_9_1", + "image_paths": [ + "data/Public/Table/P_origin_1/P_origin_1_9_1.png" + ], + "domain": "Public", + "origin": "P_origin_1", + "table_id": "P_origin_1_9_1" + }, + { + "index": 65, + "pair_id": "P_origin_10_P_origin_10_0_0", + "image_paths": [ + "data/Public/Table/P_origin_10/P_origin_10_0_0.png" + ], + "domain": "Public", + "origin": "P_origin_10", + "table_id": "P_origin_10_0_0" + }, + { + "index": 66, + "pair_id": "P_origin_10_P_origin_10_0_1", + "image_paths": [ + "data/Public/Table/P_origin_10/P_origin_10_0_1.png" + ], + "domain": "Public", + "origin": "P_origin_10", + "table_id": "P_origin_10_0_1" + }, + { + "index": 67, + "pair_id": "P_origin_10_P_origin_10_0_2", + "image_paths": [ + "data/Public/Table/P_origin_10/P_origin_10_0_2.png" + ], + "domain": "Public", + "origin": "P_origin_10", + "table_id": "P_origin_10_0_2" + }, + { + "index": 68, + "pair_id": "P_origin_10_P_origin_10_0_3", + "image_paths": [ + "data/Public/Table/P_origin_10/P_origin_10_0_3.png" + ], + "domain": "Public", + "origin": "P_origin_10", + "table_id": "P_origin_10_0_3" + }, + { + "index": 69, + "pair_id": "P_origin_10_P_origin_10_0_4", + "image_paths": [ + "data/Public/Table/P_origin_10/P_origin_10_0_4.png" + ], + "domain": "Public", + "origin": "P_origin_10", + "table_id": "P_origin_10_0_4" + }, + { + "index": 70, + "pair_id": "P_origin_11_P_origin_11_0", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_0.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_0" + }, + { + "index": 71, + "pair_id": "P_origin_11_P_origin_11_10", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_10.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_10" + }, + { + "index": 72, + "pair_id": "P_origin_11_P_origin_11_11", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_11.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_11" + }, + { + "index": 73, + "pair_id": "P_origin_11_P_origin_11_12", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_12.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_12" + }, + { + "index": 74, + "pair_id": "P_origin_11_P_origin_11_13", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_13.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_13" + }, + { + "index": 75, + "pair_id": "P_origin_11_P_origin_11_14", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_14.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_14" + }, + { + "index": 76, + "pair_id": "P_origin_11_P_origin_11_15", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_15.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_15" + }, + { + "index": 77, + "pair_id": "P_origin_11_P_origin_11_16", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_16.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_16" + }, + { + "index": 78, + "pair_id": "P_origin_11_P_origin_11_17", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_17.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_17" + }, + { + "index": 79, + "pair_id": "P_origin_11_P_origin_11_18", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_18.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_18" + }, + { + "index": 80, + "pair_id": "P_origin_11_P_origin_11_19", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_19.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_19" + }, + { + "index": 81, + "pair_id": "P_origin_11_P_origin_11_1_0", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_1_0.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_1_0" + }, + { + "index": 82, + "pair_id": "P_origin_11_P_origin_11_1_1", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_1_1.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_1_1" + }, + { + "index": 83, + "pair_id": "P_origin_11_P_origin_11_1_2", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_1_2.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_1_2" + }, + { + "index": 84, + "pair_id": "P_origin_11_P_origin_11_20", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_20.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_20" + }, + { + "index": 85, + "pair_id": "P_origin_11_P_origin_11_2_0", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_2_0.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_2_0" + }, + { + "index": 86, + "pair_id": "P_origin_11_P_origin_11_2_1", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_2_1.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_2_1" + }, + { + "index": 87, + "pair_id": "P_origin_11_P_origin_11_3", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_3.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_3" + }, + { + "index": 88, + "pair_id": "P_origin_11_P_origin_11_4", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_4.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_4" + }, + { + "index": 89, + "pair_id": "P_origin_11_P_origin_11_5", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_5.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_5" + }, + { + "index": 90, + "pair_id": "P_origin_11_P_origin_11_6", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_6.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_6" + }, + { + "index": 91, + "pair_id": "P_origin_11_P_origin_11_7", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_7.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_7" + }, + { + "index": 92, + "pair_id": "P_origin_11_P_origin_11_8", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_8.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_8" + }, + { + "index": 93, + "pair_id": "P_origin_11_P_origin_11_9", + "image_paths": [ + "data/Public/Table/P_origin_11/P_origin_11_9.png" + ], + "domain": "Public", + "origin": "P_origin_11", + "table_id": "P_origin_11_9" + }, + { + "index": 94, + "pair_id": "P_origin_2_P_origin_2_0_0", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_0_0.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_0_0" + }, + { + "index": 95, + "pair_id": "P_origin_2_P_origin_2_0_1", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_0_1.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_0_1" + }, + { + "index": 96, + "pair_id": "P_origin_2_P_origin_2_0_2", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_0_2.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_0_2" + }, + { + "index": 97, + "pair_id": "P_origin_2_P_origin_2_0_3", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_0_3.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_0_3" + }, + { + "index": 98, + "pair_id": "P_origin_2_P_origin_2_1_0", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_1_0.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_1_0" + }, + { + "index": 99, + "pair_id": "P_origin_2_P_origin_2_1_1", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_1_1.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_1_1" + }, + { + "index": 100, + "pair_id": "P_origin_2_P_origin_2_1_2", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_1_2.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_1_2" + }, + { + "index": 101, + "pair_id": "P_origin_2_P_origin_2_2_0", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_2_0.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_2_0" + }, + { + "index": 102, + "pair_id": "P_origin_2_P_origin_2_2_1", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_2_1.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_2_1" + }, + { + "index": 103, + "pair_id": "P_origin_2_P_origin_2_2_2", + "image_paths": [ + "data/Public/Table/P_origin_2/P_origin_2_2_2.png" + ], + "domain": "Public", + "origin": "P_origin_2", + "table_id": "P_origin_2_2_2" + }, + { + "index": 104, + "pair_id": "P_origin_3_P_origin_3_0", + "image_paths": [ + "data/Public/Table/P_origin_3/P_origin_3_0.png" + ], + "domain": "Public", + "origin": "P_origin_3", + "table_id": "P_origin_3_0" + }, + { + "index": 105, + "pair_id": "P_origin_3_P_origin_3_1", + "image_paths": [ + "data/Public/Table/P_origin_3/P_origin_3_1.png" + ], + "domain": "Public", + "origin": "P_origin_3", + "table_id": "P_origin_3_1" + }, + { + "index": 106, + "pair_id": "P_origin_3_P_origin_3_2_0", + "image_paths": [ + "data/Public/Table/P_origin_3/P_origin_3_2_0.png" + ], + "domain": "Public", + "origin": "P_origin_3", + "table_id": "P_origin_3_2_0" + }, + { + "index": 107, + "pair_id": "P_origin_3_P_origin_3_2_1", + "image_paths": [ + "data/Public/Table/P_origin_3/P_origin_3_2_1.png" + ], + "domain": "Public", + "origin": "P_origin_3", + "table_id": "P_origin_3_2_1" + }, + { + "index": 108, + "pair_id": "P_origin_4_P_origin_4_0", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_0.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_0" + }, + { + "index": 109, + "pair_id": "P_origin_4_P_origin_4_1", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_1.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_1" + }, + { + "index": 110, + "pair_id": "P_origin_4_P_origin_4_10", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_10.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_10" + }, + { + "index": 111, + "pair_id": "P_origin_4_P_origin_4_11_0", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_11_0.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_11_0" + }, + { + "index": 112, + "pair_id": "P_origin_4_P_origin_4_11_1", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_11_1.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_11_1" + }, + { + "index": 113, + "pair_id": "P_origin_4_P_origin_4_12", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_12.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_12" + }, + { + "index": 114, + "pair_id": "P_origin_4_P_origin_4_13", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_13.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_13" + }, + { + "index": 115, + "pair_id": "P_origin_4_P_origin_4_14", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_14.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_14" + }, + { + "index": 116, + "pair_id": "P_origin_4_P_origin_4_15", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_15.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_15" + }, + { + "index": 117, + "pair_id": "P_origin_4_P_origin_4_16", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_16.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_16" + }, + { + "index": 118, + "pair_id": "P_origin_4_P_origin_4_17", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_17.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_17" + }, + { + "index": 119, + "pair_id": "P_origin_4_P_origin_4_18", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_18.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_18" + }, + { + "index": 120, + "pair_id": "P_origin_4_P_origin_4_19", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_19.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_19" + }, + { + "index": 121, + "pair_id": "P_origin_4_P_origin_4_2", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_2.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_2" + }, + { + "index": 122, + "pair_id": "P_origin_4_P_origin_4_3", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_3.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_3" + }, + { + "index": 123, + "pair_id": "P_origin_4_P_origin_4_4", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_4.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_4" + }, + { + "index": 124, + "pair_id": "P_origin_4_P_origin_4_5", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_5.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_5" + }, + { + "index": 125, + "pair_id": "P_origin_4_P_origin_4_6", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_6.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_6" + }, + { + "index": 126, + "pair_id": "P_origin_4_P_origin_4_7", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_7.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_7" + }, + { + "index": 127, + "pair_id": "P_origin_4_P_origin_4_8", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_8.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_8" + }, + { + "index": 128, + "pair_id": "P_origin_4_P_origin_4_9_0", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_9_0.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_9_0" + }, + { + "index": 129, + "pair_id": "P_origin_4_P_origin_4_9_1", + "image_paths": [ + "data/Public/Table/P_origin_4/P_origin_4_9_1.png" + ], + "domain": "Public", + "origin": "P_origin_4", + "table_id": "P_origin_4_9_1" + }, + { + "index": 130, + "pair_id": "P_origin_5_P_origin_5_0_0", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_0_0.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_0_0" + }, + { + "index": 131, + "pair_id": "P_origin_5_P_origin_5_0_1", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_0_1.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_0_1" + }, + { + "index": 132, + "pair_id": "P_origin_5_P_origin_5_10", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_10.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_10" + }, + { + "index": 133, + "pair_id": "P_origin_5_P_origin_5_11", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_11.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_11" + }, + { + "index": 134, + "pair_id": "P_origin_5_P_origin_5_12", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_12.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_12" + }, + { + "index": 135, + "pair_id": "P_origin_5_P_origin_5_13", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_13.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_13" + }, + { + "index": 136, + "pair_id": "P_origin_5_P_origin_5_14", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_14.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_14" + }, + { + "index": 137, + "pair_id": "P_origin_5_P_origin_5_16", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_16.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_16" + }, + { + "index": 138, + "pair_id": "P_origin_5_P_origin_5_17_0", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_17_0.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_17_0" + }, + { + "index": 139, + "pair_id": "P_origin_5_P_origin_5_17_1", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_17_1.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_17_1" + }, + { + "index": 140, + "pair_id": "P_origin_5_P_origin_5_18", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_18.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_18" + }, + { + "index": 141, + "pair_id": "P_origin_5_P_origin_5_19", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_19.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_19" + }, + { + "index": 142, + "pair_id": "P_origin_5_P_origin_5_1_0", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_1_0.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_1_0" + }, + { + "index": 143, + "pair_id": "P_origin_5_P_origin_5_1_1", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_1_1.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_1_1" + }, + { + "index": 144, + "pair_id": "P_origin_5_P_origin_5_1_2", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_1_2.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_1_2" + }, + { + "index": 145, + "pair_id": "P_origin_5_P_origin_5_2", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_2.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_2" + }, + { + "index": 146, + "pair_id": "P_origin_5_P_origin_5_20", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_20.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_20" + }, + { + "index": 147, + "pair_id": "P_origin_5_P_origin_5_21", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_21.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_21" + }, + { + "index": 148, + "pair_id": "P_origin_5_P_origin_5_22", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_22.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_22" + }, + { + "index": 149, + "pair_id": "P_origin_5_P_origin_5_23", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_23.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_23" + }, + { + "index": 150, + "pair_id": "P_origin_5_P_origin_5_24", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_24.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_24" + }, + { + "index": 151, + "pair_id": "P_origin_5_P_origin_5_25", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_25.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_25" + }, + { + "index": 152, + "pair_id": "P_origin_5_P_origin_5_26_0", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_26_0.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_26_0" + }, + { + "index": 153, + "pair_id": "P_origin_5_P_origin_5_26_1", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_26_1.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_26_1" + }, + { + "index": 154, + "pair_id": "P_origin_5_P_origin_5_3", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_3.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_3" + }, + { + "index": 155, + "pair_id": "P_origin_5_P_origin_5_4", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_4.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_4" + }, + { + "index": 156, + "pair_id": "P_origin_5_P_origin_5_5", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_5.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_5" + }, + { + "index": 157, + "pair_id": "P_origin_5_P_origin_5_6", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_6.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_6" + }, + { + "index": 158, + "pair_id": "P_origin_5_P_origin_5_7", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_7.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_7" + }, + { + "index": 159, + "pair_id": "P_origin_5_P_origin_5_8", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_8.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_8" + }, + { + "index": 160, + "pair_id": "P_origin_5_P_origin_5_9", + "image_paths": [ + "data/Public/Table/P_origin_5/P_origin_5_9.png" + ], + "domain": "Public", + "origin": "P_origin_5", + "table_id": "P_origin_5_9" + }, + { + "index": 161, + "pair_id": "P_origin_6_P_origin_6_0", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_0.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_0" + }, + { + "index": 162, + "pair_id": "P_origin_6_P_origin_6_1", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_1.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_1" + }, + { + "index": 163, + "pair_id": "P_origin_6_P_origin_6_10", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_10.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_10" + }, + { + "index": 164, + "pair_id": "P_origin_6_P_origin_6_11", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_11.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_11" + }, + { + "index": 165, + "pair_id": "P_origin_6_P_origin_6_12_0", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_12_0.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_12_0" + }, + { + "index": 166, + "pair_id": "P_origin_6_P_origin_6_12_1", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_12_1.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_12_1" + }, + { + "index": 167, + "pair_id": "P_origin_6_P_origin_6_2", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_2.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_2" + }, + { + "index": 168, + "pair_id": "P_origin_6_P_origin_6_3", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_3.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_3" + }, + { + "index": 169, + "pair_id": "P_origin_6_P_origin_6_4", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_4.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_4" + }, + { + "index": 170, + "pair_id": "P_origin_6_P_origin_6_5", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_5.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_5" + }, + { + "index": 171, + "pair_id": "P_origin_6_P_origin_6_6", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_6.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_6" + }, + { + "index": 172, + "pair_id": "P_origin_6_P_origin_6_7", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_7.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_7" + }, + { + "index": 173, + "pair_id": "P_origin_6_P_origin_6_8", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_8.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_8" + }, + { + "index": 174, + "pair_id": "P_origin_6_P_origin_6_9", + "image_paths": [ + "data/Public/Table/P_origin_6/P_origin_6_9.png" + ], + "domain": "Public", + "origin": "P_origin_6", + "table_id": "P_origin_6_9" + }, + { + "index": 175, + "pair_id": "P_origin_7_P_origin_7_0_0", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_0_0.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_0_0" + }, + { + "index": 176, + "pair_id": "P_origin_7_P_origin_7_0_1", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_0_1.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_0_1" + }, + { + "index": 177, + "pair_id": "P_origin_7_P_origin_7_1_0", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_1_0.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_1_0" + }, + { + "index": 178, + "pair_id": "P_origin_7_P_origin_7_1_1", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_1_1.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_1_1" + }, + { + "index": 179, + "pair_id": "P_origin_7_P_origin_7_2_0", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_2_0.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_2_0" + }, + { + "index": 180, + "pair_id": "P_origin_7_P_origin_7_2_1", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_2_1.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_2_1" + }, + { + "index": 181, + "pair_id": "P_origin_7_P_origin_7_3", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_3.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_3" + }, + { + "index": 182, + "pair_id": "P_origin_7_P_origin_7_4_0", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_4_0.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_4_0" + }, + { + "index": 183, + "pair_id": "P_origin_7_P_origin_7_4_1", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_4_1.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_4_1" + }, + { + "index": 184, + "pair_id": "P_origin_7_P_origin_7_5", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_5.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_5" + }, + { + "index": 185, + "pair_id": "P_origin_7_P_origin_7_6", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_6.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_6" + }, + { + "index": 186, + "pair_id": "P_origin_7_P_origin_7_7", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_7.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_7" + }, + { + "index": 187, + "pair_id": "P_origin_7_P_origin_7_8_0", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_8_0.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_8_0" + }, + { + "index": 188, + "pair_id": "P_origin_7_P_origin_7_8_1", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_8_1.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_8_1" + }, + { + "index": 189, + "pair_id": "P_origin_7_P_origin_7_8_2", + "image_paths": [ + "data/Public/Table/P_origin_7/P_origin_7_8_2.png" + ], + "domain": "Public", + "origin": "P_origin_7", + "table_id": "P_origin_7_8_2" + }, + { + "index": 190, + "pair_id": "P_origin_8_P_origin_8_0", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_0.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_0" + }, + { + "index": 191, + "pair_id": "P_origin_8_P_origin_8_1", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_1.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_1" + }, + { + "index": 192, + "pair_id": "P_origin_8_P_origin_8_10_0", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_10_0.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_10_0" + }, + { + "index": 193, + "pair_id": "P_origin_8_P_origin_8_10_1", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_10_1.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_10_1" + }, + { + "index": 194, + "pair_id": "P_origin_8_P_origin_8_11", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_11.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_11" + }, + { + "index": 195, + "pair_id": "P_origin_8_P_origin_8_12_0", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_12_0.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_12_0" + }, + { + "index": 196, + "pair_id": "P_origin_8_P_origin_8_12_1", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_12_1.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_12_1" + }, + { + "index": 197, + "pair_id": "P_origin_8_P_origin_8_13", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_13.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_13" + }, + { + "index": 198, + "pair_id": "P_origin_8_P_origin_8_14_0", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_0.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_0" + }, + { + "index": 199, + "pair_id": "P_origin_8_P_origin_8_14_1", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_1.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_1" + }, + { + "index": 200, + "pair_id": "P_origin_8_P_origin_8_14_2", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_2.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_2" + }, + { + "index": 201, + "pair_id": "P_origin_8_P_origin_8_14_3", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_3.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_3" + }, + { + "index": 202, + "pair_id": "P_origin_8_P_origin_8_14_4", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_4.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_4" + }, + { + "index": 203, + "pair_id": "P_origin_8_P_origin_8_14_5", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_5.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_5" + }, + { + "index": 204, + "pair_id": "P_origin_8_P_origin_8_14_6", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_6.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_6" + }, + { + "index": 205, + "pair_id": "P_origin_8_P_origin_8_14_7", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_14_7.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_14_7" + }, + { + "index": 206, + "pair_id": "P_origin_8_P_origin_8_15", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_15.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_15" + }, + { + "index": 207, + "pair_id": "P_origin_8_P_origin_8_16", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_16.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_16" + }, + { + "index": 208, + "pair_id": "P_origin_8_P_origin_8_17_0", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_0.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_0" + }, + { + "index": 209, + "pair_id": "P_origin_8_P_origin_8_17_1", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_1.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_1" + }, + { + "index": 210, + "pair_id": "P_origin_8_P_origin_8_17_2", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_2.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_2" + }, + { + "index": 211, + "pair_id": "P_origin_8_P_origin_8_17_3", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_3.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_3" + }, + { + "index": 212, + "pair_id": "P_origin_8_P_origin_8_17_4", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_4.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_4" + }, + { + "index": 213, + "pair_id": "P_origin_8_P_origin_8_17_5", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_5.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_5" + }, + { + "index": 214, + "pair_id": "P_origin_8_P_origin_8_17_6", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_6.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_6" + }, + { + "index": 215, + "pair_id": "P_origin_8_P_origin_8_17_7", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_17_7.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_17_7" + }, + { + "index": 216, + "pair_id": "P_origin_8_P_origin_8_18", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_18.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_18" + }, + { + "index": 217, + "pair_id": "P_origin_8_P_origin_8_19", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_19.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_19" + }, + { + "index": 218, + "pair_id": "P_origin_8_P_origin_8_2", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_2.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_2" + }, + { + "index": 219, + "pair_id": "P_origin_8_P_origin_8_3", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_3.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_3" + }, + { + "index": 220, + "pair_id": "P_origin_8_P_origin_8_4", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_4.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_4" + }, + { + "index": 221, + "pair_id": "P_origin_8_P_origin_8_5", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_5.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_5" + }, + { + "index": 222, + "pair_id": "P_origin_8_P_origin_8_6", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_6.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_6" + }, + { + "index": 223, + "pair_id": "P_origin_8_P_origin_8_7", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_7.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_7" + }, + { + "index": 224, + "pair_id": "P_origin_8_P_origin_8_8", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_8.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_8" + }, + { + "index": 225, + "pair_id": "P_origin_8_P_origin_8_9_0", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_9_0.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_9_0" + }, + { + "index": 226, + "pair_id": "P_origin_8_P_origin_8_9_1", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_9_1.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_9_1" + }, + { + "index": 227, + "pair_id": "P_origin_8_P_origin_8_9_2", + "image_paths": [ + "data/Public/Table/P_origin_8/P_origin_8_9_2.png" + ], + "domain": "Public", + "origin": "P_origin_8", + "table_id": "P_origin_8_9_2" + }, + { + "index": 228, + "pair_id": "P_origin_9_P_origin_9_0_0", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_0_0.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_0_0" + }, + { + "index": 229, + "pair_id": "P_origin_9_P_origin_9_0_1", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_0_1.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_0_1" + }, + { + "index": 230, + "pair_id": "P_origin_9_P_origin_9_0_2", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_0_2.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_0_2" + }, + { + "index": 231, + "pair_id": "P_origin_9_P_origin_9_0_3", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_0_3.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_0_3" + }, + { + "index": 232, + "pair_id": "P_origin_9_P_origin_9_10", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_10.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_10" + }, + { + "index": 233, + "pair_id": "P_origin_9_P_origin_9_11", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_11.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_11" + }, + { + "index": 234, + "pair_id": "P_origin_9_P_origin_9_1_0", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_1_0.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_1_0" + }, + { + "index": 235, + "pair_id": "P_origin_9_P_origin_9_1_1", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_1_1.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_1_1" + }, + { + "index": 236, + "pair_id": "P_origin_9_P_origin_9_1_2", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_1_2.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_1_2" + }, + { + "index": 237, + "pair_id": "P_origin_9_P_origin_9_2_0", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_2_0.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_2_0" + }, + { + "index": 238, + "pair_id": "P_origin_9_P_origin_9_2_1", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_2_1.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_2_1" + }, + { + "index": 239, + "pair_id": "P_origin_9_P_origin_9_2_2", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_2_2.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_2_2" + }, + { + "index": 240, + "pair_id": "P_origin_9_P_origin_9_3", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_3.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_3" + }, + { + "index": 241, + "pair_id": "P_origin_9_P_origin_9_4", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_4.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_4" + }, + { + "index": 242, + "pair_id": "P_origin_9_P_origin_9_5", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_5.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_5" + }, + { + "index": 243, + "pair_id": "P_origin_9_P_origin_9_6_0", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_6_0.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_6_0" + }, + { + "index": 244, + "pair_id": "P_origin_9_P_origin_9_6_1", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_6_1.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_6_1" + }, + { + "index": 245, + "pair_id": "P_origin_9_P_origin_9_6_2", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_6_2.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_6_2" + }, + { + "index": 246, + "pair_id": "P_origin_9_P_origin_9_7", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_7.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_7" + }, + { + "index": 247, + "pair_id": "P_origin_9_P_origin_9_8", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_8.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_8" + }, + { + "index": 248, + "pair_id": "P_origin_9_P_origin_9_9", + "image_paths": [ + "data/Public/Table/P_origin_9/P_origin_9_9.png" + ], + "domain": "Public", + "origin": "P_origin_9", + "table_id": "P_origin_9_9" + } +] \ No newline at end of file diff --git a/test_input.json b/test_input.json deleted file mode 100644 index f51ecd0..0000000 --- a/test_input.json +++ /dev/null @@ -1,119 +0,0 @@ -[ - { - "index": 0, - "pair_id": "P_origin_0_1", - "image_paths": [ - "data/Public/Table/P_origin_0/P_origin_0_1_0.png", - "data/Public/Table/P_origin_0/P_origin_0_1_1.png" - ], - "domain": "public" - }, - { - "index": 1, - "pair_id": "P_origin_0_2", - "image_paths": [ - "data/Public/Table/P_origin_0/P_origin_0_2_1.png", - "data/Public/Table/P_origin_0/P_origin_0_2_2.png" - ], - "domain": "public" - }, - { - "index": 2, - "pair_id": "P_origin_1_0", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_0.png" - ], - "domain": "public" - }, - { - "index": 3, - "pair_id": "P_origin_1_2", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_2.png" - ], - "domain": "public" - }, - { - "index": 4, - "pair_id": "P_origin_1_4", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_4.png" - ], - "domain": "public" - }, - { - "index": 5, - "pair_id": "P_origin_1_5", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_5.png" - ], - "domain": "public" - }, - { - "index": 6, - "pair_id": "P_origin_1_7", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_7.png" - ], - "domain": "public" - }, - { - "index": 7, - "pair_id": "P_origin_1_9", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_9_0.png", - "data/Public/Table/P_origin_1/P_origin_1_9_1.png" - ], - "domain": "public" - }, - { - "index": 8, - "pair_id": "P_origin_1_10", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_10_0.png", - "data/Public/Table/P_origin_1/P_origin_1_10_1.png" - ], - "domain": "public" - }, - { - "index": 9, - "pair_id": "P_origin_1_12", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_12_0.png", - "data/Public/Table/P_origin_1/P_origin_1_12_1.png" - ], - "domain": "public" - }, - { - "index": 10, - "pair_id": "P_origin_1_13", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_13_0.png" - ], - "domain": "public" - }, - { - "index": 11, - "pair_id": "P_origin_1_14", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_14_0.png" - ], - "domain": "public" - }, - { - "index": 12, - "pair_id": "P_origin_1_23", - "image_paths": [ - "data/Public/Table/P_origin_1/P_origin_1_23_0.png" - ], - "domain": "public" - }, - { - "index": 13, - "pair_id": "P_origin_4_6", - "image_paths": [ - "data/Public/Table/P_origin_4/P_origin_4_6.png" - ], - "domain": "public" - } -] \ No newline at end of file diff --git a/tests/choi/QA_example/README.md b/tests/choi/QA_example/README.md deleted file mode 100644 index 30f07f3..0000000 --- a/tests/choi/QA_example/README.md +++ /dev/null @@ -1,129 +0,0 @@ -# QA Dataset Generation Module - -보험 테이블 마크다운 데이터를 기반으로 고품질 QA(Question-Answer) 데이터셋을 생성하는 모듈입니다. - -## 주요 기능 - -### 1. 난이도별 QA 생성 -- **IR (Information Retrieval)**: 단순 정보 검색 (Level 1) -- **Analysis**: 분석적 질문 (Level 2) -- **Compare (Multi-hop)**: 비교 및 다중 추론 (Level 3) -- **Aggregation**: 집계 연산 (Level 4) -- **Reasoning**: 복합 추론 (Level 5) -- **Insight**: 통찰 도출 (Level 6) - -### 2. 다양한 답변 유형 -- **Exact Match**: 단답형 (숫자, 예/아니오) - 정확한 매칭 평가 -- **Descriptive**: 서술형 - LLM-as-Judge 평가 -- **Calculation**: 수치 계산 결과 - Python 코드로 검증 - -### 3. 고급 기능 -- **Multi-Table QA**: 복수 테이블 참조 필요 질문 -- **Follow-up QA**: 꼬리 질문 체인 생성 -- **Evol-Instruct**: 질문 난이도 진화 -- **LLM-as-Judge**: 품질 평가 - -## 사용법 - -### 기본 사용 - -```python -from QA_example import InsuranceTableQAGenerator, QADifficulty - -# 테이블 데이터 준비 -tables = { - "table_1": "| 구분 | 값 |\n|---|---|\n| A | 100 |", - "table_2": "| 항목 | 금액 |\n|---|---|\n| B | 200 |" -} - -# Generator 초기화 -generator = InsuranceTableQAGenerator() - -# 특정 난이도 QA 생성 -ir_qa = generator.generate_qa_by_difficulty(tables, QADifficulty.IR, num_questions=3) - -# 종합 데이터셋 생성 -dataset = generator.generate_comprehensive_qa_dataset( - tables, - questions_per_difficulty=2, - include_followup=True, - include_evolution=True -) -``` - -### 간편 함수 사용 - -```python -from QA_example import generate_qa_from_tables - -# 모든 난이도 QA 생성 -all_qa = generate_qa_from_tables(tables, num_questions=2) - -# 특정 난이도만 생성 -ir_only = generate_qa_from_tables(tables, difficulty=QADifficulty.IR) -``` - -## 커버되는 QA 양상 - -| # | 양상 | 설명 | 구현 방식 | -|---|------|------|----------| -| 1 | Multi-table QA | 복수 테이블 참조 | `generate_multi_table_qa()` | -| 2 | 난이도별 QA | 6단계 난이도 체계 | `QADifficulty` Enum | -| 3 | 다양한 답변 유형 | Exact Match, Descriptive | `QAType` Enum | -| 4 | 수치 계산 QA | 집계, 비율 계산 | Aggregation 난이도 | -| 5 | 꼬리 질문 | Q-A 체인 | `generate_followup_qa()` | -| 6 | 셀 기반 측정 | 여러 셀 기반 | Compare, Aggregation | -| 7 | 특정 셀 Q-A | 단일 셀 검색 | IR 난이도 | -| 8 | 이미지 연관 QA | 테이블 구조 기반 | 테이블 마크다운 입력 | - -## 파일 구조 - -``` -QA_example/ -├── __init__.py # 모듈 초기화 -├── prompts.py # 프롬프트 템플릿 -├── qa_generator.py # QA 생성 핵심 로직 -├── qa_generation.ipynb # 사용 예제 노트북 -├── README.md # 이 문서 -└── output/ # 생성된 데이터셋 -``` - -## 프롬프트 전략 - -### Chain-of-Table -- 단계별 표 해석 과정 명시 -- 동적 계획법 기반 질문 생성 - -### Program-of-Thought (PoT) -- 수치 계산을 Python 코드로 생성 -- 계산 결과의 무결성 보장 - -### Tabular Chain-of-Thought -- 추론 과정을 표 형태로 구조화 -- Step → Sub-question → Evidence → Reasoning - -### Evol-Instruct -- 제약 조건 추가 (Adding Constraints) -- 심층 추론 (Deepening Reasoning) -- 구체화 (Concretizing) -- 입력 복잡도 증가 (Complicating Input) - -## 품질 평가 (LLM-as-Judge) - -생성된 QA의 품질을 5가지 차원으로 평가: - -1. **정확성 (Correctness)**: 답변의 사실적 정확성 -2. **충실성 (Faithfulness)**: 테이블 데이터에 대한 충실도 -3. **관련성 (Relevance)**: 보험 도메인 실용성 -4. **난이도 적절성**: 표기 난이도와 실제 난이도 일치 -5. **명확성 (Clarity)**: 질문과 답변의 명확성 - -## 의존성 - -- `polling_gemini`: Gemini API 풀링 시스템 -- `google-generativeai`: Google Gemini API -- `pyyaml`: YAML 설정 파일 처리 - -## 라이센스 - -MIT License diff --git a/tests/choi/QA_example/__init__.py b/tests/choi/QA_example/__init__.py deleted file mode 100644 index 53e6823..0000000 --- a/tests/choi/QA_example/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -QA Dataset Generation Module for Insurance Tables -보험 테이블 기반 QA 데이터셋 생성 모듈 -""" - -from .qa_generator import ( - InsuranceTableQAGenerator, - QADifficulty, - QAType, - generate_qa_from_tables, -) - -__all__ = [ - 'InsuranceTableQAGenerator', - 'QADifficulty', - 'QAType', - 'generate_qa_from_tables', -] - -__version__ = '0.1.0' diff --git a/tests/choi/QA_example/prompts.py b/tests/choi/QA_example/prompts.py deleted file mode 100644 index 9adfeb5..0000000 --- a/tests/choi/QA_example/prompts.py +++ /dev/null @@ -1,479 +0,0 @@ -""" -QA Generation Prompts for Insurance Table Data -보험 테이블 기반 QA 생성을 위한 프롬프트 템플릿 - -난이도별 QA 유형: -1. IR (Information Retrieval): 단순 정보 검색 -2. Analysis: 분석적 질문 -3. Compare (Multi-hop): 비교 및 다중 추론 -4. Aggregation: 집계 연산 -5. Reasoning: 복합 추론 -6. Insight: 통찰 도출 - -답변 유형: -- Exact Match: 단답형 (숫자, 예/아니오) -- Descriptive: 서술형 (LLM Judge 평가) -""" - -# ============================================================================= -# System Prompts -# ============================================================================= - -QA_GENERATOR_SYSTEM_PROMPT = """# Role Definition -당신은 보험 도메인 전문가이자 고품질 QA 데이터셋 구축 전문가입니다. -주어진 보험 테이블 데이터를 기반으로 다양한 난이도와 유형의 질문-답변 쌍을 생성해야 합니다. - -# Core Principles -1. **정확성(Accuracy):** 모든 답변은 주어진 테이블 데이터에 근거해야 합니다. 테이블에 없는 정보를 추측하지 마십시오. -2. **다양성(Diversity):** 단순 검색부터 복잡한 추론까지 다양한 난이도의 질문을 생성해야 합니다. -3. **실용성(Practicality):** 실제 보험 고객이 물어볼 수 있는 현실적인 질문을 생성해야 합니다. -4. **명확성(Clarity):** 질문과 답변 모두 명확하고 모호하지 않아야 합니다. - -# Difficulty Levels (난이도) -- **IR (Level 1):** 특정 셀의 값을 직접 찾는 단순 검색 -- **Analysis (Level 2):** 단일 테이블 내에서의 분석적 질문 -- **Compare (Level 3):** 여러 행/열 또는 복수 테이블 간 비교 -- **Aggregation (Level 4):** 합계, 평균, 최대/최소 등의 집계 연산 -- **Reasoning (Level 5):** 여러 정보를 종합한 복합 추론 -- **Insight (Level 6):** 데이터로부터 통찰이나 시사점 도출 - -# Answer Types (답변 유형) -- **exact_match:** 숫자, 예/아니오, 특정 텍스트 등 정확히 일치해야 하는 답변 -- **descriptive:** 설명이 필요한 서술형 답변 (LLM-as-Judge로 평가) -- **calculation:** 수치 계산 결과 (계산 과정 포함) -- **comparison:** 비교 결과 및 근거""" - -# ============================================================================= -# QA Generation Prompts by Difficulty -# ============================================================================= - -IR_QA_PROMPT = """## Task: Information Retrieval (IR) Level QA 생성 -단일 테이블에서 특정 셀의 값을 직접 검색하는 간단한 QA를 생성하세요. - -### Input Tables -{tables} - -### Requirements -1. 특정 행과 열이 교차하는 지점의 값을 묻는 질문 -2. 단답형으로 대답 가능한 질문 -3. 테이블에서 바로 찾을 수 있는 정보만 질문 - -### Output Format (JSON) -```json -{{ - "questions": [ - {{ - "id": "IR_001", - "difficulty": "IR", - "answer_type": "exact_match", - "question": "질문 내용", - "answer": "정확한 답변", - "evidence": {{ - "table_id": "table_1", - "row": "행 정보", - "column": "열 정보" - }}, - "tags": ["single_cell", "numeric"] - }} - ] -}} -``` - -### Generate {num_questions} IR-level QA pairs.""" - -ANALYSIS_QA_PROMPT = """## Task: Analysis Level QA 생성 -단일 테이블 내에서 데이터를 분석하는 질문을 생성하세요. - -### Input Tables -{tables} - -### Requirements -1. 특정 조건을 만족하는 행/열 찾기 -2. 최대값/최소값을 가진 항목 식별 -3. 특정 범위 내의 데이터 확인 -4. 단답형 또는 짧은 설명형 답변 - -### Output Format (JSON) -```json -{{ - "questions": [ - {{ - "id": "ANALYSIS_001", - "difficulty": "Analysis", - "answer_type": "exact_match", - "question": "질문 내용", - "answer": "정확한 답변", - "reasoning": "답을 도출하는 과정 설명", - "evidence": {{ - "table_id": "table_1", - "relevant_cells": ["셀 위치1", "셀 위치2"] - }}, - "tags": ["conditional_search", "extrema"] - }} - ] -}} -``` - -### Generate {num_questions} Analysis-level QA pairs.""" - -COMPARE_QA_PROMPT = """## Task: Compare (Multi-hop) Level QA 생성 -여러 행, 열, 또는 복수 테이블 간 비교가 필요한 질문을 생성하세요. - -### Input Tables -{tables} - -### Requirements -1. 두 개 이상의 셀 값을 비교하는 질문 -2. 시계열 변화를 비교하는 질문 -3. 복수 테이블의 정보를 연결하는 질문 -4. 차이, 비율, 증감률 등을 묻는 질문 - -### Output Format (JSON) -```json -{{ - "questions": [ - {{ - "id": "COMPARE_001", - "difficulty": "Compare", - "answer_type": "calculation", - "question": "질문 내용", - "answer": "정확한 답변", - "calculation": "계산 과정", - "reasoning": "비교 논리 설명", - "evidence": {{ - "table_ids": ["table_1", "table_2"], - "compared_cells": [ - {{"table": "table_1", "row": "행1", "column": "열1", "value": "값1"}}, - {{"table": "table_1", "row": "행2", "column": "열2", "value": "값2"}} - ] - }}, - "tags": ["multi_hop", "comparison", "calculation"] - }} - ] -}} -``` - -### Generate {num_questions} Compare-level QA pairs.""" - -AGGREGATION_QA_PROMPT = """## Task: Aggregation Level QA 생성 -합계, 평균, 누적값 등 집계 연산이 필요한 질문을 생성하세요. - -### Input Tables -{tables} - -### Requirements -1. 특정 열/행의 합계를 구하는 질문 -2. 평균값을 계산하는 질문 -3. 누적 증가율을 구하는 질문 -4. 조건부 집계 (특정 조건을 만족하는 항목들의 합계 등) - -### Python Code for Verification -답변의 정확성 검증을 위해 Python 코드도 함께 생성하세요. - -### Output Format (JSON) -```json -{{ - "questions": [ - {{ - "id": "AGG_001", - "difficulty": "Aggregation", - "answer_type": "calculation", - "question": "질문 내용", - "answer": "정확한 수치 답변", - "calculation": "단계별 계산 과정", - "python_verification": "import pandas as pd\\n# 검증 코드", - "evidence": {{ - "table_id": "table_1", - "aggregated_cells": ["셀1", "셀2", "셀3"] - }}, - "tags": ["aggregation", "sum", "average"] - }} - ] -}} -``` - -### Generate {num_questions} Aggregation-level QA pairs.""" - -REASONING_QA_PROMPT = """## Task: Reasoning Level QA 생성 -여러 정보를 종합하여 복합적인 추론이 필요한 질문을 생성하세요. - -### Input Tables -{tables} - -### Requirements -1. 조건부 로직을 적용한 추론 질문 -2. 가정(Assumption)을 포함한 시나리오 기반 질문 -3. 인과관계를 파악하는 질문 -4. 여러 단계의 논리적 추론이 필요한 질문 - -### Chain-of-Thought Reasoning -답변 도출 과정을 단계별로 명시하세요. - -### Output Format (JSON) -```json -{{ - "questions": [ - {{ - "id": "REASON_001", - "difficulty": "Reasoning", - "answer_type": "descriptive", - "question": "질문 내용", - "answer": "답변", - "chain_of_thought": [ - "Step 1: ...", - "Step 2: ...", - "Step 3: ..." - ], - "assumptions": ["가정1", "가정2"], - "evidence": {{ - "table_ids": ["table_1"], - "relevant_data": ["관련 데이터 포인트"] - }}, - "tags": ["multi_step_reasoning", "conditional_logic"] - }} - ] -}} -``` - -### Generate {num_questions} Reasoning-level QA pairs.""" - -INSIGHT_QA_PROMPT = """## Task: Insight Level QA 생성 -데이터로부터 통찰이나 시사점을 도출하는 고난도 질문을 생성하세요. - -### Input Tables -{tables} - -### Requirements -1. 데이터 추세(Trend)를 파악하는 질문 -2. 이상치(Anomaly)나 특이 패턴을 발견하는 질문 -3. 데이터 기반 예측이나 권고를 요청하는 질문 -4. 비즈니스적 함의를 도출하는 질문 - -### Output Format (JSON) -```json -{{ - "questions": [ - {{ - "id": "INSIGHT_001", - "difficulty": "Insight", - "answer_type": "descriptive", - "question": "질문 내용", - "answer": "통찰 및 답변", - "supporting_analysis": "분석 과정", - "key_findings": ["발견1", "발견2"], - "evidence": {{ - "table_ids": ["table_1", "table_2"], - "data_points": ["근거 데이터"] - }}, - "tags": ["trend_analysis", "insight", "recommendation"] - }} - ] -}} -``` - -### Generate {num_questions} Insight-level QA pairs.""" - -# ============================================================================= -# Follow-up Question Prompts (꼬리 질문) -# ============================================================================= - -FOLLOWUP_QA_PROMPT = """## Task: Follow-up Question (꼬리 질문) 생성 -주어진 초기 QA에 대해 연속적인 후속 질문을 생성하세요. - -### Original QA -{original_qa} - -### Input Tables -{tables} - -### Requirements -1. 원래 질문의 맥락을 유지하면서 심화된 질문 -2. 원래 답변에서 파생되는 추가 질문 -3. 관련된 다른 데이터 포인트를 탐색하는 질문 -4. 2-3개의 연속적인 후속 질문 체인 생성 - -### Output Format (JSON) -```json -{{ - "original_qa": {{ - "question": "원래 질문", - "answer": "원래 답변" - }}, - "followup_chain": [ - {{ - "id": "FOLLOWUP_001_1", - "question": "후속 질문 1", - "answer": "답변 1", - "reasoning": "이전 답변과의 연결고리" - }}, - {{ - "id": "FOLLOWUP_001_2", - "question": "후속 질문 2 (질문1 기반)", - "answer": "답변 2", - "reasoning": "이전 답변과의 연결고리" - }} - ] -}} -``` - -### Generate follow-up questions chain.""" - -# ============================================================================= -# Multi-Table QA Prompts -# ============================================================================= - -MULTI_TABLE_QA_PROMPT = """## Task: Multi-Table QA 생성 -복수의 테이블을 참조해야 답변 가능한 질문을 생성하세요. - -### Input Tables -{tables} - -### Requirements -1. 반드시 2개 이상의 테이블 정보를 조합해야 답변 가능한 질문 -2. 테이블 간 연결 키(Key)를 활용한 질문 -3. 서로 다른 테이블의 수치를 비교/연산하는 질문 -4. 종합적인 분석이 필요한 질문 - -### Output Format (JSON) -```json -{{ - "questions": [ - {{ - "id": "MULTI_001", - "difficulty": "Compare", - "answer_type": "calculation", - "question": "질문 내용", - "answer": "답변", - "required_tables": ["table_1", "table_2"], - "join_logic": "테이블 연결 방법 설명", - "reasoning": "답변 도출 과정", - "tags": ["multi_table", "join", "cross_reference"] - }} - ] -}} -``` - -### Generate {num_questions} Multi-Table QA pairs.""" - -# ============================================================================= -# Evol-Instruct Prompts (난이도 진화) -# ============================================================================= - -EVOL_INSTRUCT_PROMPT = """## Task: Evol-Instruct - 질문 난이도 진화 -주어진 기본 질문을 더 복잡하고 도전적인 질문으로 진화시키세요. - -### Original Question -{original_question} - -### Evolution Strategies -다음 전략 중 하나 이상을 적용하여 질문을 진화시키세요: - -1. **제약 조건 추가 (Adding Constraints):** - - 특정 조건(나이, 기간, 금액 범위 등)을 추가 - -2. **심층 추론 (Deepening Reasoning):** - - 다단계 논리적 사고를 요구하도록 변환 - -3. **구체화 (Concretizing):** - - 추상적 질문을 구체적 시나리오로 대체 - -4. **입력 복잡도 증가 (Complicating Input):** - - 복수 테이블이나 추가 조건을 참조하도록 변환 - -### Input Tables -{tables} - -### Output Format (JSON) -```json -{{ - "original": {{ - "question": "원래 질문", - "difficulty": "원래 난이도" - }}, - "evolved": {{ - "question": "진화된 질문", - "difficulty": "새로운 난이도", - "evolution_strategy": "적용된 전략", - "answer": "새로운 답변", - "reasoning": "답변 도출 과정" - }} -}} -``` - -### Evolve the question.""" - -# ============================================================================= -# Quality Evaluation Prompts (LLM-as-Judge) -# ============================================================================= - -QA_EVALUATION_PROMPT = """## Task: QA 품질 평가 (LLM-as-Judge) -생성된 QA 쌍의 품질을 다면적으로 평가하세요. - -### QA to Evaluate -{qa_pair} - -### Reference Tables -{tables} - -### Evaluation Criteria (1-5점 척도) - -1. **정확성 (Correctness):** - - 답변이 테이블 데이터에 정확히 근거하는가? - - 수치 계산이 정확한가? - -2. **충실성 (Faithfulness):** - - 테이블에 없는 정보를 날조(Hallucination)하지 않았는가? - - 근거 데이터가 명확한가? - -3. **관련성 (Relevance):** - - 질문이 보험 도메인에서 실용적인가? - - 실제 고객이 물어볼 법한 질문인가? - -4. **난이도 적절성 (Difficulty Appropriateness):** - - 표기된 난이도와 실제 난이도가 일치하는가? - -5. **명확성 (Clarity):** - - 질문과 답변이 명확하고 모호하지 않은가? - -### Output Format (JSON) -```json -{{ - "evaluation": {{ - "correctness": {{"score": 5, "comment": "평가 코멘트"}}, - "faithfulness": {{"score": 5, "comment": "평가 코멘트"}}, - "relevance": {{"score": 5, "comment": "평가 코멘트"}}, - "difficulty_appropriateness": {{"score": 5, "comment": "평가 코멘트"}}, - "clarity": {{"score": 5, "comment": "평가 코멘트"}} - }}, - "overall_score": 5.0, - "pass": true, - "improvement_suggestions": ["개선 제안1", "개선 제안2"] -}} -``` - -### Evaluate the QA pair.""" - - -# ============================================================================= -# Helper Functions -# ============================================================================= - -def get_qa_prompt_by_difficulty(difficulty: str) -> str: - """난이도에 따른 프롬프트 반환""" - prompts = { - "IR": IR_QA_PROMPT, - "Analysis": ANALYSIS_QA_PROMPT, - "Compare": COMPARE_QA_PROMPT, - "Aggregation": AGGREGATION_QA_PROMPT, - "Reasoning": REASONING_QA_PROMPT, - "Insight": INSIGHT_QA_PROMPT, - } - return prompts.get(difficulty, IR_QA_PROMPT) - - -def format_tables_for_prompt(tables: dict) -> str: - """테이블 딕셔너리를 프롬프트용 문자열로 변환""" - formatted = [] - for table_id, table_content in tables.items(): - formatted.append(f"### {table_id}\n```markdown\n{table_content}\n```\n") - return "\n".join(formatted) diff --git a/tests/choi/QA_example/qa_generation.ipynb b/tests/choi/QA_example/qa_generation.ipynb deleted file mode 100644 index 34481fa..0000000 --- a/tests/choi/QA_example/qa_generation.ipynb +++ /dev/null @@ -1,1432 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8386b6ee", - "metadata": {}, - "source": [ - "## 1. 환경 설정" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8972c3e2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "프로젝트 루트: /Users/jaehyeokchoi/Desktop/chois_toy/private/TableMagnifier\n", - "현재 작업 디렉토리: /Users/jaehyeokchoi/Desktop/chois_toy/private/TableMagnifier/QA_example\n" - ] - } - ], - "source": [ - "import sys\n", - "import json\n", - "import asyncio\n", - "from pathlib import Path\n", - "from datetime import datetime\n", - "\n", - "# 프로젝트 루트 경로 설정\n", - "project_root = Path.cwd().parent\n", - "if str(project_root) not in sys.path:\n", - " sys.path.insert(0, str(project_root))\n", - "\n", - "print(f\"프로젝트 루트: {project_root}\")\n", - "print(f\"현재 작업 디렉토리: {Path.cwd()}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ca7782da", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ 모듈 임포트 완료 (리로드됨)\n" - ] - } - ], - "source": [ - "# QA Generator 모듈 임포트 (변경 시 리로드)\n", - "import importlib\n", - "import QA_example.qa_generator\n", - "importlib.reload(QA_example.qa_generator)\n", - "\n", - "from QA_example.qa_generator import (\n", - " InsuranceTableQAGenerator,\n", - " QADifficulty,\n", - " QAType,\n", - " generate_qa_from_tables,\n", - ")\n", - "\n", - "# Gemini API Pool 임포트\n", - "from polling_gemini import get_gemini_pool\n", - "\n", - "print(\"✅ 모듈 임포트 완료 (리로드됨)\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "68b1f92e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📊 API Pool 상태:\n", - " 현재 키: key1\n", - " 총 키 수: 3\n" - ] - } - ], - "source": [ - "# API Pool 상태 확인\n", - "pool = get_gemini_pool()\n", - "print(\"📊 API Pool 상태:\")\n", - "print(f\" 현재 키: {pool.get_current_key_info()['name']}\")\n", - "print(f\" 총 키 수: {pool.get_current_key_info()['total_keys']}\")" - ] - }, - { - "cell_type": "markdown", - "id": "c38801d5", - "metadata": {}, - "source": [ - "## 2. 샘플 테이블 데이터 준비\n", - "\n", - "보험 도메인의 다양한 테이블 예시를 준비합니다." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "504b0780", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ 샘플 테이블 데이터 준비 완료\n", - " - TABLE_1: 보험료 산출 기초율\n", - " - TABLE_2: 해지환급금 예시\n", - " - TABLE_3: 보장 내역\n", - " - TABLE_4: 연령별 보험료\n" - ] - } - ], - "source": [ - "# 샘플 테이블 1: 보험료 산출 기초율 테이블 (실제 데이터 기반)\n", - "TABLE_1_PREMIUM_CALCULATION = \"\"\"\n", - "|구분|XX세|XX+1세|XX+2세|XX+3세|XX+4세|XX+5세|\n", - "|---|---|---|---|---|---|---|\n", - "|나이증가분(A)||1059|1357|1739|2229|2855|\n", - "|보험료 산출 기초율(위험률 등) 증가분(B=전년도 기준보험료의 최대 25% 가정)||10846|13897|17806|22815|29232|\n", - "|기준보험료(C=전년도 기준보험료+A+B)|42325|54321|69485|89030|114074|146161|\n", - "\"\"\".strip()\n", - "\n", - "# 샘플 테이블 2: 해지환급금 예시표\n", - "TABLE_2_SURRENDER_VALUE = \"\"\"\n", - "|경과기간|납입보험료 누계|해지환급금|환급률|\n", - "|---|---|---|---|\n", - "|1년|600000|0|0%|\n", - "|3년|1800000|540000|30%|\n", - "|5년|3000000|1650000|55%|\n", - "|10년|6000000|4800000|80%|\n", - "|15년|9000000|8550000|95%|\n", - "|20년(만기)|12000000|12000000|100%|\n", - "\"\"\".strip()\n", - "\n", - "# 샘플 테이블 3: 보장 내역표\n", - "TABLE_3_COVERAGE = \"\"\"\n", - "|보장항목|보장내용|지급금액|지급조건|\n", - "|---|---|---|---|\n", - "|사망보험금|일반사망|50000000|피보험자 사망시|\n", - "|사망보험금|재해사망|100000000|재해로 인한 사망시|\n", - "|암진단금|일반암|30000000|암 최초 진단시|\n", - "|암진단금|소액암|6000000|소액암 진단시|\n", - "|암진단금|유사암|3000000|유사암 진단시|\n", - "|입원비|일반입원|50000|1일당 (최대 180일)|\n", - "|입원비|암입원|100000|1일당 (최대 180일)|\n", - "|수술비|일반수술|500000|1회당|\n", - "|수술비|암수술|2000000|1회당|\n", - "\"\"\".strip()\n", - "\n", - "# 샘플 테이블 4: 연령별 보험료 예시\n", - "TABLE_4_AGE_PREMIUM = \"\"\"\n", - "|가입연령|성별|20년납_월보험료|전기납_월보험료|일시납_총보험료|\n", - "|---|---|---|---|---|\n", - "|30세|남|45000|38000|8500000|\n", - "|30세|여|42000|35000|7800000|\n", - "|40세|남|65000|52000|11500000|\n", - "|40세|여|58000|47000|10200000|\n", - "|50세|남|95000|75000|16000000|\n", - "|50세|여|82000|65000|14000000|\n", - "\"\"\".strip()\n", - "\n", - "print(\"✅ 샘플 테이블 데이터 준비 완료\")\n", - "print(f\" - TABLE_1: 보험료 산출 기초율\")\n", - "print(f\" - TABLE_2: 해지환급금 예시\")\n", - "print(f\" - TABLE_3: 보장 내역\")\n", - "print(f\" - TABLE_4: 연령별 보험료\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c22033b5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "============================================================\n", - "📋 table_1_premium_calculation\n", - "============================================================\n" - ] - }, - { - "data": { - "text/markdown": [ - "|구분|XX세|XX+1세|XX+2세|XX+3세|XX+4세|XX+5세|\n", - "|---|---|---|---|---|---|---|\n", - "|나이증가분(A)||1059|1357|1739|2229|2855|\n", - "|보험료 산출 기초율(위험률 등) 증가분(B=전년도 기준보험료의 최대 25% 가정)||10846|13897|17806|22815|29232|\n", - "|기준보험료(C=전년도 기준보험료+A+B)|42325|54321|69485|89030|114074|146161|" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "============================================================\n", - "📋 table_2_surrender_value\n", - "============================================================\n" - ] - }, - { - "data": { - "text/markdown": [ - "|경과기간|납입보험료 누계|해지환급금|환급률|\n", - "|---|---|---|---|\n", - "|1년|600000|0|0%|\n", - "|3년|1800000|540000|30%|\n", - "|5년|3000000|1650000|55%|\n", - "|10년|6000000|4800000|80%|\n", - "|15년|9000000|8550000|95%|\n", - "|20년(만기)|12000000|12000000|100%|" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "============================================================\n", - "📋 table_3_coverage\n", - "============================================================\n" - ] - }, - { - "data": { - "text/markdown": [ - "|보장항목|보장내용|지급금액|지급조건|\n", - "|---|---|---|---|\n", - "|사망보험금|일반사망|50000000|피보험자 사망시|\n", - "|사망보험금|재해사망|100000000|재해로 인한 사망시|\n", - "|암진단금|일반암|30000000|암 최초 진단시|\n", - "|암진단금|소액암|6000000|소액암 진단시|\n", - "|암진단금|유사암|3000000|유사암 진단시|\n", - "|입원비|일반입원|50000|1일당 (최대 180일)|\n", - "|입원비|암입원|100000|1일당 (최대 180일)|\n", - "|수술비|일반수술|500000|1회당|\n", - "|수술비|암수술|2000000|1회당|" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "============================================================\n", - "📋 table_4_age_premium\n", - "============================================================\n" - ] - }, - { - "data": { - "text/markdown": [ - "|가입연령|성별|20년납_월보험료|전기납_월보험료|일시납_총보험료|\n", - "|---|---|---|---|---|\n", - "|30세|남|45000|38000|8500000|\n", - "|30세|여|42000|35000|7800000|\n", - "|40세|남|65000|52000|11500000|\n", - "|40세|여|58000|47000|10200000|\n", - "|50세|남|95000|75000|16000000|\n", - "|50세|여|82000|65000|14000000|" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# 테이블 딕셔너리 구성\n", - "tables = {\n", - " \"table_1_premium_calculation\": TABLE_1_PREMIUM_CALCULATION,\n", - " \"table_2_surrender_value\": TABLE_2_SURRENDER_VALUE,\n", - " \"table_3_coverage\": TABLE_3_COVERAGE,\n", - " \"table_4_age_premium\": TABLE_4_AGE_PREMIUM,\n", - "}\n", - "\n", - "# 테이블 미리보기\n", - "from IPython.display import display, Markdown\n", - "\n", - "for table_id, content in tables.items():\n", - " print(f\"\\n{'='*60}\")\n", - " print(f\"📋 {table_id}\")\n", - " print('='*60)\n", - " display(Markdown(content))" - ] - }, - { - "cell_type": "markdown", - "id": "0480f78c", - "metadata": {}, - "source": [ - "## 3. QA Generator 초기화" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d4d5e0c6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ QA Generator 초기화 완료\n", - " 사용 모델: gemini-2.0-flash\n" - ] - } - ], - "source": [ - "# QA Generator 인스턴스 생성\n", - "qa_generator = InsuranceTableQAGenerator(\n", - " model_name=\"gemini-2.0-flash\" # 또는 \"gemini-1.5-flash\"\n", - ")\n", - "\n", - "print(\"✅ QA Generator 초기화 완료\")\n", - "print(f\" 사용 모델: gemini-2.0-flash\")" - ] - }, - { - "cell_type": "markdown", - "id": "f21cb567", - "metadata": {}, - "source": [ - "## 4. 난이도별 QA 생성\n", - "\n", - "### 4.1 IR (Information Retrieval) - 단순 정보 검색" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "11267214", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔍 IR (Information Retrieval) QA 생성 중...\n", - "\n", - "✅ 3개의 IR QA 생성 완료\n", - "\n", - "📌 [IR_001] IR\n", - " Q: XX세의 기준보험료는 얼마인가요?\n", - " A: 42325\n", - " Evidence: {'table_id': 'table_1_premium_calculation', 'row': '기준보험료(C=전년도 기준보험료+A+B)', 'column': 'XX세'}\n", - "\n", - "📌 [IR_002] IR\n", - " Q: 경과기간이 1년일 때 해지환급금은 얼마인가요?\n", - " A: 0\n", - " Evidence: {'table_id': 'table_2_surrender_value', 'row': '1년', 'column': '해지환급금'}\n", - "\n", - "📌 [IR_003] IR\n", - " Q: 30세 남성의 20년납 월보험료는 얼마인가요?\n", - " A: 45000\n", - " Evidence: {'table_id': 'table_4_age_premium', 'row': '30세 (남)', 'column': '20년납_월보험료'}\n", - "\n", - "✅ 3개의 IR QA 생성 완료\n", - "\n", - "📌 [IR_001] IR\n", - " Q: XX세의 기준보험료는 얼마인가요?\n", - " A: 42325\n", - " Evidence: {'table_id': 'table_1_premium_calculation', 'row': '기준보험료(C=전년도 기준보험료+A+B)', 'column': 'XX세'}\n", - "\n", - "📌 [IR_002] IR\n", - " Q: 경과기간이 1년일 때 해지환급금은 얼마인가요?\n", - " A: 0\n", - " Evidence: {'table_id': 'table_2_surrender_value', 'row': '1년', 'column': '해지환급금'}\n", - "\n", - "📌 [IR_003] IR\n", - " Q: 30세 남성의 20년납 월보험료는 얼마인가요?\n", - " A: 45000\n", - " Evidence: {'table_id': 'table_4_age_premium', 'row': '30세 (남)', 'column': '20년납_월보험료'}\n" - ] - } - ], - "source": [ - "# IR 난이도 QA 생성 (단답형, 특정 셀 검색)\n", - "print(\"🔍 IR (Information Retrieval) QA 생성 중...\")\n", - "\n", - "ir_qa_pairs = qa_generator.generate_qa_by_difficulty(\n", - " tables=tables,\n", - " difficulty=QADifficulty.IR,\n", - " num_questions=3\n", - ")\n", - "\n", - "print(f\"\\n✅ {len(ir_qa_pairs)}개의 IR QA 생성 완료\")\n", - "for qa in ir_qa_pairs:\n", - " print(f\"\\n📌 [{qa.id}] {qa.difficulty}\")\n", - " print(f\" Q: {qa.question}\")\n", - " print(f\" A: {qa.answer}\")\n", - " if qa.evidence:\n", - " print(f\" Evidence: {qa.evidence}\")" - ] - }, - { - "cell_type": "markdown", - "id": "6c4f477c", - "metadata": {}, - "source": [ - "### 4.2 Analysis - 분석적 질문" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "647c157f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📊 Analysis QA 생성 중...\n", - "\n", - "✅ 3개의 Analysis QA 생성 완료\n", - "\n", - "📌 [ANALYSIS_001] Analysis\n", - " Q: 암진단금 보장항목 중에서 가장 높은 지급금액을 제공하는 보장내용은 무엇인가요?\n", - " A: 일반암\n", - " Reasoning: table_3_coverage에서 '보장항목'이 '암진단금'인 행들을 확인하고, 그 중 '지급금액'이 가장 높은 '보장내용'을 찾습니다. 일반암은 30,000,000원, 소액암은 6,000,000원, 유사암은 3,000,000원이므로 일반암이 가장 높습니다.\n", - "\n", - "📌 [ANALYSIS_002] Analysis\n", - " Q: 해지환급률이 50% 이상이 되려면 최소 몇 년의 경과기간이 필요합니까?\n", - " A: 5년\n", - " Reasoning: table_2_surrender_value에서 '환급률'이 50% 이상인 행들을 찾고, 그 중 가장 작은 '경과기간'을 확인합니다. 3년 경과 시 환급률은 30%이고, 5년 경과 시 환급률은 55%이므로 최소 5년이 필요합니다.\n", - "\n", - "📌 [ANALYSIS_003] Analysis\n", - " Q: 40세 가입자의 경우, 남성과 여성 중 누가 20년납_월보험료가 더 높은가요?\n", - " A: 남성\n", - " Reasoning: table_4_age_premium에서 '가입연령'이 40세인 행들을 찾고, 해당 행들의 '20년납_월보험료'를 비교합니다. 40세 남성의 20년납_월보험료는 65,000원이고, 40세 여성의 20년납_월보험료는 58,000원이므로 남성이 더 높습니다.\n", - "\n", - "✅ 3개의 Analysis QA 생성 완료\n", - "\n", - "📌 [ANALYSIS_001] Analysis\n", - " Q: 암진단금 보장항목 중에서 가장 높은 지급금액을 제공하는 보장내용은 무엇인가요?\n", - " A: 일반암\n", - " Reasoning: table_3_coverage에서 '보장항목'이 '암진단금'인 행들을 확인하고, 그 중 '지급금액'이 가장 높은 '보장내용'을 찾습니다. 일반암은 30,000,000원, 소액암은 6,000,000원, 유사암은 3,000,000원이므로 일반암이 가장 높습니다.\n", - "\n", - "📌 [ANALYSIS_002] Analysis\n", - " Q: 해지환급률이 50% 이상이 되려면 최소 몇 년의 경과기간이 필요합니까?\n", - " A: 5년\n", - " Reasoning: table_2_surrender_value에서 '환급률'이 50% 이상인 행들을 찾고, 그 중 가장 작은 '경과기간'을 확인합니다. 3년 경과 시 환급률은 30%이고, 5년 경과 시 환급률은 55%이므로 최소 5년이 필요합니다.\n", - "\n", - "📌 [ANALYSIS_003] Analysis\n", - " Q: 40세 가입자의 경우, 남성과 여성 중 누가 20년납_월보험료가 더 높은가요?\n", - " A: 남성\n", - " Reasoning: table_4_age_premium에서 '가입연령'이 40세인 행들을 찾고, 해당 행들의 '20년납_월보험료'를 비교합니다. 40세 남성의 20년납_월보험료는 65,000원이고, 40세 여성의 20년납_월보험료는 58,000원이므로 남성이 더 높습니다.\n" - ] - } - ], - "source": [ - "# Analysis 난이도 QA 생성\n", - "print(\"📊 Analysis QA 생성 중...\")\n", - "\n", - "analysis_qa_pairs = qa_generator.generate_qa_by_difficulty(\n", - " tables=tables,\n", - " difficulty=QADifficulty.ANALYSIS,\n", - " num_questions=3\n", - ")\n", - "\n", - "print(f\"\\n✅ {len(analysis_qa_pairs)}개의 Analysis QA 생성 완료\")\n", - "for qa in analysis_qa_pairs:\n", - " print(f\"\\n📌 [{qa.id}] {qa.difficulty}\")\n", - " print(f\" Q: {qa.question}\")\n", - " print(f\" A: {qa.answer}\")\n", - " if qa.reasoning:\n", - " print(f\" Reasoning: {qa.reasoning}\")" - ] - }, - { - "cell_type": "markdown", - "id": "7c744604", - "metadata": {}, - "source": [ - "### 4.3 Compare (Multi-hop) - 비교 및 다중 추론" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "597806d6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "⚖️ Compare (Multi-hop) QA 생성 중...\n", - "\n", - "✅ 3개의 Compare QA 생성 완료\n", - "\n", - "📌 [COMPARE_001] Compare\n", - " Q: table_1_premium_calculation에서 XX+1세의 경우, 보험료 산출 기초율 증가분(B)은 나이증가분(A)보다 얼마나 더 큰가요?\n", - " A: 9,787원 더 큽니다.\n", - " Reasoning: XX+1세의 보험료 산출 기초율 증가분(B) 값에서 나이증가분(A) 값을 차감하여 그 차이를 계산합니다.\n", - "\n", - "📌 [COMPARE_002] Compare\n", - " Q: table_2_surrender_value에서 보험 가입 후 5년 경과 시점과 10년 경과 시점의 해지환급금은 얼마나 차이가 나나요?\n", - " A: 3,150,000원 차이가 납니다.\n", - " Reasoning: 10년 경과 시점의 해지환급금에서 5년 경과 시점의 해지환급금을 차감하여 차이를 계산합니다.\n", - "\n", - "📌 [COMPARE_003] Compare\n", - " Q: table_4_age_premium에서 30세 남성이 20년납 월보험료로 가입했을 경우, 20년 만기 시점의 총 납입보험료와 table_2_surrender_value의 20년 경과 시점 해지환급금은 얼마나 차이가 나나요?\n", - " A: 1,200,000원 차이가 납니다.\n", - " Reasoning: table_4에서 30세 남성의 20년납 월보험료를 찾아 20년간의 총 납입보험료를 계산하고, table_2에서 20년 경과 시점의 해지환급금을 찾아 두 값의 차이를 계산합니다.\n", - "\n", - "✅ 3개의 Compare QA 생성 완료\n", - "\n", - "📌 [COMPARE_001] Compare\n", - " Q: table_1_premium_calculation에서 XX+1세의 경우, 보험료 산출 기초율 증가분(B)은 나이증가분(A)보다 얼마나 더 큰가요?\n", - " A: 9,787원 더 큽니다.\n", - " Reasoning: XX+1세의 보험료 산출 기초율 증가분(B) 값에서 나이증가분(A) 값을 차감하여 그 차이를 계산합니다.\n", - "\n", - "📌 [COMPARE_002] Compare\n", - " Q: table_2_surrender_value에서 보험 가입 후 5년 경과 시점과 10년 경과 시점의 해지환급금은 얼마나 차이가 나나요?\n", - " A: 3,150,000원 차이가 납니다.\n", - " Reasoning: 10년 경과 시점의 해지환급금에서 5년 경과 시점의 해지환급금을 차감하여 차이를 계산합니다.\n", - "\n", - "📌 [COMPARE_003] Compare\n", - " Q: table_4_age_premium에서 30세 남성이 20년납 월보험료로 가입했을 경우, 20년 만기 시점의 총 납입보험료와 table_2_surrender_value의 20년 경과 시점 해지환급금은 얼마나 차이가 나나요?\n", - " A: 1,200,000원 차이가 납니다.\n", - " Reasoning: table_4에서 30세 남성의 20년납 월보험료를 찾아 20년간의 총 납입보험료를 계산하고, table_2에서 20년 경과 시점의 해지환급금을 찾아 두 값의 차이를 계산합니다.\n" - ] - } - ], - "source": [ - "# Compare 난이도 QA 생성 (Multi-hop 추론)\n", - "print(\"⚖️ Compare (Multi-hop) QA 생성 중...\")\n", - "\n", - "compare_qa_pairs = qa_generator.generate_qa_by_difficulty(\n", - " tables=tables,\n", - " difficulty=QADifficulty.COMPARE,\n", - " num_questions=3\n", - ")\n", - "\n", - "print(f\"\\n✅ {len(compare_qa_pairs)}개의 Compare QA 생성 완료\")\n", - "for qa in compare_qa_pairs:\n", - " print(f\"\\n📌 [{qa.id}] {qa.difficulty}\")\n", - " print(f\" Q: {qa.question}\")\n", - " print(f\" A: {qa.answer}\")\n", - " if qa.reasoning:\n", - " print(f\" Reasoning: {qa.reasoning}\")" - ] - }, - { - "cell_type": "markdown", - "id": "c91c5b2e", - "metadata": {}, - "source": [ - "### 4.4 Aggregation - 집계 연산" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "c8a555ce", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "➕ Aggregation QA 생성 중...\n", - "\n", - "✅ 3개의 Aggregation QA 생성 완료\n", - "\n", - "📌 [AGG_001] Aggregation\n", - " Q: table_1_premium_calculation에서 XX+1세부터 XX+5세까지 '나이증가분(A)'의 총합은 얼마인가요?\n", - " A: 9239\n", - " Python 검증 코드 포함: ✅\n", - "\n", - "📌 [AGG_002] Aggregation\n", - " Q: table_2_surrender_value에서 경과기간 1년, 3년, 5년, 10년, 15년, 20년 시점의 '해지환급금'의 평균은 얼마인가요?\n", - " A: 4590000\n", - " Python 검증 코드 포함: ✅\n", - "\n", - "📌 [AGG_003] Aggregation\n", - " Q: table_4_age_premium에서 30세 남성이 20년납으로 가입했을 경우, 20년 동안 총 납입해야 할 보험료는 얼마인가요?\n", - " A: 10800000\n", - " Python 검증 코드 포함: ✅\n", - "\n", - "✅ 3개의 Aggregation QA 생성 완료\n", - "\n", - "📌 [AGG_001] Aggregation\n", - " Q: table_1_premium_calculation에서 XX+1세부터 XX+5세까지 '나이증가분(A)'의 총합은 얼마인가요?\n", - " A: 9239\n", - " Python 검증 코드 포함: ✅\n", - "\n", - "📌 [AGG_002] Aggregation\n", - " Q: table_2_surrender_value에서 경과기간 1년, 3년, 5년, 10년, 15년, 20년 시점의 '해지환급금'의 평균은 얼마인가요?\n", - " A: 4590000\n", - " Python 검증 코드 포함: ✅\n", - "\n", - "📌 [AGG_003] Aggregation\n", - " Q: table_4_age_premium에서 30세 남성이 20년납으로 가입했을 경우, 20년 동안 총 납입해야 할 보험료는 얼마인가요?\n", - " A: 10800000\n", - " Python 검증 코드 포함: ✅\n" - ] - } - ], - "source": [ - "# Aggregation 난이도 QA 생성 (수치 계산)\n", - "print(\"➕ Aggregation QA 생성 중...\")\n", - "\n", - "agg_qa_pairs = qa_generator.generate_qa_by_difficulty(\n", - " tables=tables,\n", - " difficulty=QADifficulty.AGGREGATION,\n", - " num_questions=3\n", - ")\n", - "\n", - "print(f\"\\n✅ {len(agg_qa_pairs)}개의 Aggregation QA 생성 완료\")\n", - "for qa in agg_qa_pairs:\n", - " print(f\"\\n📌 [{qa.id}] {qa.difficulty}\")\n", - " print(f\" Q: {qa.question}\")\n", - " print(f\" A: {qa.answer}\")\n", - " if qa.python_verification:\n", - " print(f\" Python 검증 코드 포함: ✅\")" - ] - }, - { - "cell_type": "markdown", - "id": "05f6c85e", - "metadata": {}, - "source": [ - "### 4.5 Reasoning - 복합 추론" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "0a3ce609", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🧠 Reasoning QA 생성 중...\n", - "\n", - "✅ 3개의 Reasoning QA 생성 완료\n", - "\n", - "📌 [REASON_001] Reasoning\n", - " Q: 30세 남성이 20년납 월보험료로 가입했을 때, `table_1`의 '기준보험료' 증가율이 '월보험료'에 동일하게 적용된다고 가정하면, 가입 후 3년이 지난 시점(즉, 33세)의 월보험료는 얼마가 될까요?\n", - " A: 약 94,658원\n", - " Chain of Thought:\n", - " Step 1: Step 1: `table_4`에서 30세 남성의 20년납 월보험료를 확인합니다: 45,000원.\n", - " Step 2: Step 2: `table_1`에서 'XX세'를 30세로 가정하고, 'XX세' (30세)의 기준보험료(C)는 42,325원, 'XX+3세' (33세)의 기준보험료(C)는 89,030원임을 확인합니다.\n", - " Step 3: Step 3: 30세 대비 33세의 기준보험료 증가율을 계산합니다: (89,030 / 42,325).\n", - " Step 4: Step 4: 이 증가율을 30세 남성의 초기 월보험료에 적용하여 33세의 월보험료를 계산합니다: 45,000원 * (89,030 / 42,325) = 94,657.76...\n", - " Step 5: Step 5: 계산된 값을 반올림하여 최종 월보험료를 도출합니다.\n", - "\n", - "📌 [REASON_002] Reasoning\n", - " Q: 3년 동안 보험료를 납입한 계약자가 일반암 진단을 받고, 진단금을 수령한 직후 보험을 해지한다면, 이 계약자가 총 얼마의 금액을 받게 될까요? (단, 납입보험료는 해지환급금 계산에만 영향을 미치며, 진단금 수령 시점까지의 납입보험료는 해지환급금 누계에 포함된다고 가정합니다.)\n", - " A: 30,540,000원\n", - " Chain of Thought:\n", - " Step 1: Step 1: `table_3`에서 '일반암' 진단금을 확인합니다: 30,000,000원.\n", - " Step 2: Step 2: `table_2`에서 '경과기간' 3년 시점의 '해지환급금'을 확인합니다: 540,000원.\n", - " Step 3: Step 3: 계약자가 받게 될 총 금액은 '일반암 진단금'과 '해지환급금'의 합계입니다.\n", - " Step 4: Step 4: 총 수령액 = 30,000,000원 + 540,000원 = 30,540,000원.\n", - "\n", - "📌 [REASON_003] Reasoning\n", - " Q: 40세 남성과 40세 여성이 각각 20년납으로 보험에 가입했다고 가정합니다. 두 사람 모두 가입 후 5년이 지난 시점에 일반암 진단을 받고, 10일간 암으로 입원했으며, 1회의 암수술을 받았을 경우, 다음 두 가지 질문에 답하세요.\n", - "1. 두 사람이 5년간 납입한 총 보험료의 차이는 얼마입니까?\n", - "2. 두 사람이 받게 되는 총 보장금액의 차이는 얼마입니까?\n", - " A: 1. 5년간 납입 총 보험료 차이: 420,000원\n", - "2. 총 보장금액 차이: 0원\n", - " Chain of Thought:\n", - " Step 1: Step 1: `table_4`에서 40세 남성과 40세 여성의 20년납 월보험료를 확인합니다.\n", - " Step 2: - 40세 남성: 65,000원\n", - " Step 3: - 40세 여성: 58,000원\n", - " Step 4: Step 2: 각 성별로 5년간 납입할 총 보험료를 계산합니다 (월보험료 * 12개월 * 5년).\n", - " Step 5: - 남성 총 납입 보험료: 65,000원 * 12 * 5 = 3,900,000원\n", - " Step 6: - 여성 총 납입 보험료: 58,000원 * 12 * 5 = 3,480,000원\n", - " Step 7: Step 3: 두 사람의 5년간 납입 총 보험료 차이를 계산합니다: 3,900,000원 - 3,480,000원 = 420,000원.\n", - " Step 8: Step 4: `table_3`에서 일반암 진단금, 암입원비 (10일), 암수술비 (1회)를 확인합니다.\n", - " Step 9: - 일반암 진단금: 30,000,000원\n", - " Step 10: - 암입원비: 100,000원 (1일당) * 10일 = 1,000,000원\n", - " Step 11: - 암수술비: 2,000,000원 (1회당)\n", - " Step 12: Step 5: 각 사람이 받게 되는 총 보장금액을 계산합니다 (성별에 따른 보장금액 차이는 `table_3`에 명시되어 있지 않으므로 동일하다고 가정).\n", - " Step 13: - 총 보장금액 = 30,000,000원 + 1,000,000원 + 2,000,000원 = 33,000,000원.\n", - " Step 14: Step 6: 두 사람의 총 보장금액 차이를 계산합니다: 33,000,000원 - 33,000,000원 = 0원.\n", - "\n", - "✅ 3개의 Reasoning QA 생성 완료\n", - "\n", - "📌 [REASON_001] Reasoning\n", - " Q: 30세 남성이 20년납 월보험료로 가입했을 때, `table_1`의 '기준보험료' 증가율이 '월보험료'에 동일하게 적용된다고 가정하면, 가입 후 3년이 지난 시점(즉, 33세)의 월보험료는 얼마가 될까요?\n", - " A: 약 94,658원\n", - " Chain of Thought:\n", - " Step 1: Step 1: `table_4`에서 30세 남성의 20년납 월보험료를 확인합니다: 45,000원.\n", - " Step 2: Step 2: `table_1`에서 'XX세'를 30세로 가정하고, 'XX세' (30세)의 기준보험료(C)는 42,325원, 'XX+3세' (33세)의 기준보험료(C)는 89,030원임을 확인합니다.\n", - " Step 3: Step 3: 30세 대비 33세의 기준보험료 증가율을 계산합니다: (89,030 / 42,325).\n", - " Step 4: Step 4: 이 증가율을 30세 남성의 초기 월보험료에 적용하여 33세의 월보험료를 계산합니다: 45,000원 * (89,030 / 42,325) = 94,657.76...\n", - " Step 5: Step 5: 계산된 값을 반올림하여 최종 월보험료를 도출합니다.\n", - "\n", - "📌 [REASON_002] Reasoning\n", - " Q: 3년 동안 보험료를 납입한 계약자가 일반암 진단을 받고, 진단금을 수령한 직후 보험을 해지한다면, 이 계약자가 총 얼마의 금액을 받게 될까요? (단, 납입보험료는 해지환급금 계산에만 영향을 미치며, 진단금 수령 시점까지의 납입보험료는 해지환급금 누계에 포함된다고 가정합니다.)\n", - " A: 30,540,000원\n", - " Chain of Thought:\n", - " Step 1: Step 1: `table_3`에서 '일반암' 진단금을 확인합니다: 30,000,000원.\n", - " Step 2: Step 2: `table_2`에서 '경과기간' 3년 시점의 '해지환급금'을 확인합니다: 540,000원.\n", - " Step 3: Step 3: 계약자가 받게 될 총 금액은 '일반암 진단금'과 '해지환급금'의 합계입니다.\n", - " Step 4: Step 4: 총 수령액 = 30,000,000원 + 540,000원 = 30,540,000원.\n", - "\n", - "📌 [REASON_003] Reasoning\n", - " Q: 40세 남성과 40세 여성이 각각 20년납으로 보험에 가입했다고 가정합니다. 두 사람 모두 가입 후 5년이 지난 시점에 일반암 진단을 받고, 10일간 암으로 입원했으며, 1회의 암수술을 받았을 경우, 다음 두 가지 질문에 답하세요.\n", - "1. 두 사람이 5년간 납입한 총 보험료의 차이는 얼마입니까?\n", - "2. 두 사람이 받게 되는 총 보장금액의 차이는 얼마입니까?\n", - " A: 1. 5년간 납입 총 보험료 차이: 420,000원\n", - "2. 총 보장금액 차이: 0원\n", - " Chain of Thought:\n", - " Step 1: Step 1: `table_4`에서 40세 남성과 40세 여성의 20년납 월보험료를 확인합니다.\n", - " Step 2: - 40세 남성: 65,000원\n", - " Step 3: - 40세 여성: 58,000원\n", - " Step 4: Step 2: 각 성별로 5년간 납입할 총 보험료를 계산합니다 (월보험료 * 12개월 * 5년).\n", - " Step 5: - 남성 총 납입 보험료: 65,000원 * 12 * 5 = 3,900,000원\n", - " Step 6: - 여성 총 납입 보험료: 58,000원 * 12 * 5 = 3,480,000원\n", - " Step 7: Step 3: 두 사람의 5년간 납입 총 보험료 차이를 계산합니다: 3,900,000원 - 3,480,000원 = 420,000원.\n", - " Step 8: Step 4: `table_3`에서 일반암 진단금, 암입원비 (10일), 암수술비 (1회)를 확인합니다.\n", - " Step 9: - 일반암 진단금: 30,000,000원\n", - " Step 10: - 암입원비: 100,000원 (1일당) * 10일 = 1,000,000원\n", - " Step 11: - 암수술비: 2,000,000원 (1회당)\n", - " Step 12: Step 5: 각 사람이 받게 되는 총 보장금액을 계산합니다 (성별에 따른 보장금액 차이는 `table_3`에 명시되어 있지 않으므로 동일하다고 가정).\n", - " Step 13: - 총 보장금액 = 30,000,000원 + 1,000,000원 + 2,000,000원 = 33,000,000원.\n", - " Step 14: Step 6: 두 사람의 총 보장금액 차이를 계산합니다: 33,000,000원 - 33,000,000원 = 0원.\n" - ] - } - ], - "source": [ - "# Reasoning 난이도 QA 생성 (Chain-of-Thought)\n", - "print(\"🧠 Reasoning QA 생성 중...\")\n", - "\n", - "reasoning_qa_pairs = qa_generator.generate_qa_by_difficulty(\n", - " tables=tables,\n", - " difficulty=QADifficulty.REASONING,\n", - " num_questions=3\n", - ")\n", - "\n", - "print(f\"\\n✅ {len(reasoning_qa_pairs)}개의 Reasoning QA 생성 완료\")\n", - "for qa in reasoning_qa_pairs:\n", - " print(f\"\\n📌 [{qa.id}] {qa.difficulty}\")\n", - " print(f\" Q: {qa.question}\")\n", - " print(f\" A: {qa.answer}\")\n", - " if qa.chain_of_thought:\n", - " print(f\" Chain of Thought:\")\n", - " for i, step in enumerate(qa.chain_of_thought, 1):\n", - " print(f\" Step {i}: {step}\")" - ] - }, - { - "cell_type": "markdown", - "id": "8c33297d", - "metadata": {}, - "source": [ - "### 4.6 Insight - 통찰 도출" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "72b940c2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "💡 Insight QA 생성 중...\n", - "\n", - "✅ 2개의 Insight QA 생성 완료\n", - "\n", - "📌 [INSIGHT_001] Insight\n", - " Q: 이 보험 상품의 해지환급금 데이터를 분석했을 때, 조기 해지가 고객에게 미치는 재정적 영향은 무엇이며, 이는 상품 설계 철학에 대해 어떤 시사점을 제공합니까?\n", - " A: 이 보험 상품을 조기에 해지할 경우, 고객은 상당한 재정적 손실을 입게 됩니다. 예를 들어, 1년 경과 후 해지 시 납입보험료 누계의 0%만 환급되며, 3년 후에도 30%만 돌려받습니다. 납입한 원금(환급률 100%)을 온전히 돌려받으려면 20년 만기까지 유지해야 합니다. 이는 이 상품이 장기적인 보장과 유지를 전제로 설계되었으며, 단기 해지에 대해 높은 ...\n", - "\n", - "📌 [INSIGHT_002] Insight\n", - " Q: 보험료 산출 구성 요소와 연령 및 성별에 따른 월 보험료 데이터를 종합적으로 고려할 때, 이 보험 상품이 위험을 평가하고 보험료를 책정하는 방식에 대해 어떤 통찰을 얻을 수 있으며, 특히 연령이 증가함에 따라 위험 평가가 어떻게 변화하는 것으로 보입니까?\n", - " A: 이 보험 상품은 연령이 증가함에 따라 위험 관련 요소를 보험료 산출에 매우 중요하게 반영하고 있습니다. `table_1`에서 '나이증가분(A)'보다 '보험료 산출 기초율(위험률 등) 증가분(B)'이 훨씬 더 큰 폭으로 보험료를 상승시키는 주요 요인임을 알 수 있습니다. 이는 연령이 높아질수록 질병 발생률이나 사고 위험 등 보험사가 인지하는 위험도가 급격히 ...\n", - "\n", - "✅ 2개의 Insight QA 생성 완료\n", - "\n", - "📌 [INSIGHT_001] Insight\n", - " Q: 이 보험 상품의 해지환급금 데이터를 분석했을 때, 조기 해지가 고객에게 미치는 재정적 영향은 무엇이며, 이는 상품 설계 철학에 대해 어떤 시사점을 제공합니까?\n", - " A: 이 보험 상품을 조기에 해지할 경우, 고객은 상당한 재정적 손실을 입게 됩니다. 예를 들어, 1년 경과 후 해지 시 납입보험료 누계의 0%만 환급되며, 3년 후에도 30%만 돌려받습니다. 납입한 원금(환급률 100%)을 온전히 돌려받으려면 20년 만기까지 유지해야 합니다. 이는 이 상품이 장기적인 보장과 유지를 전제로 설계되었으며, 단기 해지에 대해 높은 ...\n", - "\n", - "📌 [INSIGHT_002] Insight\n", - " Q: 보험료 산출 구성 요소와 연령 및 성별에 따른 월 보험료 데이터를 종합적으로 고려할 때, 이 보험 상품이 위험을 평가하고 보험료를 책정하는 방식에 대해 어떤 통찰을 얻을 수 있으며, 특히 연령이 증가함에 따라 위험 평가가 어떻게 변화하는 것으로 보입니까?\n", - " A: 이 보험 상품은 연령이 증가함에 따라 위험 관련 요소를 보험료 산출에 매우 중요하게 반영하고 있습니다. `table_1`에서 '나이증가분(A)'보다 '보험료 산출 기초율(위험률 등) 증가분(B)'이 훨씬 더 큰 폭으로 보험료를 상승시키는 주요 요인임을 알 수 있습니다. 이는 연령이 높아질수록 질병 발생률이나 사고 위험 등 보험사가 인지하는 위험도가 급격히 ...\n" - ] - } - ], - "source": [ - "# Insight 난이도 QA 생성 (서술형)\n", - "print(\"💡 Insight QA 생성 중...\")\n", - "\n", - "insight_qa_pairs = qa_generator.generate_qa_by_difficulty(\n", - " tables=tables,\n", - " difficulty=QADifficulty.INSIGHT,\n", - " num_questions=2\n", - ")\n", - "\n", - "print(f\"\\n✅ {len(insight_qa_pairs)}개의 Insight QA 생성 완료\")\n", - "for qa in insight_qa_pairs:\n", - " print(f\"\\n📌 [{qa.id}] {qa.difficulty}\")\n", - " print(f\" Q: {qa.question}\")\n", - " print(f\" A: {qa.answer[:200]}...\" if len(qa.answer) > 200 else f\" A: {qa.answer}\")" - ] - }, - { - "cell_type": "markdown", - "id": "15f40c4d", - "metadata": {}, - "source": [ - "## 5. Multi-Table QA 생성\n", - "\n", - "복수의 테이블을 참조해야 답변 가능한 질문을 생성합니다." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "2dbbbad2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔗 Multi-Table QA 생성 중...\n", - "\n", - "✅ 3개의 Multi-Table QA 생성 완료\n", - "\n", - "📌 [MULTI_001] Reasoning\n", - " Q: table_2에서 10년 경과 시점의 납입보험료 누계가 6,000,000원일 때, 만약 이 보험이 20년납 상품이었다면 월 평균 보험료는 얼마였을까요? 그리고 이 월 평균 보험료는 table_4의 30세 남성 20년납 월보험료(45,000원)와 비교했을 때 얼마나 차이가 나나요?\n", - " A: table_2의 10년 경과 시점 납입보험료 누계(6,000,000원)를 기준으로 20년납 상품의 월 평균 보험료를 계산하면 50,000원입니다. 이는 table_4의 30세 남성 20년납 월보험료(45,000원)보다 5,000원 더 높은 금액입니다.\n", - "\n", - "📌 [MULTI_002] Calculation\n", - " Q: table_4에서 40세 여성이 전기납 월보험료 상품에 가입했다고 할 때, 이 보험료가 table_1의 'XX세' 기준보험료(C)에 해당한다고 가정합니다. 이 경우, 'XX+2세' 시점의 기준보험료는 얼마로 예상할 수 있을까요? (단, table_1의 나이증가분(A)와 보험료 산출 기초율 증가분(B)는 table_1의 해당 값을 따릅니다.)\n", - " A: table_4의 40세 여성 전기납 월보험료 47,000원을 'XX세' 기준보험료로 가정하면, 'XX+1세' 시점의 기준보험료는 47,000 + 1,059 + 10,846 = 58,905원입니다. 이어서 'XX+2세' 시점의 기준보험료는 58,905 + 1,357 + 13,897 = 74,159원입니다.\n", - "\n", - "📌 [MULTI_003] Reasoning\n", - " Q: 40세 남성이 20년납 월보험료 상품에 가입하여 5년 동안 보험료를 납입했을 경우, 총 납입한 보험료는 얼마인가요? 만약 이 시점에 일반암 진단을 받았다면, 총 납입 보험료 대비 암진단금은 몇 배에 해당하나요? 또한, 만약 이 보험의 해지환급금이 table_2의 '5년 경과' 시점의 납입보험료 누계(3,000,000원)에 해당하는 해지환급금과 동일하다고 가정할 때, 암진단금은 해지환급금보다 얼마나 더 많은 금액인가요?\n", - " A: 40세 남성이 5년 동안 납입한 총 보험료는 3,900,000원입니다. 이 시점에 일반암 진단을 받았다면, 총 납입 보험료 대비 암진단금(30,000,000원)은 약 7.69배에 해당합니다. 또한, table_2의 5년 경과 시점 해지환급금(1,650,000원)과 비교했을 때, 암진단금은 해지환급금보다 28,350,000원 더 많은 금액입니다.\n", - "\n", - "✅ 3개의 Multi-Table QA 생성 완료\n", - "\n", - "📌 [MULTI_001] Reasoning\n", - " Q: table_2에서 10년 경과 시점의 납입보험료 누계가 6,000,000원일 때, 만약 이 보험이 20년납 상품이었다면 월 평균 보험료는 얼마였을까요? 그리고 이 월 평균 보험료는 table_4의 30세 남성 20년납 월보험료(45,000원)와 비교했을 때 얼마나 차이가 나나요?\n", - " A: table_2의 10년 경과 시점 납입보험료 누계(6,000,000원)를 기준으로 20년납 상품의 월 평균 보험료를 계산하면 50,000원입니다. 이는 table_4의 30세 남성 20년납 월보험료(45,000원)보다 5,000원 더 높은 금액입니다.\n", - "\n", - "📌 [MULTI_002] Calculation\n", - " Q: table_4에서 40세 여성이 전기납 월보험료 상품에 가입했다고 할 때, 이 보험료가 table_1의 'XX세' 기준보험료(C)에 해당한다고 가정합니다. 이 경우, 'XX+2세' 시점의 기준보험료는 얼마로 예상할 수 있을까요? (단, table_1의 나이증가분(A)와 보험료 산출 기초율 증가분(B)는 table_1의 해당 값을 따릅니다.)\n", - " A: table_4의 40세 여성 전기납 월보험료 47,000원을 'XX세' 기준보험료로 가정하면, 'XX+1세' 시점의 기준보험료는 47,000 + 1,059 + 10,846 = 58,905원입니다. 이어서 'XX+2세' 시점의 기준보험료는 58,905 + 1,357 + 13,897 = 74,159원입니다.\n", - "\n", - "📌 [MULTI_003] Reasoning\n", - " Q: 40세 남성이 20년납 월보험료 상품에 가입하여 5년 동안 보험료를 납입했을 경우, 총 납입한 보험료는 얼마인가요? 만약 이 시점에 일반암 진단을 받았다면, 총 납입 보험료 대비 암진단금은 몇 배에 해당하나요? 또한, 만약 이 보험의 해지환급금이 table_2의 '5년 경과' 시점의 납입보험료 누계(3,000,000원)에 해당하는 해지환급금과 동일하다고 가정할 때, 암진단금은 해지환급금보다 얼마나 더 많은 금액인가요?\n", - " A: 40세 남성이 5년 동안 납입한 총 보험료는 3,900,000원입니다. 이 시점에 일반암 진단을 받았다면, 총 납입 보험료 대비 암진단금(30,000,000원)은 약 7.69배에 해당합니다. 또한, table_2의 5년 경과 시점 해지환급금(1,650,000원)과 비교했을 때, 암진단금은 해지환급금보다 28,350,000원 더 많은 금액입니다.\n" - ] - } - ], - "source": [ - "# Multi-Table QA 생성\n", - "print(\"🔗 Multi-Table QA 생성 중...\")\n", - "\n", - "multi_table_qa = qa_generator.generate_multi_table_qa(\n", - " tables=tables,\n", - " num_questions=3\n", - ")\n", - "\n", - "print(f\"\\n✅ {len(multi_table_qa)}개의 Multi-Table QA 생성 완료\")\n", - "for qa in multi_table_qa:\n", - " print(f\"\\n📌 [{qa.id}] {qa.difficulty}\")\n", - " print(f\" Q: {qa.question}\")\n", - " print(f\" A: {qa.answer}\")\n", - " if qa.evidence and 'required_tables' in str(qa.evidence):\n", - " print(f\" 참조 테이블: {qa.evidence}\")" - ] - }, - { - "cell_type": "markdown", - "id": "cc4df731", - "metadata": {}, - "source": [ - "## 6. 꼬리 질문 (Follow-up) 생성\n", - "\n", - "특정 QA에 대해 연속적인 후속 질문 체인을 생성합니다." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "d2c2b8a1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔄 Follow-up QA 생성 중...\n", - "\n", - "원본 QA:\n", - " Q: XX세의 기준보험료는 얼마인가요?\n", - " A: 42325\n", - "\n", - "✅ 3개의 Follow-up QA 생성 완료\n", - "\n", - " 🔹 Follow-up 1:\n", - " Q: XX세에서 XX+1세로 나이가 증가할 때, 기준보험료는 얼마나 증가하나요?\n", - " A: XX세의 기준보험료는 42325원이고, XX+1세의 기준보험료는 54321원이므로, 54321 - 42325 = 11996원 증가합니다.\n", - "\n", - " 🔹 Follow-up 2:\n", - " Q: 이 11996원의 기준보험료 증가분은 주로 '나이증가분(A)'과 '보험료 산출 기초율(위험률 등) 증가분(B)' 중 어떤 요인에 의해 발생한 것인가요?\n", - " A: XX+1세의 '나이증가분(A)'은 1059원이고, '보험료 산출 기초율(위험률 등) 증가분(B)'은 10846원입니다. 두 요인 중 '보험료 산출 기초율(위험률 등) 증가분(B)'이 훨씬 크므로, 주로 이 요인에 의해 기준보험료가 증가했습니다.\n", - "\n", - " 🔹 Follow-up 3:\n", - " Q: '보험료 산출 기초율(위험률 등) 증가분(B)'이 '전년도 기준보험료의 최대 25% 가정'이라는 설명에 따르면, XX+1세의 B값(10846원)은 XX세의 기준보험료(42325원)의 25%를 초과하나요?\n", - " A: XX세의 기준보험료 42325원의 25%는 42325 * 0.25 = 10581.25원입니다. XX+1세의 '보험료 산출 기초율 증가분(B)'은 10846원이므로, 10581.25원을 초과합니다.\n", - "\n", - "✅ 3개의 Follow-up QA 생성 완료\n", - "\n", - " 🔹 Follow-up 1:\n", - " Q: XX세에서 XX+1세로 나이가 증가할 때, 기준보험료는 얼마나 증가하나요?\n", - " A: XX세의 기준보험료는 42325원이고, XX+1세의 기준보험료는 54321원이므로, 54321 - 42325 = 11996원 증가합니다.\n", - "\n", - " 🔹 Follow-up 2:\n", - " Q: 이 11996원의 기준보험료 증가분은 주로 '나이증가분(A)'과 '보험료 산출 기초율(위험률 등) 증가분(B)' 중 어떤 요인에 의해 발생한 것인가요?\n", - " A: XX+1세의 '나이증가분(A)'은 1059원이고, '보험료 산출 기초율(위험률 등) 증가분(B)'은 10846원입니다. 두 요인 중 '보험료 산출 기초율(위험률 등) 증가분(B)'이 훨씬 크므로, 주로 이 요인에 의해 기준보험료가 증가했습니다.\n", - "\n", - " 🔹 Follow-up 3:\n", - " Q: '보험료 산출 기초율(위험률 등) 증가분(B)'이 '전년도 기준보험료의 최대 25% 가정'이라는 설명에 따르면, XX+1세의 B값(10846원)은 XX세의 기준보험료(42325원)의 25%를 초과하나요?\n", - " A: XX세의 기준보험료 42325원의 25%는 42325 * 0.25 = 10581.25원입니다. XX+1세의 '보험료 산출 기초율 증가분(B)'은 10846원이므로, 10581.25원을 초과합니다.\n" - ] - } - ], - "source": [ - "# 꼬리 질문 생성 (IR QA 기반)\n", - "if ir_qa_pairs:\n", - " print(\"🔄 Follow-up QA 생성 중...\")\n", - " \n", - " original_qa = ir_qa_pairs[0]\n", - " print(f\"\\n원본 QA:\")\n", - " print(f\" Q: {original_qa.question}\")\n", - " print(f\" A: {original_qa.answer}\")\n", - " \n", - " followup_result = qa_generator.generate_followup_qa(\n", - " tables=tables,\n", - " original_qa=original_qa\n", - " )\n", - " \n", - " if followup_result and 'followup_chain' in followup_result:\n", - " print(f\"\\n✅ {len(followup_result['followup_chain'])}개의 Follow-up QA 생성 완료\")\n", - " for i, followup in enumerate(followup_result['followup_chain'], 1):\n", - " print(f\"\\n 🔹 Follow-up {i}:\")\n", - " print(f\" Q: {followup.get('question', 'N/A')}\")\n", - " print(f\" A: {followup.get('answer', 'N/A')}\")\n", - " else:\n", - " print(f\"\\n결과: {followup_result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "c1b07346", - "metadata": {}, - "source": [ - "## 7. Evol-Instruct: 질문 난이도 진화\n", - "\n", - "기본 질문을 더 복잡한 질문으로 진화시킵니다." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "1ea8a3b0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🧬 Evol-Instruct 적용 중...\n", - "\n", - "원본 질문: 40세 남성의 20년납 월보험료는 얼마인가?\n", - "\n", - "✅ 진화 완료\n", - " 진화 전략: 제약 조건 추가 (Adding Constraints), 심층 추론 (Deepening Reasoning), 입력 복잡도 증가 (Complicating Input)\n", - " 진화된 질문: 40세 남성이 20년납으로 보험에 가입할 경우 월보험료는 65,000원이다. 만약 이 남성이 10년 뒤인 50세에 동일한 조건으로 가입한다면 월보험료는 얼마로 예상되는가? 그리고 table_1_premium_calculation 데이터를 참조하여 나이가 증가함에 따라 보험료가 상승하는 주요 원인 두 가지를 설명하시오.\n", - " 새로운 난이도: Reasoning (Level 5)\n", - " 답변: 50세에 동일한 조건으로 가입한다면 월보험료는 95,000원으로 예상됩니다. 보험료가 나이가 증가함에 따라 상승하는 주요 원인은 '나이증가분(A)'과 '보험료 산출 기초율(위험률 등) 증가분(B)'입니다. 이 두 가지 요소가 전년도 기준보험료에 더해져 새로운 기준보험료(C)를 형성하기 때문입니다.\n", - "\n", - "✅ 진화 완료\n", - " 진화 전략: 제약 조건 추가 (Adding Constraints), 심층 추론 (Deepening Reasoning), 입력 복잡도 증가 (Complicating Input)\n", - " 진화된 질문: 40세 남성이 20년납으로 보험에 가입할 경우 월보험료는 65,000원이다. 만약 이 남성이 10년 뒤인 50세에 동일한 조건으로 가입한다면 월보험료는 얼마로 예상되는가? 그리고 table_1_premium_calculation 데이터를 참조하여 나이가 증가함에 따라 보험료가 상승하는 주요 원인 두 가지를 설명하시오.\n", - " 새로운 난이도: Reasoning (Level 5)\n", - " 답변: 50세에 동일한 조건으로 가입한다면 월보험료는 95,000원으로 예상됩니다. 보험료가 나이가 증가함에 따라 상승하는 주요 원인은 '나이증가분(A)'과 '보험료 산출 기초율(위험률 등) 증가분(B)'입니다. 이 두 가지 요소가 전년도 기준보험료에 더해져 새로운 기준보험료(C)를 형성하기 때문입니다.\n" - ] - } - ], - "source": [ - "# Evol-Instruct 적용\n", - "print(\"🧬 Evol-Instruct 적용 중...\")\n", - "\n", - "# 간단한 질문에서 시작\n", - "simple_question = \"40세 남성의 20년납 월보험료는 얼마인가?\"\n", - "print(f\"\\n원본 질문: {simple_question}\")\n", - "\n", - "evolved_result = qa_generator.evolve_question(\n", - " tables=tables,\n", - " original_question=simple_question\n", - ")\n", - "\n", - "if evolved_result and 'evolved' in evolved_result:\n", - " evolved = evolved_result['evolved']\n", - " print(f\"\\n✅ 진화 완료\")\n", - " print(f\" 진화 전략: {evolved.get('evolution_strategy', 'N/A')}\")\n", - " print(f\" 진화된 질문: {evolved.get('question', 'N/A')}\")\n", - " print(f\" 새로운 난이도: {evolved.get('difficulty', 'N/A')}\")\n", - " print(f\" 답변: {evolved.get('answer', 'N/A')}\")\n", - "else:\n", - " print(f\"\\n결과: {evolved_result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "4ca79e89", - "metadata": {}, - "source": [ - "## 8. LLM-as-Judge: 품질 평가" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "287c2e34", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "⚖️ QA 품질 평가 중...\n", - "\n", - "평가 대상 QA:\n", - " Q: XX세의 기준보험료는 얼마인가요?\n", - " A: 42325\n", - "\n", - "✅ 평가 완료\n", - " 📊 Overall Score: 5.0/5.0\n", - " ✓ Pass: 예\n", - "\n", - " 세부 점수:\n", - " - 정확성: 5\n", - " - 충실성: 5\n", - " - 관련성: 5\n", - " - 난이도 적절성: 5\n", - " - 명확성: 5\n", - "\n", - "✅ 평가 완료\n", - " 📊 Overall Score: 5.0/5.0\n", - " ✓ Pass: 예\n", - "\n", - " 세부 점수:\n", - " - 정확성: 5\n", - " - 충실성: 5\n", - " - 관련성: 5\n", - " - 난이도 적절성: 5\n", - " - 명확성: 5\n" - ] - } - ], - "source": [ - "# QA 품질 평가\n", - "if ir_qa_pairs:\n", - " print(\"⚖️ QA 품질 평가 중...\")\n", - " \n", - " qa_to_evaluate = ir_qa_pairs[0]\n", - " print(f\"\\n평가 대상 QA:\")\n", - " print(f\" Q: {qa_to_evaluate.question}\")\n", - " print(f\" A: {qa_to_evaluate.answer}\")\n", - " \n", - " evaluation = qa_generator.evaluate_qa(\n", - " tables=tables,\n", - " qa_pair=qa_to_evaluate\n", - " )\n", - " \n", - " if evaluation:\n", - " print(f\"\\n✅ 평가 완료\")\n", - " print(f\" 📊 Overall Score: {evaluation.overall_score}/5.0\")\n", - " print(f\" ✓ Pass: {'예' if evaluation.passed else '아니오'}\")\n", - " print(f\"\\n 세부 점수:\")\n", - " print(f\" - 정확성: {evaluation.correctness.get('score', 'N/A')}\")\n", - " print(f\" - 충실성: {evaluation.faithfulness.get('score', 'N/A')}\")\n", - " print(f\" - 관련성: {evaluation.relevance.get('score', 'N/A')}\")\n", - " print(f\" - 난이도 적절성: {evaluation.difficulty_appropriateness.get('score', 'N/A')}\")\n", - " print(f\" - 명확성: {evaluation.clarity.get('score', 'N/A')}\")\n", - " \n", - " if evaluation.improvement_suggestions:\n", - " print(f\"\\n 💡 개선 제안:\")\n", - " for suggestion in evaluation.improvement_suggestions:\n", - " print(f\" - {suggestion}\")" - ] - }, - { - "cell_type": "markdown", - "id": "5dcbcf93", - "metadata": {}, - "source": [ - "## 9. 종합 QA 데이터셋 생성\n", - "\n", - "모든 난이도의 QA를 한 번에 생성하고 데이터셋으로 저장합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "342690ff", - "metadata": {}, - "outputs": [], - "source": [ - "# 종합 QA 데이터셋 생성\n", - "print(\"📦 종합 QA 데이터셋 생성 중...\")\n", - "print(\" (모든 난이도 + Follow-up + Evol-Instruct)\")\n", - "print(\" ⏳ 약 2-3분 소요될 수 있습니다...\\n\")\n", - "\n", - "comprehensive_dataset = qa_generator.generate_comprehensive_qa_dataset(\n", - " tables=tables,\n", - " questions_per_difficulty=2,\n", - " include_followup=True,\n", - " include_evolution=True,\n", - " evaluate_quality=False # 평가는 시간이 오래 걸려서 선택적으로\n", - ")\n", - "\n", - "print(\"\\n\" + \"=\"*60)\n", - "print(\"📊 데이터셋 생성 결과\")\n", - "print(\"=\"*60)\n", - "print(f\" 총 QA 쌍: {comprehensive_dataset['metadata'].get('total_qa_pairs', 0)}개\")\n", - "print(f\" Follow-up 체인: {comprehensive_dataset['metadata'].get('total_followups', 0)}개\")\n", - "print(f\" 진화된 질문: {comprehensive_dataset['metadata'].get('total_evolved', 0)}개\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c24832a3", - "metadata": {}, - "outputs": [], - "source": [ - "# 생성된 QA 미리보기\n", - "print(\"\\n📋 생성된 QA 샘플 (난이도별 1개씩):\")\n", - "print(\"=\"*60)\n", - "\n", - "shown_difficulties = set()\n", - "for qa in comprehensive_dataset['qa_pairs']:\n", - " difficulty = qa.get('difficulty', 'Unknown')\n", - " if difficulty not in shown_difficulties:\n", - " shown_difficulties.add(difficulty)\n", - " print(f\"\\n🔹 [{difficulty}]\")\n", - " print(f\" Q: {qa.get('question', 'N/A')}\")\n", - " answer = qa.get('answer', 'N/A')\n", - " if len(str(answer)) > 150:\n", - " print(f\" A: {str(answer)[:150]}...\")\n", - " else:\n", - " print(f\" A: {answer}\")" - ] - }, - { - "cell_type": "markdown", - "id": "48fda52a", - "metadata": {}, - "source": [ - "## 10. 데이터셋 저장" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a74e734", - "metadata": {}, - "outputs": [], - "source": [ - "# 출력 디렉토리 생성\n", - "output_dir = Path.cwd() / \"output\"\n", - "output_dir.mkdir(exist_ok=True)\n", - "\n", - "# 타임스탬프 생성\n", - "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", - "\n", - "# JSON 파일로 저장\n", - "output_file = output_dir / f\"insurance_qa_dataset_{timestamp}.json\"\n", - "\n", - "with open(output_file, 'w', encoding='utf-8') as f:\n", - " json.dump(comprehensive_dataset, f, ensure_ascii=False, indent=2)\n", - "\n", - "print(f\"✅ 데이터셋 저장 완료: {output_file}\")\n", - "print(f\" 파일 크기: {output_file.stat().st_size / 1024:.1f} KB\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2669b33d", - "metadata": {}, - "outputs": [], - "source": [ - "# 간단한 통계 출력용 함수\n", - "def print_dataset_statistics(dataset: dict):\n", - " \"\"\"데이터셋 통계 출력\"\"\"\n", - " print(\"\\n\" + \"=\"*60)\n", - " print(\"📈 QA 데이터셋 통계\")\n", - " print(\"=\"*60)\n", - " \n", - " # 난이도별 카운트\n", - " difficulty_counts = {}\n", - " answer_type_counts = {}\n", - " \n", - " for qa in dataset.get('qa_pairs', []):\n", - " diff = qa.get('difficulty', 'Unknown')\n", - " atype = qa.get('answer_type', 'Unknown')\n", - " \n", - " difficulty_counts[diff] = difficulty_counts.get(diff, 0) + 1\n", - " answer_type_counts[atype] = answer_type_counts.get(atype, 0) + 1\n", - " \n", - " print(\"\\n📊 난이도별 분포:\")\n", - " for diff, count in sorted(difficulty_counts.items()):\n", - " bar = \"█\" * count\n", - " print(f\" {diff:15} | {bar} ({count})\")\n", - " \n", - " print(\"\\n📊 답변 유형별 분포:\")\n", - " for atype, count in sorted(answer_type_counts.items()):\n", - " bar = \"█\" * count\n", - " print(f\" {atype:15} | {bar} ({count})\")\n", - " \n", - " print(f\"\\n📊 총계:\")\n", - " print(f\" 총 QA 쌍: {len(dataset.get('qa_pairs', []))}개\")\n", - " print(f\" Follow-up 체인: {len(dataset.get('followup_chains', []))}개\")\n", - " print(f\" 진화된 질문: {len(dataset.get('evolved_questions', []))}개\")\n", - "\n", - "# 통계 출력\n", - "print_dataset_statistics(comprehensive_dataset)" - ] - }, - { - "cell_type": "markdown", - "id": "a8e41aa7", - "metadata": {}, - "source": [ - "## 11. 비동기 대량 생성 (선택사항)\n", - "\n", - "더 빠른 생성을 위해 비동기 방식을 사용할 수 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05cf2fa2", - "metadata": {}, - "outputs": [], - "source": [ - "# 비동기 QA 생성 (선택사항)\n", - "async def generate_async_dataset():\n", - " \"\"\"비동기 방식으로 QA 데이터셋 생성\"\"\"\n", - " print(\"🚀 비동기 QA 데이터셋 생성 중...\")\n", - " \n", - " dataset = await qa_generator.agenerate_comprehensive_qa_dataset(\n", - " tables=tables,\n", - " questions_per_difficulty=2\n", - " )\n", - " \n", - " return dataset\n", - "\n", - "# Jupyter에서 비동기 실행\n", - "# async_dataset = await generate_async_dataset()\n", - "# print(f\"비동기 생성 완료: {len(async_dataset['qa_pairs'])}개 QA\")" - ] - }, - { - "cell_type": "markdown", - "id": "3e43bab8", - "metadata": {}, - "source": [ - "## 12. 커스텀 프롬프트로 특화 QA 생성\n", - "\n", - "특정 요구사항에 맞는 커스텀 QA를 생성합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b0fbe57", - "metadata": {}, - "outputs": [], - "source": [ - "# 커스텀 프롬프트 예시: 수치 계산 중심 QA\n", - "CUSTOM_CALCULATION_PROMPT = \"\"\"\n", - "## Task: 수치 계산 중심 QA 생성\n", - "보험 테이블에서 수치 계산이 필요한 질문을 생성하세요.\n", - "\n", - "### Input Tables\n", - "{tables}\n", - "\n", - "### Requirements\n", - "1. 반드시 수치 계산(사칙연산, 비율, 증감률 등)이 필요한 질문\n", - "2. 답변은 정확한 숫자로 제공\n", - "3. 계산 과정(Python 코드) 포함\n", - "4. 검증 가능한 형태로 출력\n", - "\n", - "### Output Format (JSON)\n", - "```json\n", - "{{\n", - " \"questions\": [\n", - " {{\n", - " \"id\": \"CALC_001\",\n", - " \"difficulty\": \"Aggregation\",\n", - " \"answer_type\": \"calculation\",\n", - " \"question\": \"질문\",\n", - " \"answer\": \"수치 답변\",\n", - " \"calculation_steps\": [\"단계1\", \"단계2\"],\n", - " \"python_code\": \"검증용 Python 코드\"\n", - " }}\n", - " ]\n", - "}}\n", - "```\n", - "\n", - "### Generate 3 calculation-focused QA pairs.\n", - "\"\"\"\n", - "\n", - "# 커스텀 프롬프트로 생성\n", - "from QA_example.prompts import format_tables_for_prompt\n", - "\n", - "formatted_tables = format_tables_for_prompt(tables)\n", - "custom_prompt = CUSTOM_CALCULATION_PROMPT.format(tables=formatted_tables)\n", - "\n", - "print(\"🔢 수치 계산 중심 QA 생성 중...\")\n", - "response = qa_generator.pool.generate_content(custom_prompt)\n", - "\n", - "# 결과 파싱\n", - "result = qa_generator._parse_json_response(response)\n", - "\n", - "if 'questions' in result:\n", - " print(f\"\\n✅ {len(result['questions'])}개의 계산 QA 생성 완료\")\n", - " for qa in result['questions']:\n", - " print(f\"\\n📌 [{qa.get('id', 'N/A')}]\")\n", - " print(f\" Q: {qa.get('question', 'N/A')}\")\n", - " print(f\" A: {qa.get('answer', 'N/A')}\")\n", - " if 'python_code' in qa:\n", - " print(f\" Python Code: 포함됨 ✅\")\n", - "else:\n", - " print(f\"결과: {result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "484134d3", - "metadata": {}, - "source": [ - "## 13. 요약 및 결론\n", - "\n", - "### 생성된 QA 데이터셋의 특징\n", - "\n", - "| 특징 | 설명 | 커버되는 양상 |\n", - "|------|------|-------------|\n", - "| 난이도 다양성 | IR부터 Insight까지 6단계 | #2 |\n", - "| Multi-Table | 복수 테이블 참조 필요 | #1 |\n", - "| 답변 유형 | Exact Match, Descriptive, Calculation | #3 |\n", - "| 수치 계산 | 집계, 비교, 증감률 계산 | #4, #6 |\n", - "| 꼬리 질문 | Follow-up 체인 생성 | #5 |\n", - "| 특정 셀 QA | 단일 셀 기반 Q-A | #7 |\n", - "| Evol-Instruct | 질문 난이도 진화 | #2, #4 |\n", - "| LLM Judge | 품질 평가 | #3 |" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/choi/QA_example/qa_generator.py b/tests/choi/QA_example/qa_generator.py deleted file mode 100644 index b1ab1fe..0000000 --- a/tests/choi/QA_example/qa_generator.py +++ /dev/null @@ -1,556 +0,0 @@ -""" -QA Generator for Insurance Table Data -보험 테이블 기반 QA 데이터셋 생성기 - -이 모듈은 Gemini API를 활용하여 보험 테이블 마크다운 데이터로부터 -다양한 난이도와 유형의 QA 데이터셋을 생성합니다. - -주요 기능: -1. 난이도별 QA 생성 (IR, Analysis, Compare, Aggregation, Reasoning, Insight) -2. Multi-table QA 생성 -3. 꼬리 질문 (Follow-up) 생성 -4. Evol-Instruct 기반 난이도 진화 -5. LLM-as-Judge 품질 평가 -""" - -import json -import asyncio -import logging -from pathlib import Path -from typing import Optional, Dict, Any, List, Union -from dataclasses import dataclass, asdict -from enum import Enum -import sys -from json_repair import repair_json - -# 프로젝트 루트 추가 -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -from polling_gemini import get_gemini_pool, GeminiAPIPool - -from .prompts import ( - QA_GENERATOR_SYSTEM_PROMPT, - IR_QA_PROMPT, - ANALYSIS_QA_PROMPT, - COMPARE_QA_PROMPT, - AGGREGATION_QA_PROMPT, - REASONING_QA_PROMPT, - INSIGHT_QA_PROMPT, - FOLLOWUP_QA_PROMPT, - MULTI_TABLE_QA_PROMPT, - EVOL_INSTRUCT_PROMPT, - QA_EVALUATION_PROMPT, - get_qa_prompt_by_difficulty, - format_tables_for_prompt, -) - -# 로깅 설정 -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - - -class QADifficulty(Enum): - """QA 난이도 레벨""" - IR = "IR" # Level 1: 단순 정보 검색 - ANALYSIS = "Analysis" # Level 2: 분석적 질문 - COMPARE = "Compare" # Level 3: 비교/Multi-hop - AGGREGATION = "Aggregation" # Level 4: 집계 연산 - REASONING = "Reasoning" # Level 5: 복합 추론 - INSIGHT = "Insight" # Level 6: 통찰 도출 - - -class QAType(Enum): - """QA 답변 유형""" - EXACT_MATCH = "exact_match" # 단답형 (정확히 일치) - DESCRIPTIVE = "descriptive" # 서술형 (LLM Judge 평가) - CALCULATION = "calculation" # 계산형 (수치 결과) - COMPARISON = "comparison" # 비교형 (비교 결과 및 근거) - - -@dataclass -class QAPair: - """QA 쌍 데이터 클래스""" - id: str - difficulty: str - answer_type: str - question: str - answer: str - reasoning: Optional[str] = None - evidence: Optional[Dict[str, Any]] = None - tags: Optional[List[str]] = None - python_verification: Optional[str] = None - chain_of_thought: Optional[List[str]] = None - # 추가 필드들 (LLM이 생성할 수 있는 다양한 필드) - calculation: Optional[str] = None - calculation_steps: Optional[List[str]] = None - assumptions: Optional[List[str]] = None - supporting_analysis: Optional[str] = None - key_findings: Optional[List[str]] = None - required_tables: Optional[List[str]] = None - join_logic: Optional[str] = None - python_code: Optional[str] = None - extra: Optional[Dict[str, Any]] = None # 기타 알 수 없는 필드용 - - def to_dict(self) -> Dict[str, Any]: - """딕셔너리로 변환""" - return {k: v for k, v in asdict(self).items() if v is not None} - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> 'QAPair': - """딕셔너리에서 QAPair 생성 (알 수 없는 필드는 extra에 저장)""" - # QAPair의 필드 목록 - known_fields = { - 'id', 'difficulty', 'answer_type', 'question', 'answer', - 'reasoning', 'evidence', 'tags', 'python_verification', - 'chain_of_thought', 'calculation', 'calculation_steps', - 'assumptions', 'supporting_analysis', 'key_findings', - 'required_tables', 'join_logic', 'python_code', 'extra' - } - - # 알려진 필드와 알 수 없는 필드 분리 - known_data = {} - extra_data = {} - - for key, value in data.items(): - if key in known_fields: - known_data[key] = value - else: - extra_data[key] = value - - # 필수 필드 기본값 설정 - known_data.setdefault('id', 'UNKNOWN') - known_data.setdefault('difficulty', 'Unknown') - known_data.setdefault('answer_type', 'unknown') - known_data.setdefault('question', '') - known_data.setdefault('answer', '') - - # extra 필드가 있으면 추가 - if extra_data: - known_data['extra'] = extra_data - - return cls(**known_data) - - -@dataclass -class EvaluationResult: - """QA 평가 결과 데이터 클래스""" - correctness: Dict[str, Any] - faithfulness: Dict[str, Any] - relevance: Dict[str, Any] - difficulty_appropriateness: Dict[str, Any] - clarity: Dict[str, Any] - overall_score: float - passed: bool - improvement_suggestions: List[str] - - -class InsuranceTableQAGenerator: - """ - 보험 테이블 기반 QA 데이터셋 생성기 - - Gemini API Pool을 활용하여 자동 키 로테이션을 지원하며, - 다양한 난이도와 유형의 QA를 생성합니다. - """ - - def __init__( - self, - config_path: Optional[str] = None, - model_name: str = "gemini-2.0-flash", - ): - """ - Args: - config_path: API 키 설정 파일 경로 - model_name: 사용할 Gemini 모델 - """ - self.pool = get_gemini_pool(config_path) - self.model_name = model_name - self.system_prompt = QA_GENERATOR_SYSTEM_PROMPT - - def _parse_json_response(self, response: str) -> Dict[str, Any]: - """LLM 응답에서 JSON 추출 및 파싱 (agentjson 사용)""" - - try: - result = repair_json(response, return_objects=True) - - if isinstance(result, dict): - logger.debug(f"JSON 파싱 성공") - return result - elif isinstance(result, list): - logger.warning(f"JSON 파싱 결과가 리스트입니다. 딕셔너리로 변환을 시도합니다.") - # 질문 리스트라고 가정 - return {"questions": result} - else: - logger.warning(f"JSON 파싱 실패 (예상치 못한 타입): {type(result)}") - return {"error": "parsing_failed", "raw_response": response} - - except Exception as e: - logger.error(f"JSON 파싱 실패: {e}") - logger.debug(f"원본 응답: {response}") - return {"error": str(e), "raw_response": response} - - def _build_prompt( - self, - base_prompt: str, - tables: Dict[str, str], - **kwargs - ) -> str: - """프롬프트 구성""" - formatted_tables = format_tables_for_prompt(tables) - - prompt = f"{self.system_prompt}\n\n{base_prompt}" - prompt = prompt.format(tables=formatted_tables, **kwargs) - - return prompt - - def generate_qa_by_difficulty( - self, - tables: Dict[str, str], - difficulty: QADifficulty, - num_questions: int = 3, - **kwargs - ) -> List[QAPair]: - """ - 특정 난이도의 QA 생성 - - Args: - tables: 테이블 딕셔너리 {table_id: markdown_content} - difficulty: QA 난이도 - num_questions: 생성할 질문 수 - - Returns: - 생성된 QA 쌍 리스트 - """ - base_prompt = get_qa_prompt_by_difficulty(difficulty.value) - prompt = self._build_prompt( - base_prompt, - tables, - num_questions=num_questions - ) - - try: - response = self.pool.generate_content(prompt) - result = self._parse_json_response(response) - - if "questions" in result: - return [QAPair.from_dict(q) for q in result["questions"]] - else: - logger.warning(f"예상치 못한 응답 형식: {result}") - return [] - - except Exception as e: - logger.error(f"QA 생성 실패: {e}") - return [] - - async def agenerate_qa_by_difficulty( - self, - tables: Dict[str, str], - difficulty: QADifficulty, - num_questions: int = 3, - **kwargs - ) -> List[QAPair]: - """난이도별 QA 생성 (비동기)""" - base_prompt = get_qa_prompt_by_difficulty(difficulty.value) - prompt = self._build_prompt( - base_prompt, - tables, - num_questions=num_questions - ) - - try: - response = await self.pool.agenerate_content(prompt) - result = self._parse_json_response(response) - - if "questions" in result: - return [QAPair.from_dict(q) for q in result["questions"]] - else: - return [] - - except Exception as e: - logger.error(f"비동기 QA 생성 실패: {e}") - return [] - - def generate_multi_table_qa( - self, - tables: Dict[str, str], - num_questions: int = 3, - ) -> List[QAPair]: - """ - Multi-table QA 생성 - - 복수의 테이블을 참조해야 답변 가능한 질문 생성 - """ - if len(tables) < 2: - logger.warning("Multi-table QA는 최소 2개의 테이블이 필요합니다.") - # 단일 테이블이라도 시도 - - prompt = self._build_prompt( - MULTI_TABLE_QA_PROMPT, - tables, - num_questions=num_questions - ) - - try: - response = self.pool.generate_content(prompt) - result = self._parse_json_response(response) - - if "questions" in result: - return [QAPair.from_dict(q) for q in result["questions"]] - return [] - - except Exception as e: - logger.error(f"Multi-table QA 생성 실패: {e}") - return [] - - def generate_followup_qa( - self, - tables: Dict[str, str], - original_qa: QAPair, - ) -> Dict[str, Any]: - """ - 꼬리 질문 (Follow-up) 생성 - - 원래 QA를 기반으로 연속적인 후속 질문 체인 생성 - """ - original_qa_str = json.dumps(original_qa.to_dict(), ensure_ascii=False, indent=2) - - prompt = self._build_prompt( - FOLLOWUP_QA_PROMPT, - tables, - original_qa=original_qa_str - ) - - try: - response = self.pool.generate_content(prompt) - return self._parse_json_response(response) - - except Exception as e: - logger.error(f"Follow-up QA 생성 실패: {e}") - return {} - - def evolve_question( - self, - tables: Dict[str, str], - original_question: str, - ) -> Dict[str, Any]: - """ - Evol-Instruct: 질문 난이도 진화 - - 기본 질문을 더 복잡한 질문으로 진화 - """ - prompt = self._build_prompt( - EVOL_INSTRUCT_PROMPT, - tables, - original_question=original_question - ) - - try: - response = self.pool.generate_content(prompt) - return self._parse_json_response(response) - - except Exception as e: - logger.error(f"Evol-Instruct 실패: {e}") - return {} - - def evaluate_qa( - self, - tables: Dict[str, str], - qa_pair: QAPair, - ) -> EvaluationResult: - """ - LLM-as-Judge: QA 품질 평가 - - 생성된 QA 쌍의 품질을 다면적으로 평가 - """ - qa_str = json.dumps(qa_pair.to_dict(), ensure_ascii=False, indent=2) - - prompt = self._build_prompt( - QA_EVALUATION_PROMPT, - tables, - qa_pair=qa_str - ) - - try: - response = self.pool.generate_content(prompt) - result = self._parse_json_response(response) - - if "evaluation" in result: - eval_data = result["evaluation"] - return EvaluationResult( - correctness=eval_data.get("correctness", {}), - faithfulness=eval_data.get("faithfulness", {}), - relevance=eval_data.get("relevance", {}), - difficulty_appropriateness=eval_data.get("difficulty_appropriateness", {}), - clarity=eval_data.get("clarity", {}), - overall_score=result.get("overall_score", 0.0), - passed=result.get("pass", False), - improvement_suggestions=result.get("improvement_suggestions", []) - ) - else: - logger.warning(f"평가 결과 파싱 실패: {result}") - return None - - except Exception as e: - logger.error(f"QA 평가 실패: {e}") - return None - - def generate_comprehensive_qa_dataset( - self, - tables: Dict[str, str], - questions_per_difficulty: int = 2, - include_followup: bool = True, - include_evolution: bool = True, - evaluate_quality: bool = False, - ) -> Dict[str, Any]: - """ - 종합적인 QA 데이터셋 생성 - - 모든 난이도의 QA를 생성하고 선택적으로 꼬리질문, 진화, 평가를 수행 - - Args: - tables: 테이블 딕셔너리 - questions_per_difficulty: 난이도별 질문 수 - include_followup: 꼬리질문 포함 여부 - include_evolution: Evol-Instruct 포함 여부 - evaluate_quality: 품질 평가 수행 여부 - - Returns: - 종합 QA 데이터셋 - """ - dataset = { - "metadata": { - "tables_count": len(tables), - "questions_per_difficulty": questions_per_difficulty, - }, - "qa_pairs": [], - "followup_chains": [], - "evolved_questions": [], - "evaluations": [], - } - - # 1. 각 난이도별 QA 생성 - for difficulty in QADifficulty: - logger.info(f"Generating {difficulty.value} level QA...") - qa_pairs = self.generate_qa_by_difficulty( - tables, difficulty, questions_per_difficulty - ) - - for qa in qa_pairs: - qa_dict = qa.to_dict() - dataset["qa_pairs"].append(qa_dict) - - # 2. 꼬리질문 생성 (선택적) - if include_followup and difficulty in [QADifficulty.IR, QADifficulty.ANALYSIS]: - followup = self.generate_followup_qa(tables, qa) - if followup: - dataset["followup_chains"].append(followup) - - # 3. 질문 진화 (선택적) - if include_evolution and difficulty in [QADifficulty.IR, QADifficulty.ANALYSIS]: - evolved = self.evolve_question(tables, qa.question) - if evolved: - dataset["evolved_questions"].append(evolved) - - # 4. 품질 평가 (선택적) - if evaluate_quality: - evaluation = self.evaluate_qa(tables, qa) - if evaluation: - dataset["evaluations"].append({ - "qa_id": qa.id, - "evaluation": asdict(evaluation) - }) - - # 5. Multi-table QA (테이블이 2개 이상인 경우) - if len(tables) >= 2: - logger.info("Generating Multi-table QA...") - multi_qa = self.generate_multi_table_qa(tables, questions_per_difficulty) - for qa in multi_qa: - dataset["qa_pairs"].append(qa.to_dict()) - - # 메타데이터 업데이트 - dataset["metadata"]["total_qa_pairs"] = len(dataset["qa_pairs"]) - dataset["metadata"]["total_followups"] = len(dataset["followup_chains"]) - dataset["metadata"]["total_evolved"] = len(dataset["evolved_questions"]) - - return dataset - - async def agenerate_comprehensive_qa_dataset( - self, - tables: Dict[str, str], - questions_per_difficulty: int = 2, - ) -> Dict[str, Any]: - """종합 QA 데이터셋 생성 (비동기)""" - dataset = { - "metadata": { - "tables_count": len(tables), - "questions_per_difficulty": questions_per_difficulty, - }, - "qa_pairs": [], - } - - # 모든 난이도에 대해 병렬로 QA 생성 - tasks = [ - self.agenerate_qa_by_difficulty(tables, difficulty, questions_per_difficulty) - for difficulty in QADifficulty - ] - - results = await asyncio.gather(*tasks) - - for qa_list in results: - for qa in qa_list: - dataset["qa_pairs"].append(qa.to_dict()) - - dataset["metadata"]["total_qa_pairs"] = len(dataset["qa_pairs"]) - - return dataset - - -# ============================================================================= -# Convenience Functions -# ============================================================================= - -def generate_qa_from_tables( - tables: Dict[str, str], - difficulty: Optional[QADifficulty] = None, - num_questions: int = 3, - config_path: Optional[str] = None, -) -> List[Dict[str, Any]]: - """ - 테이블에서 QA 생성 (간편 함수) - - Args: - tables: 테이블 딕셔너리 {table_id: markdown_content} - difficulty: 난이도 (None이면 모든 난이도) - num_questions: 난이도별 질문 수 - config_path: API 설정 파일 경로 - - Returns: - 생성된 QA 리스트 - """ - generator = InsuranceTableQAGenerator(config_path=config_path) - - if difficulty: - qa_pairs = generator.generate_qa_by_difficulty(tables, difficulty, num_questions) - return [qa.to_dict() for qa in qa_pairs] - else: - dataset = generator.generate_comprehensive_qa_dataset( - tables, - questions_per_difficulty=num_questions, - include_followup=False, - include_evolution=False, - evaluate_quality=False, - ) - return dataset["qa_pairs"] - - -async def agenerate_qa_from_tables( - tables: Dict[str, str], - num_questions: int = 3, - config_path: Optional[str] = None, -) -> Dict[str, Any]: - """테이블에서 QA 생성 (비동기 간편 함수)""" - generator = InsuranceTableQAGenerator(config_path=config_path) - return await generator.agenerate_comprehensive_qa_dataset(tables, num_questions) diff --git a/tests/choi/Table_example/README.md b/tests/choi/Table_example/README.md deleted file mode 100644 index f0cc165..0000000 --- a/tests/choi/Table_example/README.md +++ /dev/null @@ -1,168 +0,0 @@ -# Table Example - 보험 테이블 추출기 - -`polling_gemini` 패키지를 활용하여 보험 문서 이미지에서 테이블을 Markdown 형식으로 추출하는 예제입니다. - -## 주요 특징 - -- **VLM + OCR 하이브리드 접근**: Gemini의 시각적 추론 능력과 OCR 텍스트를 결합 -- **보험 도메인 특화 프롬프트**: 계층적 헤더, 셀 병합, 복합 데이터 처리에 최적화 -- **자동 API 키 로테이션**: `polling_gemini` 패키지를 통한 API 키 관리 -- **동기/비동기 지원**: 대용량 처리를 위한 비동기 API 지원 - -## 설치 - -프로젝트 루트에서: - -```bash -uv sync -``` - -## 사용법 - -### 1. 기본 사용 - -```python -from Table_example import extract_table_from_image - -# 이미지에서 테이블 추출 -result = extract_table_from_image("insurance_table.png") -print(result) -``` - -### 2. OCR 참조 텍스트와 함께 사용 - -```python -from Table_example import extract_table_from_image - -# OCR로 먼저 추출한 텍스트가 있는 경우 -ocr_text = """| 구분 | 보험기간 | 납입기간 | -| 상해사망 | 80세 | 20년 |""" - -result = extract_table_from_image( - "insurance_table.png", - ocr_markdown=ocr_text # OCR 결과를 참조로 제공 -) -``` - -### 3. 비동기 처리 - -```python -import asyncio -from Table_example import aextract_table_from_image - -async def process_multiple_images(): - images = ["table1.png", "table2.png", "table3.png"] - - # 동시에 여러 이미지 처리 - tasks = [aextract_table_from_image(img) for img in images] - results = await asyncio.gather(*tasks) - - return results - -results = asyncio.run(process_multiple_images()) -``` - -### 4. InsuranceTableExtractor 클래스 직접 사용 - -```python -from Table_example import InsuranceTableExtractor - -# 추출기 인스턴스 생성 -extractor = InsuranceTableExtractor( - config_path="apis/gemini_keys.yaml", # 커스텀 설정 경로 - model_name="gemini-2.5-flash" # 모델 지정 -) - -# 테이블 추출 -result = extractor.extract("insurance_table.png") - -# API Pool 상태 확인 -status = extractor.get_pool_status() -print(f"현재 API 키: {status['current_key']['name']}") -``` - -## 프롬프트 설계 - -### System Prompt 핵심 원칙 - -1. **구조적 완전성**: 병합된 셀을 비정규화하여 모든 행에 값 채우기 -2. **헤더 평탄화**: 다중 행 헤더를 언더스코어(_)로 연결하여 단일 행으로 변환 -3. **데이터 무결성**: 금액 포맷 정리, 퍼센트 유지, 부가 정보 제거 -4. **Hybrid Reference**: OCR 텍스트 참조로 오타 교정 - -### User Prompt Chain-of-Table 단계 - -1. **구조 분석**: 레이아웃, 헤더 행 수, 병합 셀 식별 -2. **헤더 처리**: 다중 행 헤더 → 단일 행 변환 -3. **데이터 추출**: 값 추출 및 병합 셀 반복 입력 -4. **포맷팅 검증**: 금액 정리, 컬럼 개수 확인 - -## 테스트 - -```bash -cd TableMagnifier -python -m Table_example.test_extraction -``` - -### 테스트 이미지 준비 - -`Table_example/sample_images/` 폴더에 테스트할 보험 테이블 이미지를 추가하세요: -- 지원 형식: PNG, JPG, JPEG, GIF, WebP, BMP -- 권장: 해상도가 높은 테이블 이미지 - -## 출력 예시 - -입력 이미지: -``` -┌─────────────┬────────────────────────────┐ -│ │ 해지환급금 │ -│ 구분 ├──────────────┬─────────────┤ -│ │ 금액 │ 환급률 │ -├─────────────┼──────────────┼─────────────┤ -│ 1년 │ 100,000원 │ 10% │ -│ 5년 │ 500,000원 │ 50% │ -│ 10년 │ 1,000,000원 │ 100% │ -└─────────────┴──────────────┴─────────────┘ -``` - -출력 Markdown: -```markdown -| 구분 | 해지환급금_금액 | 해지환급금_환급률 | -| :--- | :--- | :--- | -| 1년 | 100000 | 10% | -| 5년 | 500000 | 50% | -| 10년 | 1000000 | 100% | -``` - -## 파일 구조 - -``` -Table_example/ -├── __init__.py # 패키지 초기화 -├── prompts.py # 시스템/사용자 프롬프트 정의 -├── table_extractor.py # 테이블 추출기 클래스 -├── test_extraction.py # 테스트 코드 -├── sample_images/ # 테스트 이미지 폴더 -└── README.md # 이 문서 -``` - -## API 키 설정 - -`apis/gemini_keys.yaml` 파일에 Gemini API 키를 설정하세요: - -```yaml -api_keys: - - key: "YOUR_GEMINI_API_KEY" - name: "key1" - enabled: true - -settings: - model: "gemini-2.5-flash" - temperature: 0.1 - max_retries: 3 - retry_delay: 2 -``` - -## 라이센스 - -MIT License diff --git a/tests/choi/Table_example/__init__.py b/tests/choi/Table_example/__init__.py deleted file mode 100644 index 30e122d..0000000 --- a/tests/choi/Table_example/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -Table Extraction Example -보험 도메인 특화 테이블 추출 예제 -polling_gemini 패키지를 사용하여 이미지에서 Markdown 테이블 추출 -""" - -from .table_extractor import ( - InsuranceTableExtractor, - extract_table_from_image, - aextract_table_from_image, -) -from .prompts import SYSTEM_PROMPT, USER_PROMPT_TEMPLATE - -__all__ = [ - 'InsuranceTableExtractor', - 'extract_table_from_image', - 'aextract_table_from_image', - 'SYSTEM_PROMPT', - 'USER_PROMPT_TEMPLATE', -] diff --git a/tests/choi/Table_example/example_batch.py b/tests/choi/Table_example/example_batch.py deleted file mode 100644 index 2f348ff..0000000 --- a/tests/choi/Table_example/example_batch.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -비동기 배치 테이블 추출 예제 -여러 이미지를 동시에 처리하여 효율적으로 테이블을 추출합니다. -""" - -import sys -import asyncio -import time -from pathlib import Path -from typing import List, Tuple, Optional - -# 프로젝트 루트를 Python 경로에 추가 -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -from Table_example import aextract_table_from_image, InsuranceTableExtractor - - -async def process_single_image( - image_path: Path, - ocr_markdown: str = "N/A" -) -> Tuple[Path, Optional[str], Optional[str]]: - """ - 단일 이미지 처리 (비동기) - - Returns: - (이미지 경로, 결과 또는 None, 에러 메시지 또는 None) - """ - try: - result = await aextract_table_from_image( - image_path=image_path, - ocr_markdown=ocr_markdown - ) - return (image_path, result, None) - except Exception as e: - return (image_path, None, str(e)) - - -async def batch_extract_tables( - image_paths: List[Path], - max_concurrent: int = 3 -) -> List[Tuple[Path, Optional[str], Optional[str]]]: - """ - 여러 이미지를 배치로 처리 (동시성 제한) - - Args: - image_paths: 처리할 이미지 경로 리스트 - max_concurrent: 동시 처리 최대 개수 - - Returns: - (이미지 경로, 결과, 에러) 튜플 리스트 - """ - # 세마포어로 동시 실행 제한 - semaphore = asyncio.Semaphore(max_concurrent) - - async def process_with_semaphore(image_path: Path): - async with semaphore: - return await process_single_image(image_path) - - # 모든 이미지 동시 처리 (세마포어로 제한) - tasks = [process_with_semaphore(path) for path in image_paths] - results = await asyncio.gather(*tasks) - - return results - - -def find_all_images(directory: Path) -> List[Path]: - """디렉토리에서 모든 이미지 파일 찾기""" - extensions = ["*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.bmp"] - images = [] - for ext in extensions: - images.extend(directory.glob(ext)) - return sorted(images) - - -async def main(): - """ - 배치 처리 예제 - sample_images 폴더의 모든 이미지를 처리합니다. - """ - print("🏥 보험 테이블 배치 추출 시작") - print("=" * 70) - - # ============================================ - # 📌 설정 - # ============================================ - sample_dir = Path(__file__).parent / "sample_images" - output_dir = Path(__file__).parent / "output" - max_concurrent = 3 # 동시 처리 최대 개수 - - # 출력 디렉토리 생성 - output_dir.mkdir(exist_ok=True) - - # ============================================ - # 📌 이미지 파일 수집 - # ============================================ - image_paths = find_all_images(sample_dir) - - if not image_paths: - print(f"❌ 이미지 파일이 없습니다: {sample_dir}") - print("\n📝 sample_images 폴더에 테이블 이미지를 추가하세요.") - return - - print(f"📁 처리할 이미지: {len(image_paths)}개") - for i, path in enumerate(image_paths, 1): - print(f" {i}. {path.name}") - print(f"⚡ 동시 처리 수: {max_concurrent}") - print("=" * 70) - - # ============================================ - # 🚀 배치 처리 실행 - # ============================================ - start_time = time.time() - - results = await batch_extract_tables( - image_paths=image_paths, - max_concurrent=max_concurrent - ) - - elapsed = time.time() - start_time - - # ============================================ - # 📊 결과 처리 - # ============================================ - success_count = 0 - failed_count = 0 - - print("\n📊 처리 결과:") - print("-" * 70) - - for image_path, result, error in results: - if result: - success_count += 1 - status = "✅ 성공" - - # 결과 파일 저장 - output_path = output_dir / f"{image_path.stem}_table.md" - with open(output_path, "w", encoding="utf-8") as f: - f.write(f"# {image_path.name}\n\n") - f.write(result) - - print(f"\n{status}: {image_path.name}") - print(f" 💾 저장: {output_path.name}") - print(f" 📝 미리보기: {result[:100]}...") - else: - failed_count += 1 - status = "❌ 실패" - print(f"\n{status}: {image_path.name}") - print(f" ⚠️ 에러: {error}") - - # ============================================ - # 📈 최종 요약 - # ============================================ - print("\n" + "=" * 70) - print("📈 처리 완료!") - print(f" ⏱️ 총 소요 시간: {elapsed:.2f}초") - print(f" ✅ 성공: {success_count}개") - print(f" ❌ 실패: {failed_count}개") - print(f" 📁 출력 폴더: {output_dir}") - - if success_count > 0: - avg_time = elapsed / success_count - print(f" ⚡ 평균 처리 시간: {avg_time:.2f}초/이미지") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/choi/Table_example/example_simple.py b/tests/choi/Table_example/example_simple.py deleted file mode 100644 index c0bdbbc..0000000 --- a/tests/choi/Table_example/example_simple.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -간단한 테이블 추출 예제 -실제 이미지 파일 경로를 지정하여 테이블을 추출합니다. -""" - -import sys -from pathlib import Path - -# 프로젝트 루트를 Python 경로에 추가 -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -from Table_example import extract_table_from_image, InsuranceTableExtractor - - -def main(): - """ - 사용 예제: - 1. sample_images 폴더에 테이블 이미지를 추가하세요 - 2. 아래 IMAGE_PATH를 해당 이미지 경로로 수정하세요 - 3. 스크립트를 실행하세요 - """ - - # ============================================ - # 📌 이미지 경로 설정 - # ============================================ - # 방법 1: sample_images 폴더의 이미지 사용 - sample_dir = Path(__file__).parent / "sample_images" - - # sample_images 폴더의 첫 번째 이미지 자동 선택 - image_files = list(sample_dir.glob("*.png")) + \ - list(sample_dir.glob("*.jpg")) + \ - list(sample_dir.glob("*.jpeg")) - - if not image_files: - print("❌ sample_images 폴더에 이미지가 없습니다.") - print(f" 경로: {sample_dir}") - print("\n📝 테이블 이미지를 추가한 후 다시 실행하세요.") - return - - IMAGE_PATH = image_files[0] - - # 방법 2: 직접 경로 지정 (주석 해제하여 사용) - # IMAGE_PATH = "/path/to/your/table_image.png" - - # ============================================ - # 📌 OCR 참조 텍스트 (선택적) - # ============================================ - # OCR로 먼저 추출한 텍스트가 있으면 여기에 입력 - # 이미지가 흐릿할 때 숫자 정확도 향상에 도움됩니다 - OCR_MARKDOWN = "N/A" # 없으면 "N/A" - - # 예시: - # OCR_MARKDOWN = """ - # | 구분 | 보험료 | - # | 상해 | 10000 | - # | 질병 | 15000 | - # """ - - # ============================================ - # 🚀 테이블 추출 실행 - # ============================================ - print("🏥 보험 테이블 추출 시작") - print("=" * 60) - print(f"📁 이미지: {IMAGE_PATH}") - print(f"📝 OCR 참조: {'있음' if OCR_MARKDOWN != 'N/A' else '없음'}") - print("=" * 60) - - try: - # 테이블 추출 - result = extract_table_from_image( - image_path=IMAGE_PATH, - ocr_markdown=OCR_MARKDOWN - ) - - print("\n✅ 추출 완료!") - print("\n📊 결과 (Markdown Table):") - print("-" * 60) - print(result) - print("-" * 60) - - # 결과를 파일로 저장 - output_path = Path(IMAGE_PATH).with_suffix(".md") - with open(output_path, "w", encoding="utf-8") as f: - f.write(result) - print(f"\n💾 결과 저장됨: {output_path}") - - except FileNotFoundError as e: - print(f"❌ 파일을 찾을 수 없습니다: {e}") - except Exception as e: - print(f"❌ 추출 실패: {e}") - import traceback - traceback.print_exc() - - -if __name__ == "__main__": - main() diff --git a/tests/choi/Table_example/output/.gitkeep b/tests/choi/Table_example/output/.gitkeep deleted file mode 100644 index bc83f8d..0000000 --- a/tests/choi/Table_example/output/.gitkeep +++ /dev/null @@ -1,3 +0,0 @@ -# Output - -이 폴더에 추출된 테이블 결과가 저장됩니다. diff --git a/tests/choi/Table_example/prompts.py b/tests/choi/Table_example/prompts.py deleted file mode 100644 index e928665..0000000 --- a/tests/choi/Table_example/prompts.py +++ /dev/null @@ -1,69 +0,0 @@ - -# System Prompt - 보험 데이터 엔지니어 역할 정의 -SYSTEM_PROMPT = """# Role Definition -당신은 20년 경력의 '수석 보험 데이터 엔지니어'이자 'OCR 후처리 전문가'입니다. -당신의 임무는 제공된 [보험 문서 이미지]와 선택적으로 제공되는 [기초 OCR 텍스트]를 분석하여, 데이터베이스 적재가 가능한 완벽한 형태의 'Standardized Markdown Table'로 변환하는 것입니다. - -# Core Principles -1. **구조적 완전성(Structural Integrity):** 시각적으로 병합(Merge)된 셀은 반드시 비정규화(Denormalization)하여 모든 행에 값을 채워야 합니다. 빈칸이나 " 상동" 등의 표현은 금지됩니다. -2. **헤더 평탄화(Header Flattening):** 2행 이상의 계층적 헤더(Multi-row Headers)는 상위 헤더와 하위 헤더를 언더스코어(_)로 연결하여 단일 행(Single-row) 헤더로 변환합니다. (예: '보장내용' 하위에 '지급금액'이 있다면 -> '보장내용_지급금액') -3. **데이터 무결성(Data Integrity):** - - 금액의 천 단위 구분자(,)는 제거합니다. (예: 1,000,000 -> 1000000) - - 퍼센트(%)는 기호를 포함한 문자열로 유지합니다. (예: 98.5%) - - 괄호 안의 보조 정보(예: 전년 대비 증감액)는 무시하고 핵심 수치만 추출합니다. -4. **Hybrid Reference:** [기초 OCR 텍스트]가 제공될 경우, 이미지 내 텍스트가 흐릿하거나 불분명할 때 해당 텍스트를 참조하여 오타를 교정하십시오. 단, 표의 구조(행/열 위치) 판단은 반드시 [원본 이미지]를 기준으로 합니다.""" - -# User Prompt Template - 단계별 지시사항 포함 -USER_PROMPT_TEMPLATE = """# Task Description -아래 제공된 입력을 바탕으로 보험 테이블 데이터를 추출하십시오. - -# Input Data -1. **Target Image:** [첨부된 이미지] -2. **Reference OCR Markdown (Optional):** -\"\"\" -{ocr_markdown} -\"\"\" - -# Step-by-Step Instructions (Chain-of-Table) -단계별로 생각하고 실행하십시오: - -**Step 1. 구조 분석 (Structure Analysis)** -- 이미지를 보고 표의 전체적인 레이아웃을 파악하십시오. -- 헤더가 몇 개의 행(Row)으로 구성되어 있는지 확인하십시오. -- 세로로 병합된(Vertically Merged) '구분'이나 '기간' 컬럼이 있는지 식별하십시오. - -**Step 2. 헤더 처리 (Header Processing)** -- 다중 행 헤더를 단일 행 키(Unique Key)로 변환하십시오. -- 예: - | 구분 | 해지환급금 | - | | 금액 | 환급률 | - -> | 구분 | 해지환급금_금액 | 해지환급금_환급률 | - -**Step 3. 데이터 추출 및 채우기 (Extraction & Filling)** -- 각 행(Row)의 데이터를 추출하십시오. -- **중요:** 병합된 셀은 해당 범위에 속하는 모든 행에 동일한 값을 반복 입력(Repeat Value)하십시오. 절대 빈 칸으로 두지 마십시오. -- OCR 참고용 텍스트가 있다면, 숫자의 정확성을 검증하는 데 사용하십시오. - -**Step 4. 포맷팅 및 검증 (Formatting & Verification)** -- 금액에서 '원', ',' 제거 / 정수형 변환. -- 출력 전, 헤더의 컬럼 개수와 데이터 행의 컬럼 개수가 일치하는지 확인하십시오. - -# Output Format -설명이나 사족 없이 오직 **Markdown Table** 만 출력하십시오. - -| 헤더1 | 헤더2_서브1 | 헤더2_서브2 | ... | -| :--- | :--- | :--- | ... | -| 값1 | 값2 | 값3 | ... |""" - - -def get_user_prompt(ocr_markdown: str = "N/A") -> str: - """ - OCR 마크다운 데이터를 포함한 사용자 프롬프트 생성 - - Args: - ocr_markdown: OCR로 추출된 마크다운 텍스트 (없으면 "N/A") - - Returns: - 완성된 사용자 프롬프트 문자열 - """ - return USER_PROMPT_TEMPLATE.format(ocr_markdown=ocr_markdown) diff --git a/tests/choi/Table_example/sample_images/.gitkeep b/tests/choi/Table_example/sample_images/.gitkeep deleted file mode 100644 index eede5be..0000000 --- a/tests/choi/Table_example/sample_images/.gitkeep +++ /dev/null @@ -1,15 +0,0 @@ -# Sample Images - -이 폴더에 테스트할 보험 테이블 이미지를 추가하세요. - -## 지원 형식 -- PNG -- JPG / JPEG -- GIF -- WebP -- BMP - -## 권장 이미지 -- 해상도가 높은 테이블 이미지 -- 보험 약관, 보험료표, 해지환급금표 등 -- 계층적 헤더가 있는 복잡한 테이블 diff --git a/tests/choi/Table_example/sample_images/I_table_78.md b/tests/choi/Table_example/sample_images/I_table_78.md deleted file mode 100644 index 727197b..0000000 --- a/tests/choi/Table_example/sample_images/I_table_78.md +++ /dev/null @@ -1,6 +0,0 @@ -|구분|XX세|XX+1세|XX+2세|XX+3세|XX+4세|XX+5세| -|---|---|---|---|---|---|---| -|나이증가분(A)||1,059|1,357|1,739|2,229|2,855| -|보험료 산출 기초율
(위험률 등) 증가분
(B=전년도
기준보험료의 최대
25% 가정)||10,846|13,897|17,806|22,815|29,232| -|기준보험료
(C=전년도
기준보험료+A+B)|42,325|54,321|69,485|89,030|114,074|146,161| - diff --git a/tests/choi/Table_example/sample_images/I_table_78.png b/tests/choi/Table_example/sample_images/I_table_78.png deleted file mode 100644 index 3237899..0000000 Binary files a/tests/choi/Table_example/sample_images/I_table_78.png and /dev/null differ diff --git a/tests/choi/Table_example/sample_images/qa_output/I_table_78_pair_0_qa.json b/tests/choi/Table_example/sample_images/qa_output/I_table_78_pair_0_qa.json deleted file mode 100644 index f4d70a8..0000000 --- a/tests/choi/Table_example/sample_images/qa_output/I_table_78_pair_0_qa.json +++ /dev/null @@ -1,80 +0,0 @@ -{ - "name": "I_table_78_pair_0", - "image_paths": [ - "tests/choi/Table_example/sample_images/I_table_78.png" - ], - "qa_results": [ - { - "question": "XX+2세의 기준보험료(C)는 얼마입니까?", - "answer": "69,485", - "type": "lookup", - "reasoning_annotation": "Directly retrieve the value from the '기준보험료 (C)' row and 'XX+2세' column.", - "context": null - }, - { - "question": "기준보험료(C)가 70,000 미만인 연령대는 어디입니까?", - "answer": "XX세, XX+1세, XX+2세", - "type": "filter", - "reasoning_annotation": "Filter the '기준보험료 (C)' row for values less than 70,000 and list the corresponding age categories.", - "context": null - }, - { - "question": "XX+1세부터 XX+5세까지의 나이증가분(A)의 총합은 얼마입니까?", - "answer": "9,239", - "type": "aggregate", - "reasoning_annotation": "Sum the values in the '나이증가분(A)' row for age categories XX+1세 (1,059), XX+2세 (1,357), XX+3세 (1,739), XX+4세 (2,229), and XX+5세 (2,855). (1,059 + 1,357 + 1,739 + 2,229 + 2,855 = 9,239).", - "context": null - }, - { - "question": "XX+4세와 XX+5세 중 나이증가분(A)이 더 높은 연령대는 어디입니까?", - "answer": "XX+5세", - "type": "compare", - "reasoning_annotation": "Compare the '나이증가분(A)' values for XX+4세 (2,229) and XX+5세 (2,855). XX+5세 has a higher value.", - "context": null - }, - { - "question": "XX+5세의 기준보험료(C)와 XX세의 기준보험료(C)의 차이는 얼마입니까?", - "answer": "103,836", - "type": "arithmetic", - "reasoning_annotation": "Subtract the '기준보험료 (C)' value for XX세 (42,325) from the '기준보험료 (C)' value for XX+5세 (146,161). (146,161 - 42,325 = 103,836).", - "context": null - }, - { - "question": "XX세부터 XX+5세까지 기준보험료(C)의 전반적인 추세는 어떻습니까?", - "answer": "연령이 증가함에 따라 기준보험료(C)가 지속적으로 증가하는 상승 추세를 보입니다.", - "type": "temporal", - "reasoning_annotation": "Observe the values in the '기준보험료 (C)' row across all age categories (42,325, 54,321, 69,485, 89,030, 114,074, 146,161). All values show a consistent increase.", - "context": null - }, - { - "question": "나이증가분(A)이 1,739인 연령대의 기준보험료(C)는 얼마입니까?", - "answer": "89,030", - "type": "multi_hop", - "reasoning_annotation": "First, find the age category where '나이증가분(A)' is 1,739, which is 'XX+3세'. Then, retrieve the '기준보험료 (C)' value for 'XX+3세'.", - "context": null - }, - { - "question": "위험률 등 기초율 변화로 인한 추가 비용이 가장 높은 연령대는 어디입니까?", - "answer": "XX+5세", - "type": "implicit_reference", - "reasoning_annotation": "The question implicitly refers to '보험료 산출 기초율 (위험률 등) 증가분 (B)'. Find the maximum value in this row, which is 29,232 for 'XX+5세'.", - "context": null - }, - { - "question": "XX+4세 바로 다음 연령대의 기준보험료는 얼마입니까?", - "answer": "146,161", - "type": "ellipsis", - "reasoning_annotation": "The question implicitly asks for the '기준보험료 (C)' of the age category immediately following 'XX+4세', which is 'XX+5세'. Retrieve the value from the '기준보험료 (C)' row and 'XX+5세' column.", - "context": null - }, - { - "question": "정부의 보험료 안정화 방안에 따라, 보험료 인상률을 재조정해야 하는 연령대는 어디입니까?", - "answer": "XX+4세, XX+5세", - "type": "long_sequence", - "reasoning_annotation": "According to the provided context, identify age categories where '보험료 산출 기초율 증가분(B)' exceeds 20,000. These are XX+4세 (22,815) and XX+5세 (29,232).", - "context": "정부의 보험료 안정화 방안에 따라, '보험료 산출 기초율 증가분(B)'이 20,000을 초과하는 연령대에 대해서는 보험료 인상률을 재조정해야 한다." - } - ], - "token_usage": 8086, - "errors": [] -} \ No newline at end of file diff --git a/tests/choi/Table_example/sample_images/qa_output/_summary.json b/tests/choi/Table_example/sample_images/qa_output/_summary.json deleted file mode 100644 index 1d6400d..0000000 --- a/tests/choi/Table_example/sample_images/qa_output/_summary.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "folder": "tests/choi/Table_example/sample_images", - "total": 1, - "success": 1, - "failed": 0, - "qa_only": true, - "provider": "gemini_pool", - "model": "gpt-4o-mini", - "checkpointing_enabled": false, - "checkpoint_dir": null, - "results": [ - { - "name": "I_table_78_pair_0", - "status": "success", - "qa_count": 10, - "output_file": "tests/choi/Table_example/sample_images/qa_output/I_table_78_pair_0_qa.json", - "errors": [] - } - ] -} \ No newline at end of file diff --git a/tests/choi/Table_example/table_extractor.py b/tests/choi/Table_example/table_extractor.py deleted file mode 100644 index 4024079..0000000 --- a/tests/choi/Table_example/table_extractor.py +++ /dev/null @@ -1,295 +0,0 @@ -""" -보험 테이블 추출기 -polling_gemini 패키지를 사용하여 이미지에서 테이블을 Markdown으로 변환 -""" - -import os -import asyncio -import base64 -import logging -from pathlib import Path -from typing import Optional, Dict, Any, Union -import google.generativeai as genai -from google.api_core import exceptions as google_exceptions - -from .prompts import SYSTEM_PROMPT, get_user_prompt - -# polling_gemini 패키지에서 API Pool 가져오기 -import sys -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -from polling_gemini import get_gemini_pool, GeminiAPIPool - -# 로깅 설정 -logger = logging.getLogger(__name__) - - -def load_image_as_base64(image_path: Union[str, Path]) -> str: - """이미지 파일을 Base64로 인코딩""" - image_path = Path(image_path) - if not image_path.exists(): - raise FileNotFoundError(f"이미지 파일을 찾을 수 없습니다: {image_path}") - - with open(image_path, "rb") as f: - return base64.b64encode(f.read()).decode("utf-8") - - -def get_image_mime_type(image_path: Union[str, Path]) -> str: - """이미지 파일의 MIME 타입 반환""" - image_path = Path(image_path) - suffix = image_path.suffix.lower() - - mime_types = { - ".jpg": "image/jpeg", - ".jpeg": "image/jpeg", - ".png": "image/png", - ".gif": "image/gif", - ".webp": "image/webp", - ".bmp": "image/bmp", - } - - return mime_types.get(suffix, "image/jpeg") - - -class InsuranceTableExtractor: - """ - 보험 문서 이미지에서 테이블을 추출하는 클래스 - - polling_gemini의 GeminiAPIPool을 활용하여 자동 API 키 로테이션을 지원합니다. - """ - - def __init__( - self, - config_path: Optional[str] = None, - model_name: str = "gemini-2.5-flash", - ): - """ - Args: - config_path: API 키 설정 파일 경로 (None이면 기본 경로 사용) - model_name: 사용할 Gemini 모델 이름 - """ - self.pool = get_gemini_pool(config_path) - self.model_name = model_name - self.system_prompt = SYSTEM_PROMPT - - def _create_multimodal_content( - self, - image_path: Union[str, Path], - ocr_markdown: str = "N/A" - ) -> list: - """ - 멀티모달 콘텐츠 생성 (이미지 + 텍스트) - - Args: - image_path: 이미지 파일 경로 - ocr_markdown: 참조용 OCR 마크다운 텍스트 - - Returns: - Gemini API에 전달할 콘텐츠 리스트 - """ - # 이미지 로드 - image_data = load_image_as_base64(image_path) - mime_type = get_image_mime_type(image_path) - - # 사용자 프롬프트 생성 - user_prompt = get_user_prompt(ocr_markdown) - - # 멀티모달 콘텐츠 구성 - content = [ - # 시스템 프롬프트를 포함한 전체 프롬프트 - f"{self.system_prompt}\n\n{user_prompt}", - # 이미지 데이터 - { - "mime_type": mime_type, - "data": image_data - } - ] - - return content - - def extract( - self, - image_path: Union[str, Path], - ocr_markdown: str = "N/A", - **kwargs - ) -> str: - """ - 이미지에서 테이블을 추출하여 Markdown으로 반환 (동기) - - Args: - image_path: 테이블이 포함된 이미지 파일 경로 - ocr_markdown: 참조용 OCR 마크다운 텍스트 (선택적) - **kwargs: 추가 생성 파라미터 - - Returns: - 추출된 Markdown 테이블 문자열 - """ - content = self._create_multimodal_content(image_path, ocr_markdown) - - # API Pool의 현재 설정된 모델 사용 - # 멀티모달 요청을 위해 직접 genai 호출 - max_retries = self.pool.settings.get('max_retries', 3) - retry_delay = self.pool.settings.get('retry_delay', 2) - - last_error = None - attempts = 0 - max_attempts = len(self.pool.api_keys) * max_retries - - while attempts < max_attempts: - current_key = self.pool.api_keys[self.pool.current_key_index] - - try: - # 현재 키로 Gemini 설정 - genai.configure(api_key=current_key.key) - model = genai.GenerativeModel(self.model_name) - - # 생성 설정 - generation_config = { - 'temperature': self.pool.settings.get('temperature', 0.1), - } - generation_config.update(kwargs.get('generation_config', {})) - - # API 호출 - response = model.generate_content( - content, - generation_config=generation_config - ) - - # 성공 시 실패 카운트 리셋 - current_key.failed_count = 0 - current_key.last_error = None - - return response.text - - except google_exceptions.ResourceExhausted as e: - logger.warning(f"API 키 '{current_key.name}' 할당량 초과. 다음 키로 전환합니다.") - current_key.failed_count += 1 - current_key.last_error = str(e) - last_error = e - - if not self.pool._rotate_key(): - break - - except Exception as e: - logger.warning(f"API 호출 실패 (키: {current_key.name}): {e}") - current_key.failed_count += 1 - current_key.last_error = str(e) - last_error = e - - if self.pool._is_quota_error(e): - if not self.pool._rotate_key(): - break - else: - import time - time.sleep(retry_delay) - - attempts += 1 - - error_msg = f"모든 API 키로 시도했으나 실패했습니다. 마지막 에러: {last_error}" - logger.error(error_msg) - raise Exception(error_msg) - - async def aextract( - self, - image_path: Union[str, Path], - ocr_markdown: str = "N/A", - **kwargs - ) -> str: - """ - 이미지에서 테이블을 추출하여 Markdown으로 반환 (비동기) - - Args: - image_path: 테이블이 포함된 이미지 파일 경로 - ocr_markdown: 참조용 OCR 마크다운 텍스트 (선택적) - **kwargs: 추가 생성 파라미터 - - Returns: - 추출된 Markdown 테이블 문자열 - """ - # 동기 메서드를 비동기로 래핑 - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, - lambda: self.extract(image_path, ocr_markdown, **kwargs) - ) - - def get_pool_status(self) -> Dict[str, Any]: - """현재 API Pool 상태 반환""" - return { - 'current_key': self.pool.get_current_key_info(), - 'all_keys': self.pool.get_all_keys_status() - } - - -# 편의 함수들 - -def extract_table_from_image( - image_path: Union[str, Path], - ocr_markdown: str = "N/A", - config_path: Optional[str] = None, - **kwargs -) -> str: - """ - 이미지에서 보험 테이블을 추출하는 간편 함수 (동기) - - Args: - image_path: 테이블이 포함된 이미지 파일 경로 - ocr_markdown: 참조용 OCR 마크다운 텍스트 (선택적) - config_path: API 키 설정 파일 경로 - **kwargs: 추가 생성 파라미터 - - Returns: - 추출된 Markdown 테이블 문자열 - - Example: - ```python - from Table_example import extract_table_from_image - - # 기본 사용 - result = extract_table_from_image("insurance_table.png") - print(result) - - # OCR 참조 텍스트와 함께 사용 - result = extract_table_from_image( - "insurance_table.png", - ocr_markdown="| 구분 | 금액 |\\n| 보험료 | 10000 |" - ) - ``` - """ - extractor = InsuranceTableExtractor(config_path=config_path) - return extractor.extract(image_path, ocr_markdown, **kwargs) - - -async def aextract_table_from_image( - image_path: Union[str, Path], - ocr_markdown: str = "N/A", - config_path: Optional[str] = None, - **kwargs -) -> str: - """ - 이미지에서 보험 테이블을 추출하는 간편 함수 (비동기) - - Args: - image_path: 테이블이 포함된 이미지 파일 경로 - ocr_markdown: 참조용 OCR 마크다운 텍스트 (선택적) - config_path: API 키 설정 파일 경로 - **kwargs: 추가 생성 파라미터 - - Returns: - 추출된 Markdown 테이블 문자열 - - Example: - ```python - import asyncio - from Table_example import aextract_table_from_image - - async def main(): - result = await aextract_table_from_image("insurance_table.png") - print(result) - - asyncio.run(main()) - ``` - """ - extractor = InsuranceTableExtractor(config_path=config_path) - return await extractor.aextract(image_path, ocr_markdown, **kwargs) diff --git a/tests/choi/Table_example/test_extraction.ipynb b/tests/choi/Table_example/test_extraction.ipynb deleted file mode 100644 index fc7a143..0000000 --- a/tests/choi/Table_example/test_extraction.ipynb +++ /dev/null @@ -1,610 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "03a4228b", - "metadata": {}, - "source": [ - "## 1. 환경 설정" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "ac52abd7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "프로젝트 루트: /Users/jaehyeokchoi/Desktop/chois_toy/private/TableMagnifier\n" - ] - } - ], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "\n", - "# 프로젝트 루트를 Python 경로에 추가\n", - "project_root = Path.cwd().parent\n", - "if str(project_root) not in sys.path:\n", - " sys.path.insert(0, str(project_root))\n", - "\n", - "print(f\"프로젝트 루트: {project_root}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3f5862d6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Table_example 패키지 로드 완료!\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jaehyeokchoi/Desktop/chois_toy/private/TableMagnifier/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "# Table_example 패키지 임포트\n", - "from Table_example import (\n", - " InsuranceTableExtractor,\n", - " extract_table_from_image,\n", - " aextract_table_from_image,\n", - " SYSTEM_PROMPT,\n", - " USER_PROMPT_TEMPLATE,\n", - ")\n", - "from Table_example.prompts import get_user_prompt\n", - "\n", - "print(\"✅ Table_example 패키지 로드 완료!\")" - ] - }, - { - "cell_type": "markdown", - "id": "7e2f995a", - "metadata": {}, - "source": [ - "## 2. 프롬프트 확인\n", - "\n", - "보험 도메인 특화 프롬프트를 확인합니다." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "95f59ce1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📋 System Prompt:\n", - "======================================================================\n", - "# Role Definition\n", - "당신은 20년 경력의 '수석 보험 데이터 엔지니어'이자 'OCR 후처리 전문가'입니다.\n", - "당신의 임무는 제공된 [보험 문서 이미지]와 선택적으로 제공되는 [기초 OCR 텍스트]를 분석하여, 데이터베이스 적재가 가능한 완벽한 형태의 'Standardized Markdown Table'로 변환하는 것입니다.\n", - "\n", - "# Core Principles\n", - "1. **구조적 완전성(Structural Integrity):** 시각적으로 병합(Merge)된 셀은 반드시 비정규화(Denormalization)하여 모든 행에 값을 채워야 합니다. 빈칸이나 \" 상동\" 등의 표현은 금지됩니다.\n", - "2. **헤더 평탄화(Header Flattening):** 2행 이상의 계층적 헤더(Multi-row Headers)는 상위 헤더와 하위 헤더를 언더스코어(_)로 연결하여 단일 행(Single-row) 헤더로 변환합니다. (예: '보장내용' 하위에 '지급금액'이 있다면 -> '보장내용_지급금액')\n", - "3. **데이터 무결성(Data Integrity):**\n", - " - 금액의 천 단위 구분자(,)는 제거합니다. (예: 1,000,000 -> 1000000)\n", - " - 퍼센트(%)는 기호를 포함한 문자열로 유지합니다. (예: 98.5%)\n", - " - 괄호 안의 보조 정보(예: 전년 대비 증감액)는 무시하고 핵심 수치만 추출합니다.\n", - "4. **Hybrid Reference:** [기초 OCR 텍스트]가 제공될 경우, 이미지 내 텍스트가 흐릿하거나 불분명할 때 해당 텍스트를 참조하여 오타를 교정하십시오. 단, 표의 구조(행/열 위치) 판단은 반드시 [원본 이미지]를 기준으로 합니다.\n", - "======================================================================\n" - ] - } - ], - "source": [ - "# System Prompt 확인\n", - "print(\"📋 System Prompt:\")\n", - "print(\"=\" * 70)\n", - "print(SYSTEM_PROMPT)\n", - "print(\"=\" * 70)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "3bd4aaa6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📋 User Prompt Template:\n", - "======================================================================\n", - "# Task Description\n", - "아래 제공된 입력을 바탕으로 보험 테이블 데이터를 추출하십시오.\n", - "\n", - "# Input Data\n", - "1. **Target Image:** [첨부된 이미지]\n", - "2. **Reference OCR Markdown (Optional):**\n", - "\"\"\"\n", - "{ocr_markdown}\n", - "\"\"\"\n", - "\n", - "# Step-by-Step Instructions (Chain-of-Table)\n", - "단계별로 생각하고 실행하십시오:\n", - "\n", - "**Step 1. 구조 분석 (Structure Analysis)**\n", - "- 이미지를 보고 표의 전체적인 레이아웃을 파악하십시오.\n", - "- 헤더가 몇 개의 행(Row)으로 구성되어 있는지 확인하십시오.\n", - "- 세로로 병합된(Vertically Merged) '구분'이나 '기간' 컬럼이 있는지 식별하십시오.\n", - "\n", - "**Step 2. 헤더 처리 (Header Processing)**\n", - "- 다중 행 헤더를 단일 행 키(Unique Key)로 변환하십시오.\n", - "- 예:\n", - " | 구분 | 해지환급금 |\n", - " | | 금액 | 환급률 |\n", - " -> | 구분 | 해지환급금_금액 | 해지환급금_환급률 |\n", - "\n", - "**Step 3. 데이터 추출 및 채우기 (Extraction & Filling)**\n", - "- 각 행(Row)의 데이터를 추출하십시오.\n", - "- **중요:** 병합된 셀은 해당 범위에 속하는 모든 행에 동일한 값을 반복 입력(Repeat Value)하십시오. 절대 빈 칸으로 두지 마십시오.\n", - "- OCR 참고용 텍스트가 있다면, 숫자의 정확성을 검증하는 데 사용하십시오.\n", - "\n", - "**Step 4. 포맷팅 및 검증 (Formatting & Verification)**\n", - "- 금액에서 '원', ',' 제거 / 정수형 변환.\n", - "- 출력 전, 헤더의 컬럼 개수와 데이터 행의 컬럼 개수가 일치하는지 확인하십시오.\n", - "\n", - "# Output Format\n", - "설명이나 사족 없이 오직 **Markdown Table** 만 출력하십시오.\n", - "\n", - "| 헤더1 | 헤더2_서브1 | 헤더2_서브2 | ... |\n", - "| :--- | :--- | :--- | ... |\n", - "| 값1 | 값2 | 값3 | ... |\n", - "======================================================================\n" - ] - } - ], - "source": [ - "# User Prompt Template 확인\n", - "print(\"📋 User Prompt Template:\")\n", - "print(\"=\" * 70)\n", - "print(USER_PROMPT_TEMPLATE)\n", - "print(\"=\" * 70)" - ] - }, - { - "cell_type": "markdown", - "id": "efe2d4d4", - "metadata": {}, - "source": [ - "## 3. 추출기 초기화\n", - "\n", - "API 키 설정 확인 및 추출기를 초기화합니다." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "4d6f9bcb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ API 키 설정 파일 확인됨: /Users/jaehyeokchoi/Desktop/chois_toy/private/TableMagnifier/apis/gemini_keys.yaml\n" - ] - } - ], - "source": [ - "# API 키 설정 파일 확인\n", - "config_path = project_root / \"apis\" / \"gemini_keys.yaml\"\n", - "\n", - "if not config_path.exists():\n", - " print(f\"❌ API 키 설정 파일이 없습니다: {config_path}\")\n", - " print(\"\\n다음 단계를 수행하세요:\")\n", - " print(\"1. apis/gemini_keys-example.yaml을 apis/gemini_keys.yaml로 복사\")\n", - " print(\"2. 실제 Gemini API 키를 입력\")\n", - " print(\"3. Google AI Studio에서 무료 API 키 발급:\")\n", - " print(\" https://makersuite.google.com/app/apikey\")\n", - "else:\n", - " print(f\"✅ API 키 설정 파일 확인됨: {config_path}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "08823fe6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-12-02 14:27:15,381 - polling_gemini.api_pool - INFO - 총 3개의 API 키를 로드했습니다.\n", - "2025-12-02 14:27:15,382 - polling_gemini.api_pool - INFO - API 키 'key1' 사용 중 (모델: gemini-2.5-flash)\n", - "2025-12-02 14:27:15,382 - polling_gemini.api_pool - INFO - API 키 'key1' 사용 중 (모델: gemini-2.5-flash)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ InsuranceTableExtractor 초기화 성공!\n", - "\n", - "현재 사용 중인 API 키: key1\n", - "총 API 키 수: 3\n", - "\n", - "전체 API 키 상태:\n", - " - key1: 활성화=True, 실패횟수=0\n", - " - key2: 활성화=True, 실패횟수=0\n", - " - key3: 활성화=True, 실패횟수=0\n" - ] - } - ], - "source": [ - "# InsuranceTableExtractor 초기화\n", - "try:\n", - " extractor = InsuranceTableExtractor()\n", - " \n", - " # API Pool 상태 확인\n", - " status = extractor.get_pool_status()\n", - " \n", - " print(\"✅ InsuranceTableExtractor 초기화 성공!\")\n", - " print(f\"\\n현재 사용 중인 API 키: {status['current_key']['name']}\")\n", - " print(f\"총 API 키 수: {status['current_key']['total_keys']}\")\n", - " \n", - " print(\"\\n전체 API 키 상태:\")\n", - " for key in status['all_keys']:\n", - " print(f\" - {key['name']}: 활성화={key['enabled']}, 실패횟수={key['failed_count']}\")\n", - " \n", - "except FileNotFoundError as e:\n", - " print(f\"❌ API 키 파일 오류: {e}\")\n", - "except Exception as e:\n", - " print(f\"❌ 초기화 실패: {e}\")" - ] - }, - { - "cell_type": "markdown", - "id": "41a30785", - "metadata": {}, - "source": [ - "## 4. 샘플 이미지 확인\n", - "\n", - "테스트할 이미지 파일을 확인합니다." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "dd007a21", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "📁 샘플 이미지 디렉토리: /Users/jaehyeokchoi/Desktop/chois_toy/private/TableMagnifier/Table_example/sample_images\n", - "📷 발견된 이미지: 1개\n", - " 1. I_table_78.png\n" - ] - } - ], - "source": [ - "# 샘플 이미지 디렉토리 확인\n", - "sample_images_dir = Path.cwd() / \"sample_images\"\n", - "\n", - "if not sample_images_dir.exists():\n", - " sample_images_dir.mkdir(parents=True, exist_ok=True)\n", - " print(f\"📁 샘플 이미지 디렉토리 생성됨: {sample_images_dir}\")\n", - "\n", - "# 이미지 파일 검색\n", - "image_extensions = [\"*.png\", \"*.jpg\", \"*.jpeg\", \"*.gif\", \"*.webp\", \"*.bmp\"]\n", - "image_files = []\n", - "for ext in image_extensions:\n", - " image_files.extend(sample_images_dir.glob(ext))\n", - "\n", - "print(f\"\\n📁 샘플 이미지 디렉토리: {sample_images_dir}\")\n", - "print(f\"📷 발견된 이미지: {len(image_files)}개\")\n", - "\n", - "if image_files:\n", - " for i, img in enumerate(image_files, 1):\n", - " print(f\" {i}. {img.name}\")\n", - "else:\n", - " print(\"\\n⚠️ 테스트할 이미지를 sample_images 폴더에 추가하세요!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "7f0f1733", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📷 첫 번째 이미지 미리보기:\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "" - ] - }, - "metadata": { - "image/png": { - "width": 600 - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# 이미지 미리보기 (IPython 사용)\n", - "if image_files:\n", - " from IPython.display import Image, display\n", - " \n", - " print(\"📷 첫 번째 이미지 미리보기:\")\n", - " display(Image(filename=str(image_files[0]), width=600))" - ] - }, - { - "cell_type": "markdown", - "id": "43bd0f11", - "metadata": {}, - "source": [ - "## 5. 기본 테이블 추출\n", - "\n", - "이미지에서 테이블을 추출합니다." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "101765ab", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🎯 추출 대상 이미지: I_table_78.png\n" - ] - } - ], - "source": [ - "# 추출할 이미지 선택\n", - "if image_files:\n", - " # 첫 번째 이미지 사용\n", - " target_image = image_files[0]\n", - " print(f\"🎯 추출 대상 이미지: {target_image.name}\")\n", - "else:\n", - " print(\"❌ 이미지가 없습니다. sample_images 폴더에 이미지를 추가하세요.\")\n", - " target_image = None" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "8b4d4f8a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🚀 테이블 추출 시작: I_table_78.png\n", - "======================================================================\n", - "\n", - "✅ 추출 완료!\n", - "\n", - "📊 결과 (Markdown Table):\n", - "----------------------------------------------------------------------\n", - "| 구분 | XX세 | XX+1세 | XX+2세 | XX+3세 | XX+4세 | XX+5세 |\n", - "| :--- | :--- | :--- | :--- | :--- | :--- | :--- |\n", - "| 나이증가분(A) | | 1059 | 1357 | 1739 | 2229 | 2855 |\n", - "| 보험료 산출 기초율 (위험률 등) 증가분 (B=전년도 기준보험료의 최대 25% 가정) | | 10846 | 13897 | 17806 | 22815 | 29232 |\n", - "| 기준보험료 (C=전년도 기준보험료+A+B) | 42325 | 54321 | 69485 | 89030 | 114074 | 146161 |\n", - "----------------------------------------------------------------------\n", - "CPU times: user 21.5 ms, sys: 14.8 ms, total: 36.3 ms\n", - "Wall time: 14.7 s\n", - "\n", - "✅ 추출 완료!\n", - "\n", - "📊 결과 (Markdown Table):\n", - "----------------------------------------------------------------------\n", - "| 구분 | XX세 | XX+1세 | XX+2세 | XX+3세 | XX+4세 | XX+5세 |\n", - "| :--- | :--- | :--- | :--- | :--- | :--- | :--- |\n", - "| 나이증가분(A) | | 1059 | 1357 | 1739 | 2229 | 2855 |\n", - "| 보험료 산출 기초율 (위험률 등) 증가분 (B=전년도 기준보험료의 최대 25% 가정) | | 10846 | 13897 | 17806 | 22815 | 29232 |\n", - "| 기준보험료 (C=전년도 기준보험료+A+B) | 42325 | 54321 | 69485 | 89030 | 114074 | 146161 |\n", - "----------------------------------------------------------------------\n", - "CPU times: user 21.5 ms, sys: 14.8 ms, total: 36.3 ms\n", - "Wall time: 14.7 s\n" - ] - } - ], - "source": [ - "%%time\n", - "# 테이블 추출 실행\n", - "if target_image:\n", - " print(f\"🚀 테이블 추출 시작: {target_image.name}\")\n", - " print(\"=\" * 70)\n", - " \n", - " try:\n", - " result = extract_table_from_image(target_image)\n", - " \n", - " print(\"\\n✅ 추출 완료!\")\n", - " print(\"\\n📊 결과 (Markdown Table):\")\n", - " print(\"-\" * 70)\n", - " print(result)\n", - " print(\"-\" * 70)\n", - " \n", - " except Exception as e:\n", - " print(f\"❌ 추출 실패: {e}\")\n", - " import traceback\n", - " traceback.print_exc()" - ] - }, - { - "cell_type": "markdown", - "id": "bc284731", - "metadata": {}, - "source": [ - "## 6. OCR 참조 텍스트와 함께 추출\n", - "\n", - "OCR로 먼저 추출한 텍스트를 참조로 제공하여 정확도를 높입니다." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "361949ff", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ OCR 참조 파일 로드됨: I_table_78.md\n", - "\n", - "📝 OCR 참조 텍스트:\n", - "----------------------------------------------------------------------\n", - "|구분|XX세|XX+1세|XX+2세|XX+3세|XX+4세|XX+5세|\n", - "|---|---|---|---|---|---|---|\n", - "|나이증가분(A)||1,059|1,357|1,739|2,229|2,855|\n", - "|보험료 산출 기초율
(위험률 등) 증가분
(B=전년도
기준보험료의 최대
25% 가정)||10,846|13,897|17,806|22,815|29,232|\n", - "|기준보험료
(C=전년도
기준보험료+A+B)|42,325|54,321|69,485|89,030|114,074|146,161|\n", - "----------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "# OCR 참조 텍스트 로드 (이미지와 같은 이름의 .md 파일에서)\n", - "# 예: I_table_78.png → I_table_78.md\n", - "\n", - "if target_image:\n", - " # 이미지와 같은 이름의 .md 파일 찾기\n", - " ocr_md_path = target_image.with_suffix(\".md\")\n", - " \n", - " if ocr_md_path.exists():\n", - " with open(ocr_md_path, \"r\", encoding=\"utf-8\") as f:\n", - " sample_ocr_markdown = f.read().strip()\n", - " print(f\"✅ OCR 참조 파일 로드됨: {ocr_md_path.name}\")\n", - " print(\"\\n📝 OCR 참조 텍스트:\")\n", - " print(\"-\" * 70)\n", - " print(sample_ocr_markdown)\n", - " print(\"-\" * 70)\n", - " else:\n", - " sample_ocr_markdown = None\n", - " print(f\"⚠️ OCR 참조 파일이 없습니다: {ocr_md_path.name}\")\n", - " print(\" 이미지와 같은 이름의 .md 파일을 sample_images 폴더에 추가하세요.\")\n", - "else:\n", - " sample_ocr_markdown = None\n", - " print(\"❌ 대상 이미지가 없습니다.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8495b93f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🚀 OCR 참조와 함께 테이블 추출: I_table_78.png\n", - "======================================================================\n", - "\n", - "✅ 추출 완료!\n", - "\n", - "📊 결과 (OCR 참조 사용):\n", - "----------------------------------------------------------------------\n", - "|구분|XX세|XX+1세|XX+2세|XX+3세|XX+4세|XX+5세|\n", - "|:---|:---|:---|:---|:---|:---|:---|:---|\n", - "|나이증가분(A)||1059|1357|1739|2229|2855|\n", - "|보험료 산출 기초율 (위험률 등) 증가분 (B=전년도 기준보험료의 최대 25% 가정)||10846|13897|17806|22815|29232|\n", - "|기준보험료 (C=전년도 기준보험료+A+B)|42325|54321|69485|89030|114074|146161|\n", - "----------------------------------------------------------------------\n", - "CPU times: user 17.5 ms, sys: 17.2 ms, total: 34.7 ms\n", - "Wall time: 20.8 s\n", - "\n", - "✅ 추출 완료!\n", - "\n", - "📊 결과 (OCR 참조 사용):\n", - "----------------------------------------------------------------------\n", - "|구분|XX세|XX+1세|XX+2세|XX+3세|XX+4세|XX+5세|\n", - "|:---|:---|:---|:---|:---|:---|:---|:---|\n", - "|나이증가분(A)||1059|1357|1739|2229|2855|\n", - "|보험료 산출 기초율 (위험률 등) 증가분 (B=전년도 기준보험료의 최대 25% 가정)||10846|13897|17806|22815|29232|\n", - "|기준보험료 (C=전년도 기준보험료+A+B)|42325|54321|69485|89030|114074|146161|\n", - "----------------------------------------------------------------------\n", - "CPU times: user 17.5 ms, sys: 17.2 ms, total: 34.7 ms\n", - "Wall time: 20.8 s\n" - ] - } - ], - "source": [ - "%%time\n", - "# OCR 참조와 함께 테이블 추출\n", - "if target_image and sample_ocr_markdown:\n", - " print(f\"🚀 OCR 참조와 함께 테이블 추출: {target_image.name}\")\n", - " print(\"=\" * 70)\n", - " \n", - " try:\n", - " result_with_ocr = extract_table_from_image(\n", - " target_image,\n", - " ocr_markdown=sample_ocr_markdown\n", - " )\n", - " \n", - " print(\"\\n✅ 추출 완료!\")\n", - " print(\"\\n📊 결과 (OCR 참조 사용):\")\n", - " print(\"-\" * 70)\n", - " print(result_with_ocr)\n", - " print(\"-\" * 70)\n", - " \n", - " except Exception as e:\n", - " print(f\"❌ 추출 실패: {e}\")\n", - "elif target_image and not sample_ocr_markdown:\n", - " print(\"⚠️ OCR 참조 파일이 없어서 이 섹션을 건너뜁니다.\")\n", - " print(f\" {target_image.stem}.md 파일을 sample_images 폴더에 추가하세요.\")\n", - "else:\n", - " print(\"❌ 이미지가 없습니다.\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/choi/Table_example/test_extraction.py b/tests/choi/Table_example/test_extraction.py deleted file mode 100644 index fb4e153..0000000 --- a/tests/choi/Table_example/test_extraction.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -보험 테이블 추출 테스트 코드 -Table_example 패키지를 사용하여 이미지에서 테이블을 추출하는 예제 -""" - -import sys -import asyncio -from pathlib import Path - -# 프로젝트 루트를 Python 경로에 추가 -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -from Table_example import ( - InsuranceTableExtractor, - extract_table_from_image, - aextract_table_from_image, - SYSTEM_PROMPT, - USER_PROMPT_TEMPLATE, -) - - -def test_prompt_display(): - """1. 프롬프트 내용 확인""" - print("\n" + "="*70) - print("테스트 1: 프롬프트 내용 확인") - print("="*70) - - print("\n📋 System Prompt:") - print("-"*50) - print(SYSTEM_PROMPT[:500] + "...") - - print("\n📋 User Prompt Template (일부):") - print("-"*50) - print(USER_PROMPT_TEMPLATE[:500] + "...") - - print("\n✅ 프롬프트 로드 성공!") - return True - - -def test_extractor_initialization(): - """2. 추출기 초기화 테스트""" - print("\n" + "="*70) - print("테스트 2: InsuranceTableExtractor 초기화") - print("="*70) - - try: - extractor = InsuranceTableExtractor() - - status = extractor.get_pool_status() - print(f"\n현재 사용 중인 키: {status['current_key']['name']}") - print(f"총 API 키 수: {status['current_key']['total_keys']}") - - print("\n✅ 추출기 초기화 성공!") - return True - - except FileNotFoundError as e: - print(f"\n⚠️ API 키 파일이 없습니다: {e}") - print("apis/gemini_keys.yaml 파일을 생성하세요.") - return False - except Exception as e: - print(f"\n❌ 초기화 실패: {e}") - return False - - -def test_extract_from_sample_image(): - """3. 샘플 이미지에서 테이블 추출 테스트""" - print("\n" + "="*70) - print("테스트 3: 샘플 이미지에서 테이블 추출") - print("="*70) - - # 샘플 이미지 경로 확인 - sample_images_dir = project_root / "Table_example" / "sample_images" - - if not sample_images_dir.exists(): - print(f"\n⚠️ 샘플 이미지 디렉토리가 없습니다: {sample_images_dir}") - print("sample_images 폴더에 테스트 이미지를 추가하세요.") - return None - - # 샘플 이미지 찾기 - image_files = list(sample_images_dir.glob("*.png")) + \ - list(sample_images_dir.glob("*.jpg")) + \ - list(sample_images_dir.glob("*.jpeg")) - - if not image_files: - print(f"\n⚠️ 샘플 이미지가 없습니다: {sample_images_dir}") - print("PNG, JPG, JPEG 형식의 이미지를 추가하세요.") - return None - - # 첫 번째 이미지로 테스트 - image_path = image_files[0] - print(f"\n테스트 이미지: {image_path.name}") - - try: - # 테이블 추출 - result = extract_table_from_image(image_path) - - print("\n📊 추출 결과:") - print("-"*50) - print(result) - print("-"*50) - - print("\n✅ 테이블 추출 성공!") - return result - - except Exception as e: - print(f"\n❌ 추출 실패: {e}") - import traceback - traceback.print_exc() - return None - - -def test_extract_with_ocr_reference(): - """4. OCR 참조 텍스트와 함께 추출 테스트""" - print("\n" + "="*70) - print("테스트 4: OCR 참조 텍스트와 함께 추출") - print("="*70) - - sample_images_dir = project_root / "Table_example" / "sample_images" - - if not sample_images_dir.exists(): - print(f"\n⚠️ 샘플 이미지 디렉토리가 없습니다.") - return None - - image_files = list(sample_images_dir.glob("*.png")) + \ - list(sample_images_dir.glob("*.jpg")) - - if not image_files: - print(f"\n⚠️ 샘플 이미지가 없습니다.") - return None - - image_path = image_files[0] - print(f"\n테스트 이미지: {image_path.name}") - - # OCR 참조 텍스트 예시 (실제로는 OCR 결과를 사용) - sample_ocr_markdown = """| 구분 | 보험기간 | 납입기간 | 가입금액 | -| 상해사망 | 80세 | 20년 | 1억원 | -| 질병사망 | 80세 | 20년 | 5천만원 |""" - - try: - result = extract_table_from_image( - image_path, - ocr_markdown=sample_ocr_markdown - ) - - print("\n📊 추출 결과 (OCR 참조 사용):") - print("-"*50) - print(result) - print("-"*50) - - print("\n✅ OCR 참조 추출 성공!") - return result - - except Exception as e: - print(f"\n❌ 추출 실패: {e}") - import traceback - traceback.print_exc() - return None - - -def test_async_extraction(): - """5. 비동기 테이블 추출 테스트""" - print("\n" + "="*70) - print("테스트 5: 비동기 테이블 추출") - print("="*70) - - async def async_test(): - sample_images_dir = project_root / "Table_example" / "sample_images" - - if not sample_images_dir.exists(): - print(f"\n⚠️ 샘플 이미지 디렉토리가 없습니다.") - return None - - image_files = list(sample_images_dir.glob("*.png")) + \ - list(sample_images_dir.glob("*.jpg")) - - if not image_files: - print(f"\n⚠️ 샘플 이미지가 없습니다.") - return None - - image_path = image_files[0] - print(f"\n테스트 이미지: {image_path.name}") - - try: - import time - start_time = time.time() - - result = await aextract_table_from_image(image_path) - - elapsed = time.time() - start_time - - print(f"\n⏱️ 소요 시간: {elapsed:.2f}초") - print("\n📊 추출 결과:") - print("-"*50) - print(result[:500] + "..." if len(result) > 500 else result) - print("-"*50) - - print("\n✅ 비동기 추출 성공!") - return result - - except Exception as e: - print(f"\n❌ 추출 실패: {e}") - import traceback - traceback.print_exc() - return None - - return asyncio.run(async_test()) - - -def test_multiple_images(): - """6. 여러 이미지 동시 처리 테스트""" - print("\n" + "="*70) - print("테스트 6: 여러 이미지 동시 처리 (비동기)") - print("="*70) - - async def async_test(): - sample_images_dir = project_root / "Table_example" / "sample_images" - - if not sample_images_dir.exists(): - print(f"\n⚠️ 샘플 이미지 디렉토리가 없습니다.") - return None - - image_files = list(sample_images_dir.glob("*.png")) + \ - list(sample_images_dir.glob("*.jpg")) + \ - list(sample_images_dir.glob("*.jpeg")) - - if len(image_files) < 2: - print(f"\n⚠️ 2개 이상의 샘플 이미지가 필요합니다. 현재: {len(image_files)}개") - return None - - # 최대 3개 이미지 처리 - images_to_process = image_files[:3] - - print(f"\n처리할 이미지 {len(images_to_process)}개:") - for img in images_to_process: - print(f" - {img.name}") - - try: - import time - start_time = time.time() - - # 동시에 여러 이미지 처리 - tasks = [aextract_table_from_image(img) for img in images_to_process] - results = await asyncio.gather(*tasks, return_exceptions=True) - - elapsed = time.time() - start_time - - print(f"\n⏱️ 총 소요 시간: {elapsed:.2f}초") - - for i, (img, result) in enumerate(zip(images_to_process, results), 1): - print(f"\n📊 이미지 {i} ({img.name}) 결과:") - print("-"*40) - if isinstance(result, Exception): - print(f"❌ 에러: {result}") - else: - print(result[:300] + "..." if len(result) > 300 else result) - - success_count = sum(1 for r in results if not isinstance(r, Exception)) - print(f"\n✅ {len(images_to_process)}개 중 {success_count}개 성공!") - return results - - except Exception as e: - print(f"\n❌ 처리 실패: {e}") - import traceback - traceback.print_exc() - return None - - return asyncio.run(async_test()) - - -def main(): - """모든 테스트 실행""" - print("\n" + "🏥 " * 20) - print("보험 테이블 추출 테스트 시작") - print("🏥 " * 20) - - # API 키 설정 확인 - config_path = project_root / "apis" / "gemini_keys.yaml" - if not config_path.exists(): - print(f"\n❌ 오류: API 키 설정 파일이 없습니다: {config_path}") - print("\n다음 단계를 수행하세요:") - print("1. apis/gemini_keys-example.yaml을 apis/gemini_keys.yaml로 복사") - print("2. 실제 Gemini API 키를 입력") - print("3. Google AI Studio에서 무료 API 키 발급:") - print(" https://makersuite.google.com/app/apikey") - return - - # 샘플 이미지 디렉토리 확인/생성 - sample_images_dir = project_root / "Table_example" / "sample_images" - if not sample_images_dir.exists(): - sample_images_dir.mkdir(parents=True, exist_ok=True) - print(f"\n📁 샘플 이미지 디렉토리를 생성했습니다: {sample_images_dir}") - print("⚠️ 테스트할 보험 테이블 이미지를 이 폴더에 추가하세요.") - - # 테스트 실행 - tests = [ - ("프롬프트 확인", test_prompt_display), - ("추출기 초기화", test_extractor_initialization), - ("이미지 추출", test_extract_from_sample_image), - ("OCR 참조 추출", test_extract_with_ocr_reference), - ("비동기 추출", test_async_extraction), - ("다중 이미지", test_multiple_images), - ] - - results = [] - for name, test_func in tests: - try: - result = test_func() - # None은 이미지가 없어서 스킵된 경우 - if result is None: - results.append((name, "skipped")) - else: - results.append((name, "success")) - except KeyboardInterrupt: - print("\n\n⚠️ 사용자가 테스트를 중단했습니다.") - break - except Exception as e: - print(f"\n❌ 예상치 못한 오류: {e}") - results.append((name, "failed")) - - # 결과 요약 - print("\n" + "=" * 70) - print("테스트 결과 요약") - print("=" * 70) - - for name, status in results: - if status == "success": - icon = "✅" - elif status == "skipped": - icon = "⏭️ " - else: - icon = "❌" - print(f"{icon} {name}: {status}") - - success_count = sum(1 for _, s in results if s == "success") - skipped_count = sum(1 for _, s in results if s == "skipped") - total_count = len(results) - - print(f"\n총 {total_count}개 테스트 중 {success_count}개 성공, {skipped_count}개 스킵") - - if skipped_count > 0: - print("\n💡 팁: sample_images 폴더에 보험 테이블 이미지를 추가하면 더 많은 테스트가 실행됩니다.") - - -if __name__ == "__main__": - main() diff --git a/tests/choi/upload_fin.txt b/tests/choi/upload_fin.txt deleted file mode 100644 index da757cb..0000000 --- a/tests/choi/upload_fin.txt +++ /dev/null @@ -1 +0,0 @@ -I_origin_0/I_table_1.png diff --git a/tests/test_flow.py b/tests/test_flow.py deleted file mode 100644 index 417c125..0000000 --- a/tests/test_flow.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest -from unittest.mock import MagicMock -from langchain_core.messages import AIMessage -from generate_synthetic_table.flow import build_synthetic_table_graph, TableState - -@pytest.fixture -def mock_llm(): - llm = MagicMock() - llm.invoke.return_value = AIMessage(content="Mock response") - return llm - -def test_graph_compilation(mock_llm): - graph = build_synthetic_table_graph(mock_llm) - assert graph is not None - -def test_image_to_html_node(mock_llm): - # This is a bit harder to test without mocking the file system or _load_prompt - # For now, we just check if the graph builds and runs with a mock - pass diff --git a/tests/test_html_to_image.py b/tests/test_html_to_image.py deleted file mode 100644 index e90c721..0000000 --- a/tests/test_html_to_image.py +++ /dev/null @@ -1,41 +0,0 @@ - -from pathlib import Path -import pytest -from generate_synthetic_table.html_to_image import capture_html_as_image - -def test_capture_html_as_image(tmp_path): - html = """ - - - - - -

Test Table

-
- - - - - - - - - - - - -
NameAge
Alice30
Bob25
- - - """ - - output_path = tmp_path / "test_table.png" - capture_html_as_image(html, output_path) - - assert output_path.exists() - assert output_path.stat().st_size > 0 - print(f"Image saved to {output_path}") diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index d78f755..0000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,39 +0,0 @@ -import pytest -from generate_synthetic_table.validators import robust_json_parse, validate_html - -def test_robust_json_parse_valid(): - assert robust_json_parse('{"a": 1}') == {"a": 1} - -def test_robust_json_parse_markdown(): - text = """ - Here is the json: - ```json - { - "key": "value" - } - ``` - """ - assert robust_json_parse(text) == {"key": "value"} - -def test_robust_json_parse_markdown_no_lang(): - text = """ - ``` - {"x": [1, 2]} - ``` - """ - assert robust_json_parse(text) == {"x": [1, 2]} - -def test_robust_json_parse_dirty(): - text = "Sure! {\"a\": 1} is the answer." - assert robust_json_parse(text) == {"a": 1} - -def test_validate_html_valid(): - html = "
Cell
" - assert validate_html(html) is True - -def test_validate_html_invalid_structure(): - html = "
Not a table
" - assert validate_html(html) is False - -def test_validate_html_empty(): - assert validate_html("") is False diff --git a/tests/test_validation.py b/tests/test_validation.py deleted file mode 100644 index 3b72a1b..0000000 --- a/tests/test_validation.py +++ /dev/null @@ -1,354 +0,0 @@ -"""3단계 검증 체인 테스트 스크립트 (standalone)""" - -import re -import logging -from typing import Optional, Any, List, Tuple -from bs4 import BeautifulSoup - -# pandas/pandasql import -try: - import pandas as pd - from pandasql import sqldf - PANDAS_SQL_AVAILABLE = True -except ImportError: - PANDAS_SQL_AVAILABLE = False - pd = None - sqldf = None - -# REPL import -try: - from langchain_experimental.tools import PythonREPLTool - REPL_AVAILABLE = True -except ImportError: - REPL_AVAILABLE = False - PythonREPLTool = None - -logger = logging.getLogger(__name__) - - -def parse_html_table_to_json(html: str) -> Optional[dict]: - """HTML 테이블을 JSON으로 파싱""" - if not html: - return None - - try: - soup = BeautifulSoup(html, "html.parser") - table = soup.find("table") - if not table: - return None - - rows = table.find_all("tr") - if not rows: - return None - - max_cols = 0 - for row in rows: - cols = sum(int(cell.get("colspan", 1)) for cell in row.find_all(["td", "th"])) - max_cols = max(max_cols, cols) - - if max_cols == 0: - return None - - grid = [] - for row_idx, row in enumerate(rows): - while len(grid) <= row_idx: - grid.append([None] * max_cols) - - col_idx = 0 - for cell in row.find_all(["td", "th"]): - while col_idx < max_cols and grid[row_idx][col_idx] is not None: - col_idx += 1 - - if col_idx >= max_cols: - break - - cell_text = cell.get_text(strip=True) - colspan = int(cell.get("colspan", 1)) - rowspan = int(cell.get("rowspan", 1)) - - for r in range(rowspan): - for c in range(colspan): - target_row = row_idx + r - target_col = col_idx + c - while len(grid) <= target_row: - grid.append([None] * max_cols) - if target_col < max_cols: - grid[target_row][target_col] = cell_text - - col_idx += colspan - - raw_rows = [[cell if cell is not None else "" for cell in row] for row in grid] - if not raw_rows: - return None - - has_header = bool(rows[0].find_all("th")) - - if has_header and len(raw_rows) > 1: - headers = raw_rows[0] - data = raw_rows[1:] - else: - headers = [f"col_{i+1}" for i in range(len(raw_rows[0]))] - data = raw_rows - - return {"headers": headers, "data": data, "raw_rows": raw_rows} - except Exception as e: - logger.error(f"HTML parsing failed: {e}") - return None - - -def html_table_to_dataframe(html: str) -> Optional[Any]: - """HTML 테이블을 DataFrame으로 변환""" - if not PANDAS_SQL_AVAILABLE: - return None - - parsed = parse_html_table_to_json(html) - if not parsed: - return None - - try: - headers = parsed["headers"] - data = parsed["data"] - - clean_headers = [] - for i, h in enumerate(headers): - clean = re.sub(r'[^\w가-힣]', '_', str(h).strip()) - clean = re.sub(r'_+', '_', clean).strip('_') - if not clean or clean[0].isdigit(): - clean = f"col_{i}" - clean_headers.append(clean) - - seen = {} - final_headers = [] - for h in clean_headers: - if h in seen: - seen[h] += 1 - final_headers.append(f"{h}_{seen[h]}") - else: - seen[h] = 0 - final_headers.append(h) - - return pd.DataFrame(data, columns=final_headers) - except Exception as e: - logger.error(f"DataFrame conversion failed: {e}") - return None - - -def compare_tables_with_sql(original_html: str, synthetic_html: str) -> Tuple[bool, List[str]]: - """pandasql로 두 테이블 비교""" - if not PANDAS_SQL_AVAILABLE: - return True, ["pandas/pandasql not available"] - - issues = [] - - df_original = html_table_to_dataframe(original_html) - df_synthetic = html_table_to_dataframe(synthetic_html) - - if df_original is None: - return False, ["원본 테이블 변환 실패"] - if df_synthetic is None: - return False, ["합성 테이블 변환 실패"] - - orig_shape = df_original.shape - synth_shape = df_synthetic.shape - - if orig_shape[1] != synth_shape[1]: - issues.append(f"열 수 불일치: 원본={orig_shape[1]}, 합성={synth_shape[1]}") - - if orig_shape[0] != synth_shape[0]: - issues.append(f"행 수 불일치: 원본={orig_shape[0]}, 합성={synth_shape[0]}") - - if orig_shape[1] != synth_shape[1]: - return False, issues - - try: - common_cols = [f"c{i}" for i in range(orig_shape[1])] - df_original.columns = common_cols - df_synthetic.columns = common_cols - - env = {"df_original": df_original, "df_synthetic": df_synthetic} - - numeric_cols = df_original.select_dtypes(include=['number']).columns.tolist() - - for col in numeric_cols: - try: - sum_query = f""" - SELECT - (SELECT COALESCE(SUM(CAST({col} AS REAL)), 0) FROM df_original) as orig_sum, - (SELECT COALESCE(SUM(CAST({col} AS REAL)), 0) FROM df_synthetic) as synth_sum - """ - sum_result = sqldf(sum_query, env) - orig_sum = sum_result.iloc[0]['orig_sum'] - synth_sum = sum_result.iloc[0]['synth_sum'] - - if abs(orig_sum - synth_sum) > 0.01: - issues.append(f"열 '{col}' 합계 불일치: 원본={orig_sum:.2f}, 합성={synth_sum:.2f}") - except: - pass - - except Exception as e: - issues.append(f"SQL 비교 오류: {e}") - - passed = len([i for i in issues if not i.startswith(" -")]) == 0 - return passed, issues - - -def compare_tables_with_repl(original_html: str, synthetic_html: str) -> Tuple[bool, List[str]]: - """REPL로 두 테이블 비교""" - if not REPL_AVAILABLE or not PANDAS_SQL_AVAILABLE: - return True, ["REPL 사용 불가"] - - df_original = html_table_to_dataframe(original_html) - df_synthetic = html_table_to_dataframe(synthetic_html) - - if df_original is None or df_synthetic is None: - return False, ["DataFrame 변환 실패"] - - issues = [] - - try: - # Shape 비교 - if df_original.shape != df_synthetic.shape: - issues.append(f"Shape 불일치: 원본={df_original.shape}, 합성={df_synthetic.shape}") - - # 행 수 비교 - if len(df_original) != len(df_synthetic): - issues.append(f"행 수 불일치: 원본={len(df_original)}, 합성={len(df_synthetic)}") - - # 숫자 컬럼 합계 비교 - for col in df_original.columns: - if col in df_synthetic.columns: - try: - orig_sum = pd.to_numeric(df_original[col], errors='coerce').sum() - synth_sum = pd.to_numeric(df_synthetic[col], errors='coerce').sum() - if pd.notna(orig_sum) and pd.notna(synth_sum): - if abs(orig_sum - synth_sum) > 0.01: - issues.append(f"컬럼 '{col}' 합계 불일치: 원본={orig_sum:.2f}, 합성={synth_sum:.2f}") - except: - pass - - except Exception as e: - issues.append(f"REPL 비교 오류: {e}") - - return len(issues) == 0, issues - -print("=" * 60) -print("3단계 검증 체인 테스트") -print("=" * 60) - -# 패키지 상태 확인 -print(f"\n📦 패키지 상태:") -print(f" - pandas/pandasql: {'✅ 사용 가능' if PANDAS_SQL_AVAILABLE else '❌ 없음'}") -print(f" - langchain REPL: {'✅ 사용 가능' if REPL_AVAILABLE else '❌ 없음'}") - -# 테스트용 HTML 테이블 -original_html = """ - - - - - - - - - -
이름나이점수
김철수2585
이영희3092
박민수2878
-""" - -# 테스트 1: 동일한 테이블 비교 (PASS 예상) -print("\n" + "=" * 60) -print("테스트 1: 동일한 테이블 비교 (PASS 예상)") -print("=" * 60) - -identical_synthetic = """ - - - - - -
이름나이점수
김철수2585
이영희3092
박민수2878
-""" - -print("\n[1/3] pandasql 검증...") -passed, issues = compare_tables_with_sql(original_html, identical_synthetic) -print(f" 결과: {'✅ PASS' if passed else '❌ FAIL'}") -if issues: - print(f" 이슈: {issues}") - -print("\n[2/3] REPL 검증...") -passed, issues = compare_tables_with_repl(original_html, identical_synthetic) -print(f" 결과: {'✅ PASS' if passed else '❌ FAIL'}") -if issues: - print(f" 이슈: {issues}") - -# 테스트 2: 행 수가 다른 테이블 (FAIL 예상) -print("\n" + "=" * 60) -print("테스트 2: 행 수가 다른 테이블 (FAIL 예상)") -print("=" * 60) - -different_rows = """ - - - - -
이름나이점수
김철수2585
이영희3092
-""" - -print("\n[1/3] pandasql 검증...") -passed, issues = compare_tables_with_sql(original_html, different_rows) -print(f" 결과: {'✅ PASS' if passed else '❌ FAIL'}") -if issues: - for issue in issues[:5]: - print(f" - {issue}") - -print("\n[2/3] REPL 검증...") -passed, issues = compare_tables_with_repl(original_html, different_rows) -print(f" 결과: {'✅ PASS' if passed else '❌ FAIL'}") -if issues: - for issue in issues[:5]: - print(f" - {issue}") - -# 테스트 3: 숫자 값이 다른 테이블 (FAIL 예상) -print("\n" + "=" * 60) -print("테스트 3: 숫자 값이 다른 테이블 (FAIL 예상)") -print("=" * 60) - -different_values = """ - - - - - -
이름나이점수
김철수2590
이영희3095
박민수2880
-""" - -print("\n[1/3] pandasql 검증...") -passed, issues = compare_tables_with_sql(original_html, different_values) -print(f" 결과: {'✅ PASS' if passed else '❌ FAIL'}") -if issues: - for issue in issues[:5]: - print(f" - {issue}") - -print("\n[2/3] REPL 검증...") -passed, issues = compare_tables_with_repl(original_html, different_values) -print(f" 결과: {'✅ PASS' if passed else '❌ FAIL'}") -if issues: - for issue in issues[:5]: - print(f" - {issue}") - -# DataFrame 변환 테스트 -print("\n" + "=" * 60) -print("DataFrame 변환 테스트") -print("=" * 60) - -df = html_table_to_dataframe(original_html) -if df is not None: - print(f"\n변환된 DataFrame:") - print(df) - print(f"\nShape: {df.shape}") - print(f"Columns: {list(df.columns)}") -else: - print("❌ DataFrame 변환 실패") - -print("\n" + "=" * 60) -print("테스트 완료!") -print("=" * 60) diff --git a/uv.lock b/uv.lock index 9e69bc9..6daa4bd 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.12" resolution-markers = [ "python_full_version >= '3.14'", @@ -2425,6 +2425,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651, upload-time = "2025-10-08T17:44:47.223Z" }, ] +[[package]] +name = "playwright" +version = "1.57.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet" }, + { name = "pyee" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/b6/e17543cea8290ae4dced10be21d5a43c360096aa2cce0aa7039e60c50df3/playwright-1.57.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:9351c1ac3dfd9b3820fe7fc4340d96c0d3736bb68097b9b7a69bd45d25e9370c", size = 41985039, upload-time = "2025-12-09T08:06:18.408Z" }, + { url = "https://files.pythonhosted.org/packages/8b/04/ef95b67e1ff59c080b2effd1a9a96984d6953f667c91dfe9d77c838fc956/playwright-1.57.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a4a9d65027bce48eeba842408bcc1421502dfd7e41e28d207e94260fa93ca67e", size = 40775575, upload-time = "2025-12-09T08:06:22.105Z" }, + { url = "https://files.pythonhosted.org/packages/60/bd/5563850322a663956c927eefcf1457d12917e8f118c214410e815f2147d1/playwright-1.57.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:99104771abc4eafee48f47dac2369e0015516dc1ce8c409807d2dd440828b9a4", size = 41985042, upload-time = "2025-12-09T08:06:25.357Z" }, + { url = "https://files.pythonhosted.org/packages/56/61/3a803cb5ae0321715bfd5247ea871d25b32c8f372aeb70550a90c5f586df/playwright-1.57.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:284ed5a706b7c389a06caa431b2f0ba9ac4130113c3a779767dda758c2497bb1", size = 45975252, upload-time = "2025-12-09T08:06:29.186Z" }, + { url = "https://files.pythonhosted.org/packages/83/d7/b72eb59dfbea0013a7f9731878df8c670f5f35318cedb010c8a30292c118/playwright-1.57.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a1bae6c0a07839cdeaddbc0756b3b2b85e476c07945f64ece08f1f956a86f1", size = 45706917, upload-time = "2025-12-09T08:06:32.549Z" }, + { url = "https://files.pythonhosted.org/packages/e4/09/3fc9ebd7c95ee54ba6a68d5c0bc23e449f7235f4603fc60534a364934c16/playwright-1.57.0-py3-none-win32.whl", hash = "sha256:1dd93b265688da46e91ecb0606d36f777f8eadcf7fbef12f6426b20bf0c9137c", size = 36553860, upload-time = "2025-12-09T08:06:35.864Z" }, + { url = "https://files.pythonhosted.org/packages/58/d4/dcdfd2a33096aeda6ca0d15584800443dd2be64becca8f315634044b135b/playwright-1.57.0-py3-none-win_amd64.whl", hash = "sha256:6caefb08ed2c6f29d33b8088d05d09376946e49a73be19271c8cd5384b82b14c", size = 36553864, upload-time = "2025-12-09T08:06:38.915Z" }, + { url = "https://files.pythonhosted.org/packages/6a/60/fe31d7e6b8907789dcb0584f88be741ba388413e4fbce35f1eba4e3073de/playwright-1.57.0-py3-none-win_arm64.whl", hash = "sha256:5f065f5a133dbc15e6e7c71e7bc04f258195755b1c32a432b792e28338c8335e", size = 32837940, upload-time = "2025-12-09T08:06:42.268Z" }, +] + [[package]] name = "prompt-toolkit" version = "3.0.52" @@ -2721,6 +2740,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, ] +[[package]] +name = "pyee" +version = "13.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/03/1fd98d5841cd7964a27d729ccf2199602fe05eb7a405c1462eb7277945ed/pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37", size = 31250, upload-time = "2025-03-17T18:53:15.955Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/4d/b9add7c84060d4c1906abe9a7e5359f2a60f7a9a4f67268b2766673427d8/pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498", size = 15730, upload-time = "2025-03-17T18:53:14.532Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -3374,6 +3405,7 @@ dependencies = [ { name = "notion-client" }, { name = "pandas" }, { name = "pandasql" }, + { name = "playwright" }, { name = "pydantic" }, { name = "pymongo" }, { name = "pymupdf" }, @@ -3405,6 +3437,7 @@ requires-dist = [ { name = "notion-client", specifier = ">=2.0.0" }, { name = "pandas", specifier = ">=2.3.3" }, { name = "pandasql", specifier = ">=0.7.3" }, + { name = "playwright", specifier = ">=1.57.0" }, { name = "pydantic", specifier = ">=2.12.4" }, { name = "pymongo", specifier = ">=4.6.1" }, { name = "pymupdf", specifier = ">=1.26.7" },