diff --git a/generate_synthetic_table/flow.py b/generate_synthetic_table/flow.py index 0177147..8be68dd 100644 --- a/generate_synthetic_table/flow.py +++ b/generate_synthetic_table/flow.py @@ -765,6 +765,7 @@ def build_synthetic_table_graph( llm: ChatOpenAI, provider: str = "openai", qa_only: bool = False, + skip_qa: bool = False, ) -> StateGraph: """ Assemble the LangGraph pipeline. @@ -773,6 +774,7 @@ def build_synthetic_table_graph( llm: LLM instance provider: LLM provider name qa_only: If True, generate QA directly from image without synthetic data generation + skip_qa: If True, skip QA generation after table generation (table only mode) """ graph = StateGraph(TableState) @@ -783,7 +785,7 @@ def build_synthetic_table_graph( graph.add_edge(START, "generate_qa_from_image") graph.add_edge("generate_qa_from_image", END) else: - # Full pipeline mode + # Full pipeline mode (or table-only mode if skip_qa=True) graph.add_node("image_to_html", image_to_html_node(llm)) graph.add_node("pymupdf_parse", pymupdf_parse_node) graph.add_node("validate_parsed_table", validate_parsed_table_node(llm)) @@ -795,7 +797,9 @@ def build_synthetic_table_graph( graph.add_node("self_reflection", self_reflection_node(llm)) graph.add_node("revise_synthetic_table", revise_synthetic_table_node(llm)) graph.add_node("parse_synthetic_table", parse_synthetic_table_node(llm)) - graph.add_node("generate_qa", generate_qa_node(llm)) + + if not skip_qa: + graph.add_node("generate_qa", generate_qa_node(llm)) # Routing based on provider and input type def route_start(state: TableState) -> str: @@ -842,8 +846,13 @@ def route_start(state: TableState) -> str: ) graph.add_edge("revise_synthetic_table", "self_reflection") - graph.add_edge("parse_synthetic_table", "generate_qa") - graph.add_edge("generate_qa", END) + + # Final edge: skip QA if requested + if skip_qa: + graph.add_edge("parse_synthetic_table", END) + else: + graph.add_edge("parse_synthetic_table", "generate_qa") + graph.add_edge("generate_qa", END) return graph @@ -914,6 +923,7 @@ def run_synthetic_table_flow( azure_deployment: str | None = None, azure_endpoint: str | None = None, qa_only: bool = False, + skip_qa: bool = False, image_paths: List[str] | None = None, domain: str | None = None, # 체크포인팅 옵션 @@ -935,6 +945,7 @@ def run_synthetic_table_flow( azure_deployment: Azure OpenAI deployment name azure_endpoint: Azure OpenAI endpoint URL qa_only: If True, skip synthetic data generation and only generate QA from image + skip_qa: If True, generate table only without QA generation image_paths: Optional list of image paths for multi-image processing domain: Optional domain for prompt customization (e.g. 'public') enable_checkpointing: 체크포인팅 활성화 여부 @@ -955,7 +966,7 @@ def run_synthetic_table_flow( config_path=config_path, ) - graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only) + graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only, skip_qa=skip_qa) # 체크포인팅 설정 if enable_checkpointing: diff --git a/generate_synthetic_table/prompts/academic.yaml b/generate_synthetic_table/prompts/academic.yaml index bbb19d5..87543eb 100644 --- a/generate_synthetic_table/prompts/academic.yaml +++ b/generate_synthetic_table/prompts/academic.yaml @@ -94,37 +94,79 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Academic Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic academic data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT academic data values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic academic data. - - Use realistic Korean student names, university names, course titles, and grades. - - Contexts: Transcripts, Research Papers, Enrollment Stats, Faculty Lists. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., sum of credits, correct GPA calculations if visible). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For student/model names: Generate DIFFERENT names + - For university names: Generate DIFFERENT names + - For grades/scores: Generate DIFFERENT realistic values + - For course/research topics: Generate DIFFERENT titles + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure academic logic (credits sum correctly, GPA valid) + 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks. + + **Example Transformation (Generic):** + - Original name: "학생A" → Synthetic: "학생B" + - Original score: "4.0" → Synthetic: "3.5" + - Original model: "모델X" → Synthetic: "모델Y" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Academic Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic academic data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's an ACADEMIC table + 3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** - 1. **Image:** An image of an academic table. + 1. **Image:** An image of an academic table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic academic data. - - Use realistic Korean student names, course titles, grades, research topics. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For student/model names: Generate DIFFERENT names + - For grades/scores: Generate DIFFERENT values + - For course/research topics: Generate DIFFERENT titles + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-indigo-700 to-indigo-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-indigo-50 transition-colors"` + - `
`: `class="border border-indigo-300 px-4 py-3 font-semibold text-left"` + - `
`: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
    `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
    `. No markdown code blocks. + + **Example (Generic):** + - Name in image: "이름X" → Generate: "이름Y" + - Score in image: "점수A" → Generate: "점수B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/business.yaml b/generate_synthetic_table/prompts/business.yaml index 9a3efe5..18ebc27 100644 --- a/generate_synthetic_table/prompts/business.yaml +++ b/generate_synthetic_table/prompts/business.yaml @@ -94,37 +94,90 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Business Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic business data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT business data values. + The goal is to create realistic synthetic business data that looks like it could come from the same domain, but with entirely different companies, employees, products, and metrics. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic business data. - - Use realistic Korean company names, department names, product lines, and financial metrics. - - Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., Q1 + Q2 + Q3 + Q4 = Total). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
    `. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). + 2. **Headers:** Keep header text the same (column names, category labels like 기업경쟁력, 시장경쟁력). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀") + - For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + - For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억") + - For strategy/description text: Write DIFFERENT content with similar structure + - For bullet point items: Create DIFFERENT but domain-appropriate content + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** + - Ensure business logic (Q1+Q2+Q3+Q4=Total, percentages add up) + - Use realistic Korean business terminology + - Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns + 6. **Output:** Return ONLY the raw HTML string starting with `
    ` and ending with `
    `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original name: "A팀" → Synthetic: "B팀" + - Original amount: "5억원" → Synthetic: "7.3억원" + - Original description: "신규 사업 추진" → Synthetic: "해외 시장 진출" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. + Remember: The synthetic table should look like a completely different business dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Business Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic business data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures) + 2. Understand it's a BUSINESS table (기업경쟁력, 시장경쟁력, 매출, 실적 등) + 3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** - 1. **Image:** An image of a business table. + 1. **Image:** An image of a business table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic business data. - - Use realistic Korean company names, products, sales figures. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
    `. - - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
    `. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` for merged cells. + 2. **Headers:** Keep header text (column names, category labels like 기업경쟁력, 차별화 요소) the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT from the original** + - Generate COMPLETELY NEW synthetic business values for all data cells: + * For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀") + * For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억") + * For strategy/description text: Write DIFFERENT content with similar structure + * For bullet point items: Create DIFFERENT but domain-appropriate items + * For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + - The synthetic table should look like a COMPLETELY DIFFERENT business report from the same industry + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `
    ` and ending with `
    `. No markdown code blocks. + + **Example of Expected Behavior (Generic):** + If the image shows a business table with: + - Team name: "영업팀" → Generate different: "마케팅팀" + - Revenue: "10억원" → Generate different: "15억원" + - Strategy: "시장 확대" → Generate different: "신규 진출" + - Bullet point items → Generate completely different items + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. + Remember: The output should be a new synthetic business dataset, not a transcription of the original. diff --git a/generate_synthetic_table/prompts/default.yaml b/generate_synthetic_table/prompts/default.yaml index e877e0a..0ca2645 100644 --- a/generate_synthetic_table/prompts/default.yaml +++ b/generate_synthetic_table/prompts/default.yaml @@ -78,43 +78,94 @@ generate_qa_from_image: | Return ONLY the JSON object, no additional text. generate_synthetic_table: | - You are a Synthetic Data Generator. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic data. + You are a Synthetic Data Generator specialized in creating completely NEW data while preserving table structure. + + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** + Your task is to generate a new HTML table that has the SAME STRUCTURE as the original but with COMPLETELY DIFFERENT, newly generated data values. + The goal is to create realistic synthetic data that looks like it could come from the same domain, but with entirely different entities, names, numbers, and values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges) as the original table. - 2. **Data:** Replace ALL cell values with new, synthetic data. - - Use realistic Korean names, organizations, and values suitable for the context. - - Ensure the data is consistent with the column types and patterns described in the summary. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency if applicable (e.g., sums, percentages). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
    `. Do not include markdown code blocks. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, rowspan, colspan, merges) as the original. + 2. **Headers:** Keep header text the same (column names, row labels that describe categories). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + - For organizations: Generate DIFFERENT names (e.g., "A회사" → "B회사") + - For numbers: Generate DIFFERENT numbers in similar ranges + - For descriptions/text: Write DIFFERENT content with similar structure + - For dates: Generate DIFFERENT plausible dates + - For addresses: Generate DIFFERENT realistic Korean addresses + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). Preserve the original table's visual style: + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, `font-semibold` as appropriate + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** + - Analyze the summary to understand the domain context + - Generate data that is realistic for that specific domain + - Maintain internal consistency (e.g., totals should sum correctly) + 6. **Output:** Return ONLY the raw HTML string starting with `
    ` and ending with `
    `. No markdown code blocks. + + **Example Transformation:** + - Original name: "A회사" → Synthetic: "B회사" + - Original number: "1,500,000" → Synthetic: "2,340,000" + - Original text: "[어떤 내용]" → Synthetic: "[다른 내용]" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. + Remember: The synthetic table should look like a completely different dataset from the same domain. generate_synthetic_table_from_image: | - You are a Synthetic Data Generator specialized in Korean documents. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic data. + You are a Synthetic Data Generator specialized in creating completely NEW data from Korean document images. + + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures) + 2. Understand the DOMAIN and data patterns from the image + 3. Generate COMPLETELY NEW synthetic data that fits the same domain but uses ENTIRELY DIFFERENT values **Inputs:** - 1. **Image:** An image of a table containing Korean text. + 1. **Image:** An image of a table containing Korean text. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` attributes for merged cells. - 2. **Data Generation:** Replace ALL cell values with new, synthetic data. - - Use realistic Korean names, organizations, and values suitable for the context of the table in the image. - - Ensure the data is consistent with the column types (e.g., dates, numbers, text). - - Do NOT use real private data. - 3. **Styling:** Use **Tailwind CSS** classes to style the table. - - Add `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` to the `` tag. - - Add `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` to `
    ` tags. - - Add `class="border border-slate-300 p-2"` to `` tags. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
    `. Do not include markdown code blocks. + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT from the original** + - Generate COMPLETELY NEW synthetic values for all data cells: + * For names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + * For organizations/teams: Generate DIFFERENT names (e.g., "A팀" → "B팀") + * For numbers/amounts: Generate DIFFERENT numbers in similar ranges + * For descriptions/text: Write DIFFERENT content with similar structure + * For dates: Generate DIFFERENT plausible dates + - The synthetic table should look like a COMPLETELY DIFFERENT dataset from the same domain + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe the original image's visual style and mimic it:** + - Look at the original image's color scheme (header background, borders, text colors) + - Match the visual design as closely as possible using appropriate Tailwind color classes + - If the original has colored headers, use similar colors (e.g., blue → `bg-blue-600`, gray → `bg-slate-600`) + - Basic structure classes: + * ``: `class="w-full border-collapse text-sm"` + * `
    `: Include `border`, `px-4 py-3`, `font-semibold` and appropriate background color + * ``: Include `border`, `px-4 py-3` + * `
      `: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** (e.g., style="...") + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
      `. No markdown code blocks. + + **Example of Expected Behavior:** + - Name in image: "홍길동" → Generate: "김영수" + - Amount in image: "1,500만원" → Generate: "2,300만원" + - Style: If original has blue header → use blue Tailwind classes + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. + Remember: The output should be a new synthetic dataset with similar visual style but different data. image_to_html: | You are an AI assistant specialized in OCR and HTML generation. @@ -124,11 +175,12 @@ image_to_html: | 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` attributes for merged cells. 2. **Text Fidelity:** Transcribe the text within the cells exactly as it appears in the image, preserving the Korean language. 3. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
      `. Do not include markdown code blocks (```html ... ```) or any other text. - 4. **Styling:** Use **Tailwind CSS** classes to style the table. - - Add `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` to the `` tag. - - Add `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` to `
      ` tags. - - Add `class="border border-slate-300 p-2"` to `` tags. - - Ensure the design looks clean and professional, similar to a standard document table. + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Mimic the original image's visual style:** + - Observe the original image's colors (header background, borders, text) + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` parse_contents: | Analyze the following HTML table and summarize its schema and content patterns. @@ -194,12 +246,19 @@ self_reflection: | You are a strict QA reviewer for synthetic HTML tables. Check: - 1) table structure matches original HTML + 1) table structure matches original HTML (rows, columns, merges) 2) column count / header hierarchy 3) row consistency 4) cell type realism (numbers vs text) 5) validity of HTML tags 6) no missing/extra columns + 7) **⚠️ DATA ORIGINALITY CHECK (CRITICAL) ⚠️:** + - The synthetic data MUST be DIFFERENT from the original + - If data values are identical or very similar to the original, mark as FAILED + - Names, numbers, descriptions should all be different + 8) **Styling check:** + - Should use Tailwind CSS classes, NOT inline styles + - Check for style="..." attributes and flag them as issues Output ONLY valid JSON with this schema: @@ -207,7 +266,7 @@ self_reflection: | "passed": true/false, "score": 0-100, "issues": [ - {{"type": "structure|header|row_count|data_type|html_validity|other", + {{"type": "structure|header|row_count|data_type|html_validity|data_originality|styling|other", "detail": "..."}} ], "revision_instructions": "Concrete step-by-step instructions to fix the table." @@ -224,12 +283,19 @@ self_reflection_from_image: | You are a strict QA reviewer for synthetic HTML tables. Check: - 1) table structure matches the original table in the image + 1) table structure matches the original table in the image (rows, columns, merges) 2) column count / header hierarchy 3) row consistency 4) cell type realism (numbers vs text) 5) validity of HTML tags 6) no missing/extra columns + 7) **⚠️ DATA ORIGINALITY CHECK (CRITICAL) ⚠️:** + - The synthetic data MUST be DIFFERENT from what appears in the image + - If data values appear to be copied/transcribed from the image, mark as FAILED + - This is NOT an OCR task - data should be newly generated + 8) **Styling check:** + - Should use Tailwind CSS classes, NOT inline styles + - Check for style="..." attributes and flag them as issues Output ONLY valid JSON with this schema: @@ -237,7 +303,7 @@ self_reflection_from_image: | "passed": true/false, "score": 0-100, "issues": [ - {{"type": "structure|header|row_count|data_type|html_validity|other", + {{"type": "structure|header|row_count|data_type|html_validity|data_originality|styling|other", "detail": "..."}} ], "revision_instructions": "Concrete step-by-step instructions to fix the table." diff --git a/generate_synthetic_table/prompts/finance.yaml b/generate_synthetic_table/prompts/finance.yaml index d610704..77d9927 100644 --- a/generate_synthetic_table/prompts/finance.yaml +++ b/generate_synthetic_table/prompts/finance.yaml @@ -94,37 +94,78 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Financial Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic financial data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT financial data values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic financial data. - - Use realistic Korean company names (e.g., KOSPI listed mock names), stock tickers, and realistic currency values (KRW, USD). - - Contexts: Balance Sheets, Income Statements, Stock Portfolios, Tax Records. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., Assets = Liabilities + Equity). - 4. **Output:** Return ONLY the raw HTML string starting with `
      ` and ending with `
      `. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For company names: Generate DIFFERENT names (e.g., "A회사" → "B회사") + - For financial figures: Generate DIFFERENT amounts (similar magnitude, different values) + - For percentages/ratios: Generate DIFFERENT metrics + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure financial logic (Assets = Liabilities + Equity, Totals match) + 6. **Output:** Return ONLY the raw HTML string starting with `
      ` and ending with `
      `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original company: "A기업" → Synthetic: "B기업" + - Original amount: "100억원" → Synthetic: "150억원" + - Original ratio: "PER 10" → Synthetic: "PER 8" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Financial Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic financial data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's a FINANCIAL table + 3. Generate COMPLETELY NEW synthetic financial data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** - 1. **Image:** An image of a financial table. + 1. **Image:** An image of a financial table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic financial data. - - Use realistic Korean company names, financial metrics. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
      `. - - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
      `. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For company names: Generate DIFFERENT names + - For financial figures: Generate DIFFERENT amounts + - For percentages/ratios: Generate DIFFERENT metrics + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-green-700 to-green-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-green-50 transition-colors"` + - `
      `: `class="border border-green-300 px-4 py-3 font-semibold text-left"` + - `
      `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
        `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
        `. No markdown code blocks. + + **Example (Generic):** + - Company in image: "X회사" → Generate: "Y회사" + - Amount in image: "50억" → Generate: "80억" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/insurance.yaml b/generate_synthetic_table/prompts/insurance.yaml index ce0c759..4521288 100644 --- a/generate_synthetic_table/prompts/insurance.yaml +++ b/generate_synthetic_table/prompts/insurance.yaml @@ -65,38 +65,79 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Insurance Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic insurance data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT insurance data values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic insurance data. - - Use realistic Korean insurance plan names (e.g., "SafeLife Plus", "Family Care"), coverage types, premiums (KRW), and terms. - - Ensure consistency: e.g., higher premiums for better coverage. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., monthly premium * 12 = annual). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
        `. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For plan names: Generate DIFFERENT insurance plan names + - For coverage amounts: Generate DIFFERENT amounts + - For premiums: Generate DIFFERENT premium amounts + - For terms/conditions: Generate DIFFERENT coverage terms + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure insurance logic (higher premiums for better coverage) + 6. **Output:** Return ONLY the raw HTML string starting with `
        ` and ending with `
        `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original plan: "보험A" → Synthetic: "보험B" + - Original coverage: "1억원" → Synthetic: "5천만원" + - Original premium: "월 3만원" → Synthetic: "월 5만원" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Insurance Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic insurance data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's an INSURANCE table + 3. Generate COMPLETELY NEW synthetic insurance data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** - 1. **Image:** An image of an insurance table. + 1. **Image:** An image of an insurance table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic insurance data. - - Use realistic Korean insurance terms, plan names, coverage amounts (KRW). - - Contexts: Life insurance, Health insurance, Auto insurance, etc. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
        `. - - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
        `. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For plan names: Generate DIFFERENT insurance plan names + - For coverage/premiums: Generate DIFFERENT amounts + - For terms: Generate DIFFERENT coverage terms + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-amber-600 to-amber-700 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-amber-50 transition-colors"` + - `
        `: `class="border border-amber-300 px-4 py-3 font-semibold text-left"` + - `
        `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
          `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
          `. No markdown code blocks. + + **Example (Generic):** + - Plan name in image: "보험X" → Generate: "보험Y" + - Amount in image: "금액A" → Generate: "금액B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/medical.yaml b/generate_synthetic_table/prompts/medical.yaml index d11f8f8..7bf995c 100644 --- a/generate_synthetic_table/prompts/medical.yaml +++ b/generate_synthetic_table/prompts/medical.yaml @@ -95,37 +95,78 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Medical Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic medical data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT medical data values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic medical data. - - Use realistic Korean patient names (pseudonymized), diagnosis codes (ICD-10 style but synthetic), medication names, and lab values. - - Contexts: Patient Charts, Lab Reports, Prescription Lists, Clinical Trials. - - Do NOT use real private data. - 3. **Consistency:** Ensure medical consistency (e.g., proper units for blood pressure, temperature). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
          `. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers + - For lab values: Generate DIFFERENT realistic values + - For diagnoses/medications: Generate DIFFERENT names and codes + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure medical logic (proper units for BP, temperature, lab values) + 6. **Output:** Return ONLY the raw HTML string starting with `
          ` and ending with `
          `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original ID: "환자A" → Synthetic: "환자B" + - Original value: "120/80" → Synthetic: "135/85" + - Original medication: "약물X" → Synthetic: "약물Y" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Medical Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic medical data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's a MEDICAL table + 3. Generate COMPLETELY NEW synthetic medical data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** - 1. **Image:** An image of a medical table. + 1. **Image:** An image of a medical table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic medical data. - - Use realistic Korean medical terms, patient info (synthetic). - 3. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
          `. - - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
          `. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers + - For lab values: Generate DIFFERENT realistic values + - For diagnoses/medications: Generate DIFFERENT names + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-teal-700 to-teal-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-teal-50 transition-colors"` + - `
          `: `class="border border-teal-300 px-4 py-3 font-semibold text-left"` + - `
          `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
            `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
            `. No markdown code blocks. + + **Example (Generic):** + - Patient ID in image: "환자X" → Generate: "환자Y" + - Value in image: "수치A" → Generate: "수치B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/public.yaml b/generate_synthetic_table/prompts/public.yaml index 2850db7..b0c4099 100644 --- a/generate_synthetic_table/prompts/public.yaml +++ b/generate_synthetic_table/prompts/public.yaml @@ -94,37 +94,78 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Public Sector/Government Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic public data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT public sector data values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic public data. - - Use realistic Korean administrative region names (e.g., Sejong-si, Mapo-gu), government department names, and statistical values. - - Contexts: Census Data, Budget Reports, Public Facility Status, Regional Statistics. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., Subtotals match Grand Total). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
            `. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For regions: Generate DIFFERENT administrative region names + - For departments: Generate DIFFERENT department names + - For statistics: Generate DIFFERENT numbers (similar magnitude) + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure statistical logic (Subtotals match Grand Total) + 6. **Output:** Return ONLY the raw HTML string starting with `
            ` and ending with `
            `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original region: "A시" → Synthetic: "B시" + - Original statistic: "인구 100만" → Synthetic: "인구 150만" + - Original rate: "집행률 80%" → Synthetic: "집행률 90%" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Public Sector/Government Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic public data. + + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's a PUBLIC SECTOR/GOVERNMENT table + 3. Generate COMPLETELY NEW synthetic public data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** - 1. **Image:** An image of a public data table. + 1. **Image:** An image of a public data table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic public data. - - Use realistic Korean region names, stats. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
            `. - - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
            `. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For regions: Generate DIFFERENT administrative region names + - For statistics: Generate DIFFERENT numbers + - For departments: Generate DIFFERENT names + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-purple-700 to-purple-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-purple-50 transition-colors"` + - ` str: + """Build complete HTML document with fonts and styles.""" + + # Google Fonts link + font_link = "" + if style["font_url"]: + font_link = f'\n \n ' + + # Font family CSS + font_css = f""" + """ + + return f""" + + + + + + {font_link} + {font_css} + + +{table_html} + +""" + # Add parent directory to path to allow imports if running from root sys.path.append(str(Path(__file__).parent)) @@ -15,6 +202,72 @@ from generate_synthetic_table.flow import TableState from generate_synthetic_table.notion_uploader import NotionUploader + +def save_synthetic_table_as_html( + synthetic_table: str, + output_path: Path, + pair_id: str, + table_index: int, + randomize_style: bool = True +) -> Tuple[Optional[str], Optional[Dict[str, Any]]]: + """Save synthetic table as HTML file with optional style randomization. + + Args: + synthetic_table: The HTML table string + output_path: Directory to save the file + pair_id: Identifier for the pair + table_index: Index of the table within the pair + randomize_style: Whether to apply random style variations + + Returns: + Tuple of (html_filepath, style_info) or (None, None) if failed + """ + if not synthetic_table: + return None, None + + # Clean up markdown code blocks if present + table_html = synthetic_table + if table_html.startswith("```html"): + table_html = table_html[7:] + if table_html.startswith("```"): + table_html = table_html[3:] + if table_html.endswith("```"): + table_html = table_html[:-3] + table_html = table_html.strip() + + # Apply style randomization if enabled + style_info = None + if randomize_style: + style_info = get_random_style() + table_html = apply_style_to_html(table_html, style_info) + full_html = build_html_document(table_html, style_info) + else: + # Basic HTML without style randomization + full_html = f""" + + + + + + + +{table_html} + +""" + + # Create html subdirectory + html_dir = output_path / "html" + html_dir.mkdir(parents=True, exist_ok=True) + + # Save file + safe_pair_id = "".join([c for c in pair_id if c.isalnum() or c in ('-', '_')]) + html_filename = f"{safe_pair_id}_table_{table_index}.html" + html_filepath = html_dir / html_filename + + html_filepath.write_text(full_html, encoding="utf-8") + + return str(html_filepath), style_info + def resolve_paths(pair: List[str], data_root: Path) -> List[Path]: """Resolves a list of relative paths to absolute Paths.""" paths = [] @@ -39,12 +292,14 @@ def process_single_pair( index: int, total_count: int, data_root: Path, + output_dir: Path, provider: str, model: str, config_path: str, arg_domain: str, qa_only: bool, - notion_uploader: Any + notion_uploader: Any, + randomize_style: bool = True ) -> Dict: """Process a single pair of images.""" @@ -132,6 +387,7 @@ def process_single_pair( model=model, config_path=config_path, qa_only=False, # We want the table + skip_qa=True, # Skip QA here - we'll generate QA for the pair later domain=domain ) @@ -139,12 +395,29 @@ def process_single_pair( if table_state.get("errors"): print(f" [Pair {index+1}] Error generating table: {table_state['errors']}") + # Save synthetic table as HTML file with style randomization + html_path = None + style_info = None + if table_state.get("synthetic_table"): + html_path, style_info = save_synthetic_table_as_html( + synthetic_table=table_state.get("synthetic_table"), + output_path=output_dir, + pair_id=pair_id, + table_index=len(temp_tables), + randomize_style=randomize_style + ) + if html_path: + style_desc = f" (font: {style_info['font_name']}, color: {style_info['color_name']})" if style_info else "" + print(f" [Pair {index+1}] Saved HTML: {html_path}{style_desc}") + # Filter state safe_state = { "image_path": str(path), "synthetic_table": table_state.get("synthetic_table"), "synthetic_json": table_state.get("synthetic_json"), "table_summary": table_state.get("table_summary"), + "html_path": html_path, + "style_info": style_info, # Store applied style for reference } temp_tables.append(safe_state) @@ -221,7 +494,8 @@ def run_pipeline( arg_domain: str = None, qa_only: bool = False, upload_to_notion: bool = False, - max_workers: int = 3 + max_workers: int = 3, + randomize_style: bool = True ): output_dir.mkdir(parents=True, exist_ok=True) @@ -249,12 +523,14 @@ def run_pipeline( i, total_count, data_root, + output_dir, provider, model, config_path, arg_domain, qa_only, - notion_uploader + notion_uploader, + randomize_style ): i for i, item in enumerate(json_input) } @@ -298,6 +574,8 @@ def main(): parser.add_argument("--qa-only", action="store_true", help="Skip table generation, only generate QA (applies to all domains)") parser.add_argument("--upload-to-notion", action="store_true", help="Upload QA results to Notion database") parser.add_argument("--max-workers", type=int, default=3, help="Maximum number of parallel workers (default: 3)") + parser.add_argument("--randomize-style", action="store_true", default=True, help="Randomize HTML table styles (fonts, colors) for diversity (default: True)") + parser.add_argument("--no-randomize-style", dest="randomize_style", action="store_false", help="Disable style randomization") args = parser.parse_args() @@ -331,7 +609,8 @@ def main(): arg_domain=args.domain, qa_only=args.qa_only, upload_to_notion=args.upload_to_notion, - max_workers=args.max_workers + max_workers=args.max_workers, + randomize_style=args.randomize_style ) if __name__ == "__main__":
            `: `class="border border-purple-300 px-4 py-3 font-semibold text-left"` + - `
            `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
              `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
              `. No markdown code blocks. + + **Example (Generic):** + - Region in image: "X지역" → Generate: "Y지역" + - Statistic in image: "수치A" → Generate: "수치B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/run_openai_public.sh b/run_openai_public.sh index 92bf58d..d90e1f6 100644 --- a/run_openai_public.sh +++ b/run_openai_public.sh @@ -5,9 +5,9 @@ # ============================================================================== # Default Configuration -INPUT_JSON="test_business_input.json" +INPUT_JSON="test_business.json" OUTPUT_DIR="output_business" -DEFAULT_ARGS="--provider openai --model gpt-5-mini --domain business" +DEFAULT_ARGS="--provider claude --model claude-sonnet-4-5 --domain business" # Check if the first argument is a JSON file path if [[ "$1" == *.json ]]; then @@ -20,8 +20,8 @@ echo " TableMagnifier - JSON Pipeline (Public)" echo "==============================================" echo "Input JSON: $INPUT_JSON" echo "Output Dir: $OUTPUT_DIR" -echo "Provider: openai" -echo "Model: gpt-5-mini" +echo "Provider: claude" +echo "Model: claude-sonnet-4-5" echo "Domain: business" echo "" echo "💡 Tip: To upload to Notion during pipeline execution:" @@ -31,13 +31,13 @@ echo "💡 To upload existing results later:" echo " python upload_to_notion_from_json.py $OUTPUT_DIR" echo "" -# Check for OPENAI_API_KEY -if [[ -z "$OPENAI_API_KEY" ]]; then - echo "⚠️ Warning: OPENAI_API_KEY is not set." +# Check for ANTHROPIC_API_KEY +if [[ -z "$ANTHROPIC_API_KEY" ]]; then + echo "⚠️ Warning: ANTHROPIC_API_KEY is not set." echo " Please set it in your environment or .env file." echo "" fi # Run the pipeline # Note: "$@" appends any remaining arguments, allowing overrides of defaults -uv run python run_pipeline_json.py --input "$INPUT_JSON" --output-dir "$OUTPUT_DIR" $DEFAULT_ARGS "$@" +uv run python run_pipeline_json.py --input "$INPUT_JSON" --output-dir "$OUTPUT_DIR" $DEFAULT_ARGS "$@" \ No newline at end of file diff --git a/run_pipeline_json.py b/run_pipeline_json.py index 0ff7276..d8fef9c 100644 --- a/run_pipeline_json.py +++ b/run_pipeline_json.py @@ -1,13 +1,200 @@ import argparse import json import os +import re +import random import sys from pathlib import Path -from typing import List, Dict, Any, Tuple +from typing import List, Dict, Any, Tuple, Optional from concurrent.futures import ThreadPoolExecutor, as_completed from dotenv import load_dotenv + +# ============================================================ +# Style Variation Configuration +# ============================================================ + +GOOGLE_FONTS = [ + ("Noto Sans KR", "Noto+Sans+KR:wght@400;500;600;700"), + ("Pretendard", None), # Self-hosted or system font + ("IBM Plex Sans KR", "IBM+Plex+Sans+KR:wght@400;500;600;700"), + ("Nanum Gothic", "Nanum+Gothic:wght@400;700"), + ("Nanum Myeongjo", "Nanum+Myeongjo:wght@400;700"), + ("Gothic A1", "Gothic+A1:wght@400;500;600;700"), + ("Do Hyeon", "Do+Hyeon"), + ("Jua", "Jua"), + ("Gowun Dodum", "Gowun+Dodum"), + ("Gowun Batang", "Gowun+Batang:wght@400;700"), +] + +COLOR_SCHEMES = [ + # (name, header_bg_from, header_bg_to, header_text, header_border, body_hover, body_border, text_color) + ("indigo", "indigo-600", "indigo-700", "white", "indigo-400", "indigo-50", "slate-200", "slate-700"), + ("slate", "slate-600", "slate-700", "white", "slate-500", "slate-50", "slate-300", "slate-700"), + ("emerald", "emerald-600", "emerald-700", "white", "emerald-400", "emerald-50", "slate-200", "slate-700"), + ("blue", "blue-600", "blue-700", "white", "blue-400", "blue-50", "slate-200", "slate-700"), + ("purple", "purple-600", "purple-700", "white", "purple-400", "purple-50", "slate-200", "slate-700"), + ("teal", "teal-600", "teal-700", "white", "teal-400", "teal-50", "slate-200", "slate-700"), + ("amber", "amber-600", "amber-700", "white", "amber-400", "amber-50", "slate-200", "slate-800"), + ("rose", "rose-600", "rose-700", "white", "rose-400", "rose-50", "slate-200", "slate-700"), + ("cyan", "cyan-600", "cyan-700", "white", "cyan-400", "cyan-50", "slate-200", "slate-700"), + ("stone", "stone-600", "stone-700", "white", "stone-500", "stone-50", "stone-300", "stone-700"), + # Light header variants + ("light-blue", "blue-100", "blue-200", "blue-900", "blue-300", "blue-50", "blue-200", "slate-700"), + ("light-gray", "gray-100", "gray-200", "gray-800", "gray-300", "gray-50", "gray-200", "gray-700"), + ("light-green", "green-100", "green-200", "green-900", "green-300", "green-50", "green-200", "slate-700"), +] + +TABLE_STYLES = [ + # (name, table_extra_classes, has_shadow, has_rounded, stripe_odd) + ("default", "", False, False, False), + ("shadow", "shadow-lg", True, False, False), + ("rounded", "rounded-lg overflow-hidden", False, True, False), + ("shadow-rounded", "shadow-lg rounded-lg overflow-hidden", True, True, False), + ("striped", "", False, False, True), + ("striped-rounded", "rounded-lg overflow-hidden", False, True, True), +] + +FONT_SIZES = ["text-xs", "text-sm", "text-base"] + + +def get_random_style() -> Dict[str, Any]: + """Generate a random style configuration.""" + font_name, font_url = random.choice(GOOGLE_FONTS) + color = random.choice(COLOR_SCHEMES) + table_style = random.choice(TABLE_STYLES) + font_size = random.choice(FONT_SIZES) + + return { + "font_name": font_name, + "font_url": font_url, + "color_name": color[0], + "header_bg_from": color[1], + "header_bg_to": color[2], + "header_text": color[3], + "header_border": color[4], + "body_hover": color[5], + "body_border": color[6], + "text_color": color[7], + "table_style_name": table_style[0], + "table_extra_classes": table_style[1], + "has_shadow": table_style[2], + "has_rounded": table_style[3], + "stripe_odd": table_style[4], + "font_size": font_size, + } + + +def apply_style_to_html(table_html: str, style: Dict[str, Any]) -> str: + """Apply style variations to the table HTML by replacing Tailwind classes.""" + html = table_html + + # Replace header gradient colors + # Pattern: bg-gradient-to-r from-{color}-{shade} to-{color}-{shade} + html = re.sub( + r'from-\w+-\d+\s+to-\w+-\d+', + f'from-{style["header_bg_from"]} to-{style["header_bg_to"]}', + html + ) + + # Replace header text color + html = re.sub( + r'(]*class="[^"]*?)text-white', + f'\\1text-{style["header_text"]}', + html + ) + + # Replace header border color + html = re.sub( + r'border-\w+-300(?=\s|")', + f'border-{style["header_border"]}', + html + ) + + # Replace hover color + html = re.sub( + r'hover:bg-\w+-50', + f'hover:bg-{style["body_hover"]}', + html + ) + + # Replace body border color + html = re.sub( + r'border-slate-200', + f'border-{style["body_border"]}', + html + ) + + # Replace text color + html = re.sub( + r'text-slate-700', + f'text-{style["text_color"]}', + html + ) + html = re.sub( + r'text-slate-600', + f'text-{style["text_color"]}', + html + ) + + # Replace font size in table tag + html = re.sub( + r'(]*class="[^"]*?)text-(?:xs|sm|base)', + f'\\1{style["font_size"]}', + html + ) + + # Add table extra classes (shadow, rounded) + if style["table_extra_classes"]: + html = re.sub( + r']*class="[^"]*hover:bg-)', + f'