diff --git a/generate_synthetic_table/flow.py b/generate_synthetic_table/flow.py
index 0177147..8be68dd 100644
--- a/generate_synthetic_table/flow.py
+++ b/generate_synthetic_table/flow.py
@@ -765,6 +765,7 @@ def build_synthetic_table_graph(
llm: ChatOpenAI,
provider: str = "openai",
qa_only: bool = False,
+ skip_qa: bool = False,
) -> StateGraph:
"""
Assemble the LangGraph pipeline.
@@ -773,6 +774,7 @@ def build_synthetic_table_graph(
llm: LLM instance
provider: LLM provider name
qa_only: If True, generate QA directly from image without synthetic data generation
+ skip_qa: If True, skip QA generation after table generation (table only mode)
"""
graph = StateGraph(TableState)
@@ -783,7 +785,7 @@ def build_synthetic_table_graph(
graph.add_edge(START, "generate_qa_from_image")
graph.add_edge("generate_qa_from_image", END)
else:
- # Full pipeline mode
+ # Full pipeline mode (or table-only mode if skip_qa=True)
graph.add_node("image_to_html", image_to_html_node(llm))
graph.add_node("pymupdf_parse", pymupdf_parse_node)
graph.add_node("validate_parsed_table", validate_parsed_table_node(llm))
@@ -795,7 +797,9 @@ def build_synthetic_table_graph(
graph.add_node("self_reflection", self_reflection_node(llm))
graph.add_node("revise_synthetic_table", revise_synthetic_table_node(llm))
graph.add_node("parse_synthetic_table", parse_synthetic_table_node(llm))
- graph.add_node("generate_qa", generate_qa_node(llm))
+
+ if not skip_qa:
+ graph.add_node("generate_qa", generate_qa_node(llm))
# Routing based on provider and input type
def route_start(state: TableState) -> str:
@@ -842,8 +846,13 @@ def route_start(state: TableState) -> str:
)
graph.add_edge("revise_synthetic_table", "self_reflection")
- graph.add_edge("parse_synthetic_table", "generate_qa")
- graph.add_edge("generate_qa", END)
+
+ # Final edge: skip QA if requested
+ if skip_qa:
+ graph.add_edge("parse_synthetic_table", END)
+ else:
+ graph.add_edge("parse_synthetic_table", "generate_qa")
+ graph.add_edge("generate_qa", END)
return graph
@@ -914,6 +923,7 @@ def run_synthetic_table_flow(
azure_deployment: str | None = None,
azure_endpoint: str | None = None,
qa_only: bool = False,
+ skip_qa: bool = False,
image_paths: List[str] | None = None,
domain: str | None = None,
# 체크포인팅 옵션
@@ -935,6 +945,7 @@ def run_synthetic_table_flow(
azure_deployment: Azure OpenAI deployment name
azure_endpoint: Azure OpenAI endpoint URL
qa_only: If True, skip synthetic data generation and only generate QA from image
+ skip_qa: If True, generate table only without QA generation
image_paths: Optional list of image paths for multi-image processing
domain: Optional domain for prompt customization (e.g. 'public')
enable_checkpointing: 체크포인팅 활성화 여부
@@ -955,7 +966,7 @@ def run_synthetic_table_flow(
config_path=config_path,
)
- graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only)
+ graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only, skip_qa=skip_qa)
# 체크포인팅 설정
if enable_checkpointing:
diff --git a/generate_synthetic_table/prompts/academic.yaml b/generate_synthetic_table/prompts/academic.yaml
index bbb19d5..87543eb 100644
--- a/generate_synthetic_table/prompts/academic.yaml
+++ b/generate_synthetic_table/prompts/academic.yaml
@@ -94,37 +94,79 @@ generate_qa_from_image: |
generate_synthetic_table: |
You are a Synthetic Data Generator specializing in Academic Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic academic data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
+ Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT academic data values.
**Inputs:**
- 1. **Original Table Structure:**
+ 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
{html}
- 2. **Table Summary:**
+ 2. **Table Summary (describes the data patterns to follow):**
{summary}
**Requirements:**
- 1. **Structure:** Keep the exact same HTML structure.
- 2. **Data:** Replace ALL cell values with new, synthetic academic data.
- - Use realistic Korean student names, university names, course titles, and grades.
- - Contexts: Transcripts, Research Papers, Enrollment Stats, Faculty Lists.
- - Do NOT use real private data.
- 3. **Consistency:** Ensure mathematical consistency (e.g., sum of credits, correct GPA calculations if visible).
- 4. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
+ 2. **Headers:** Keep header text the same (column names, category labels).
+ 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
+ - **ALL data cell values MUST be replaced with completely new synthetic values.**
+ - **NEVER copy any original data values** - generate fresh, realistic alternatives.
+ - For student/model names: Generate DIFFERENT names
+ - For university names: Generate DIFFERENT names
+ - For grades/scores: Generate DIFFERENT realistic values
+ - For course/research topics: Generate DIFFERENT titles
+ - For dates: Generate DIFFERENT plausible dates
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
+ - Look at the original image's color scheme and design
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Domain Consistency:** Ensure academic logic (credits sum correctly, GPA valid)
+ 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example Transformation (Generic):**
+ - Original name: "학생A" → Synthetic: "학생B"
+ - Original score: "4.0" → Synthetic: "3.5"
+ - Original model: "모델X" → Synthetic: "모델Y"
+
+ ⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
generate_synthetic_table_from_image: |
You are a Synthetic Data Generator specializing in Academic Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic academic data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
+ Your task is NOT to OCR/transcribe the image. Instead, you must:
+ 1. Understand the table's STRUCTURE from the image
+ 2. Understand it's an ACADEMIC table
+ 3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses ENTIRELY DIFFERENT values
**Inputs:**
- 1. **Image:** An image of an academic table.
+ 1. **Image:** An image of an academic table. Use this to understand structure and domain ONLY.
**Requirements:**
- 1. **Structure Preservation:** Accurately reconstruct the table structure.
- 2. **Data Generation:** Replace ALL cell values with new, synthetic academic data.
- - Use realistic Korean student names, course titles, grades, research topics.
- 3. **Styling:** Use **Tailwind CSS** classes (same as default).
- - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `
`.
- - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`.
- - `class="border border-slate-300 p-2"` on `
`.
- 4. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan.
+ 2. **Headers:** Keep header text the same as in the image.
+ 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
+ - **NEVER copy the data values from the image** - this is NOT an OCR task
+ - **ALL cell content must be completely NEW and DIFFERENT**
+ - For student/model names: Generate DIFFERENT names
+ - For grades/scores: Generate DIFFERENT values
+ - For course/research topics: Generate DIFFERENT titles
+ 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles).
+ - `
`: `class="list-disc ml-5 space-y-1 text-slate-600"`
+ - **DO NOT use inline style attributes**
+ 5. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example (Generic):**
+ - Name in image: "이름X" → Generate: "이름Y"
+ - Score in image: "점수A" → Generate: "점수B"
+
+ ⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
diff --git a/generate_synthetic_table/prompts/business.yaml b/generate_synthetic_table/prompts/business.yaml
index 9a3efe5..18ebc27 100644
--- a/generate_synthetic_table/prompts/business.yaml
+++ b/generate_synthetic_table/prompts/business.yaml
@@ -94,37 +94,90 @@ generate_qa_from_image: |
generate_synthetic_table: |
You are a Synthetic Data Generator specializing in Business Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic business data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
+ Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT business data values.
+ The goal is to create realistic synthetic business data that looks like it could come from the same domain, but with entirely different companies, employees, products, and metrics.
**Inputs:**
- 1. **Original Table Structure:**
+ 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
{html}
- 2. **Table Summary:**
+ 2. **Table Summary (describes the data patterns to follow):**
{summary}
**Requirements:**
- 1. **Structure:** Keep the exact same HTML structure.
- 2. **Data:** Replace ALL cell values with new, synthetic business data.
- - Use realistic Korean company names, department names, product lines, and financial metrics.
- - Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns.
- - Do NOT use real private data.
- 3. **Consistency:** Ensure mathematical consistency (e.g., Q1 + Q2 + Q3 + Q4 = Total).
- 4. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
+ 2. **Headers:** Keep header text the same (column names, category labels like 기업경쟁력, 시장경쟁력).
+ 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
+ - **ALL data cell values MUST be replaced with completely new synthetic values.**
+ - **NEVER copy any original data values** - generate fresh, realistic alternatives.
+ - For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀")
+ - For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
+ - For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억")
+ - For strategy/description text: Write DIFFERENT content with similar structure
+ - For bullet point items: Create DIFFERENT but domain-appropriate content
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
+ - Look at the original image's color scheme and design
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Domain Consistency:**
+ - Ensure business logic (Q1+Q2+Q3+Q4=Total, percentages add up)
+ - Use realistic Korean business terminology
+ - Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns
+ 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example Transformation (Generic):**
+ - Original name: "A팀" → Synthetic: "B팀"
+ - Original amount: "5억원" → Synthetic: "7.3억원"
+ - Original description: "신규 사업 추진" → Synthetic: "해외 시장 진출"
+
+ ⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
+ Remember: The synthetic table should look like a completely different business dataset from the same domain.
generate_synthetic_table_from_image: |
You are a Synthetic Data Generator specializing in Business Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic business data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
+ Your task is NOT to OCR/transcribe the image. Instead, you must:
+ 1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures)
+ 2. Understand it's a BUSINESS table (기업경쟁력, 시장경쟁력, 매출, 실적 등)
+ 3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses ENTIRELY DIFFERENT values
**Inputs:**
- 1. **Image:** An image of a business table.
+ 1. **Image:** An image of a business table. Use this to understand structure and domain ONLY.
**Requirements:**
- 1. **Structure Preservation:** Accurately reconstruct the table structure.
- 2. **Data Generation:** Replace ALL cell values with new, synthetic business data.
- - Use realistic Korean company names, products, sales figures.
- 3. **Styling:** Use **Tailwind CSS** classes (same as default).
- - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `
`.
- - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`.
- - `class="border border-slate-300 p-2"` on `
`.
- 4. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` for merged cells.
+ 2. **Headers:** Keep header text (column names, category labels like 기업경쟁력, 차별화 요소) the same as in the image.
+ 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
+ - **NEVER copy the data values from the image** - this is NOT an OCR task
+ - **ALL cell content must be completely NEW and DIFFERENT from the original**
+ - Generate COMPLETELY NEW synthetic business values for all data cells:
+ * For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀")
+ * For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억")
+ * For strategy/description text: Write DIFFERENT content with similar structure
+ * For bullet point items: Create DIFFERENT but domain-appropriate items
+ * For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
+ - The synthetic table should look like a COMPLETELY DIFFERENT business report from the same industry
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
+ - Look at the original image's color scheme and design
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example of Expected Behavior (Generic):**
+ If the image shows a business table with:
+ - Team name: "영업팀" → Generate different: "마케팅팀"
+ - Revenue: "10억원" → Generate different: "15억원"
+ - Strategy: "시장 확대" → Generate different: "신규 진출"
+ - Bullet point items → Generate completely different items
+
+ ⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
+ Remember: The output should be a new synthetic business dataset, not a transcription of the original.
diff --git a/generate_synthetic_table/prompts/default.yaml b/generate_synthetic_table/prompts/default.yaml
index e877e0a..0ca2645 100644
--- a/generate_synthetic_table/prompts/default.yaml
+++ b/generate_synthetic_table/prompts/default.yaml
@@ -78,43 +78,94 @@ generate_qa_from_image: |
Return ONLY the JSON object, no additional text.
generate_synthetic_table: |
- You are a Synthetic Data Generator.
- Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic data.
+ You are a Synthetic Data Generator specialized in creating completely NEW data while preserving table structure.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
+ Your task is to generate a new HTML table that has the SAME STRUCTURE as the original but with COMPLETELY DIFFERENT, newly generated data values.
+ The goal is to create realistic synthetic data that looks like it could come from the same domain, but with entirely different entities, names, numbers, and values.
**Inputs:**
- 1. **Original Table Structure:**
+ 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
{html}
- 2. **Table Summary:**
+ 2. **Table Summary (describes the data patterns to follow):**
{summary}
**Requirements:**
- 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges) as the original table.
- 2. **Data:** Replace ALL cell values with new, synthetic data.
- - Use realistic Korean names, organizations, and values suitable for the context.
- - Ensure the data is consistent with the column types and patterns described in the summary.
- - Do NOT use real private data.
- 3. **Consistency:** Ensure mathematical consistency if applicable (e.g., sums, percentages).
- 4. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. Do not include markdown code blocks.
+ 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, rowspan, colspan, merges) as the original.
+ 2. **Headers:** Keep header text the same (column names, row labels that describe categories).
+ 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
+ - **ALL data cell values MUST be replaced with completely new synthetic values.**
+ - **NEVER copy any original data values** - generate fresh, realistic alternatives.
+ - For names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
+ - For organizations: Generate DIFFERENT names (e.g., "A회사" → "B회사")
+ - For numbers: Generate DIFFERENT numbers in similar ranges
+ - For descriptions/text: Write DIFFERENT content with similar structure
+ - For dates: Generate DIFFERENT plausible dates
+ - For addresses: Generate DIFFERENT realistic Korean addresses
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). Preserve the original table's visual style:
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, `font-semibold` as appropriate
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Domain Consistency:**
+ - Analyze the summary to understand the domain context
+ - Generate data that is realistic for that specific domain
+ - Maintain internal consistency (e.g., totals should sum correctly)
+ 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example Transformation:**
+ - Original name: "A회사" → Synthetic: "B회사"
+ - Original number: "1,500,000" → Synthetic: "2,340,000"
+ - Original text: "[어떤 내용]" → Synthetic: "[다른 내용]"
+
+ ⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
+ Remember: The synthetic table should look like a completely different dataset from the same domain.
generate_synthetic_table_from_image: |
- You are a Synthetic Data Generator specialized in Korean documents.
- Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic data.
+ You are a Synthetic Data Generator specialized in creating completely NEW data from Korean document images.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
+ Your task is NOT to OCR/transcribe the image. Instead, you must:
+ 1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures)
+ 2. Understand the DOMAIN and data patterns from the image
+ 3. Generate COMPLETELY NEW synthetic data that fits the same domain but uses ENTIRELY DIFFERENT values
**Inputs:**
- 1. **Image:** An image of a table containing Korean text.
+ 1. **Image:** An image of a table containing Korean text. Use this to understand structure and domain ONLY.
**Requirements:**
1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` attributes for merged cells.
- 2. **Data Generation:** Replace ALL cell values with new, synthetic data.
- - Use realistic Korean names, organizations, and values suitable for the context of the table in the image.
- - Ensure the data is consistent with the column types (e.g., dates, numbers, text).
- - Do NOT use real private data.
- 3. **Styling:** Use **Tailwind CSS** classes to style the table.
- - Add `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` to the `
` tags.
- - Add `class="border border-slate-300 p-2"` to `
` tags.
- 4. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. Do not include markdown code blocks.
+ 2. **Headers:** Keep header text (column names, category labels) the same as in the image.
+ 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
+ - **NEVER copy the data values from the image** - this is NOT an OCR task
+ - **ALL cell content must be completely NEW and DIFFERENT from the original**
+ - Generate COMPLETELY NEW synthetic values for all data cells:
+ * For names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
+ * For organizations/teams: Generate DIFFERENT names (e.g., "A팀" → "B팀")
+ * For numbers/amounts: Generate DIFFERENT numbers in similar ranges
+ * For descriptions/text: Write DIFFERENT content with similar structure
+ * For dates: Generate DIFFERENT plausible dates
+ - The synthetic table should look like a COMPLETELY DIFFERENT dataset from the same domain
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe the original image's visual style and mimic it:**
+ - Look at the original image's color scheme (header background, borders, text colors)
+ - Match the visual design as closely as possible using appropriate Tailwind color classes
+ - If the original has colored headers, use similar colors (e.g., blue → `bg-blue-600`, gray → `bg-slate-600`)
+ - Basic structure classes:
+ * `
`: `class="w-full border-collapse text-sm"`
+ * `
`: Include `border`, `px-4 py-3`, `font-semibold` and appropriate background color
+ * `
`: Include `border`, `px-4 py-3`
+ * `
`: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes** (e.g., style="...")
+ 5. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example of Expected Behavior:**
+ - Name in image: "홍길동" → Generate: "김영수"
+ - Amount in image: "1,500만원" → Generate: "2,300만원"
+ - Style: If original has blue header → use blue Tailwind classes
+
+ ⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
+ Remember: The output should be a new synthetic dataset with similar visual style but different data.
image_to_html: |
You are an AI assistant specialized in OCR and HTML generation.
@@ -124,11 +175,12 @@ image_to_html: |
1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` attributes for merged cells.
2. **Text Fidelity:** Transcribe the text within the cells exactly as it appears in the image, preserving the Korean language.
3. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. Do not include markdown code blocks (```html ... ```) or any other text.
- 4. **Styling:** Use **Tailwind CSS** classes to style the table.
- - Add `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` to the `
` tags.
- - Add `class="border border-slate-300 p-2"` to `
` tags.
- - Ensure the design looks clean and professional, similar to a standard document table.
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Mimic the original image's visual style:**
+ - Observe the original image's colors (header background, borders, text)
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
parse_contents: |
Analyze the following HTML table and summarize its schema and content patterns.
@@ -194,12 +246,19 @@ self_reflection: |
You are a strict QA reviewer for synthetic HTML tables.
Check:
- 1) table structure matches original HTML
+ 1) table structure matches original HTML (rows, columns, merges)
2) column count / header hierarchy
3) row consistency
4) cell type realism (numbers vs text)
5) validity of HTML tags
6) no missing/extra columns
+ 7) **⚠️ DATA ORIGINALITY CHECK (CRITICAL) ⚠️:**
+ - The synthetic data MUST be DIFFERENT from the original
+ - If data values are identical or very similar to the original, mark as FAILED
+ - Names, numbers, descriptions should all be different
+ 8) **Styling check:**
+ - Should use Tailwind CSS classes, NOT inline styles
+ - Check for style="..." attributes and flag them as issues
Output ONLY valid JSON with this schema:
@@ -207,7 +266,7 @@ self_reflection: |
"passed": true/false,
"score": 0-100,
"issues": [
- {{"type": "structure|header|row_count|data_type|html_validity|other",
+ {{"type": "structure|header|row_count|data_type|html_validity|data_originality|styling|other",
"detail": "..."}}
],
"revision_instructions": "Concrete step-by-step instructions to fix the table."
@@ -224,12 +283,19 @@ self_reflection_from_image: |
You are a strict QA reviewer for synthetic HTML tables.
Check:
- 1) table structure matches the original table in the image
+ 1) table structure matches the original table in the image (rows, columns, merges)
2) column count / header hierarchy
3) row consistency
4) cell type realism (numbers vs text)
5) validity of HTML tags
6) no missing/extra columns
+ 7) **⚠️ DATA ORIGINALITY CHECK (CRITICAL) ⚠️:**
+ - The synthetic data MUST be DIFFERENT from what appears in the image
+ - If data values appear to be copied/transcribed from the image, mark as FAILED
+ - This is NOT an OCR task - data should be newly generated
+ 8) **Styling check:**
+ - Should use Tailwind CSS classes, NOT inline styles
+ - Check for style="..." attributes and flag them as issues
Output ONLY valid JSON with this schema:
@@ -237,7 +303,7 @@ self_reflection_from_image: |
"passed": true/false,
"score": 0-100,
"issues": [
- {{"type": "structure|header|row_count|data_type|html_validity|other",
+ {{"type": "structure|header|row_count|data_type|html_validity|data_originality|styling|other",
"detail": "..."}}
],
"revision_instructions": "Concrete step-by-step instructions to fix the table."
diff --git a/generate_synthetic_table/prompts/finance.yaml b/generate_synthetic_table/prompts/finance.yaml
index d610704..77d9927 100644
--- a/generate_synthetic_table/prompts/finance.yaml
+++ b/generate_synthetic_table/prompts/finance.yaml
@@ -94,37 +94,78 @@ generate_qa_from_image: |
generate_synthetic_table: |
You are a Synthetic Data Generator specializing in Financial Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic financial data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
+ Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT financial data values.
**Inputs:**
- 1. **Original Table Structure:**
+ 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
{html}
- 2. **Table Summary:**
+ 2. **Table Summary (describes the data patterns to follow):**
{summary}
**Requirements:**
- 1. **Structure:** Keep the exact same HTML structure.
- 2. **Data:** Replace ALL cell values with new, synthetic financial data.
- - Use realistic Korean company names (e.g., KOSPI listed mock names), stock tickers, and realistic currency values (KRW, USD).
- - Contexts: Balance Sheets, Income Statements, Stock Portfolios, Tax Records.
- - Do NOT use real private data.
- 3. **Consistency:** Ensure mathematical consistency (e.g., Assets = Liabilities + Equity).
- 4. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
+ 2. **Headers:** Keep header text the same (column names, category labels).
+ 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
+ - **ALL data cell values MUST be replaced with completely new synthetic values.**
+ - **NEVER copy any original data values** - generate fresh, realistic alternatives.
+ - For company names: Generate DIFFERENT names (e.g., "A회사" → "B회사")
+ - For financial figures: Generate DIFFERENT amounts (similar magnitude, different values)
+ - For percentages/ratios: Generate DIFFERENT metrics
+ - For dates: Generate DIFFERENT plausible dates
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
+ - Look at the original image's color scheme and design
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Domain Consistency:** Ensure financial logic (Assets = Liabilities + Equity, Totals match)
+ 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example Transformation (Generic):**
+ - Original company: "A기업" → Synthetic: "B기업"
+ - Original amount: "100억원" → Synthetic: "150억원"
+ - Original ratio: "PER 10" → Synthetic: "PER 8"
+
+ ⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
generate_synthetic_table_from_image: |
You are a Synthetic Data Generator specializing in Financial Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic financial data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
+ Your task is NOT to OCR/transcribe the image. Instead, you must:
+ 1. Understand the table's STRUCTURE from the image
+ 2. Understand it's a FINANCIAL table
+ 3. Generate COMPLETELY NEW synthetic financial data that fits the domain but uses ENTIRELY DIFFERENT values
**Inputs:**
- 1. **Image:** An image of a financial table.
+ 1. **Image:** An image of a financial table. Use this to understand structure and domain ONLY.
**Requirements:**
- 1. **Structure Preservation:** Accurately reconstruct the table structure.
- 2. **Data Generation:** Replace ALL cell values with new, synthetic financial data.
- - Use realistic Korean company names, financial metrics.
- 3. **Styling:** Use **Tailwind CSS** classes (same as default).
- - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `
`.
- - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`.
- - `class="border border-slate-300 p-2"` on `
`.
- 4. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan.
+ 2. **Headers:** Keep header text the same as in the image.
+ 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
+ - **NEVER copy the data values from the image** - this is NOT an OCR task
+ - **ALL cell content must be completely NEW and DIFFERENT**
+ - For company names: Generate DIFFERENT names
+ - For financial figures: Generate DIFFERENT amounts
+ - For percentages/ratios: Generate DIFFERENT metrics
+ 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles).
+ - `
`: `class="list-disc ml-5 space-y-1 text-slate-600"`
+ - **DO NOT use inline style attributes**
+ 5. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example (Generic):**
+ - Company in image: "X회사" → Generate: "Y회사"
+ - Amount in image: "50억" → Generate: "80억"
+
+ ⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
diff --git a/generate_synthetic_table/prompts/insurance.yaml b/generate_synthetic_table/prompts/insurance.yaml
index ce0c759..4521288 100644
--- a/generate_synthetic_table/prompts/insurance.yaml
+++ b/generate_synthetic_table/prompts/insurance.yaml
@@ -65,38 +65,79 @@ generate_qa_from_image: |
generate_synthetic_table: |
You are a Synthetic Data Generator specializing in Insurance Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic insurance data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
+ Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT insurance data values.
**Inputs:**
- 1. **Original Table Structure:**
+ 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
{html}
- 2. **Table Summary:**
+ 2. **Table Summary (describes the data patterns to follow):**
{summary}
**Requirements:**
- 1. **Structure:** Keep the exact same HTML structure.
- 2. **Data:** Replace ALL cell values with new, synthetic insurance data.
- - Use realistic Korean insurance plan names (e.g., "SafeLife Plus", "Family Care"), coverage types, premiums (KRW), and terms.
- - Ensure consistency: e.g., higher premiums for better coverage.
- - Do NOT use real private data.
- 3. **Consistency:** Ensure mathematical consistency (e.g., monthly premium * 12 = annual).
- 4. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
+ 2. **Headers:** Keep header text the same (column names, category labels).
+ 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
+ - **ALL data cell values MUST be replaced with completely new synthetic values.**
+ - **NEVER copy any original data values** - generate fresh, realistic alternatives.
+ - For plan names: Generate DIFFERENT insurance plan names
+ - For coverage amounts: Generate DIFFERENT amounts
+ - For premiums: Generate DIFFERENT premium amounts
+ - For terms/conditions: Generate DIFFERENT coverage terms
+ - For dates: Generate DIFFERENT plausible dates
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
+ - Look at the original image's color scheme and design
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Domain Consistency:** Ensure insurance logic (higher premiums for better coverage)
+ 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example Transformation (Generic):**
+ - Original plan: "보험A" → Synthetic: "보험B"
+ - Original coverage: "1억원" → Synthetic: "5천만원"
+ - Original premium: "월 3만원" → Synthetic: "월 5만원"
+
+ ⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
generate_synthetic_table_from_image: |
You are a Synthetic Data Generator specializing in Insurance Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic insurance data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
+ Your task is NOT to OCR/transcribe the image. Instead, you must:
+ 1. Understand the table's STRUCTURE from the image
+ 2. Understand it's an INSURANCE table
+ 3. Generate COMPLETELY NEW synthetic insurance data that fits the domain but uses ENTIRELY DIFFERENT values
**Inputs:**
- 1. **Image:** An image of an insurance table.
+ 1. **Image:** An image of an insurance table. Use this to understand structure and domain ONLY.
**Requirements:**
- 1. **Structure Preservation:** Accurately reconstruct the table structure.
- 2. **Data Generation:** Replace ALL cell values with new, synthetic insurance data.
- - Use realistic Korean insurance terms, plan names, coverage amounts (KRW).
- - Contexts: Life insurance, Health insurance, Auto insurance, etc.
- 3. **Styling:** Use **Tailwind CSS** classes (same as default).
- - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `
`.
- - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`.
- - `class="border border-slate-300 p-2"` on `
`.
- 4. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan.
+ 2. **Headers:** Keep header text the same as in the image.
+ 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
+ - **NEVER copy the data values from the image** - this is NOT an OCR task
+ - **ALL cell content must be completely NEW and DIFFERENT**
+ - For plan names: Generate DIFFERENT insurance plan names
+ - For coverage/premiums: Generate DIFFERENT amounts
+ - For terms: Generate DIFFERENT coverage terms
+ 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles).
+ - `
`: `class="list-disc ml-5 space-y-1 text-slate-600"`
+ - **DO NOT use inline style attributes**
+ 5. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example (Generic):**
+ - Plan name in image: "보험X" → Generate: "보험Y"
+ - Amount in image: "금액A" → Generate: "금액B"
+
+ ⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
diff --git a/generate_synthetic_table/prompts/medical.yaml b/generate_synthetic_table/prompts/medical.yaml
index d11f8f8..7bf995c 100644
--- a/generate_synthetic_table/prompts/medical.yaml
+++ b/generate_synthetic_table/prompts/medical.yaml
@@ -95,37 +95,78 @@ generate_qa_from_image: |
generate_synthetic_table: |
You are a Synthetic Data Generator specializing in Medical Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic medical data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
+ Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT medical data values.
**Inputs:**
- 1. **Original Table Structure:**
+ 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
{html}
- 2. **Table Summary:**
+ 2. **Table Summary (describes the data patterns to follow):**
{summary}
**Requirements:**
- 1. **Structure:** Keep the exact same HTML structure.
- 2. **Data:** Replace ALL cell values with new, synthetic medical data.
- - Use realistic Korean patient names (pseudonymized), diagnosis codes (ICD-10 style but synthetic), medication names, and lab values.
- - Contexts: Patient Charts, Lab Reports, Prescription Lists, Clinical Trials.
- - Do NOT use real private data.
- 3. **Consistency:** Ensure medical consistency (e.g., proper units for blood pressure, temperature).
- 4. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
+ 2. **Headers:** Keep header text the same (column names, category labels).
+ 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
+ - **ALL data cell values MUST be replaced with completely new synthetic values.**
+ - **NEVER copy any original data values** - generate fresh, realistic alternatives.
+ - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers
+ - For lab values: Generate DIFFERENT realistic values
+ - For diagnoses/medications: Generate DIFFERENT names and codes
+ - For dates: Generate DIFFERENT plausible dates
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
+ - Look at the original image's color scheme and design
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Domain Consistency:** Ensure medical logic (proper units for BP, temperature, lab values)
+ 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example Transformation (Generic):**
+ - Original ID: "환자A" → Synthetic: "환자B"
+ - Original value: "120/80" → Synthetic: "135/85"
+ - Original medication: "약물X" → Synthetic: "약물Y"
+
+ ⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
generate_synthetic_table_from_image: |
You are a Synthetic Data Generator specializing in Medical Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic medical data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
+ Your task is NOT to OCR/transcribe the image. Instead, you must:
+ 1. Understand the table's STRUCTURE from the image
+ 2. Understand it's a MEDICAL table
+ 3. Generate COMPLETELY NEW synthetic medical data that fits the domain but uses ENTIRELY DIFFERENT values
**Inputs:**
- 1. **Image:** An image of a medical table.
+ 1. **Image:** An image of a medical table. Use this to understand structure and domain ONLY.
**Requirements:**
- 1. **Structure Preservation:** Accurately reconstruct the table structure.
- 2. **Data Generation:** Replace ALL cell values with new, synthetic medical data.
- - Use realistic Korean medical terms, patient info (synthetic).
- 3. **Styling:** Use **Tailwind CSS** classes (same as default).
- - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `
`.
- - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`.
- - `class="border border-slate-300 p-2"` on `
`.
- 4. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan.
+ 2. **Headers:** Keep header text the same as in the image.
+ 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
+ - **NEVER copy the data values from the image** - this is NOT an OCR task
+ - **ALL cell content must be completely NEW and DIFFERENT**
+ - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers
+ - For lab values: Generate DIFFERENT realistic values
+ - For diagnoses/medications: Generate DIFFERENT names
+ 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles).
+ - `
`: `class="list-disc ml-5 space-y-1 text-slate-600"`
+ - **DO NOT use inline style attributes**
+ 5. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example (Generic):**
+ - Patient ID in image: "환자X" → Generate: "환자Y"
+ - Value in image: "수치A" → Generate: "수치B"
+
+ ⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
diff --git a/generate_synthetic_table/prompts/public.yaml b/generate_synthetic_table/prompts/public.yaml
index 2850db7..b0c4099 100644
--- a/generate_synthetic_table/prompts/public.yaml
+++ b/generate_synthetic_table/prompts/public.yaml
@@ -94,37 +94,78 @@ generate_qa_from_image: |
generate_synthetic_table: |
You are a Synthetic Data Generator specializing in Public Sector/Government Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic public data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
+ Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT public sector data values.
**Inputs:**
- 1. **Original Table Structure:**
+ 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
{html}
- 2. **Table Summary:**
+ 2. **Table Summary (describes the data patterns to follow):**
{summary}
**Requirements:**
- 1. **Structure:** Keep the exact same HTML structure.
- 2. **Data:** Replace ALL cell values with new, synthetic public data.
- - Use realistic Korean administrative region names (e.g., Sejong-si, Mapo-gu), government department names, and statistical values.
- - Contexts: Census Data, Budget Reports, Public Facility Status, Regional Statistics.
- - Do NOT use real private data.
- 3. **Consistency:** Ensure mathematical consistency (e.g., Subtotals match Grand Total).
- 4. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
+ 2. **Headers:** Keep header text the same (column names, category labels).
+ 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
+ - **ALL data cell values MUST be replaced with completely new synthetic values.**
+ - **NEVER copy any original data values** - generate fresh, realistic alternatives.
+ - For regions: Generate DIFFERENT administrative region names
+ - For departments: Generate DIFFERENT department names
+ - For statistics: Generate DIFFERENT numbers (similar magnitude)
+ - For dates: Generate DIFFERENT plausible dates
+ 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
+ - Look at the original image's color scheme and design
+ - Use appropriate Tailwind color classes to match the original style
+ - Basic structure: `
`
+ - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
+ - Lists: `class="list-disc ml-5 space-y-1"`
+ - **DO NOT use inline style attributes**
+ 5. **Domain Consistency:** Ensure statistical logic (Subtotals match Grand Total)
+ 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example Transformation (Generic):**
+ - Original region: "A시" → Synthetic: "B시"
+ - Original statistic: "인구 100만" → Synthetic: "인구 150만"
+ - Original rate: "집행률 80%" → Synthetic: "집행률 90%"
+
+ ⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
generate_synthetic_table_from_image: |
You are a Synthetic Data Generator specializing in Public Sector/Government Data.
- Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic public data.
+
+ **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
+ Your task is NOT to OCR/transcribe the image. Instead, you must:
+ 1. Understand the table's STRUCTURE from the image
+ 2. Understand it's a PUBLIC SECTOR/GOVERNMENT table
+ 3. Generate COMPLETELY NEW synthetic public data that fits the domain but uses ENTIRELY DIFFERENT values
**Inputs:**
- 1. **Image:** An image of a public data table.
+ 1. **Image:** An image of a public data table. Use this to understand structure and domain ONLY.
**Requirements:**
- 1. **Structure Preservation:** Accurately reconstruct the table structure.
- 2. **Data Generation:** Replace ALL cell values with new, synthetic public data.
- - Use realistic Korean region names, stats.
- 3. **Styling:** Use **Tailwind CSS** classes (same as default).
- - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `
`.
- - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`.
- - `class="border border-slate-300 p-2"` on `
`.
- 4. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`.
+ 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan.
+ 2. **Headers:** Keep header text the same as in the image.
+ 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
+ - **NEVER copy the data values from the image** - this is NOT an OCR task
+ - **ALL cell content must be completely NEW and DIFFERENT**
+ - For regions: Generate DIFFERENT administrative region names
+ - For statistics: Generate DIFFERENT numbers
+ - For departments: Generate DIFFERENT names
+ 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles).
+ - `
`: `class="list-disc ml-5 space-y-1 text-slate-600"`
+ - **DO NOT use inline style attributes**
+ 5. **Output Format:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks.
+
+ **Example (Generic):**
+ - Region in image: "X지역" → Generate: "Y지역"
+ - Statistic in image: "수치A" → Generate: "수치B"
+
+ ⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
diff --git a/run_openai_public.sh b/run_openai_public.sh
index 92bf58d..d90e1f6 100644
--- a/run_openai_public.sh
+++ b/run_openai_public.sh
@@ -5,9 +5,9 @@
# ==============================================================================
# Default Configuration
-INPUT_JSON="test_business_input.json"
+INPUT_JSON="test_business.json"
OUTPUT_DIR="output_business"
-DEFAULT_ARGS="--provider openai --model gpt-5-mini --domain business"
+DEFAULT_ARGS="--provider claude --model claude-sonnet-4-5 --domain business"
# Check if the first argument is a JSON file path
if [[ "$1" == *.json ]]; then
@@ -20,8 +20,8 @@ echo " TableMagnifier - JSON Pipeline (Public)"
echo "=============================================="
echo "Input JSON: $INPUT_JSON"
echo "Output Dir: $OUTPUT_DIR"
-echo "Provider: openai"
-echo "Model: gpt-5-mini"
+echo "Provider: claude"
+echo "Model: claude-sonnet-4-5"
echo "Domain: business"
echo ""
echo "💡 Tip: To upload to Notion during pipeline execution:"
@@ -31,13 +31,13 @@ echo "💡 To upload existing results later:"
echo " python upload_to_notion_from_json.py $OUTPUT_DIR"
echo ""
-# Check for OPENAI_API_KEY
-if [[ -z "$OPENAI_API_KEY" ]]; then
- echo "⚠️ Warning: OPENAI_API_KEY is not set."
+# Check for ANTHROPIC_API_KEY
+if [[ -z "$ANTHROPIC_API_KEY" ]]; then
+ echo "⚠️ Warning: ANTHROPIC_API_KEY is not set."
echo " Please set it in your environment or .env file."
echo ""
fi
# Run the pipeline
# Note: "$@" appends any remaining arguments, allowing overrides of defaults
-uv run python run_pipeline_json.py --input "$INPUT_JSON" --output-dir "$OUTPUT_DIR" $DEFAULT_ARGS "$@"
+uv run python run_pipeline_json.py --input "$INPUT_JSON" --output-dir "$OUTPUT_DIR" $DEFAULT_ARGS "$@"
\ No newline at end of file
diff --git a/run_pipeline_json.py b/run_pipeline_json.py
index 0ff7276..d8fef9c 100644
--- a/run_pipeline_json.py
+++ b/run_pipeline_json.py
@@ -1,13 +1,200 @@
import argparse
import json
import os
+import re
+import random
import sys
from pathlib import Path
-from typing import List, Dict, Any, Tuple
+from typing import List, Dict, Any, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv
+
+# ============================================================
+# Style Variation Configuration
+# ============================================================
+
+GOOGLE_FONTS = [
+ ("Noto Sans KR", "Noto+Sans+KR:wght@400;500;600;700"),
+ ("Pretendard", None), # Self-hosted or system font
+ ("IBM Plex Sans KR", "IBM+Plex+Sans+KR:wght@400;500;600;700"),
+ ("Nanum Gothic", "Nanum+Gothic:wght@400;700"),
+ ("Nanum Myeongjo", "Nanum+Myeongjo:wght@400;700"),
+ ("Gothic A1", "Gothic+A1:wght@400;500;600;700"),
+ ("Do Hyeon", "Do+Hyeon"),
+ ("Jua", "Jua"),
+ ("Gowun Dodum", "Gowun+Dodum"),
+ ("Gowun Batang", "Gowun+Batang:wght@400;700"),
+]
+
+COLOR_SCHEMES = [
+ # (name, header_bg_from, header_bg_to, header_text, header_border, body_hover, body_border, text_color)
+ ("indigo", "indigo-600", "indigo-700", "white", "indigo-400", "indigo-50", "slate-200", "slate-700"),
+ ("slate", "slate-600", "slate-700", "white", "slate-500", "slate-50", "slate-300", "slate-700"),
+ ("emerald", "emerald-600", "emerald-700", "white", "emerald-400", "emerald-50", "slate-200", "slate-700"),
+ ("blue", "blue-600", "blue-700", "white", "blue-400", "blue-50", "slate-200", "slate-700"),
+ ("purple", "purple-600", "purple-700", "white", "purple-400", "purple-50", "slate-200", "slate-700"),
+ ("teal", "teal-600", "teal-700", "white", "teal-400", "teal-50", "slate-200", "slate-700"),
+ ("amber", "amber-600", "amber-700", "white", "amber-400", "amber-50", "slate-200", "slate-800"),
+ ("rose", "rose-600", "rose-700", "white", "rose-400", "rose-50", "slate-200", "slate-700"),
+ ("cyan", "cyan-600", "cyan-700", "white", "cyan-400", "cyan-50", "slate-200", "slate-700"),
+ ("stone", "stone-600", "stone-700", "white", "stone-500", "stone-50", "stone-300", "stone-700"),
+ # Light header variants
+ ("light-blue", "blue-100", "blue-200", "blue-900", "blue-300", "blue-50", "blue-200", "slate-700"),
+ ("light-gray", "gray-100", "gray-200", "gray-800", "gray-300", "gray-50", "gray-200", "gray-700"),
+ ("light-green", "green-100", "green-200", "green-900", "green-300", "green-50", "green-200", "slate-700"),
+]
+
+TABLE_STYLES = [
+ # (name, table_extra_classes, has_shadow, has_rounded, stripe_odd)
+ ("default", "", False, False, False),
+ ("shadow", "shadow-lg", True, False, False),
+ ("rounded", "rounded-lg overflow-hidden", False, True, False),
+ ("shadow-rounded", "shadow-lg rounded-lg overflow-hidden", True, True, False),
+ ("striped", "", False, False, True),
+ ("striped-rounded", "rounded-lg overflow-hidden", False, True, True),
+]
+
+FONT_SIZES = ["text-xs", "text-sm", "text-base"]
+
+
+def get_random_style() -> Dict[str, Any]:
+ """Generate a random style configuration."""
+ font_name, font_url = random.choice(GOOGLE_FONTS)
+ color = random.choice(COLOR_SCHEMES)
+ table_style = random.choice(TABLE_STYLES)
+ font_size = random.choice(FONT_SIZES)
+
+ return {
+ "font_name": font_name,
+ "font_url": font_url,
+ "color_name": color[0],
+ "header_bg_from": color[1],
+ "header_bg_to": color[2],
+ "header_text": color[3],
+ "header_border": color[4],
+ "body_hover": color[5],
+ "body_border": color[6],
+ "text_color": color[7],
+ "table_style_name": table_style[0],
+ "table_extra_classes": table_style[1],
+ "has_shadow": table_style[2],
+ "has_rounded": table_style[3],
+ "stripe_odd": table_style[4],
+ "font_size": font_size,
+ }
+
+
+def apply_style_to_html(table_html: str, style: Dict[str, Any]) -> str:
+ """Apply style variations to the table HTML by replacing Tailwind classes."""
+ html = table_html
+
+ # Replace header gradient colors
+ # Pattern: bg-gradient-to-r from-{color}-{shade} to-{color}-{shade}
+ html = re.sub(
+ r'from-\w+-\d+\s+to-\w+-\d+',
+ f'from-{style["header_bg_from"]} to-{style["header_bg_to"]}',
+ html
+ )
+
+ # Replace header text color
+ html = re.sub(
+ r'(]*class="[^"]*?)text-white',
+ f'\\1text-{style["header_text"]}',
+ html
+ )
+
+ # Replace header border color
+ html = re.sub(
+ r'border-\w+-300(?=\s|")',
+ f'border-{style["header_border"]}',
+ html
+ )
+
+ # Replace hover color
+ html = re.sub(
+ r'hover:bg-\w+-50',
+ f'hover:bg-{style["body_hover"]}',
+ html
+ )
+
+ # Replace body border color
+ html = re.sub(
+ r'border-slate-200',
+ f'border-{style["body_border"]}',
+ html
+ )
+
+ # Replace text color
+ html = re.sub(
+ r'text-slate-700',
+ f'text-{style["text_color"]}',
+ html
+ )
+ html = re.sub(
+ r'text-slate-600',
+ f'text-{style["text_color"]}',
+ html
+ )
+
+ # Replace font size in table tag
+ html = re.sub(
+ r'(
]*class="[^"]*?)text-(?:xs|sm|base)',
+ f'\\1{style["font_size"]}',
+ html
+ )
+
+ # Add table extra classes (shadow, rounded)
+ if style["table_extra_classes"]:
+ html = re.sub(
+ r'
]*class="[^"]*hover:bg-)',
+ f'
str:
+ """Build complete HTML document with fonts and styles."""
+
+ # Google Fonts link
+ font_link = ""
+ if style["font_url"]:
+ font_link = f'\n \n '
+
+ # Font family CSS
+ font_css = f"""
+ """
+
+ return f"""
+
+
+
+
+
+ {font_link}
+ {font_css}
+
+
+{table_html}
+
+"""
+
# Add parent directory to path to allow imports if running from root
sys.path.append(str(Path(__file__).parent))
@@ -15,6 +202,72 @@
from generate_synthetic_table.flow import TableState
from generate_synthetic_table.notion_uploader import NotionUploader
+
+def save_synthetic_table_as_html(
+ synthetic_table: str,
+ output_path: Path,
+ pair_id: str,
+ table_index: int,
+ randomize_style: bool = True
+) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
+ """Save synthetic table as HTML file with optional style randomization.
+
+ Args:
+ synthetic_table: The HTML table string
+ output_path: Directory to save the file
+ pair_id: Identifier for the pair
+ table_index: Index of the table within the pair
+ randomize_style: Whether to apply random style variations
+
+ Returns:
+ Tuple of (html_filepath, style_info) or (None, None) if failed
+ """
+ if not synthetic_table:
+ return None, None
+
+ # Clean up markdown code blocks if present
+ table_html = synthetic_table
+ if table_html.startswith("```html"):
+ table_html = table_html[7:]
+ if table_html.startswith("```"):
+ table_html = table_html[3:]
+ if table_html.endswith("```"):
+ table_html = table_html[:-3]
+ table_html = table_html.strip()
+
+ # Apply style randomization if enabled
+ style_info = None
+ if randomize_style:
+ style_info = get_random_style()
+ table_html = apply_style_to_html(table_html, style_info)
+ full_html = build_html_document(table_html, style_info)
+ else:
+ # Basic HTML without style randomization
+ full_html = f"""
+
+
+
+
+
+
+
+{table_html}
+
+"""
+
+ # Create html subdirectory
+ html_dir = output_path / "html"
+ html_dir.mkdir(parents=True, exist_ok=True)
+
+ # Save file
+ safe_pair_id = "".join([c for c in pair_id if c.isalnum() or c in ('-', '_')])
+ html_filename = f"{safe_pair_id}_table_{table_index}.html"
+ html_filepath = html_dir / html_filename
+
+ html_filepath.write_text(full_html, encoding="utf-8")
+
+ return str(html_filepath), style_info
+
def resolve_paths(pair: List[str], data_root: Path) -> List[Path]:
"""Resolves a list of relative paths to absolute Paths."""
paths = []
@@ -39,12 +292,14 @@ def process_single_pair(
index: int,
total_count: int,
data_root: Path,
+ output_dir: Path,
provider: str,
model: str,
config_path: str,
arg_domain: str,
qa_only: bool,
- notion_uploader: Any
+ notion_uploader: Any,
+ randomize_style: bool = True
) -> Dict:
"""Process a single pair of images."""
@@ -132,6 +387,7 @@ def process_single_pair(
model=model,
config_path=config_path,
qa_only=False, # We want the table
+ skip_qa=True, # Skip QA here - we'll generate QA for the pair later
domain=domain
)
@@ -139,12 +395,29 @@ def process_single_pair(
if table_state.get("errors"):
print(f" [Pair {index+1}] Error generating table: {table_state['errors']}")
+ # Save synthetic table as HTML file with style randomization
+ html_path = None
+ style_info = None
+ if table_state.get("synthetic_table"):
+ html_path, style_info = save_synthetic_table_as_html(
+ synthetic_table=table_state.get("synthetic_table"),
+ output_path=output_dir,
+ pair_id=pair_id,
+ table_index=len(temp_tables),
+ randomize_style=randomize_style
+ )
+ if html_path:
+ style_desc = f" (font: {style_info['font_name']}, color: {style_info['color_name']})" if style_info else ""
+ print(f" [Pair {index+1}] Saved HTML: {html_path}{style_desc}")
+
# Filter state
safe_state = {
"image_path": str(path),
"synthetic_table": table_state.get("synthetic_table"),
"synthetic_json": table_state.get("synthetic_json"),
"table_summary": table_state.get("table_summary"),
+ "html_path": html_path,
+ "style_info": style_info, # Store applied style for reference
}
temp_tables.append(safe_state)
@@ -221,7 +494,8 @@ def run_pipeline(
arg_domain: str = None,
qa_only: bool = False,
upload_to_notion: bool = False,
- max_workers: int = 3
+ max_workers: int = 3,
+ randomize_style: bool = True
):
output_dir.mkdir(parents=True, exist_ok=True)
@@ -249,12 +523,14 @@ def run_pipeline(
i,
total_count,
data_root,
+ output_dir,
provider,
model,
config_path,
arg_domain,
qa_only,
- notion_uploader
+ notion_uploader,
+ randomize_style
): i for i, item in enumerate(json_input)
}
@@ -298,6 +574,8 @@ def main():
parser.add_argument("--qa-only", action="store_true", help="Skip table generation, only generate QA (applies to all domains)")
parser.add_argument("--upload-to-notion", action="store_true", help="Upload QA results to Notion database")
parser.add_argument("--max-workers", type=int, default=3, help="Maximum number of parallel workers (default: 3)")
+ parser.add_argument("--randomize-style", action="store_true", default=True, help="Randomize HTML table styles (fonts, colors) for diversity (default: True)")
+ parser.add_argument("--no-randomize-style", dest="randomize_style", action="store_false", help="Disable style randomization")
args = parser.parse_args()
@@ -331,7 +609,8 @@ def main():
arg_domain=args.domain,
qa_only=args.qa_only,
upload_to_notion=args.upload_to_notion,
- max_workers=args.max_workers
+ max_workers=args.max_workers,
+ randomize_style=args.randomize_style
)
if __name__ == "__main__":