From a65e9b180f6d032b5694f0acd38df334b809d4d2 Mon Sep 17 00:00:00 2001 From: Seokhyun Seo Date: Wed, 21 Jan 2026 12:00:49 +0000 Subject: [PATCH 1/2] some fixes --- .../prompts/academic.yaml | 63 +++++++++++---- .../prompts/business.yaml | 62 +++++++++++---- generate_synthetic_table/prompts/default.yaml | 79 ++++++++++++++----- generate_synthetic_table/prompts/finance.yaml | 62 +++++++++++---- .../prompts/insurance.yaml | 63 +++++++++++---- generate_synthetic_table/prompts/medical.yaml | 62 +++++++++++---- generate_synthetic_table/prompts/public.yaml | 62 +++++++++++---- 7 files changed, 336 insertions(+), 117 deletions(-) diff --git a/generate_synthetic_table/prompts/academic.yaml b/generate_synthetic_table/prompts/academic.yaml index bbb19d5..47fafc2 100644 --- a/generate_synthetic_table/prompts/academic.yaml +++ b/generate_synthetic_table/prompts/academic.yaml @@ -94,37 +94,68 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Academic Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic academic data. + + **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT academic data values. + The goal is to create realistic synthetic academic data that looks like it could come from the same domain, but with entirely different students, courses, and metrics. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic academic data. - - Use realistic Korean student names, university names, course titles, and grades. - - Contexts: Transcripts, Research Papers, Enrollment Stats, Faculty Lists. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., sum of credits, correct GPA calculations if visible). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **Data Transformation - MANDATORY:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **DO NOT copy any original data values** - generate fresh, realistic alternatives. + - For student names: Generate new Korean student names (e.g., "김철수" → "이영희", "학생A" → "학생B") + - For university names: Generate new Korean university names + - For course titles: Generate new course names + - For grades/scores: Generate new realistic values + - For model names (if research table): Generate new model/method names + - For dates: Generate new plausible dates + 4. **Domain Consistency:** + - Ensure academic logic (credits sum correctly, GPA calculations valid) + - Use realistic Korean academic terminology + - Contexts: Transcripts, Research Papers, Enrollment Stats, Faculty Lists + 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + **Example Transformation:** + - Original: "서울대학교" → Synthetic: "고려대학교" + - Original: "학점 4.2" → Synthetic: "학점 3.8" + - Original: "BERT-Large" → Synthetic: "RoBERTa-Base" + + Remember: The synthetic table should look like a completely different academic dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Academic Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic academic data. + + **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's an ACADEMIC table + 3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses different values **Inputs:** - 1. **Image:** An image of an academic table. + 1. **Image:** An image of an academic table. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic academic data. - - Use realistic Korean student names, course titles, grades, research topics. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **Data Generation - CRITICAL:** + - **DO NOT copy the data values from the image** - this is NOT an OCR task + - Generate COMPLETELY NEW synthetic academic values for all data cells + - For student/model names: Generate new names (different from what you see) + - For grades/scores: Generate new realistic values + - For course/research topics: Generate new titles + 4. **Styling:** Use **Tailwind CSS** classes (same as default). - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + Remember: The output should be a new synthetic academic dataset, not a transcription of the original. diff --git a/generate_synthetic_table/prompts/business.yaml b/generate_synthetic_table/prompts/business.yaml index 9a3efe5..c57b506 100644 --- a/generate_synthetic_table/prompts/business.yaml +++ b/generate_synthetic_table/prompts/business.yaml @@ -94,37 +94,67 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Business Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic business data. + + **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT business data values. + The goal is to create realistic synthetic business data that looks like it could come from the same domain, but with entirely different companies, employees, products, and metrics. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic business data. - - Use realistic Korean company names, department names, product lines, and financial metrics. - - Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., Q1 + Q2 + Q3 + Q4 = Total). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **Data Transformation - MANDATORY:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **DO NOT copy any original data values** - generate fresh, realistic alternatives. + - For company names: Generate new Korean company names (e.g., "삼성물산" → "현대상사", "A팀" → "B팀") + - For employee names: Generate new Korean names + - For product names: Generate new product line names + - For revenue/sales figures: Generate new realistic amounts (different values) + - For dates: Generate new plausible dates + 4. **Domain Consistency:** + - Ensure business logic (Q1+Q2+Q3+Q4=Total, percentages add up) + - Use realistic Korean business terminology + - Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns + 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + **Example Transformation:** + - Original: "영업1팀" → Synthetic: "마케팅2팀" + - Original: "매출 5억원" → Synthetic: "매출 7.3억원" + - Original: "김부장" → Synthetic: "박과장" + + Remember: The synthetic table should look like a completely different business dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Business Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic business data. + + **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's a BUSINESS table + 3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses different values **Inputs:** - 1. **Image:** An image of a business table. + 1. **Image:** An image of a business table. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic business data. - - Use realistic Korean company names, products, sales figures. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **Data Generation - CRITICAL:** + - **DO NOT copy the data values from the image** - this is NOT an OCR task + - Generate COMPLETELY NEW synthetic business values for all data cells + - For company/team names: Generate new names (different from what you see) + - For sales/revenue figures: Generate new realistic amounts + - For employee names: Generate new Korean names + 4. **Styling:** Use **Tailwind CSS** classes (same as default). - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + Remember: The output should be a new synthetic business dataset, not a transcription of the original. diff --git a/generate_synthetic_table/prompts/default.yaml b/generate_synthetic_table/prompts/default.yaml index e877e0a..a75c978 100644 --- a/generate_synthetic_table/prompts/default.yaml +++ b/generate_synthetic_table/prompts/default.yaml @@ -78,43 +78,82 @@ generate_qa_from_image: | Return ONLY the JSON object, no additional text. generate_synthetic_table: | - You are a Synthetic Data Generator. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic data. + You are a Synthetic Data Generator specialized in creating completely NEW data while preserving table structure. + + **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + Your task is to generate a new HTML table that has the SAME STRUCTURE as the original but with COMPLETELY DIFFERENT, newly generated data values. + The goal is to create realistic synthetic data that looks like it could come from the same domain, but with entirely different entities, names, numbers, and values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges) as the original table. - 2. **Data:** Replace ALL cell values with new, synthetic data. - - Use realistic Korean names, organizations, and values suitable for the context. - - Ensure the data is consistent with the column types and patterns described in the summary. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency if applicable (e.g., sums, percentages). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. Do not include markdown code blocks. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, rowspan, colspan, merges) as the original. + 2. **Headers:** Keep header text the same (column names, row labels that describe categories). + 3. **Data Transformation - MANDATORY:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **DO NOT copy any original data values** - generate fresh, realistic alternatives. + - For names: Generate new Korean names (e.g., 김철수 → 이영희, 박민수 → 정하늘) + - For organizations: Generate new realistic Korean organization names + - For numbers: Generate new realistic numbers that follow the same pattern/range but are different values + - For dates: Generate new plausible dates + - For addresses: Generate new realistic Korean addresses + - For any other text: Generate semantically similar but different content + 4. **Domain Consistency:** + - Analyze the summary to understand the domain context (finance, medical, public, etc.) + - Generate data that is realistic for that specific domain + - Maintain internal consistency (e.g., totals should sum correctly, percentages should add up) + 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. No markdown code blocks. + + **Example Transformation (showing the expected behavior):** + - Original: "삼성전자" → Synthetic: "한국테크" + - Original: "1,500,000" → Synthetic: "2,340,000" + - Original: "2024-01-15" → Synthetic: "2024-03-22" + - Original: "서울시 강남구" → Synthetic: "부산시 해운대구" + + Remember: The synthetic table should look like a completely different dataset from the same domain, not a copy of the original. generate_synthetic_table_from_image: | - You are a Synthetic Data Generator specialized in Korean documents. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic data. + You are a Synthetic Data Generator specialized in creating completely NEW data from Korean document images. + + **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand the DOMAIN and data patterns from the image + 3. Generate COMPLETELY NEW synthetic data that fits the same domain but uses different values **Inputs:** - 1. **Image:** An image of a table containing Korean text. + 1. **Image:** An image of a table containing Korean text. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` attributes for merged cells. - 2. **Data Generation:** Replace ALL cell values with new, synthetic data. - - Use realistic Korean names, organizations, and values suitable for the context of the table in the image. - - Ensure the data is consistent with the column types (e.g., dates, numbers, text). - - Do NOT use real private data. - 3. **Styling:** Use **Tailwind CSS** classes to style the table. + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **Data Generation - CRITICAL:** + - **DO NOT copy the data values from the image** - this is NOT an OCR task + - Generate COMPLETELY NEW synthetic values for all data cells + - Analyze the domain from the image (finance, medical, public, etc.) and generate appropriate data + - For names: Generate new Korean names different from what you see + - For organizations: Generate new realistic Korean organization names + - For numbers: Generate new realistic numbers in similar ranges but different values + - For dates: Generate new plausible dates + - For addresses: Generate new realistic Korean addresses + - The synthetic table should look like a DIFFERENT dataset from the same domain + 4. **Styling:** Use **Tailwind CSS** classes to style the table. - Add `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` to the `` tag. - Add `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` to `
` tags. - Add `class="border border-slate-300 p-2"` to `` tags. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. Do not include markdown code blocks. + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. No markdown code blocks. + + **Example of Expected Behavior:** + If the image shows a table with employee "김철수" and salary "3,500,000": + - WRONG (OCR copy): "김철수", "3,500,000" + - CORRECT (synthetic): "이영희", "4,200,000" + + Remember: The output should be a new synthetic dataset, not a transcription of the original. image_to_html: | You are an AI assistant specialized in OCR and HTML generation. diff --git a/generate_synthetic_table/prompts/finance.yaml b/generate_synthetic_table/prompts/finance.yaml index d610704..c17aaf8 100644 --- a/generate_synthetic_table/prompts/finance.yaml +++ b/generate_synthetic_table/prompts/finance.yaml @@ -94,37 +94,67 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Financial Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic financial data. + + **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT financial data values. + The goal is to create realistic synthetic financial data that looks like it could come from the same domain, but with entirely different companies, amounts, and metrics. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic financial data. - - Use realistic Korean company names (e.g., KOSPI listed mock names), stock tickers, and realistic currency values (KRW, USD). - - Contexts: Balance Sheets, Income Statements, Stock Portfolios, Tax Records. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., Assets = Liabilities + Equity). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **Data Transformation - MANDATORY:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **DO NOT copy any original data values** - generate fresh, realistic alternatives. + - For company names: Generate new Korean company names (e.g., "삼성전자" → "한국반도체", "현대자동차" → "동아모터스") + - For stock tickers: Generate new realistic tickers + - For financial figures: Generate new realistic amounts in KRW/USD (different values, same magnitude range) + - For percentages: Generate new realistic percentages + - For dates: Generate new plausible dates + 4. **Domain Consistency:** + - Ensure financial logic (Assets = Liabilities + Equity, Totals match sum of items) + - Use realistic Korean KOSPI/KOSDAQ style mock company names + - Contexts: Balance Sheets, Income Statements, Stock Portfolios, Tax Records + 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + **Example Transformation:** + - Original: "삼성전자" → Synthetic: "한국테크놀로지" + - Original: "15,340,000,000원" → Synthetic: "23,890,000,000원" + - Original: "PER 12.5" → Synthetic: "PER 8.7" + + Remember: The synthetic table should look like a completely different financial dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Financial Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic financial data. + + **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's a FINANCIAL table + 3. Generate COMPLETELY NEW synthetic financial data that fits the domain but uses different values **Inputs:** - 1. **Image:** An image of a financial table. + 1. **Image:** An image of a financial table. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic financial data. - - Use realistic Korean company names, financial metrics. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **Data Generation - CRITICAL:** + - **DO NOT copy the data values from the image** - this is NOT an OCR task + - Generate COMPLETELY NEW synthetic financial values for all data cells + - For company names: Generate new Korean company names (different from what you see) + - For financial figures: Generate new realistic amounts (different values) + - For percentages/ratios: Generate new realistic metrics + 4. **Styling:** Use **Tailwind CSS** classes (same as default). - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + Remember: The output should be a new synthetic financial dataset, not a transcription of the original. diff --git a/generate_synthetic_table/prompts/insurance.yaml b/generate_synthetic_table/prompts/insurance.yaml index ce0c759..100f5c9 100644 --- a/generate_synthetic_table/prompts/insurance.yaml +++ b/generate_synthetic_table/prompts/insurance.yaml @@ -65,38 +65,67 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Insurance Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic insurance data. + + **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT insurance data values. + The goal is to create realistic synthetic insurance data that looks like it could come from the same domain, but with entirely different plans, coverage amounts, and terms. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic insurance data. - - Use realistic Korean insurance plan names (e.g., "SafeLife Plus", "Family Care"), coverage types, premiums (KRW), and terms. - - Ensure consistency: e.g., higher premiums for better coverage. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., monthly premium * 12 = annual). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **Data Transformation - MANDATORY:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **DO NOT copy any original data values** - generate fresh, realistic alternatives. + - For plan names: Generate new Korean insurance plan names (e.g., "안심보장플러스" → "가족사랑보험", "SafeLife" → "행복지킴이") + - For coverage amounts: Generate new realistic amounts in KRW (different values) + - For premiums: Generate new realistic premium amounts + - For terms/conditions: Generate new coverage terms + - For dates: Generate new plausible dates + 4. **Domain Consistency:** + - Ensure insurance logic (higher premiums for better coverage, monthly*12=annual) + - Use realistic Korean insurance terminology + - Contexts: Life insurance, Health insurance, Auto insurance + 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + **Example Transformation:** + - Original: "무배당 건강보험" → Synthetic: "실속형 의료보험" + - Original: "보장한도 1억원" → Synthetic: "보장한도 5천만원" + - Original: "월 보험료 35,000원" → Synthetic: "월 보험료 52,000원" + + Remember: The synthetic table should look like a completely different insurance dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Insurance Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic insurance data. + + **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's an INSURANCE table + 3. Generate COMPLETELY NEW synthetic insurance data that fits the domain but uses different values **Inputs:** - 1. **Image:** An image of an insurance table. + 1. **Image:** An image of an insurance table. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic insurance data. - - Use realistic Korean insurance terms, plan names, coverage amounts (KRW). - - Contexts: Life insurance, Health insurance, Auto insurance, etc. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **Data Generation - CRITICAL:** + - **DO NOT copy the data values from the image** - this is NOT an OCR task + - Generate COMPLETELY NEW synthetic insurance values for all data cells + - For plan names: Generate new insurance plan names (different from what you see) + - For coverage/premiums: Generate new realistic amounts + - For terms: Generate new coverage terms + 4. **Styling:** Use **Tailwind CSS** classes (same as default). - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + Remember: The output should be a new synthetic insurance dataset, not a transcription of the original. diff --git a/generate_synthetic_table/prompts/medical.yaml b/generate_synthetic_table/prompts/medical.yaml index d11f8f8..edaaed4 100644 --- a/generate_synthetic_table/prompts/medical.yaml +++ b/generate_synthetic_table/prompts/medical.yaml @@ -95,37 +95,67 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Medical Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic medical data. + + **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT medical data values. + The goal is to create realistic synthetic medical data that looks like it could come from the same domain, but with entirely different patients, diagnoses, and values. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic medical data. - - Use realistic Korean patient names (pseudonymized), diagnosis codes (ICD-10 style but synthetic), medication names, and lab values. - - Contexts: Patient Charts, Lab Reports, Prescription Lists, Clinical Trials. - - Do NOT use real private data. - 3. **Consistency:** Ensure medical consistency (e.g., proper units for blood pressure, temperature). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **Data Transformation - MANDATORY:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **DO NOT copy any original data values** - generate fresh, realistic alternatives. + - For patient names: Generate new Korean pseudonymized names (e.g., "홍길동" → "김영수", "환자A" → "환자B") + - For diagnosis codes: Generate new ICD-10 style codes (synthetic) + - For lab values: Generate new realistic values within normal/abnormal ranges + - For medications: Generate new realistic medication names and dosages + - For dates: Generate new plausible dates + 4. **Domain Consistency:** + - Ensure medical logic (proper units for BP, temperature, lab values) + - Use realistic Korean medical terminology + - Contexts: Patient Charts, Lab Reports, Prescription Lists, Clinical Trials + 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + **Example Transformation:** + - Original: "환자ID-001" → Synthetic: "환자ID-078" + - Original: "혈압 120/80" → Synthetic: "혈압 135/85" + - Original: "아스피린 100mg" → Synthetic: "타이레놀 500mg" + + Remember: The synthetic table should look like a completely different medical dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Medical Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic medical data. + + **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's a MEDICAL table + 3. Generate COMPLETELY NEW synthetic medical data that fits the domain but uses different values **Inputs:** - 1. **Image:** An image of a medical table. + 1. **Image:** An image of a medical table. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic medical data. - - Use realistic Korean medical terms, patient info (synthetic). - 3. **Styling:** Use **Tailwind CSS** classes (same as default). + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **Data Generation - CRITICAL:** + - **DO NOT copy the data values from the image** - this is NOT an OCR task + - Generate COMPLETELY NEW synthetic medical values for all data cells + - For patient names/IDs: Generate new pseudonymized identifiers + - For lab values: Generate new realistic values + - For diagnoses/medications: Generate new names and codes + 4. **Styling:** Use **Tailwind CSS** classes (same as default). - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + Remember: The output should be a new synthetic medical dataset, not a transcription of the original. diff --git a/generate_synthetic_table/prompts/public.yaml b/generate_synthetic_table/prompts/public.yaml index 2850db7..57827d1 100644 --- a/generate_synthetic_table/prompts/public.yaml +++ b/generate_synthetic_table/prompts/public.yaml @@ -94,37 +94,67 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Public Sector/Government Data. - Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic public data. + + **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT public sector data values. + The goal is to create realistic synthetic government/public data that looks like it could come from the same domain, but with entirely different regions, departments, and statistics. **Inputs:** - 1. **Original Table Structure:** + 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** {html} - 2. **Table Summary:** + 2. **Table Summary (describes the data patterns to follow):** {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure. - 2. **Data:** Replace ALL cell values with new, synthetic public data. - - Use realistic Korean administrative region names (e.g., Sejong-si, Mapo-gu), government department names, and statistical values. - - Contexts: Census Data, Budget Reports, Public Facility Status, Regional Statistics. - - Do NOT use real private data. - 3. **Consistency:** Ensure mathematical consistency (e.g., Subtotals match Grand Total). - 4. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 2. **Headers:** Keep header text the same (column names, category labels). + 3. **Data Transformation - MANDATORY:** + - **ALL data cell values MUST be replaced with completely new synthetic values.** + - **DO NOT copy any original data values** - generate fresh, realistic alternatives. + - For regions: Generate new Korean administrative regions (e.g., "서울특별시" → "부산광역시", "강남구" → "해운대구") + - For departments: Generate new government department names + - For statistics: Generate new realistic numbers (different values, similar magnitude) + - For population/budget figures: Generate new plausible values + - For dates: Generate new plausible dates + 4. **Domain Consistency:** + - Ensure statistical logic (Subtotals match Grand Total, percentages add up) + - Use realistic Korean administrative region names + - Contexts: Census Data, Budget Reports, Public Facility Status, Regional Statistics + 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + **Example Transformation:** + - Original: "서울특별시" → Synthetic: "대전광역시" + - Original: "인구 9,776,000명" → Synthetic: "인구 1,489,000명" + - Original: "예산집행률 87.5%" → Synthetic: "예산집행률 92.3%" + + Remember: The synthetic table should look like a completely different public sector dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Public Sector/Government Data. - Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic public data. + + **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + Your task is NOT to OCR/transcribe the image. Instead, you must: + 1. Understand the table's STRUCTURE from the image + 2. Understand it's a PUBLIC SECTOR/GOVERNMENT table + 3. Generate COMPLETELY NEW synthetic public data that fits the domain but uses different values **Inputs:** - 1. **Image:** An image of a public data table. + 1. **Image:** An image of a public data table. Use this to understand structure and domain ONLY. **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Data Generation:** Replace ALL cell values with new, synthetic public data. - - Use realistic Korean region names, stats. - 3. **Styling:** Use **Tailwind CSS** classes (same as default). + 2. **Headers:** Keep header text (column names, category labels) the same as in the image. + 3. **Data Generation - CRITICAL:** + - **DO NOT copy the data values from the image** - this is NOT an OCR task + - Generate COMPLETELY NEW synthetic public sector values for all data cells + - For regions: Generate new Korean administrative regions (different from what you see) + - For statistics: Generate new realistic numbers (different values) + - For departments/organizations: Generate new names + 4. **Styling:** Use **Tailwind CSS** classes (same as default). - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - `class="border border-slate-300 p-2"` on ``. - 4. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. + + Remember: The output should be a new synthetic public sector dataset, not a transcription of the original. From b56cb6982519ba75fb0fc0ee961bf6df83f6ec3e Mon Sep 17 00:00:00 2001 From: Seokhyun Seo Date: Wed, 21 Jan 2026 13:36:14 +0000 Subject: [PATCH 2/2] Update run_openai_public.sh and run_pipeline_json.py for new provider and model configurations; enhance style randomization in HTML output generation. --- generate_synthetic_table/flow.py | 21 +- .../prompts/academic.yaml | 91 +++--- .../prompts/business.yaml | 91 +++--- generate_synthetic_table/prompts/default.yaml | 125 +++++--- generate_synthetic_table/prompts/finance.yaml | 89 +++--- .../prompts/insurance.yaml | 90 +++--- generate_synthetic_table/prompts/medical.yaml | 89 +++--- generate_synthetic_table/prompts/public.yaml | 89 +++--- run_openai_public.sh | 16 +- run_pipeline_json.py | 289 +++++++++++++++++- 10 files changed, 693 insertions(+), 297 deletions(-) diff --git a/generate_synthetic_table/flow.py b/generate_synthetic_table/flow.py index 0177147..8be68dd 100644 --- a/generate_synthetic_table/flow.py +++ b/generate_synthetic_table/flow.py @@ -765,6 +765,7 @@ def build_synthetic_table_graph( llm: ChatOpenAI, provider: str = "openai", qa_only: bool = False, + skip_qa: bool = False, ) -> StateGraph: """ Assemble the LangGraph pipeline. @@ -773,6 +774,7 @@ def build_synthetic_table_graph( llm: LLM instance provider: LLM provider name qa_only: If True, generate QA directly from image without synthetic data generation + skip_qa: If True, skip QA generation after table generation (table only mode) """ graph = StateGraph(TableState) @@ -783,7 +785,7 @@ def build_synthetic_table_graph( graph.add_edge(START, "generate_qa_from_image") graph.add_edge("generate_qa_from_image", END) else: - # Full pipeline mode + # Full pipeline mode (or table-only mode if skip_qa=True) graph.add_node("image_to_html", image_to_html_node(llm)) graph.add_node("pymupdf_parse", pymupdf_parse_node) graph.add_node("validate_parsed_table", validate_parsed_table_node(llm)) @@ -795,7 +797,9 @@ def build_synthetic_table_graph( graph.add_node("self_reflection", self_reflection_node(llm)) graph.add_node("revise_synthetic_table", revise_synthetic_table_node(llm)) graph.add_node("parse_synthetic_table", parse_synthetic_table_node(llm)) - graph.add_node("generate_qa", generate_qa_node(llm)) + + if not skip_qa: + graph.add_node("generate_qa", generate_qa_node(llm)) # Routing based on provider and input type def route_start(state: TableState) -> str: @@ -842,8 +846,13 @@ def route_start(state: TableState) -> str: ) graph.add_edge("revise_synthetic_table", "self_reflection") - graph.add_edge("parse_synthetic_table", "generate_qa") - graph.add_edge("generate_qa", END) + + # Final edge: skip QA if requested + if skip_qa: + graph.add_edge("parse_synthetic_table", END) + else: + graph.add_edge("parse_synthetic_table", "generate_qa") + graph.add_edge("generate_qa", END) return graph @@ -914,6 +923,7 @@ def run_synthetic_table_flow( azure_deployment: str | None = None, azure_endpoint: str | None = None, qa_only: bool = False, + skip_qa: bool = False, image_paths: List[str] | None = None, domain: str | None = None, # 체크포인팅 옵션 @@ -935,6 +945,7 @@ def run_synthetic_table_flow( azure_deployment: Azure OpenAI deployment name azure_endpoint: Azure OpenAI endpoint URL qa_only: If True, skip synthetic data generation and only generate QA from image + skip_qa: If True, generate table only without QA generation image_paths: Optional list of image paths for multi-image processing domain: Optional domain for prompt customization (e.g. 'public') enable_checkpointing: 체크포인팅 활성화 여부 @@ -955,7 +966,7 @@ def run_synthetic_table_flow( config_path=config_path, ) - graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only) + graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only, skip_qa=skip_qa) # 체크포인팅 설정 if enable_checkpointing: diff --git a/generate_synthetic_table/prompts/academic.yaml b/generate_synthetic_table/prompts/academic.yaml index 47fafc2..87543eb 100644 --- a/generate_synthetic_table/prompts/academic.yaml +++ b/generate_synthetic_table/prompts/academic.yaml @@ -95,9 +95,8 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Academic Data. - **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT academic data values. - The goal is to create realistic synthetic academic data that looks like it could come from the same domain, but with entirely different students, courses, and metrics. **Inputs:** 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** @@ -107,55 +106,67 @@ generate_synthetic_table: | {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). 2. **Headers:** Keep header text the same (column names, category labels). - 3. **Data Transformation - MANDATORY:** + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - - **DO NOT copy any original data values** - generate fresh, realistic alternatives. - - For student names: Generate new Korean student names (e.g., "김철수" → "이영희", "학생A" → "학생B") - - For university names: Generate new Korean university names - - For course titles: Generate new course names - - For grades/scores: Generate new realistic values - - For model names (if research table): Generate new model/method names - - For dates: Generate new plausible dates - 4. **Domain Consistency:** - - Ensure academic logic (credits sum correctly, GPA calculations valid) - - Use realistic Korean academic terminology - - Contexts: Transcripts, Research Papers, Enrollment Stats, Faculty Lists - 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
`. - - **Example Transformation:** - - Original: "서울대학교" → Synthetic: "고려대학교" - - Original: "학점 4.2" → Synthetic: "학점 3.8" - - Original: "BERT-Large" → Synthetic: "RoBERTa-Base" - - Remember: The synthetic table should look like a completely different academic dataset from the same domain. + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For student/model names: Generate DIFFERENT names + - For university names: Generate DIFFERENT names + - For grades/scores: Generate DIFFERENT realistic values + - For course/research topics: Generate DIFFERENT titles + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure academic logic (credits sum correctly, GPA valid) + 6. **Output:** Return ONLY the raw HTML string starting with `
` and ending with `
`. No markdown code blocks. + + **Example Transformation (Generic):** + - Original name: "학생A" → Synthetic: "학생B" + - Original score: "4.0" → Synthetic: "3.5" + - Original model: "모델X" → Synthetic: "모델Y" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Academic Data. - **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** Your task is NOT to OCR/transcribe the image. Instead, you must: 1. Understand the table's STRUCTURE from the image 2. Understand it's an ACADEMIC table - 3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses different values + 3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** 1. **Image:** An image of an academic table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Headers:** Keep header text (column names, category labels) the same as in the image. - 3. **Data Generation - CRITICAL:** - - **DO NOT copy the data values from the image** - this is NOT an OCR task - - Generate COMPLETELY NEW synthetic academic values for all data cells - - For student/model names: Generate new names (different from what you see) - - For grades/scores: Generate new realistic values - - For course/research topics: Generate new titles - 4. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
`. - - `class="border border-slate-300 p-2"` on ``. - 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
`. - - Remember: The output should be a new synthetic academic dataset, not a transcription of the original. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For student/model names: Generate DIFFERENT names + - For grades/scores: Generate DIFFERENT values + - For course/research topics: Generate DIFFERENT titles + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-indigo-700 to-indigo-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-indigo-50 transition-colors"` + - `
`: `class="border border-indigo-300 px-4 py-3 font-semibold text-left"` + - `
`: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
    `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
    `. No markdown code blocks. + + **Example (Generic):** + - Name in image: "이름X" → Generate: "이름Y" + - Score in image: "점수A" → Generate: "점수B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/business.yaml b/generate_synthetic_table/prompts/business.yaml index c57b506..18ebc27 100644 --- a/generate_synthetic_table/prompts/business.yaml +++ b/generate_synthetic_table/prompts/business.yaml @@ -95,7 +95,7 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Business Data. - **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT business data values. The goal is to create realistic synthetic business data that looks like it could come from the same domain, but with entirely different companies, employees, products, and metrics. @@ -107,54 +107,77 @@ generate_synthetic_table: | {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). - 2. **Headers:** Keep header text the same (column names, category labels). - 3. **Data Transformation - MANDATORY:** + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). + 2. **Headers:** Keep header text the same (column names, category labels like 기업경쟁력, 시장경쟁력). + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - - **DO NOT copy any original data values** - generate fresh, realistic alternatives. - - For company names: Generate new Korean company names (e.g., "삼성물산" → "현대상사", "A팀" → "B팀") - - For employee names: Generate new Korean names - - For product names: Generate new product line names - - For revenue/sales figures: Generate new realistic amounts (different values) - - For dates: Generate new plausible dates - 4. **Domain Consistency:** + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀") + - For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + - For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억") + - For strategy/description text: Write DIFFERENT content with similar structure + - For bullet point items: Create DIFFERENT but domain-appropriate content + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** - Ensure business logic (Q1+Q2+Q3+Q4=Total, percentages add up) - Use realistic Korean business terminology - Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns - 5. **Output:** Return ONLY the raw HTML string starting with `
    ` and ending with `
    `. + 6. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
    `. No markdown code blocks. - **Example Transformation:** - - Original: "영업1팀" → Synthetic: "마케팅2팀" - - Original: "매출 5억원" → Synthetic: "매출 7.3억원" - - Original: "김부장" → Synthetic: "박과장" + **Example Transformation (Generic):** + - Original name: "A팀" → Synthetic: "B팀" + - Original amount: "5억원" → Synthetic: "7.3억원" + - Original description: "신규 사업 추진" → Synthetic: "해외 시장 진출" + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. Remember: The synthetic table should look like a completely different business dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Business Data. - **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** Your task is NOT to OCR/transcribe the image. Instead, you must: - 1. Understand the table's STRUCTURE from the image - 2. Understand it's a BUSINESS table - 3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses different values + 1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures) + 2. Understand it's a BUSINESS table (기업경쟁력, 시장경쟁력, 매출, 실적 등) + 3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** 1. **Image:** An image of a business table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Headers:** Keep header text (column names, category labels) the same as in the image. - 3. **Data Generation - CRITICAL:** - - **DO NOT copy the data values from the image** - this is NOT an OCR task - - Generate COMPLETELY NEW synthetic business values for all data cells - - For company/team names: Generate new names (different from what you see) - - For sales/revenue figures: Generate new realistic amounts - - For employee names: Generate new Korean names - 4. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
    `. - - `class="border border-slate-300 p-2"` on ``. - 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
    `. - + 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` for merged cells. + 2. **Headers:** Keep header text (column names, category labels like 기업경쟁력, 차별화 요소) the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT from the original** + - Generate COMPLETELY NEW synthetic business values for all data cells: + * For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀") + * For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억") + * For strategy/description text: Write DIFFERENT content with similar structure + * For bullet point items: Create DIFFERENT but domain-appropriate items + * For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + - The synthetic table should look like a COMPLETELY DIFFERENT business report from the same industry + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `
    ` and ending with `
    `. No markdown code blocks. + + **Example of Expected Behavior (Generic):** + If the image shows a business table with: + - Team name: "영업팀" → Generate different: "마케팅팀" + - Revenue: "10억원" → Generate different: "15억원" + - Strategy: "시장 확대" → Generate different: "신규 진출" + - Bullet point items → Generate completely different items + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. Remember: The output should be a new synthetic business dataset, not a transcription of the original. diff --git a/generate_synthetic_table/prompts/default.yaml b/generate_synthetic_table/prompts/default.yaml index a75c978..0ca2645 100644 --- a/generate_synthetic_table/prompts/default.yaml +++ b/generate_synthetic_table/prompts/default.yaml @@ -80,7 +80,7 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specialized in creating completely NEW data while preserving table structure. - **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** Your task is to generate a new HTML table that has the SAME STRUCTURE as the original but with COMPLETELY DIFFERENT, newly generated data values. The goal is to create realistic synthetic data that looks like it could come from the same domain, but with entirely different entities, names, numbers, and values. @@ -94,37 +94,42 @@ generate_synthetic_table: | **Requirements:** 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, rowspan, colspan, merges) as the original. 2. **Headers:** Keep header text the same (column names, row labels that describe categories). - 3. **Data Transformation - MANDATORY:** + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - - **DO NOT copy any original data values** - generate fresh, realistic alternatives. - - For names: Generate new Korean names (e.g., 김철수 → 이영희, 박민수 → 정하늘) - - For organizations: Generate new realistic Korean organization names - - For numbers: Generate new realistic numbers that follow the same pattern/range but are different values - - For dates: Generate new plausible dates - - For addresses: Generate new realistic Korean addresses - - For any other text: Generate semantically similar but different content - 4. **Domain Consistency:** - - Analyze the summary to understand the domain context (finance, medical, public, etc.) + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + - For organizations: Generate DIFFERENT names (e.g., "A회사" → "B회사") + - For numbers: Generate DIFFERENT numbers in similar ranges + - For descriptions/text: Write DIFFERENT content with similar structure + - For dates: Generate DIFFERENT plausible dates + - For addresses: Generate DIFFERENT realistic Korean addresses + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). Preserve the original table's visual style: + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, `font-semibold` as appropriate + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** + - Analyze the summary to understand the domain context - Generate data that is realistic for that specific domain - - Maintain internal consistency (e.g., totals should sum correctly, percentages should add up) - 5. **Output:** Return ONLY the raw HTML string starting with `
    ` and ending with `
    `. No markdown code blocks. + - Maintain internal consistency (e.g., totals should sum correctly) + 6. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
    `. No markdown code blocks. - **Example Transformation (showing the expected behavior):** - - Original: "삼성전자" → Synthetic: "한국테크" - - Original: "1,500,000" → Synthetic: "2,340,000" - - Original: "2024-01-15" → Synthetic: "2024-03-22" - - Original: "서울시 강남구" → Synthetic: "부산시 해운대구" + **Example Transformation:** + - Original name: "A회사" → Synthetic: "B회사" + - Original number: "1,500,000" → Synthetic: "2,340,000" + - Original text: "[어떤 내용]" → Synthetic: "[다른 내용]" - Remember: The synthetic table should look like a completely different dataset from the same domain, not a copy of the original. + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. + Remember: The synthetic table should look like a completely different dataset from the same domain. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specialized in creating completely NEW data from Korean document images. - **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** Your task is NOT to OCR/transcribe the image. Instead, you must: - 1. Understand the table's STRUCTURE from the image + 1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures) 2. Understand the DOMAIN and data patterns from the image - 3. Generate COMPLETELY NEW synthetic data that fits the same domain but uses different values + 3. Generate COMPLETELY NEW synthetic data that fits the same domain but uses ENTIRELY DIFFERENT values **Inputs:** 1. **Image:** An image of a table containing Korean text. Use this to understand structure and domain ONLY. @@ -132,28 +137,35 @@ generate_synthetic_table_from_image: | **Requirements:** 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` attributes for merged cells. 2. **Headers:** Keep header text (column names, category labels) the same as in the image. - 3. **Data Generation - CRITICAL:** - - **DO NOT copy the data values from the image** - this is NOT an OCR task - - Generate COMPLETELY NEW synthetic values for all data cells - - Analyze the domain from the image (finance, medical, public, etc.) and generate appropriate data - - For names: Generate new Korean names different from what you see - - For organizations: Generate new realistic Korean organization names - - For numbers: Generate new realistic numbers in similar ranges but different values - - For dates: Generate new plausible dates - - For addresses: Generate new realistic Korean addresses - - The synthetic table should look like a DIFFERENT dataset from the same domain - 4. **Styling:** Use **Tailwind CSS** classes to style the table. - - Add `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` to the `` tag. - - Add `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` to `
    ` tags. - - Add `class="border border-slate-300 p-2"` to `` tags. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT from the original** + - Generate COMPLETELY NEW synthetic values for all data cells: + * For names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO") + * For organizations/teams: Generate DIFFERENT names (e.g., "A팀" → "B팀") + * For numbers/amounts: Generate DIFFERENT numbers in similar ranges + * For descriptions/text: Write DIFFERENT content with similar structure + * For dates: Generate DIFFERENT plausible dates + - The synthetic table should look like a COMPLETELY DIFFERENT dataset from the same domain + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe the original image's visual style and mimic it:** + - Look at the original image's color scheme (header background, borders, text colors) + - Match the visual design as closely as possible using appropriate Tailwind color classes + - If the original has colored headers, use similar colors (e.g., blue → `bg-blue-600`, gray → `bg-slate-600`) + - Basic structure classes: + * ``: `class="w-full border-collapse text-sm"` + * `
    `: Include `border`, `px-4 py-3`, `font-semibold` and appropriate background color + * ``: Include `border`, `px-4 py-3` + * `
      `: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** (e.g., style="...") 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
      `. No markdown code blocks. **Example of Expected Behavior:** - If the image shows a table with employee "김철수" and salary "3,500,000": - - WRONG (OCR copy): "김철수", "3,500,000" - - CORRECT (synthetic): "이영희", "4,200,000" + - Name in image: "홍길동" → Generate: "김영수" + - Amount in image: "1,500만원" → Generate: "2,300만원" + - Style: If original has blue header → use blue Tailwind classes - Remember: The output should be a new synthetic dataset, not a transcription of the original. + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. + Remember: The output should be a new synthetic dataset with similar visual style but different data. image_to_html: | You are an AI assistant specialized in OCR and HTML generation. @@ -163,11 +175,12 @@ image_to_html: | 1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` attributes for merged cells. 2. **Text Fidelity:** Transcribe the text within the cells exactly as it appears in the image, preserving the Korean language. 3. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
      `. Do not include markdown code blocks (```html ... ```) or any other text. - 4. **Styling:** Use **Tailwind CSS** classes to style the table. - - Add `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` to the `` tag. - - Add `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` to `
      ` tags. - - Add `class="border border-slate-300 p-2"` to `` tags. - - Ensure the design looks clean and professional, similar to a standard document table. + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Mimic the original image's visual style:** + - Observe the original image's colors (header background, borders, text) + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` parse_contents: | Analyze the following HTML table and summarize its schema and content patterns. @@ -233,12 +246,19 @@ self_reflection: | You are a strict QA reviewer for synthetic HTML tables. Check: - 1) table structure matches original HTML + 1) table structure matches original HTML (rows, columns, merges) 2) column count / header hierarchy 3) row consistency 4) cell type realism (numbers vs text) 5) validity of HTML tags 6) no missing/extra columns + 7) **⚠️ DATA ORIGINALITY CHECK (CRITICAL) ⚠️:** + - The synthetic data MUST be DIFFERENT from the original + - If data values are identical or very similar to the original, mark as FAILED + - Names, numbers, descriptions should all be different + 8) **Styling check:** + - Should use Tailwind CSS classes, NOT inline styles + - Check for style="..." attributes and flag them as issues Output ONLY valid JSON with this schema: @@ -246,7 +266,7 @@ self_reflection: | "passed": true/false, "score": 0-100, "issues": [ - {{"type": "structure|header|row_count|data_type|html_validity|other", + {{"type": "structure|header|row_count|data_type|html_validity|data_originality|styling|other", "detail": "..."}} ], "revision_instructions": "Concrete step-by-step instructions to fix the table." @@ -263,12 +283,19 @@ self_reflection_from_image: | You are a strict QA reviewer for synthetic HTML tables. Check: - 1) table structure matches the original table in the image + 1) table structure matches the original table in the image (rows, columns, merges) 2) column count / header hierarchy 3) row consistency 4) cell type realism (numbers vs text) 5) validity of HTML tags 6) no missing/extra columns + 7) **⚠️ DATA ORIGINALITY CHECK (CRITICAL) ⚠️:** + - The synthetic data MUST be DIFFERENT from what appears in the image + - If data values appear to be copied/transcribed from the image, mark as FAILED + - This is NOT an OCR task - data should be newly generated + 8) **Styling check:** + - Should use Tailwind CSS classes, NOT inline styles + - Check for style="..." attributes and flag them as issues Output ONLY valid JSON with this schema: @@ -276,7 +303,7 @@ self_reflection_from_image: | "passed": true/false, "score": 0-100, "issues": [ - {{"type": "structure|header|row_count|data_type|html_validity|other", + {{"type": "structure|header|row_count|data_type|html_validity|data_originality|styling|other", "detail": "..."}} ], "revision_instructions": "Concrete step-by-step instructions to fix the table." diff --git a/generate_synthetic_table/prompts/finance.yaml b/generate_synthetic_table/prompts/finance.yaml index c17aaf8..77d9927 100644 --- a/generate_synthetic_table/prompts/finance.yaml +++ b/generate_synthetic_table/prompts/finance.yaml @@ -95,9 +95,8 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Financial Data. - **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT financial data values. - The goal is to create realistic synthetic financial data that looks like it could come from the same domain, but with entirely different companies, amounts, and metrics. **Inputs:** 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** @@ -107,54 +106,66 @@ generate_synthetic_table: | {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). 2. **Headers:** Keep header text the same (column names, category labels). - 3. **Data Transformation - MANDATORY:** + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - - **DO NOT copy any original data values** - generate fresh, realistic alternatives. - - For company names: Generate new Korean company names (e.g., "삼성전자" → "한국반도체", "현대자동차" → "동아모터스") - - For stock tickers: Generate new realistic tickers - - For financial figures: Generate new realistic amounts in KRW/USD (different values, same magnitude range) - - For percentages: Generate new realistic percentages - - For dates: Generate new plausible dates - 4. **Domain Consistency:** - - Ensure financial logic (Assets = Liabilities + Equity, Totals match sum of items) - - Use realistic Korean KOSPI/KOSDAQ style mock company names - - Contexts: Balance Sheets, Income Statements, Stock Portfolios, Tax Records - 5. **Output:** Return ONLY the raw HTML string starting with `
      ` and ending with `
      `. - - **Example Transformation:** - - Original: "삼성전자" → Synthetic: "한국테크놀로지" - - Original: "15,340,000,000원" → Synthetic: "23,890,000,000원" - - Original: "PER 12.5" → Synthetic: "PER 8.7" - - Remember: The synthetic table should look like a completely different financial dataset from the same domain. + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For company names: Generate DIFFERENT names (e.g., "A회사" → "B회사") + - For financial figures: Generate DIFFERENT amounts (similar magnitude, different values) + - For percentages/ratios: Generate DIFFERENT metrics + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure financial logic (Assets = Liabilities + Equity, Totals match) + 6. **Output:** Return ONLY the raw HTML string starting with `
      ` and ending with `
      `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original company: "A기업" → Synthetic: "B기업" + - Original amount: "100억원" → Synthetic: "150억원" + - Original ratio: "PER 10" → Synthetic: "PER 8" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Financial Data. - **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** Your task is NOT to OCR/transcribe the image. Instead, you must: 1. Understand the table's STRUCTURE from the image 2. Understand it's a FINANCIAL table - 3. Generate COMPLETELY NEW synthetic financial data that fits the domain but uses different values + 3. Generate COMPLETELY NEW synthetic financial data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** 1. **Image:** An image of a financial table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Headers:** Keep header text (column names, category labels) the same as in the image. - 3. **Data Generation - CRITICAL:** - - **DO NOT copy the data values from the image** - this is NOT an OCR task - - Generate COMPLETELY NEW synthetic financial values for all data cells - - For company names: Generate new Korean company names (different from what you see) - - For financial figures: Generate new realistic amounts (different values) - - For percentages/ratios: Generate new realistic metrics - 4. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
      `. - - `class="border border-slate-300 p-2"` on ``. - 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
      `. - - Remember: The output should be a new synthetic financial dataset, not a transcription of the original. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For company names: Generate DIFFERENT names + - For financial figures: Generate DIFFERENT amounts + - For percentages/ratios: Generate DIFFERENT metrics + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-green-700 to-green-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-green-50 transition-colors"` + - `
      `: `class="border border-green-300 px-4 py-3 font-semibold text-left"` + - `
      `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
        `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
        `. No markdown code blocks. + + **Example (Generic):** + - Company in image: "X회사" → Generate: "Y회사" + - Amount in image: "50억" → Generate: "80억" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/insurance.yaml b/generate_synthetic_table/prompts/insurance.yaml index 100f5c9..4521288 100644 --- a/generate_synthetic_table/prompts/insurance.yaml +++ b/generate_synthetic_table/prompts/insurance.yaml @@ -66,9 +66,8 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Insurance Data. - **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT insurance data values. - The goal is to create realistic synthetic insurance data that looks like it could come from the same domain, but with entirely different plans, coverage amounts, and terms. **Inputs:** 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** @@ -78,54 +77,67 @@ generate_synthetic_table: | {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). 2. **Headers:** Keep header text the same (column names, category labels). - 3. **Data Transformation - MANDATORY:** + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - - **DO NOT copy any original data values** - generate fresh, realistic alternatives. - - For plan names: Generate new Korean insurance plan names (e.g., "안심보장플러스" → "가족사랑보험", "SafeLife" → "행복지킴이") - - For coverage amounts: Generate new realistic amounts in KRW (different values) - - For premiums: Generate new realistic premium amounts - - For terms/conditions: Generate new coverage terms - - For dates: Generate new plausible dates - 4. **Domain Consistency:** - - Ensure insurance logic (higher premiums for better coverage, monthly*12=annual) - - Use realistic Korean insurance terminology - - Contexts: Life insurance, Health insurance, Auto insurance - 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
        `. - - **Example Transformation:** - - Original: "무배당 건강보험" → Synthetic: "실속형 의료보험" - - Original: "보장한도 1억원" → Synthetic: "보장한도 5천만원" - - Original: "월 보험료 35,000원" → Synthetic: "월 보험료 52,000원" - - Remember: The synthetic table should look like a completely different insurance dataset from the same domain. + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For plan names: Generate DIFFERENT insurance plan names + - For coverage amounts: Generate DIFFERENT amounts + - For premiums: Generate DIFFERENT premium amounts + - For terms/conditions: Generate DIFFERENT coverage terms + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure insurance logic (higher premiums for better coverage) + 6. **Output:** Return ONLY the raw HTML string starting with `
        ` and ending with `
        `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original plan: "보험A" → Synthetic: "보험B" + - Original coverage: "1억원" → Synthetic: "5천만원" + - Original premium: "월 3만원" → Synthetic: "월 5만원" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Insurance Data. - **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** Your task is NOT to OCR/transcribe the image. Instead, you must: 1. Understand the table's STRUCTURE from the image 2. Understand it's an INSURANCE table - 3. Generate COMPLETELY NEW synthetic insurance data that fits the domain but uses different values + 3. Generate COMPLETELY NEW synthetic insurance data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** 1. **Image:** An image of an insurance table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Headers:** Keep header text (column names, category labels) the same as in the image. - 3. **Data Generation - CRITICAL:** - - **DO NOT copy the data values from the image** - this is NOT an OCR task - - Generate COMPLETELY NEW synthetic insurance values for all data cells - - For plan names: Generate new insurance plan names (different from what you see) - - For coverage/premiums: Generate new realistic amounts - - For terms: Generate new coverage terms - 4. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
        `. - - `class="border border-slate-300 p-2"` on ``. - 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
        `. - - Remember: The output should be a new synthetic insurance dataset, not a transcription of the original. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For plan names: Generate DIFFERENT insurance plan names + - For coverage/premiums: Generate DIFFERENT amounts + - For terms: Generate DIFFERENT coverage terms + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-amber-600 to-amber-700 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-amber-50 transition-colors"` + - `
        `: `class="border border-amber-300 px-4 py-3 font-semibold text-left"` + - `
        `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
          `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
          `. No markdown code blocks. + + **Example (Generic):** + - Plan name in image: "보험X" → Generate: "보험Y" + - Amount in image: "금액A" → Generate: "금액B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/medical.yaml b/generate_synthetic_table/prompts/medical.yaml index edaaed4..7bf995c 100644 --- a/generate_synthetic_table/prompts/medical.yaml +++ b/generate_synthetic_table/prompts/medical.yaml @@ -96,9 +96,8 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Medical Data. - **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT medical data values. - The goal is to create realistic synthetic medical data that looks like it could come from the same domain, but with entirely different patients, diagnoses, and values. **Inputs:** 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** @@ -108,54 +107,66 @@ generate_synthetic_table: | {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). 2. **Headers:** Keep header text the same (column names, category labels). - 3. **Data Transformation - MANDATORY:** + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - - **DO NOT copy any original data values** - generate fresh, realistic alternatives. - - For patient names: Generate new Korean pseudonymized names (e.g., "홍길동" → "김영수", "환자A" → "환자B") - - For diagnosis codes: Generate new ICD-10 style codes (synthetic) - - For lab values: Generate new realistic values within normal/abnormal ranges - - For medications: Generate new realistic medication names and dosages - - For dates: Generate new plausible dates - 4. **Domain Consistency:** - - Ensure medical logic (proper units for BP, temperature, lab values) - - Use realistic Korean medical terminology - - Contexts: Patient Charts, Lab Reports, Prescription Lists, Clinical Trials - 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
          `. - - **Example Transformation:** - - Original: "환자ID-001" → Synthetic: "환자ID-078" - - Original: "혈압 120/80" → Synthetic: "혈압 135/85" - - Original: "아스피린 100mg" → Synthetic: "타이레놀 500mg" - - Remember: The synthetic table should look like a completely different medical dataset from the same domain. + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers + - For lab values: Generate DIFFERENT realistic values + - For diagnoses/medications: Generate DIFFERENT names and codes + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure medical logic (proper units for BP, temperature, lab values) + 6. **Output:** Return ONLY the raw HTML string starting with `
          ` and ending with `
          `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original ID: "환자A" → Synthetic: "환자B" + - Original value: "120/80" → Synthetic: "135/85" + - Original medication: "약물X" → Synthetic: "약물Y" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Medical Data. - **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** Your task is NOT to OCR/transcribe the image. Instead, you must: 1. Understand the table's STRUCTURE from the image 2. Understand it's a MEDICAL table - 3. Generate COMPLETELY NEW synthetic medical data that fits the domain but uses different values + 3. Generate COMPLETELY NEW synthetic medical data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** 1. **Image:** An image of a medical table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Headers:** Keep header text (column names, category labels) the same as in the image. - 3. **Data Generation - CRITICAL:** - - **DO NOT copy the data values from the image** - this is NOT an OCR task - - Generate COMPLETELY NEW synthetic medical values for all data cells - - For patient names/IDs: Generate new pseudonymized identifiers - - For lab values: Generate new realistic values - - For diagnoses/medications: Generate new names and codes - 4. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
          `. - - `class="border border-slate-300 p-2"` on ``. - 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
          `. - - Remember: The output should be a new synthetic medical dataset, not a transcription of the original. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For patient names/IDs: Generate DIFFERENT pseudonymized identifiers + - For lab values: Generate DIFFERENT realistic values + - For diagnoses/medications: Generate DIFFERENT names + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-teal-700 to-teal-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-teal-50 transition-colors"` + - `
          `: `class="border border-teal-300 px-4 py-3 font-semibold text-left"` + - `
          `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
            `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
            `. No markdown code blocks. + + **Example (Generic):** + - Patient ID in image: "환자X" → Generate: "환자Y" + - Value in image: "수치A" → Generate: "수치B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/generate_synthetic_table/prompts/public.yaml b/generate_synthetic_table/prompts/public.yaml index 57827d1..b0c4099 100644 --- a/generate_synthetic_table/prompts/public.yaml +++ b/generate_synthetic_table/prompts/public.yaml @@ -95,9 +95,8 @@ generate_qa_from_image: | generate_synthetic_table: | You are a Synthetic Data Generator specializing in Public Sector/Government Data. - **CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️** Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT public sector data values. - The goal is to create realistic synthetic government/public data that looks like it could come from the same domain, but with entirely different regions, departments, and statistics. **Inputs:** 1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):** @@ -107,54 +106,66 @@ generate_synthetic_table: | {summary} **Requirements:** - 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges). + 1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan). 2. **Headers:** Keep header text the same (column names, category labels). - 3. **Data Transformation - MANDATORY:** + 3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:** - **ALL data cell values MUST be replaced with completely new synthetic values.** - - **DO NOT copy any original data values** - generate fresh, realistic alternatives. - - For regions: Generate new Korean administrative regions (e.g., "서울특별시" → "부산광역시", "강남구" → "해운대구") - - For departments: Generate new government department names - - For statistics: Generate new realistic numbers (different values, similar magnitude) - - For population/budget figures: Generate new plausible values - - For dates: Generate new plausible dates - 4. **Domain Consistency:** - - Ensure statistical logic (Subtotals match Grand Total, percentages add up) - - Use realistic Korean administrative region names - - Contexts: Census Data, Budget Reports, Public Facility Status, Regional Statistics - 5. **Output:** Return ONLY the raw HTML string starting with `` and ending with `
            `. - - **Example Transformation:** - - Original: "서울특별시" → Synthetic: "대전광역시" - - Original: "인구 9,776,000명" → Synthetic: "인구 1,489,000명" - - Original: "예산집행률 87.5%" → Synthetic: "예산집행률 92.3%" - - Remember: The synthetic table should look like a completely different public sector dataset from the same domain. + - **NEVER copy any original data values** - generate fresh, realistic alternatives. + - For regions: Generate DIFFERENT administrative region names + - For departments: Generate DIFFERENT department names + - For statistics: Generate DIFFERENT numbers (similar magnitude) + - For dates: Generate DIFFERENT plausible dates + 4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:** + - Look at the original image's color scheme and design + - Use appropriate Tailwind color classes to match the original style + - Basic structure: `` + - Headers/cells: Include `border`, `px-4 py-3`, appropriate colors + - Lists: `class="list-disc ml-5 space-y-1"` + - **DO NOT use inline style attributes** + 5. **Domain Consistency:** Ensure statistical logic (Subtotals match Grand Total) + 6. **Output:** Return ONLY the raw HTML string starting with `
            ` and ending with `
            `. No markdown code blocks. + + **Example Transformation (Generic):** + - Original region: "A시" → Synthetic: "B시" + - Original statistic: "인구 100만" → Synthetic: "인구 150만" + - Original rate: "집행률 80%" → Synthetic: "집행률 90%" + + ⚠️ If the generated content is identical or very similar to the original, the output is INVALID. generate_synthetic_table_from_image: | You are a Synthetic Data Generator specializing in Public Sector/Government Data. - **CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA** + **⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️** Your task is NOT to OCR/transcribe the image. Instead, you must: 1. Understand the table's STRUCTURE from the image 2. Understand it's a PUBLIC SECTOR/GOVERNMENT table - 3. Generate COMPLETELY NEW synthetic public data that fits the domain but uses different values + 3. Generate COMPLETELY NEW synthetic public data that fits the domain but uses ENTIRELY DIFFERENT values **Inputs:** 1. **Image:** An image of a public data table. Use this to understand structure and domain ONLY. **Requirements:** - 1. **Structure Preservation:** Accurately reconstruct the table structure. - 2. **Headers:** Keep header text (column names, category labels) the same as in the image. - 3. **Data Generation - CRITICAL:** - - **DO NOT copy the data values from the image** - this is NOT an OCR task - - Generate COMPLETELY NEW synthetic public sector values for all data cells - - For regions: Generate new Korean administrative regions (different from what you see) - - For statistics: Generate new realistic numbers (different values) - - For departments/organizations: Generate new names - 4. **Styling:** Use **Tailwind CSS** classes (same as default). - - `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on ``. - - `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `
            `. - - `class="border border-slate-300 p-2"` on ``. - 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
            `. - - Remember: The output should be a new synthetic public sector dataset, not a transcription of the original. + 1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan. + 2. **Headers:** Keep header text the same as in the image. + 3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:** + - **NEVER copy the data values from the image** - this is NOT an OCR task + - **ALL cell content must be completely NEW and DIFFERENT** + - For regions: Generate DIFFERENT administrative region names + - For statistics: Generate DIFFERENT numbers + - For departments: Generate DIFFERENT names + 4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles). + - ``: `class="w-full border-collapse text-sm"` + - ``: `class="bg-gradient-to-r from-purple-700 to-purple-800 text-white"` + - ``: `class="divide-y divide-slate-200"` + - `` (body rows): `class="hover:bg-purple-50 transition-colors"` + - ` str: + """Build complete HTML document with fonts and styles.""" + + # Google Fonts link + font_link = "" + if style["font_url"]: + font_link = f'\n \n ' + + # Font family CSS + font_css = f""" + """ + + return f""" + + + + + + {font_link} + {font_css} + + +{table_html} + +""" + # Add parent directory to path to allow imports if running from root sys.path.append(str(Path(__file__).parent)) @@ -15,6 +202,72 @@ from generate_synthetic_table.flow import TableState from generate_synthetic_table.notion_uploader import NotionUploader + +def save_synthetic_table_as_html( + synthetic_table: str, + output_path: Path, + pair_id: str, + table_index: int, + randomize_style: bool = True +) -> Tuple[Optional[str], Optional[Dict[str, Any]]]: + """Save synthetic table as HTML file with optional style randomization. + + Args: + synthetic_table: The HTML table string + output_path: Directory to save the file + pair_id: Identifier for the pair + table_index: Index of the table within the pair + randomize_style: Whether to apply random style variations + + Returns: + Tuple of (html_filepath, style_info) or (None, None) if failed + """ + if not synthetic_table: + return None, None + + # Clean up markdown code blocks if present + table_html = synthetic_table + if table_html.startswith("```html"): + table_html = table_html[7:] + if table_html.startswith("```"): + table_html = table_html[3:] + if table_html.endswith("```"): + table_html = table_html[:-3] + table_html = table_html.strip() + + # Apply style randomization if enabled + style_info = None + if randomize_style: + style_info = get_random_style() + table_html = apply_style_to_html(table_html, style_info) + full_html = build_html_document(table_html, style_info) + else: + # Basic HTML without style randomization + full_html = f""" + + + + + + + +{table_html} + +""" + + # Create html subdirectory + html_dir = output_path / "html" + html_dir.mkdir(parents=True, exist_ok=True) + + # Save file + safe_pair_id = "".join([c for c in pair_id if c.isalnum() or c in ('-', '_')]) + html_filename = f"{safe_pair_id}_table_{table_index}.html" + html_filepath = html_dir / html_filename + + html_filepath.write_text(full_html, encoding="utf-8") + + return str(html_filepath), style_info + def resolve_paths(pair: List[str], data_root: Path) -> List[Path]: """Resolves a list of relative paths to absolute Paths.""" paths = [] @@ -39,12 +292,14 @@ def process_single_pair( index: int, total_count: int, data_root: Path, + output_dir: Path, provider: str, model: str, config_path: str, arg_domain: str, qa_only: bool, - notion_uploader: Any + notion_uploader: Any, + randomize_style: bool = True ) -> Dict: """Process a single pair of images.""" @@ -132,6 +387,7 @@ def process_single_pair( model=model, config_path=config_path, qa_only=False, # We want the table + skip_qa=True, # Skip QA here - we'll generate QA for the pair later domain=domain ) @@ -139,12 +395,29 @@ def process_single_pair( if table_state.get("errors"): print(f" [Pair {index+1}] Error generating table: {table_state['errors']}") + # Save synthetic table as HTML file with style randomization + html_path = None + style_info = None + if table_state.get("synthetic_table"): + html_path, style_info = save_synthetic_table_as_html( + synthetic_table=table_state.get("synthetic_table"), + output_path=output_dir, + pair_id=pair_id, + table_index=len(temp_tables), + randomize_style=randomize_style + ) + if html_path: + style_desc = f" (font: {style_info['font_name']}, color: {style_info['color_name']})" if style_info else "" + print(f" [Pair {index+1}] Saved HTML: {html_path}{style_desc}") + # Filter state safe_state = { "image_path": str(path), "synthetic_table": table_state.get("synthetic_table"), "synthetic_json": table_state.get("synthetic_json"), "table_summary": table_state.get("table_summary"), + "html_path": html_path, + "style_info": style_info, # Store applied style for reference } temp_tables.append(safe_state) @@ -221,7 +494,8 @@ def run_pipeline( arg_domain: str = None, qa_only: bool = False, upload_to_notion: bool = False, - max_workers: int = 3 + max_workers: int = 3, + randomize_style: bool = True ): output_dir.mkdir(parents=True, exist_ok=True) @@ -249,12 +523,14 @@ def run_pipeline( i, total_count, data_root, + output_dir, provider, model, config_path, arg_domain, qa_only, - notion_uploader + notion_uploader, + randomize_style ): i for i, item in enumerate(json_input) } @@ -298,6 +574,8 @@ def main(): parser.add_argument("--qa-only", action="store_true", help="Skip table generation, only generate QA (applies to all domains)") parser.add_argument("--upload-to-notion", action="store_true", help="Upload QA results to Notion database") parser.add_argument("--max-workers", type=int, default=3, help="Maximum number of parallel workers (default: 3)") + parser.add_argument("--randomize-style", action="store_true", default=True, help="Randomize HTML table styles (fonts, colors) for diversity (default: True)") + parser.add_argument("--no-randomize-style", dest="randomize_style", action="store_false", help="Disable style randomization") args = parser.parse_args() @@ -331,7 +609,8 @@ def main(): arg_domain=args.domain, qa_only=args.qa_only, upload_to_notion=args.upload_to_notion, - max_workers=args.max_workers + max_workers=args.max_workers, + randomize_style=args.randomize_style ) if __name__ == "__main__":
            `: `class="border border-purple-300 px-4 py-3 font-semibold text-left"` + - `
            `: `class="border border-slate-200 px-4 py-3 text-slate-700"` + - `
              `: `class="list-disc ml-5 space-y-1 text-slate-600"` + - **DO NOT use inline style attributes** + 5. **Output Format:** Return ONLY the raw HTML string starting with `` and ending with `
              `. No markdown code blocks. + + **Example (Generic):** + - Region in image: "X지역" → Generate: "Y지역" + - Statistic in image: "수치A" → Generate: "수치B" + + ⚠️ If the generated content is identical or very similar to the image, the output is INVALID. diff --git a/run_openai_public.sh b/run_openai_public.sh index 92bf58d..d90e1f6 100644 --- a/run_openai_public.sh +++ b/run_openai_public.sh @@ -5,9 +5,9 @@ # ============================================================================== # Default Configuration -INPUT_JSON="test_business_input.json" +INPUT_JSON="test_business.json" OUTPUT_DIR="output_business" -DEFAULT_ARGS="--provider openai --model gpt-5-mini --domain business" +DEFAULT_ARGS="--provider claude --model claude-sonnet-4-5 --domain business" # Check if the first argument is a JSON file path if [[ "$1" == *.json ]]; then @@ -20,8 +20,8 @@ echo " TableMagnifier - JSON Pipeline (Public)" echo "==============================================" echo "Input JSON: $INPUT_JSON" echo "Output Dir: $OUTPUT_DIR" -echo "Provider: openai" -echo "Model: gpt-5-mini" +echo "Provider: claude" +echo "Model: claude-sonnet-4-5" echo "Domain: business" echo "" echo "💡 Tip: To upload to Notion during pipeline execution:" @@ -31,13 +31,13 @@ echo "💡 To upload existing results later:" echo " python upload_to_notion_from_json.py $OUTPUT_DIR" echo "" -# Check for OPENAI_API_KEY -if [[ -z "$OPENAI_API_KEY" ]]; then - echo "⚠️ Warning: OPENAI_API_KEY is not set." +# Check for ANTHROPIC_API_KEY +if [[ -z "$ANTHROPIC_API_KEY" ]]; then + echo "⚠️ Warning: ANTHROPIC_API_KEY is not set." echo " Please set it in your environment or .env file." echo "" fi # Run the pipeline # Note: "$@" appends any remaining arguments, allowing overrides of defaults -uv run python run_pipeline_json.py --input "$INPUT_JSON" --output-dir "$OUTPUT_DIR" $DEFAULT_ARGS "$@" +uv run python run_pipeline_json.py --input "$INPUT_JSON" --output-dir "$OUTPUT_DIR" $DEFAULT_ARGS "$@" \ No newline at end of file diff --git a/run_pipeline_json.py b/run_pipeline_json.py index 0ff7276..d8fef9c 100644 --- a/run_pipeline_json.py +++ b/run_pipeline_json.py @@ -1,13 +1,200 @@ import argparse import json import os +import re +import random import sys from pathlib import Path -from typing import List, Dict, Any, Tuple +from typing import List, Dict, Any, Tuple, Optional from concurrent.futures import ThreadPoolExecutor, as_completed from dotenv import load_dotenv + +# ============================================================ +# Style Variation Configuration +# ============================================================ + +GOOGLE_FONTS = [ + ("Noto Sans KR", "Noto+Sans+KR:wght@400;500;600;700"), + ("Pretendard", None), # Self-hosted or system font + ("IBM Plex Sans KR", "IBM+Plex+Sans+KR:wght@400;500;600;700"), + ("Nanum Gothic", "Nanum+Gothic:wght@400;700"), + ("Nanum Myeongjo", "Nanum+Myeongjo:wght@400;700"), + ("Gothic A1", "Gothic+A1:wght@400;500;600;700"), + ("Do Hyeon", "Do+Hyeon"), + ("Jua", "Jua"), + ("Gowun Dodum", "Gowun+Dodum"), + ("Gowun Batang", "Gowun+Batang:wght@400;700"), +] + +COLOR_SCHEMES = [ + # (name, header_bg_from, header_bg_to, header_text, header_border, body_hover, body_border, text_color) + ("indigo", "indigo-600", "indigo-700", "white", "indigo-400", "indigo-50", "slate-200", "slate-700"), + ("slate", "slate-600", "slate-700", "white", "slate-500", "slate-50", "slate-300", "slate-700"), + ("emerald", "emerald-600", "emerald-700", "white", "emerald-400", "emerald-50", "slate-200", "slate-700"), + ("blue", "blue-600", "blue-700", "white", "blue-400", "blue-50", "slate-200", "slate-700"), + ("purple", "purple-600", "purple-700", "white", "purple-400", "purple-50", "slate-200", "slate-700"), + ("teal", "teal-600", "teal-700", "white", "teal-400", "teal-50", "slate-200", "slate-700"), + ("amber", "amber-600", "amber-700", "white", "amber-400", "amber-50", "slate-200", "slate-800"), + ("rose", "rose-600", "rose-700", "white", "rose-400", "rose-50", "slate-200", "slate-700"), + ("cyan", "cyan-600", "cyan-700", "white", "cyan-400", "cyan-50", "slate-200", "slate-700"), + ("stone", "stone-600", "stone-700", "white", "stone-500", "stone-50", "stone-300", "stone-700"), + # Light header variants + ("light-blue", "blue-100", "blue-200", "blue-900", "blue-300", "blue-50", "blue-200", "slate-700"), + ("light-gray", "gray-100", "gray-200", "gray-800", "gray-300", "gray-50", "gray-200", "gray-700"), + ("light-green", "green-100", "green-200", "green-900", "green-300", "green-50", "green-200", "slate-700"), +] + +TABLE_STYLES = [ + # (name, table_extra_classes, has_shadow, has_rounded, stripe_odd) + ("default", "", False, False, False), + ("shadow", "shadow-lg", True, False, False), + ("rounded", "rounded-lg overflow-hidden", False, True, False), + ("shadow-rounded", "shadow-lg rounded-lg overflow-hidden", True, True, False), + ("striped", "", False, False, True), + ("striped-rounded", "rounded-lg overflow-hidden", False, True, True), +] + +FONT_SIZES = ["text-xs", "text-sm", "text-base"] + + +def get_random_style() -> Dict[str, Any]: + """Generate a random style configuration.""" + font_name, font_url = random.choice(GOOGLE_FONTS) + color = random.choice(COLOR_SCHEMES) + table_style = random.choice(TABLE_STYLES) + font_size = random.choice(FONT_SIZES) + + return { + "font_name": font_name, + "font_url": font_url, + "color_name": color[0], + "header_bg_from": color[1], + "header_bg_to": color[2], + "header_text": color[3], + "header_border": color[4], + "body_hover": color[5], + "body_border": color[6], + "text_color": color[7], + "table_style_name": table_style[0], + "table_extra_classes": table_style[1], + "has_shadow": table_style[2], + "has_rounded": table_style[3], + "stripe_odd": table_style[4], + "font_size": font_size, + } + + +def apply_style_to_html(table_html: str, style: Dict[str, Any]) -> str: + """Apply style variations to the table HTML by replacing Tailwind classes.""" + html = table_html + + # Replace header gradient colors + # Pattern: bg-gradient-to-r from-{color}-{shade} to-{color}-{shade} + html = re.sub( + r'from-\w+-\d+\s+to-\w+-\d+', + f'from-{style["header_bg_from"]} to-{style["header_bg_to"]}', + html + ) + + # Replace header text color + html = re.sub( + r'(]*class="[^"]*?)text-white', + f'\\1text-{style["header_text"]}', + html + ) + + # Replace header border color + html = re.sub( + r'border-\w+-300(?=\s|")', + f'border-{style["header_border"]}', + html + ) + + # Replace hover color + html = re.sub( + r'hover:bg-\w+-50', + f'hover:bg-{style["body_hover"]}', + html + ) + + # Replace body border color + html = re.sub( + r'border-slate-200', + f'border-{style["body_border"]}', + html + ) + + # Replace text color + html = re.sub( + r'text-slate-700', + f'text-{style["text_color"]}', + html + ) + html = re.sub( + r'text-slate-600', + f'text-{style["text_color"]}', + html + ) + + # Replace font size in table tag + html = re.sub( + r'(]*class="[^"]*?)text-(?:xs|sm|base)', + f'\\1{style["font_size"]}', + html + ) + + # Add table extra classes (shadow, rounded) + if style["table_extra_classes"]: + html = re.sub( + r']*class="[^"]*hover:bg-)', + f'