Skip to content

Commit 266b919

Browse files
Merge pull request #44 from Pseudo-Lab/feature/batch-처리-token-문제-수정-issue
Feature/batch 처리 token 문제 수정 issue
2 parents 6d978de + e07cac3 commit 266b919

6 files changed

Lines changed: 268 additions & 67 deletions

File tree

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,10 @@ pipeline_ui/backend/uploads/*
7373
# Frontend (Node.js)
7474
pipeline_ui/frontend/node_modules/*
7575
pipeline_ui/frontend/package-lock.json
76+
77+
78+
I_origin_0/*
79+
I_origin_1/*
80+
I_origin_2/*
81+
82+
output/*

generate_synthetic_table/flow.py

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -99,23 +99,47 @@ def _call_llm(
9999
if return_token_usage:
100100
# Extract token usage from response metadata
101101
token_usage = 0
102-
if hasattr(response, 'response_metadata'):
103-
usage_metadata = response.response_metadata.get('usage', {})
104-
# OpenAI/Gemini format
102+
103+
logger.info(f"=== TOKEN DEBUG START ===")
104+
logger.info(f"Response type: {type(response)}")
105+
logger.info(f"Has usage_metadata: {hasattr(response, 'usage_metadata')}")
106+
logger.info(f"Has response_metadata: {hasattr(response, 'response_metadata')}")
107+
108+
# Try response.usage_metadata first (Gemini pool format)
109+
if hasattr(response, 'usage_metadata') and response.usage_metadata:
110+
usage = response.usage_metadata
111+
logger.info(f"usage_metadata type: {type(usage)}")
112+
logger.info(f"usage_metadata value: {usage}")
113+
114+
if isinstance(usage, dict):
115+
token_usage = usage.get('total_tokens', 0)
116+
if not token_usage:
117+
token_usage = usage.get('input_tokens', 0) + usage.get('output_tokens', 0)
118+
else:
119+
token_usage = getattr(usage, 'total_tokens', 0)
120+
if not token_usage:
121+
token_usage = getattr(usage, 'input_tokens', 0) + getattr(usage, 'output_tokens', 0)
122+
123+
logger.info(f"Extracted token_usage from usage_metadata: {token_usage}")
124+
125+
# Fallback: response.response_metadata (OpenAI format)
126+
if not token_usage and hasattr(response, 'response_metadata'):
127+
metadata = response.response_metadata
128+
logger.info(f"response_metadata: {metadata}")
129+
usage_metadata = metadata.get('usage', {})
130+
logger.info(f"usage from response_metadata: {usage_metadata}")
131+
105132
token_usage = usage_metadata.get('total_tokens', 0)
106-
# Fallback: prompt_tokens + completion_tokens
107133
if not token_usage:
108134
token_usage = usage_metadata.get('prompt_tokens', 0) + usage_metadata.get('completion_tokens', 0)
109-
# Fallback: input_tokens + output_tokens
110135
if not token_usage:
111136
token_usage = usage_metadata.get('input_tokens', 0) + usage_metadata.get('output_tokens', 0)
112-
# Alternative: usage_metadata attribute (dict or object)
113-
if not token_usage and hasattr(response, 'usage_metadata') and response.usage_metadata:
114-
usage = response.usage_metadata
115-
if isinstance(usage, dict):
116-
token_usage = usage.get('total_tokens', 0) or (usage.get('input_tokens', 0) + usage.get('output_tokens', 0))
117-
else:
118-
token_usage = getattr(usage, 'total_tokens', 0) or (getattr(usage, 'input_tokens', 0) + getattr(usage, 'output_tokens', 0))
137+
138+
logger.info(f"Extracted token_usage from response_metadata: {token_usage}")
139+
140+
logger.info(f"Final token_usage: {token_usage}")
141+
logger.info(f"=== TOKEN DEBUG END ===")
142+
119143
return response_content, token_usage
120144

121145
return response_content
@@ -641,7 +665,11 @@ def _node(state: TableState) -> TableState:
641665
errors.append(f"QA prompt missing placeholder: {e}")
642666
return {**state, "errors": errors}
643667

644-
response_text = _call_llm(llm, prompt)
668+
response_text, token_usage = _call_llm(llm, prompt, return_token_usage=True)
669+
670+
# Debug log for token usage
671+
logger.info(f"QA generation token usage: {token_usage}")
672+
645673
response_json = robust_json_parse(response_text)
646674

647675
qa_results = []
@@ -650,7 +678,8 @@ def _node(state: TableState) -> TableState:
650678
else:
651679
logger.warning("QA generation did not return valid JSON or 'qa_pairs' key.")
652680

653-
return {**state, "qa_results": qa_results}
681+
logger.info(f"Returning token_usage: {token_usage}")
682+
return {**state, "qa_results": qa_results, "token_usage": token_usage}
654683

655684
return _node
656685

@@ -685,6 +714,10 @@ def _node(state: TableState) -> TableState:
685714
prompt = prompt_template
686715

687716
response_text, token_usage = _call_llm(llm, prompt, image_urls=image_data_urls, return_token_usage=True)
717+
718+
# Debug log for token usage
719+
logger.info(f"QA generation token usage: {token_usage}")
720+
688721
response_json = robust_json_parse(response_text)
689722

690723
qa_results = []
@@ -693,6 +726,7 @@ def _node(state: TableState) -> TableState:
693726
else:
694727
logger.warning("QA generation from image did not return valid JSON or 'qa_pairs' key.")
695728

729+
logger.info(f"Returning token_usage: {token_usage}")
696730
return {**state, "qa_results": qa_results, "token_usage": token_usage}
697731

698732
return _node
@@ -877,6 +911,8 @@ def run_synthetic_table_flow(
877911
temperature: float = 0.2,
878912
base_url: str | None = None,
879913
config_path: str | None = None,
914+
azure_deployment: str | None = None,
915+
azure_endpoint: str | None = None,
880916
qa_only: bool = False,
881917
image_paths: List[str] | None = None,
882918
domain: str | None = None,
@@ -891,11 +927,13 @@ def run_synthetic_table_flow(
891927
892928
Args:
893929
image_path: Path to the input image or HTML file
894-
provider: LLM provider (openai, gemini, gemini_pool, claude, vllm)
930+
provider: LLM provider (openai, azure, gemini, gemini_pool, claude, vllm)
895931
model: Model name
896932
temperature: Sampling temperature
897933
base_url: Custom base URL for vLLM
898934
config_path: Config path for gemini_pool
935+
azure_deployment: Azure OpenAI deployment name
936+
azure_endpoint: Azure OpenAI endpoint URL
899937
qa_only: If True, skip synthetic data generation and only generate QA from image
900938
image_paths: Optional list of image paths for multi-image processing
901939
domain: Optional domain for prompt customization (e.g. 'public')

generate_synthetic_table/llm_factory.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,64 @@
11
from __future__ import annotations
22

33
import os
4+
import yaml
5+
from pathlib import Path
46
from typing import Optional
57

68
from langchain_core.language_models import BaseChatModel
7-
from langchain_openai import ChatOpenAI
9+
from langchain_openai import ChatOpenAI, AzureChatOpenAI
810
from langchain_google_genai import ChatGoogleGenerativeAI
911
from langchain_anthropic import ChatAnthropic
1012

1113
# polling_gemini 모듈에서 GeminiPoolChatModel 임포트
1214
from polling_gemini import GeminiPoolChatModel, create_gemini_chat_model
1315

1416

17+
def _load_azure_config_from_yaml(config_path: Optional[str] = None) -> dict:
18+
"""Load Azure OpenAI configuration from gemini_keys.yaml file."""
19+
if not config_path:
20+
config_path = "apis/gemini_keys.yaml"
21+
22+
config_file = Path(config_path)
23+
if not config_file.exists():
24+
return {}
25+
26+
try:
27+
with open(config_file, 'r', encoding='utf-8') as f:
28+
config = yaml.safe_load(f)
29+
30+
return {
31+
'api_key': config.get('AZURE_OPENAI_API_KEY'),
32+
'endpoint': config.get('AZURE_OPENAI_ENDPOINT'),
33+
'api_version': config.get('AZURE_OPENAI_API_VERSION'),
34+
'deployment': config.get('AZURE_OPENAI_DEPLOYMENT_NAME'),
35+
}
36+
except Exception:
37+
return {}
38+
39+
1540
def get_llm(
1641
provider: str,
1742
model: str,
1843
temperature: float = 0.2,
1944
base_url: Optional[str] = None,
2045
api_key: Optional[str] = None,
2146
config_path: Optional[str] = None,
47+
azure_deployment: Optional[str] = None,
48+
azure_endpoint: Optional[str] = None,
2249
) -> BaseChatModel:
2350
"""
2451
Factory to create a Chat Model based on the provider.
2552
2653
Args:
27-
provider: 'openai', 'gemini', 'gemini_pool', 'claude', or 'vllm'
54+
provider: 'openai', 'azure', 'gemini', 'gemini_pool', 'claude', or 'vllm'
2855
model: Model name (e.g., 'gpt-4', 'gemini-1.5-flash', 'claude-sonnet-4-20250514')
2956
temperature: Sampling temperature
3057
base_url: Optional base URL for vLLM or custom OpenAI endpoints
3158
api_key: Optional API key override
3259
config_path: Optional config path for gemini_pool (apis/gemini_keys.yaml)
60+
azure_deployment: Azure OpenAI deployment name (required for azure provider)
61+
azure_endpoint: Azure OpenAI endpoint URL (required for azure provider)
3362
3463
Returns:
3564
A configured LangChain Chat Model
@@ -66,6 +95,30 @@ def get_llm(
6695
temperature=temperature,
6796
)
6897

98+
elif provider == "azure":
99+
# Azure OpenAI
100+
# 우선순위: CLI 파라미터 > config_path의 yaml 파일 > 환경변수
101+
yaml_config = _load_azure_config_from_yaml(config_path)
102+
103+
azure_key = api_key or yaml_config.get('api_key') or os.getenv("AZURE_OPENAI_API_KEY")
104+
azure_ep = azure_endpoint or yaml_config.get('endpoint') or os.getenv("AZURE_OPENAI_ENDPOINT")
105+
azure_dep = azure_deployment or yaml_config.get('deployment') or model
106+
azure_ver = yaml_config.get('api_version') or os.getenv("AZURE_OPENAI_API_VERSION") or "2024-02-15-preview"
107+
108+
if not azure_key or not azure_ep:
109+
raise ValueError(
110+
"Azure OpenAI requires AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT. "
111+
"Set via environment variables, --azure-* arguments, or in apis/gemini_keys.yaml."
112+
)
113+
114+
return AzureChatOpenAI(
115+
azure_deployment=azure_dep,
116+
azure_endpoint=azure_ep,
117+
api_key=azure_key,
118+
api_version=azure_ver,
119+
temperature=temperature,
120+
)
121+
69122
elif provider == "claude":
70123
# Anthropic Claude API
71124
# ANTHROPIC_API_KEY 환경변수 또는 api_key 파라미터 사용

generate_synthetic_table/prompts/insurance.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,17 @@ generate_qa: |
1515
3. **Language:** The questions and answers MUST be in Korean.
1616
4. **Reasoning Language:** reasoning_annotation MUST be written in English and MUST be a single string (not a list).
1717
5. **Accuracy:** Ensure all answers are factually correct based on the data in the table.
18+
7. **Context:** Include the specific table cells or rows that were used to answer the question.
1819
1920
**Output Format (JSON):**
2021
{{
2122
"qa_pairs": [
2223
{{
2324
"question": "...",
2425
"answer": "...",
25-
"type": "lookup"
26+
"type": "lookup",
27+
"reasoning_annotation": "Detailed explanation of how the answer was derived (in English, single string)",
28+
"context": "Specific table cells/rows used (e.g., 'Row 2, Column 3: Premium amount')"
2629
}},
2730
...
2831
]
@@ -43,14 +46,17 @@ generate_qa_from_image: |
4346
4. **Language:** The questions and answers MUST be in Korean.
4447
5. **Reasoning Language:** reasoning_annotation MUST be written in English and MUST be a single string (not a list).
4548
6. **Accuracy:** Ensure 100% factual correctness.
49+
7. **Context:** Include specific cell references or table sections used to derive the answer.
4650
4751
**Output Format (JSON):**
4852
{{
4953
"qa_pairs": [
5054
{{
5155
"question": "...",
5256
"answer": "...",
53-
"type": "lookup"
57+
"type": "lookup",
58+
"reasoning_annotation": "Step-by-step reasoning process in English (single string)",
59+
"context": "Table location used (e.g., 'Premium column, Row 3')"
5460
}},
5561
...
5662
]

0 commit comments

Comments
 (0)