-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
134 lines (108 loc) · 5.13 KB
/
main.py
File metadata and controls
134 lines (108 loc) · 5.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
from summarizer import summarize_doc_sections
from template_sections import extract_template_sections, create_template_sections
from dotenv import load_dotenv
#NEW
from docx import Document
#NEW
from docx.shared import RGBColor, Pt, Inches
#NEW
from docx.enum.text import WD_ALIGN_PARAGRAPH
import logging
# Load environment variables from .env file
load_dotenv(override=True)
# Set up OpenAI API key
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Model configuration
MODEL_TYPE = os.getenv("MODEL_TYPE", "openai").lower() # Options: "openai", "azure", "llama"
# Set up model configuration based on the selected model type
model_config = {
"model_type": MODEL_TYPE
}
if MODEL_TYPE == "openai":
# OpenAI configuration
model_config["api_key"] = os.getenv("OPENAI_API_KEY")
if not model_config["api_key"]:
raise ValueError("Please set the OPENAI_API_KEY environment variable.")
elif MODEL_TYPE == "azure":
# Azure OpenAI configuration
model_config["api_key"] = os.getenv("AZURE_OPENAI_API_KEY")
model_config["endpoint"] = os.getenv("AZURE_OPENAI_ENDPOINT")
model_config["deployment_name"] = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
if not all([model_config["api_key"], model_config["endpoint"], model_config["deployment_name"]]):
raise ValueError("Please set all required Azure OpenAI environment variables: "
"AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_DEPLOYMENT_NAME")
elif MODEL_TYPE == "llama":
# Llama configuration
model_config["model_path"] = os.getenv("LLAMA_MODEL_PATH")
if not model_config["model_path"]:
raise ValueError("Please set the LLAMA_MODEL_PATH environment variable.")
if not os.path.exists(model_config["model_path"]):
raise FileNotFoundError(f"Llama model not found at '{model_config['model_path']}'")
else:
raise ValueError(f"Unsupported model type: {MODEL_TYPE}. Choose from 'openai', 'azure', or 'llama'.")
current_dir = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_DOC_PATH = os.path.join(current_dir, "template_doc", "template.docx")
INPUT_DIR = "input_doc"
TEMPLATE_PATH = "template_doc/1759_Prospect Curator_Template.pdf"
OUTPUT_DIR = "output_doc"
if not os.path.exists(TEMPLATE_DOC_PATH):
raise FileNotFoundError(f"Template file not found at '{TEMPLATE_DOC_PATH}'")
os.makedirs(OUTPUT_DIR, exist_ok=True)
# NEW - This code allows us to either use PDF template or provide plaintext guidelines
USE_PDF_TEMPLATE = False # Set to False to use plaintext templates
if USE_PDF_TEMPLATE:
# Get summarization instructions from template PDF
instructions = extract_template_sections(TEMPLATE_PATH)
else:
# Example of using plaintext templates
instructions = create_template_sections(TEMPLATE_DOC_PATH)
SECTION_ORDER = ["Prospect Curator", "Back of the Napkin", "The Pitch", "Rewards", "Client Needs", "The Elixir", "Insurance+", "Landmines", "Path to Close", "What Will Clients Ask?", "Sustain Success", "Look Ahead"]
def create_formatted_docx(summaries, output_path):
doc = Document()
# Set default paragraph spacing and font
style = doc.styles['Normal']
style.font.name = 'Calibri'
style.font.size = Pt(11)
style.paragraph_format.space_after = Pt(12)
# Define section header style
def add_section_header(text):
header = doc.add_paragraph()
header.alignment = WD_ALIGN_PARAGRAPH.LEFT
run = header.add_run(text)
run.bold = True
run.font.size = Pt(16)
run.font.color.rgb = RGBColor(0, 51, 102) # Dark blue color
header.space_before = Pt(24)
header.space_after = Pt(12)
# Add each section
for section in SECTION_ORDER:
if section in summaries:
add_section_header(section)
# Add content with proper paragraph formatting
paragraphs = summaries[section].split('\n\n')
for i, p in enumerate(paragraphs):
if p.strip():
content = doc.add_paragraph()
# Justify all paragraphs except the last one
if i < len(paragraphs) - 1:
content.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
else:
content.alignment = WD_ALIGN_PARAGRAPH.LEFT
content.add_run(p.strip())
# Save the document
doc.save(output_path)
for filename in os.listdir(INPUT_DIR):
if filename.lower().endswith((".pdf", ".docx")):
input_path = os.path.join(INPUT_DIR, filename)
base_name = os.path.splitext(filename)[0]
output_path = os.path.join(OUTPUT_DIR, f"{base_name}_summary.docx")
# print(f"Processing {filename}...")
logging.info(f"Using model type: {MODEL_TYPE}")
# Get summaries for all sections
summaries = summarize_doc_sections(input_path, model_config, instructions)
# NEW - Create formatted Word document
create_formatted_docx(summaries, output_path)
# print(f"Created: {output_path}")
logging.info(f"Successfully created summary at {output_path}")