diff --git a/app/agents/file_analyzer_agent.rb b/app/agents/file_analyzer_agent.rb index 6984058..2f3487f 100644 --- a/app/agents/file_analyzer_agent.rb +++ b/app/agents/file_analyzer_agent.rb @@ -15,9 +15,19 @@ class FileAnalyzerAgent < ApplicationAgent def analyze_pdf @file_path = params[:file_path] - # Read PDF content (would need pdf-reader gem) - @content = extract_pdf_content(@file_path) if @file_path + @analysis_type = params[:analysis_type] || "general" + + if @file_path + result = PdfTextExtractor.new(@file_path).extract + @content = result[:pages].values.join("\n\n---\n\n") + @page_count = result[:page_count] + @metadata = result[:metadata] + end + setup_context_and_prompt + rescue PdfTextExtractor::ExtractionError => e + Rails.logger.error "[FileAnalyzer] PDF extraction failed: #{e.message}" + @content = "Error extracting PDF: #{e.message}" setup_context_and_prompt end @@ -47,14 +57,14 @@ def extract_image_text def extract_text @file_path = params[:file_path] if params[:file_path] - @content = extract_file_content(@file_path) if @file_path + @content = extract_document_content(@file_path) if @file_path setup_context_and_prompt end def summarize_document @file_path = params[:file_path] - @content = extract_file_content(@file_path) if @file_path + @content = extract_document_content(@file_path) if @file_path setup_context_and_prompt end @@ -172,16 +182,19 @@ def detect_content_type(file_path) end end - def extract_pdf_content(file_path) - # This would require pdf-reader gem - # For now, returning placeholder - "PDF content extraction would go here" - end + # Extract content using format-aware extractor when possible, falling back to raw read + def extract_document_content(file_path) + return nil unless file_path - def extract_file_content(file_path) + extractor = DocumentTextExtractor.new(file_path) + result = extractor.extract + result[:pages].values.join("\n\n") + rescue DocumentTextExtractor::UnsupportedFormatError + # Fall back to raw file read for unsupported formats + File.read(file_path) + rescue => e + Rails.logger.error "[FileAnalyzer] Content extraction failed: #{e.message}" File.read(file_path) - rescue - "Unable to read file content" end def broadcast_chunk(chunk) diff --git a/app/views/file_analyzer_agent/analyze_pdf.text.erb b/app/views/file_analyzer_agent/analyze_pdf.text.erb index 99dba24..c05bad8 100644 --- a/app/views/file_analyzer_agent/analyze_pdf.text.erb +++ b/app/views/file_analyzer_agent/analyze_pdf.text.erb @@ -1,3 +1,27 @@ -FileAnalyzer#analyze_pdf +Analyze the following PDF document and provide a comprehensive review. -<%= @message %>, find me in app/views/file_analyzer_agent/analyze_pdf.text.erb +<% if @metadata.present? %> +Document metadata: +<% if @metadata[:title].present? %>- Title: <%= @metadata[:title] %><% end %> +<% if @metadata[:author].present? %>- Author: <%= @metadata[:author] %><% end %> +<% if @metadata[:subject].present? %>- Subject: <%= @metadata[:subject] %><% end %> +<% if @metadata[:creation_date].present? %>- Created: <%= @metadata[:creation_date] %><% end %> +<% end %> +<% if @page_count.present? %> +Pages: <%= @page_count %> +<% end %> +<% if @analysis_type.present? && @analysis_type != "general" %> +Analysis focus: <%= @analysis_type %> +<% end %> + +--- DOCUMENT CONTENT --- +<%= @content %> +--- END DOCUMENT CONTENT --- + +Provide: +1. A brief summary of what this document is about +2. Key findings, data points, or arguments presented +3. Notable sections or topics covered +4. Any concerns, gaps, or areas that warrant further investigation + +Format your response in clear markdown with appropriate headings. diff --git a/app/views/file_analyzer_agent/extract_text.text.erb b/app/views/file_analyzer_agent/extract_text.text.erb index 5a156c4..4d4b157 100644 --- a/app/views/file_analyzer_agent/extract_text.text.erb +++ b/app/views/file_analyzer_agent/extract_text.text.erb @@ -1,3 +1,16 @@ -FileAnalyzer#extract_text +Extract and present the text content from this document in a clean, well-structured format. -<%= @message %>, find me in app/views/file_analyzer_agent/extract_text.text.erb +<% if @file_path.present? %> +File: <%= File.basename(@file_path) %> +<% end %> + +--- DOCUMENT CONTENT --- +<%= @content %> +--- END DOCUMENT CONTENT --- + +Instructions: +- Clean up any extraction artifacts (broken words, stray characters, garbled encoding) +- Preserve the document's logical structure (headings, sections, lists, tables) +- Format the output as readable markdown +- If tables are present, render them as markdown tables +- Indicate any sections where text was unreadable with [unclear] markers diff --git a/app/views/file_analyzer_agent/summarize_document.text.erb b/app/views/file_analyzer_agent/summarize_document.text.erb index ae7be12..9cfdb1c 100644 --- a/app/views/file_analyzer_agent/summarize_document.text.erb +++ b/app/views/file_analyzer_agent/summarize_document.text.erb @@ -1,3 +1,16 @@ -FileAnalyzer#summarize_document +Summarize the following document concisely for a due diligence review. -<%= @message %>, find me in app/views/file_analyzer_agent/summarize_document.text.erb +<% if @file_path.present? %> +File: <%= File.basename(@file_path) %> +<% end %> + +--- DOCUMENT CONTENT --- +<%= @content %> +--- END DOCUMENT CONTENT --- + +Provide: +1. **Executive Summary** - 2-3 sentence overview of the document +2. **Key Points** - Bulleted list of the most important facts, figures, and conclusions +3. **Relevance** - How this document may be relevant to an investment due diligence process + +Keep the summary concise and actionable. Focus on facts and data rather than filler. diff --git a/test/agents/file_analyzer_agent_test.rb b/test/agents/file_analyzer_agent_test.rb index 04bcb27..7fd3d5f 100644 --- a/test/agents/file_analyzer_agent_test.rb +++ b/test/agents/file_analyzer_agent_test.rb @@ -103,21 +103,21 @@ class FileAnalyzerAgentTest < ActiveSupport::TestCase assert_nil image_data end - test "extract_file_content reads file content" do - temp_file = @temp_dir.join('test.txt') + test "extract_document_content reads plain text files" do + temp_file = @temp_dir.join("test.txt") File.write(temp_file, "Hello, World!") agent = FileAnalyzerAgent.new - content = agent.send(:extract_file_content, temp_file.to_s) + content = agent.send(:extract_document_content, temp_file.to_s) assert_equal "Hello, World!", content end - test "extract_file_content returns error message when file cannot be read" do + test "extract_document_content returns nil for nil path" do agent = FileAnalyzerAgent.new - content = agent.send(:extract_file_content, '/nonexistent/file.txt') + content = agent.send(:extract_document_content, nil) - assert_equal "Unable to read file content", content + assert_nil content end test "encode_image returns nil for non-existent file" do