Skip to content
Merged

Dev #2401

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
using System.IO;
using System.Threading.Tasks;

namespace Unity.AI.Extraction
{
public interface ITextExtractionService
{
Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType);
Task<string> ExtractTextAsync(string fileName, Stream fileContent, string contentType);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ public class AttachmentSummaryRequest
[JsonPropertyName("fileName")]
public string FileName { get; set; } = string.Empty;

[JsonPropertyName("fileContent")]
public byte[] FileContent { get; set; } = System.Array.Empty<byte>();

[JsonPropertyName("contentType")]
public string ContentType { get; set; } = "application/octet-stream";

[JsonPropertyName("extractedText")]
public string? ExtractedText { get; set; }

[JsonPropertyName("promptVersion")]
public string? PromptVersion { get; set; }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,17 @@ public partial class TextExtractionService : ITextExtractionService, ITransientD
private const int MaxDocxTableCellsPerRow = 50;
private const int MaxPowerPointSlides = 200;
private readonly ILogger<TextExtractionService> _logger;
private readonly Dictionary<string, Func<string, byte[], string>> _extractorsByExtension;

public TextExtractionService(ILogger<TextExtractionService> logger)
{
_logger = logger;
_extractorsByExtension = new Dictionary<string, Func<string, byte[], string>>(StringComparer.OrdinalIgnoreCase)
{
[".txt"] = (_, content) => ExtractTextFromTextFile(content),
[".csv"] = (_, content) => ExtractTextFromTextFile(content),
[".json"] = (_, content) => ExtractTextFromTextFile(content),
[".xml"] = (_, content) => ExtractTextFromTextFile(content),
[".pdf"] = ExtractTextFromPdfFile,
[".docx"] = ExtractTextFromWordDocx,
[".xls"] = ExtractTextFromExcelFile,
[".xlsx"] = ExtractTextFromExcelFile,
[".pptx"] = ExtractTextFromPowerPointFile
};
}

public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
public Task<string> ExtractTextAsync(string fileName, Stream fileContent, string contentType)
{
if (fileContent == null || fileContent.Length == 0)
if (fileContent == null)
{
_logger.LogDebug("File content is empty for {FileName}", fileName);
_logger.LogDebug("File content stream is null for {FileName}", fileName);
return Task.FromResult(string.Empty);
}

Expand All @@ -64,48 +51,23 @@ public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string
return Task.FromResult(string.Empty);
}

if (_extractorsByExtension.TryGetValue(extension, out var extractor))
var rawText = extension switch
{
var rawText = extractor(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

if (normalizedContentType.Contains("text/"))
{
var rawText = ExtractTextFromTextFile(fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

if (normalizedContentType.Contains("pdf"))
{
var rawText = ExtractTextFromPdfFile(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

if (normalizedContentType.Contains("word") ||
normalizedContentType.Contains("msword") ||
normalizedContentType.Contains("officedocument.wordprocessingml"))
{
var rawText = ExtractTextFromWordDocx(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

if (normalizedContentType.Contains("excel") || normalizedContentType.Contains("spreadsheet"))
".txt" or ".csv" or ".json" or ".xml" => ExtractTextFromTextFile(fileContent),
".pdf" => ExtractTextFromPdfFile(fileName, fileContent),
".docx" => ExtractTextFromWordDocx(fileName, fileContent),
".xls" or ".xlsx" => ExtractTextFromExcelFile(fileName, fileContent),
".pptx" => ExtractTextFromPowerPointFile(fileName, fileContent),
_ => ExtractByContentType(fileName, fileContent, normalizedContentType)
};

if (string.IsNullOrEmpty(rawText))
{
var rawText = ExtractTextFromExcelFile(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
_logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}",
contentType, extension);
}

if (normalizedContentType.Contains("presentation") ||
normalizedContentType.Contains("powerpoint"))
{
var rawText = ExtractTextFromPowerPointFile(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

_logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}",
contentType, extension);
return Task.FromResult(string.Empty);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}
catch (Exception ex)
{
Expand All @@ -114,25 +76,64 @@ public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string
}
}

private string ExtractTextFromTextFile(byte[] fileContent)
private string ExtractByContentType(string fileName, Stream fileContent, string normalizedContentType)
{
try
if (normalizedContentType.Contains("text/"))
{
var text = Encoding.UTF8.GetString(fileContent);
return ExtractTextFromTextFile(fileContent);
}
if (normalizedContentType.Contains("pdf"))
{
return ExtractTextFromPdfFile(fileName, fileContent);
}
if (normalizedContentType.Contains("word") ||
normalizedContentType.Contains("msword") ||
normalizedContentType.Contains("officedocument.wordprocessingml"))
{
return ExtractTextFromWordDocx(fileName, fileContent);
}
if (normalizedContentType.Contains("excel") || normalizedContentType.Contains("spreadsheet"))
{
return ExtractTextFromExcelFile(fileName, fileContent);
}
if (normalizedContentType.Contains("presentation") || normalizedContentType.Contains("powerpoint"))
{
return ExtractTextFromPowerPointFile(fileName, fileContent);
}
return string.Empty;
}

if (text.Contains('\uFFFD'))
{
text = Encoding.ASCII.GetString(fileContent);
}
private static void RewindIfPossible(Stream stream)
{
if (stream.CanSeek)
{
stream.Position = 0;
}
}

if (text.Length > MaxExtractedTextLength)
private string ExtractTextFromTextFile(Stream fileContent)
{
try
{
RewindIfPossible(fileContent);
using var reader = new StreamReader(fileContent, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, bufferSize: 4096, leaveOpen: true);
var buffer = new char[Math.Min(MaxExtractedTextLength, 8192)];
var builder = new StringBuilder(capacity: Math.Min(MaxExtractedTextLength, 8192));
int read;
while ((read = reader.Read(buffer, 0, buffer.Length)) > 0)
{
text = text.Substring(0, MaxExtractedTextLength);
_logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength);
var remaining = MaxExtractedTextLength - builder.Length;
if (remaining <= 0) break;
builder.Append(buffer, 0, Math.Min(read, remaining));
if (builder.Length >= MaxExtractedTextLength)
{
_logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength);
break;
}
}

_logger.LogDebug("Extracted {CharacterCount} characters from text-based content.", text.Length);
return text;
_logger.LogDebug("Extracted {CharacterCount} characters from text-based content.", builder.Length);
return builder.ToString();
}
catch (Exception ex)
{
Expand All @@ -141,12 +142,12 @@ private string ExtractTextFromTextFile(byte[] fileContent)
}
}

private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
private string ExtractTextFromPdfFile(string fileName, Stream fileContent)
{
try
{
using var stream = new MemoryStream(fileContent, writable: false);
using var document = PdfDocument.Open(stream);
RewindIfPossible(fileContent);
using var document = PdfDocument.Open(fileContent);
var builder = new StringBuilder();
var processedPageCount = 0;
var pageTexts = document.GetPages()
Expand Down Expand Up @@ -177,12 +178,12 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
}
}

private string ExtractTextFromWordDocx(string fileName, byte[] fileContent)
private string ExtractTextFromWordDocx(string fileName, Stream fileContent)
{
try
{
using var stream = new MemoryStream(fileContent, writable: false);
using var document = new XWPFDocument(stream);
RewindIfPossible(fileContent);
using var document = new XWPFDocument(fileContent);
var builder = new StringBuilder();
var processedParagraphCount = AppendDocxParagraphText(document, builder);
var processedTableRowCount = AppendDocxTableText(document, builder);
Expand Down Expand Up @@ -268,12 +269,12 @@ private static int AppendDocxTableText(XWPFDocument document, StringBuilder buil
return processedTableRowCount;
}

private string ExtractTextFromExcelFile(string fileName, byte[] fileContent)
private string ExtractTextFromExcelFile(string fileName, Stream fileContent)
{
try
{
using var stream = new MemoryStream(fileContent, writable: false);
using var workbook = WorkbookFactory.Create(stream);
RewindIfPossible(fileContent);
using var workbook = WorkbookFactory.Create(fileContent);
var builder = new StringBuilder();
var sheetCount = Math.Min(workbook.NumberOfSheets, MaxExcelSheets);
var processedSheetCount = 0;
Expand Down Expand Up @@ -314,12 +315,12 @@ private string ExtractTextFromExcelFile(string fileName, byte[] fileContent)
}
}

private string ExtractTextFromPowerPointFile(string fileName, byte[] fileContent)
private string ExtractTextFromPowerPointFile(string fileName, Stream fileContent)
{
try
{
using var stream = new MemoryStream(fileContent, writable: false);
using var archive = new ZipArchive(stream, ZipArchiveMode.Read, leaveOpen: false);
RewindIfPossible(fileContent);
using var archive = new ZipArchive(fileContent, ZipArchiveMode.Read, leaveOpen: true);
var builder = new StringBuilder();
var slideEntries = GetOrderedPowerPointSlideEntries(archive)
.Take(MaxPowerPointSlides);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Unity.AI.Extraction;
using Unity.AI.Requests;
using Unity.GrantManager.Applications;
using Unity.GrantManager.Intakes;
Expand All @@ -12,24 +13,26 @@ namespace Unity.AI.Operations;

public class AttachmentSummaryService(
IApplicationChefsFileAttachmentRepository applicationChefsFileAttachmentRepository,
ISubmissionAppService submissionAppService,
IChefsFileAttachmentStreamProvider chefsFileAttachmentStreamProvider,
ITextExtractionService textExtractionService,
IAIService aiService,
ILogger<AttachmentSummaryService> logger) : IAttachmentSummaryService, ITransientDependency
{
private const string DefaultContentType = "application/octet-stream";
private const string SummaryGenerationFailedMessage = "AI summary generation failed.";

public async Task<string> GenerateAndSaveAsync(Guid attachmentId, string? promptVersion = null)
{
var attachment = await applicationChefsFileAttachmentRepository.GetAsync(attachmentId);
var fileName = string.IsNullOrWhiteSpace(attachment.FileName) ? "unknown" : attachment.FileName;
var (fileContent, contentType) = await GetAttachmentContentForSummaryAsync(attachment, fileName);

await using var attachmentStream = await OpenAttachmentStreamAsync(attachment, fileName);
var extractedText = await textExtractionService.ExtractTextAsync(fileName, attachmentStream.Content, attachmentStream.ContentType);

var summaryResponse = await aiService.GenerateAttachmentSummaryAsync(new AttachmentSummaryRequest
{
FileName = fileName,
FileContent = fileContent,
ContentType = contentType,
ContentType = attachmentStream.ContentType,
ExtractedText = extractedText,
PromptVersion = promptVersion,
});

Expand Down Expand Up @@ -68,37 +71,29 @@ public async Task<List<string>> GenerateForApplicationAsync(Guid applicationId,
return await GenerateAndSaveAsync(attachmentIds, promptVersion);
}

private async Task<(byte[] Content, string ContentType)> GetAttachmentContentForSummaryAsync(ApplicationChefsFileAttachment attachment, string fileName)
private async Task<ChefsFileAttachmentStream> OpenAttachmentStreamAsync(ApplicationChefsFileAttachment attachment, string fileName)
{
if (!Guid.TryParse(attachment.ChefsSubmissionId, out var submissionId) ||
!Guid.TryParse(attachment.ChefsFileId, out var fileId))
{
logger.LogWarning(
"Attachment {AttachmentId} has invalid CHEFS IDs. Falling back to metadata-only summary generation.",
attachment.Id);
return (Array.Empty<byte>(), DefaultContentType);
return ChefsFileAttachmentStream.Empty;
}

try
{
var fileDto = await submissionAppService.GetChefsFileAttachment(submissionId, fileId, fileName);
if (fileDto?.Content == null)
{
logger.LogWarning(
"Attachment {AttachmentId} has no retrievable content. Falling back to metadata-only summary generation.",
attachment.Id);
return (Array.Empty<byte>(), DefaultContentType);
}

return (fileDto.Content, string.IsNullOrWhiteSpace(fileDto.ContentType) ? DefaultContentType : fileDto.ContentType);
var stream = await chefsFileAttachmentStreamProvider.OpenAsync(submissionId, fileId, fileName);
return stream ?? ChefsFileAttachmentStream.Empty;
}
catch (Exception ex)
{
logger.LogWarning(
ex,
"Failed retrieving CHEFS content for attachment {AttachmentId}. Falling back to metadata-only summary generation.",
attachment.Id);
return (Array.Empty<byte>(), DefaultContentType);
return ChefsFileAttachmentStream.Empty;
}
}
}
Loading
Loading