bcgov · JamesPasta · May 6, 2026 · Apr 30, 2026 · May 6, 2026 · May 6, 2026
diff --git a/...dules/Unity.AI/src/Unity.AI.Application.Contracts/AI/Extraction/ITextExtractionService.cs b/...dules/Unity.AI/src/Unity.AI.Application.Contracts/AI/Extraction/ITextExtractionService.cs
@@ -1,9 +1,10 @@
+using System.IO;
 using System.Threading.Tasks;
 
 namespace Unity.AI.Extraction
 {
     public interface ITextExtractionService
     {
-        Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType);
+        Task<string> ExtractTextAsync(string fileName, Stream fileContent, string contentType);
     }
 }
diff --git a/...dules/Unity.AI/src/Unity.AI.Application.Contracts/AI/Requests/AttachmentSummaryRequest.cs b/...dules/Unity.AI/src/Unity.AI.Application.Contracts/AI/Requests/AttachmentSummaryRequest.cs
@@ -7,12 +7,12 @@ public class AttachmentSummaryRequest
         [JsonPropertyName("fileName")]
         public string FileName { get; set; } = string.Empty;
 
-        [JsonPropertyName("fileContent")]
-        public byte[] FileContent { get; set; } = System.Array.Empty<byte>();
-
         [JsonPropertyName("contentType")]
         public string ContentType { get; set; } = "application/octet-stream";
 
+        [JsonPropertyName("extractedText")]
+        public string? ExtractedText { get; set; }
+
         [JsonPropertyName("promptVersion")]
         public string? PromptVersion { get; set; }
     }

diff --git a/...tManager/modules/Unity.AI/src/Unity.AI.Application/AI/Extraction/TextExtractionService.cs b/...tManager/modules/Unity.AI/src/Unity.AI.Application/AI/Extraction/TextExtractionService.cs
@@ -26,30 +26,17 @@ public partial class TextExtractionService : ITextExtractionService, ITransientD
         private const int MaxDocxTableCellsPerRow = 50;
         private const int MaxPowerPointSlides = 200;
         private readonly ILogger<TextExtractionService> _logger;
-        private readonly Dictionary<string, Func<string, byte[], string>> _extractorsByExtension;
 
         public TextExtractionService(ILogger<TextExtractionService> logger)
         {
             _logger = logger;
-            _extractorsByExtension = new Dictionary<string, Func<string, byte[], string>>(StringComparer.OrdinalIgnoreCase)
-            {
-                [".txt"] = (_, content) => ExtractTextFromTextFile(content),
-                [".csv"] = (_, content) => ExtractTextFromTextFile(content),
-                [".json"] = (_, content) => ExtractTextFromTextFile(content),
-                [".xml"] = (_, content) => ExtractTextFromTextFile(content),
-                [".pdf"] = ExtractTextFromPdfFile,
-                [".docx"] = ExtractTextFromWordDocx,
-                [".xls"] = ExtractTextFromExcelFile,
-                [".xlsx"] = ExtractTextFromExcelFile,
-                [".pptx"] = ExtractTextFromPowerPointFile
-            };
         }
 
-        public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
+        public Task<string> ExtractTextAsync(string fileName, Stream fileContent, string contentType)
         {
-            if (fileContent == null || fileContent.Length == 0)
+            if (fileContent == null)
             {
-                _logger.LogDebug("File content is empty for {FileName}", fileName);
+                _logger.LogDebug("File content stream is null for {FileName}", fileName);
                 return Task.FromResult(string.Empty);
             }
 
@@ -64,48 +51,23 @@ public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string
                     return Task.FromResult(string.Empty);
                 }
 
-                if (_extractorsByExtension.TryGetValue(extension, out var extractor))
+                var rawText = extension switch
                 {
-                    var rawText = extractor(fileName, fileContent);
-                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
-                }
-
-                if (normalizedContentType.Contains("text/"))
-                {
-                    var rawText = ExtractTextFromTextFile(fileContent);
-                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
-                }
-
-                if (normalizedContentType.Contains("pdf"))
-                {
-                    var rawText = ExtractTextFromPdfFile(fileName, fileContent);
-                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
-                }
-
-                if (normalizedContentType.Contains("word") ||
-                    normalizedContentType.Contains("msword") ||
-                    normalizedContentType.Contains("officedocument.wordprocessingml"))
-                {
-                    var rawText = ExtractTextFromWordDocx(fileName, fileContent);
-                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
-                }
-
-                if (normalizedContentType.Contains("excel") || normalizedContentType.Contains("spreadsheet"))
+                    ".txt" or ".csv" or ".json" or ".xml" => ExtractTextFromTextFile(fileContent),
+                    ".pdf" => ExtractTextFromPdfFile(fileName, fileContent),
+                    ".docx" => ExtractTextFromWordDocx(fileName, fileContent),
+                    ".xls" or ".xlsx" => ExtractTextFromExcelFile(fileName, fileContent),
+                    ".pptx" => ExtractTextFromPowerPointFile(fileName, fileContent),
+                    _ => ExtractByContentType(fileName, fileContent, normalizedContentType)
+                };
+
+                if (string.IsNullOrEmpty(rawText))
                 {
-                    var rawText = ExtractTextFromExcelFile(fileName, fileContent);
-                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
+                    _logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}",
+                        contentType, extension);
                 }
 
-                if (normalizedContentType.Contains("presentation") ||
-                    normalizedContentType.Contains("powerpoint"))
-                {
-                    var rawText = ExtractTextFromPowerPointFile(fileName, fileContent);
-                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
-                }
-
-                _logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}",
-                    contentType, extension);
-                return Task.FromResult(string.Empty);
+                return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
             }
             catch (Exception ex)
             {
@@ -114,25 +76,64 @@ public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string
             }
         }
 
-        private string ExtractTextFromTextFile(byte[] fileContent)
+        private string ExtractByContentType(string fileName, Stream fileContent, string normalizedContentType)
         {
-            try
+            if (normalizedContentType.Contains("text/"))
             {
-                var text = Encoding.UTF8.GetString(fileContent);
+                return ExtractTextFromTextFile(fileContent);
+            }
+            if (normalizedContentType.Contains("pdf"))
+            {
+                return ExtractTextFromPdfFile(fileName, fileContent);
+            }
+            if (normalizedContentType.Contains("word") ||
+                normalizedContentType.Contains("msword") ||
+                normalizedContentType.Contains("officedocument.wordprocessingml"))
+            {
+                return ExtractTextFromWordDocx(fileName, fileContent);
+            }
+            if (normalizedContentType.Contains("excel") || normalizedContentType.Contains("spreadsheet"))
+            {
+                return ExtractTextFromExcelFile(fileName, fileContent);
+            }
+            if (normalizedContentType.Contains("presentation") || normalizedContentType.Contains("powerpoint"))
+            {
+                return ExtractTextFromPowerPointFile(fileName, fileContent);
+            }
+            return string.Empty;
+        }
 
-                if (text.Contains('\uFFFD'))
-                {
-                    text = Encoding.ASCII.GetString(fileContent);
-                }
+        private static void RewindIfPossible(Stream stream)
+        {
+            if (stream.CanSeek)
+            {
+                stream.Position = 0;
+            }
+        }
 
-                if (text.Length > MaxExtractedTextLength)
+        private string ExtractTextFromTextFile(Stream fileContent)
+        {
+            try
+            {
+                RewindIfPossible(fileContent);
+                using var reader = new StreamReader(fileContent, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, bufferSize: 4096, leaveOpen: true);
+                var buffer = new char[Math.Min(MaxExtractedTextLength, 8192)];
+                var builder = new StringBuilder(capacity: Math.Min(MaxExtractedTextLength, 8192));
+                int read;
+                while ((read = reader.Read(buffer, 0, buffer.Length)) > 0)
                 {
-                    text = text.Substring(0, MaxExtractedTextLength);
-                    _logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength);
+                    var remaining = MaxExtractedTextLength - builder.Length;
+                    if (remaining <= 0) break;
+                    builder.Append(buffer, 0, Math.Min(read, remaining));
+                    if (builder.Length >= MaxExtractedTextLength)
+                    {
+                        _logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength);
+                        break;
+                    }
                 }
 
-                _logger.LogDebug("Extracted {CharacterCount} characters from text-based content.", text.Length);
-                return text;
+                _logger.LogDebug("Extracted {CharacterCount} characters from text-based content.", builder.Length);
+                return builder.ToString();
             }
             catch (Exception ex)
             {
@@ -141,12 +142,12 @@ private string ExtractTextFromTextFile(byte[] fileContent)
             }
         }
 
-        private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
+        private string ExtractTextFromPdfFile(string fileName, Stream fileContent)
         {
             try
             {
-                using var stream = new MemoryStream(fileContent, writable: false);
-                using var document = PdfDocument.Open(stream);
+                RewindIfPossible(fileContent);
+                using var document = PdfDocument.Open(fileContent);
                 var builder = new StringBuilder();
                 var processedPageCount = 0;
                 var pageTexts = document.GetPages()
@@ -177,12 +178,12 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
             }
         }
 
-        private string ExtractTextFromWordDocx(string fileName, byte[] fileContent)
+        private string ExtractTextFromWordDocx(string fileName, Stream fileContent)
         {
             try
             {
-                using var stream = new MemoryStream(fileContent, writable: false);
-                using var document = new XWPFDocument(stream);
+                RewindIfPossible(fileContent);
+                using var document = new XWPFDocument(fileContent);
                 var builder = new StringBuilder();
                 var processedParagraphCount = AppendDocxParagraphText(document, builder);
                 var processedTableRowCount = AppendDocxTableText(document, builder);
@@ -268,12 +269,12 @@ private static int AppendDocxTableText(XWPFDocument document, StringBuilder buil
             return processedTableRowCount;
         }
 
-        private string ExtractTextFromExcelFile(string fileName, byte[] fileContent)
+        private string ExtractTextFromExcelFile(string fileName, Stream fileContent)
         {
             try
             {
-                using var stream = new MemoryStream(fileContent, writable: false);
-                using var workbook = WorkbookFactory.Create(stream);
+                RewindIfPossible(fileContent);
+                using var workbook = WorkbookFactory.Create(fileContent);
                 var builder = new StringBuilder();
                 var sheetCount = Math.Min(workbook.NumberOfSheets, MaxExcelSheets);
                 var processedSheetCount = 0;
@@ -314,12 +315,12 @@ private string ExtractTextFromExcelFile(string fileName, byte[] fileContent)
             }
         }
 
-        private string ExtractTextFromPowerPointFile(string fileName, byte[] fileContent)
+        private string ExtractTextFromPowerPointFile(string fileName, Stream fileContent)
         {
             try
             {
-                using var stream = new MemoryStream(fileContent, writable: false);
-                using var archive = new ZipArchive(stream, ZipArchiveMode.Read, leaveOpen: false);
+                RewindIfPossible(fileContent);
+                using var archive = new ZipArchive(fileContent, ZipArchiveMode.Read, leaveOpen: true);
                 var builder = new StringBuilder();
                 var slideEntries = GetOrderedPowerPointSlideEntries(archive)
                     .Take(MaxPowerPointSlides);

diff --git a/...nager/modules/Unity.AI/src/Unity.AI.Application/AI/Operations/AttachmentSummaryService.cs b/...nager/modules/Unity.AI/src/Unity.AI.Application/AI/Operations/AttachmentSummaryService.cs
@@ -3,6 +3,7 @@
 using System.Collections.Generic;
 using System.Linq;
 using System.Threading.Tasks;
+using Unity.AI.Extraction;
 using Unity.AI.Requests;
 using Unity.GrantManager.Applications;
 using Unity.GrantManager.Intakes;
@@ -12,24 +13,26 @@ namespace Unity.AI.Operations;
 
 public class AttachmentSummaryService(
     IApplicationChefsFileAttachmentRepository applicationChefsFileAttachmentRepository,
-    ISubmissionAppService submissionAppService,
+    IChefsFileAttachmentStreamProvider chefsFileAttachmentStreamProvider,
+    ITextExtractionService textExtractionService,
     IAIService aiService,
     ILogger<AttachmentSummaryService> logger) : IAttachmentSummaryService, ITransientDependency
 {
-    private const string DefaultContentType = "application/octet-stream";
     private const string SummaryGenerationFailedMessage = "AI summary generation failed.";
 
     public async Task<string> GenerateAndSaveAsync(Guid attachmentId, string? promptVersion = null)
     {
         var attachment = await applicationChefsFileAttachmentRepository.GetAsync(attachmentId);
         var fileName = string.IsNullOrWhiteSpace(attachment.FileName) ? "unknown" : attachment.FileName;
-        var (fileContent, contentType) = await GetAttachmentContentForSummaryAsync(attachment, fileName);
+
+        await using var attachmentStream = await OpenAttachmentStreamAsync(attachment, fileName);
+        var extractedText = await textExtractionService.ExtractTextAsync(fileName, attachmentStream.Content, attachmentStream.ContentType);
 
         var summaryResponse = await aiService.GenerateAttachmentSummaryAsync(new AttachmentSummaryRequest
         {
             FileName = fileName,
-            FileContent = fileContent,
-            ContentType = contentType,
+            ContentType = attachmentStream.ContentType,
+            ExtractedText = extractedText,
             PromptVersion = promptVersion,
         });
 
@@ -68,37 +71,29 @@ public async Task<List<string>> GenerateForApplicationAsync(Guid applicationId,
         return await GenerateAndSaveAsync(attachmentIds, promptVersion);
     }
 
-    private async Task<(byte[] Content, string ContentType)> GetAttachmentContentForSummaryAsync(ApplicationChefsFileAttachment attachment, string fileName)
+    private async Task<ChefsFileAttachmentStream> OpenAttachmentStreamAsync(ApplicationChefsFileAttachment attachment, string fileName)
     {
         if (!Guid.TryParse(attachment.ChefsSubmissionId, out var submissionId) ||
             !Guid.TryParse(attachment.ChefsFileId, out var fileId))
         {
             logger.LogWarning(
                 "Attachment {AttachmentId} has invalid CHEFS IDs. Falling back to metadata-only summary generation.",
                 attachment.Id);
-            return (Array.Empty<byte>(), DefaultContentType);
+            return ChefsFileAttachmentStream.Empty;
         }
 
         try
         {
-            var fileDto = await submissionAppService.GetChefsFileAttachment(submissionId, fileId, fileName);
-            if (fileDto?.Content == null)
-            {
-                logger.LogWarning(
-                    "Attachment {AttachmentId} has no retrievable content. Falling back to metadata-only summary generation.",
-                    attachment.Id);
-                return (Array.Empty<byte>(), DefaultContentType);
-            }
-
-            return (fileDto.Content, string.IsNullOrWhiteSpace(fileDto.ContentType) ? DefaultContentType : fileDto.ContentType);
+            var stream = await chefsFileAttachmentStreamProvider.OpenAsync(submissionId, fileId, fileName);
+            return stream ?? ChefsFileAttachmentStream.Empty;
         }
         catch (Exception ex)
         {
             logger.LogWarning(
                 ex,
                 "Failed retrieving CHEFS content for attachment {AttachmentId}. Falling back to metadata-only summary generation.",
                 attachment.Id);
-            return (Array.Empty<byte>(), DefaultContentType);
+            return ChefsFileAttachmentStream.Empty;
         }
     }
 }