Skip to content

Commit f683528

Browse files
Merge pull request #2048 from bcgov/feature/AB#32008-AddOfficeExtractionSupport
Feature/ab#32008 add office extraction support
2 parents aed4e2f + b16d19d commit f683528

2 files changed

Lines changed: 210 additions & 26 deletions

File tree

applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs

Lines changed: 209 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
using Microsoft.Extensions.Logging;
2+
using NPOI.SS.UserModel;
3+
using NPOI.XWPF.UserModel;
24
using System;
5+
using System.Collections.Generic;
36
using System.IO;
47
using System.Linq;
58
using System.Text;
@@ -13,104 +16,104 @@ namespace Unity.GrantManager.AI
1316
public partial class TextExtractionService : ITextExtractionService, ITransientDependency
1417
{
1518
private const int MaxExtractedTextLength = 50000;
19+
private const int MaxExcelSheets = 10;
20+
private const int MaxExcelRowsPerSheet = 2000;
21+
private const int MaxExcelCellsPerRow = 50;
22+
private const int MaxDocxParagraphs = 2000;
23+
private const int MaxDocxTableRows = 2000;
24+
private const int MaxDocxTableCellsPerRow = 50;
1625
private readonly ILogger<TextExtractionService> _logger;
1726

1827
public TextExtractionService(ILogger<TextExtractionService> logger)
1928
{
2029
_logger = logger;
2130
}
2231

23-
public async Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
32+
public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
2433
{
2534
if (fileContent == null || fileContent.Length == 0)
2635
{
2736
_logger.LogDebug("File content is empty for {FileName}", fileName);
28-
return string.Empty;
37+
return Task.FromResult(string.Empty);
2938
}
3039

3140
try
3241
{
33-
// Normalize content type
3442
var normalizedContentType = contentType?.ToLowerInvariant() ?? string.Empty;
3543
var extension = Path.GetExtension(fileName)?.ToLowerInvariant() ?? string.Empty;
3644

3745
string rawText;
3846

39-
// Handle text-based files
4047
if (normalizedContentType.Contains("text/") ||
4148
extension == ".txt" ||
4249
extension == ".csv" ||
4350
extension == ".json" ||
4451
extension == ".xml")
4552
{
46-
rawText = await ExtractTextFromTextFileAsync(fileContent);
47-
return NormalizeAndLimitText(rawText, fileName);
53+
rawText = ExtractTextFromTextFile(fileContent);
54+
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
4855
}
4956

50-
// Handle PDF files
5157
if (normalizedContentType.Contains("pdf") || extension == ".pdf")
5258
{
53-
rawText = await Task.FromResult(ExtractTextFromPdfFile(fileName, fileContent));
54-
return NormalizeAndLimitText(rawText, fileName);
59+
rawText = ExtractTextFromPdfFile(fileName, fileContent);
60+
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
5561
}
5662

57-
// Handle Word documents
5863
if (normalizedContentType.Contains("word") ||
5964
normalizedContentType.Contains("msword") ||
6065
normalizedContentType.Contains("officedocument.wordprocessingml") ||
6166
extension == ".doc" ||
6267
extension == ".docx")
6368
{
64-
// For now, return empty string - can be enhanced with Word parsing library
65-
_logger.LogDebug("Word document text extraction not yet implemented for {FileName}", fileName);
66-
return string.Empty;
69+
if (extension == ".docx" || normalizedContentType.Contains("officedocument.wordprocessingml"))
70+
{
71+
rawText = ExtractTextFromWordDocx(fileName, fileContent);
72+
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
73+
}
74+
75+
_logger.LogDebug("Legacy .doc extraction is not supported for {FileName}", fileName);
76+
return Task.FromResult(string.Empty);
6777
}
6878

69-
// Handle Excel files
7079
if (normalizedContentType.Contains("excel") ||
7180
normalizedContentType.Contains("spreadsheet") ||
7281
extension == ".xls" ||
7382
extension == ".xlsx")
7483
{
75-
// For now, return empty string - can be enhanced with Excel parsing library
76-
_logger.LogDebug("Excel text extraction not yet implemented for {FileName}", fileName);
77-
return string.Empty;
84+
rawText = ExtractTextFromExcelFile(fileName, fileContent);
85+
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
7886
}
7987

80-
// For other file types, return empty string
8188
_logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}",
8289
contentType, extension);
83-
return string.Empty;
90+
return Task.FromResult(string.Empty);
8491
}
8592
catch (Exception ex)
8693
{
8794
_logger.LogError(ex, "Error extracting text from {FileName}", fileName);
88-
return string.Empty;
95+
return Task.FromResult(string.Empty);
8996
}
9097
}
9198

92-
private async Task<string> ExtractTextFromTextFileAsync(byte[] fileContent)
99+
private string ExtractTextFromTextFile(byte[] fileContent)
93100
{
94101
try
95102
{
96-
// Try UTF-8 first
97103
var text = Encoding.UTF8.GetString(fileContent);
98104

99-
// Check if the decoded text contains replacement characters (indicates encoding issue)
100105
if (text.Contains('\uFFFD'))
101106
{
102-
// Try other encodings
103107
text = Encoding.ASCII.GetString(fileContent);
104108
}
105109

106-
// Limit the extracted text to a reasonable size.
107110
if (text.Length > MaxExtractedTextLength)
108111
{
109112
text = text.Substring(0, MaxExtractedTextLength);
110113
_logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength);
111114
}
112115

113-
return await Task.FromResult(text);
116+
return text;
114117
}
115118
catch (Exception ex)
116119
{
@@ -155,6 +158,186 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
155158
}
156159
}
157160

161+
private string ExtractTextFromWordDocx(string fileName, byte[] fileContent)
162+
{
163+
try
164+
{
165+
using var stream = new MemoryStream(fileContent, writable: false);
166+
using var document = new XWPFDocument(stream);
167+
var builder = new StringBuilder();
168+
169+
foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText))
170+
{
171+
var limitReached = AppendWithLimit(builder, paragraphText, MaxExtractedTextLength, Environment.NewLine);
172+
if (limitReached)
173+
{
174+
break;
175+
}
176+
}
177+
178+
if (builder.Length < MaxExtractedTextLength)
179+
{
180+
foreach (var table in document.Tables)
181+
{
182+
foreach (var row in table.Rows.Take(MaxDocxTableRows))
183+
{
184+
foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow))
185+
{
186+
var limitReached = AppendWithLimit(builder, cell.GetText(), MaxExtractedTextLength, Environment.NewLine);
187+
if (limitReached)
188+
{
189+
break;
190+
}
191+
}
192+
193+
if (builder.Length >= MaxExtractedTextLength)
194+
{
195+
break;
196+
}
197+
}
198+
199+
if (builder.Length >= MaxExtractedTextLength)
200+
{
201+
break;
202+
}
203+
}
204+
}
205+
206+
return builder.ToString();
207+
}
208+
catch (Exception ex)
209+
{
210+
_logger.LogWarning(ex, "Word (.docx) text extraction failed for {FileName}", fileName);
211+
return string.Empty;
212+
}
213+
}
214+
215+
private string ExtractTextFromExcelFile(string fileName, byte[] fileContent)
216+
{
217+
try
218+
{
219+
using var stream = new MemoryStream(fileContent, writable: false);
220+
using var workbook = WorkbookFactory.Create(stream);
221+
var builder = new StringBuilder();
222+
var sheetCount = Math.Min(workbook.NumberOfSheets, MaxExcelSheets);
223+
var limitReached = false;
224+
225+
for (var sheetIndex = 0; sheetIndex < sheetCount; sheetIndex++)
226+
{
227+
if (limitReached || builder.Length >= MaxExtractedTextLength)
228+
{
229+
break;
230+
}
231+
232+
var sheet = workbook.GetSheetAt(sheetIndex);
233+
if (sheet == null)
234+
{
235+
continue;
236+
}
237+
238+
var processedRows = 0;
239+
foreach (IRow row in sheet)
240+
{
241+
if (processedRows >= MaxExcelRowsPerSheet || builder.Length >= MaxExtractedTextLength)
242+
{
243+
break;
244+
}
245+
246+
var rowHasValue = false;
247+
foreach (var cell in row.Cells.Take(MaxExcelCellsPerRow))
248+
{
249+
var value = GetCellText(cell);
250+
if (string.IsNullOrWhiteSpace(value))
251+
{
252+
continue;
253+
}
254+
255+
var separator = rowHasValue ? " | " : (builder.Length > 0 ? Environment.NewLine : null);
256+
limitReached = AppendWithLimit(builder, value, MaxExtractedTextLength, separator);
257+
rowHasValue = true;
258+
if (limitReached)
259+
{
260+
break;
261+
}
262+
}
263+
264+
processedRows++;
265+
if (limitReached)
266+
{
267+
break;
268+
}
269+
}
270+
}
271+
272+
return builder.ToString();
273+
}
274+
catch (Exception ex)
275+
{
276+
_logger.LogWarning(ex, "Excel text extraction failed for {FileName}", fileName);
277+
return string.Empty;
278+
}
279+
}
280+
281+
private static bool AppendWithLimit(StringBuilder builder, string? value, int maxLength, string? separator = null)
282+
{
283+
if (string.IsNullOrWhiteSpace(value))
284+
{
285+
return builder.Length >= maxLength;
286+
}
287+
288+
if (builder.Length >= maxLength)
289+
{
290+
return true;
291+
}
292+
293+
var remaining = maxLength - builder.Length;
294+
if (remaining <= 0)
295+
{
296+
return true;
297+
}
298+
299+
if (!string.IsNullOrEmpty(separator) && builder.Length > 0)
300+
{
301+
if (separator.Length >= remaining)
302+
{
303+
builder.Append(separator.AsSpan(0, remaining));
304+
return true;
305+
}
306+
307+
builder.Append(separator);
308+
remaining -= separator.Length;
309+
}
310+
311+
if (value.Length >= remaining)
312+
{
313+
builder.Append(value.AsSpan(0, remaining));
314+
return true;
315+
}
316+
317+
builder.Append(value);
318+
return false;
319+
}
320+
321+
private static string GetCellText(NPOI.SS.UserModel.ICell cell)
322+
{
323+
if (cell == null)
324+
{
325+
return string.Empty;
326+
}
327+
328+
return (cell.CellType switch
329+
{
330+
CellType.String => cell.StringCellValue ?? string.Empty,
331+
CellType.Numeric => DateUtil.IsCellDateFormatted(cell)
332+
? cell.DateCellValue.ToString()
333+
: cell.NumericCellValue.ToString(),
334+
CellType.Boolean => cell.BooleanCellValue ? "true" : "false",
335+
CellType.Formula => cell.ToString(),
336+
CellType.Blank => string.Empty,
337+
_ => cell.ToString() ?? string.Empty
338+
}) ?? string.Empty;
339+
}
340+
158341
private string NormalizeAndLimitText(string text, string fileName)
159342
{
160343
var normalized = NormalizeExtractedText(text);

applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
<PackageReference Include="Quartz.Serialization.Json" Version="3.14.0" />
3434
<PackageReference Include="RestSharp" Version="112.1.0" />
3535
<PackageReference Include="PdfPig" Version="0.1.13" />
36+
<PackageReference Include="NPOI" Version="2.7.5" />
3637
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
3738
<PackageReference Include="Volo.Abp.BackgroundWorkers.Quartz" Version="9.1.3" />
3839
<PackageReference Include="Volo.Abp.BlobStoring" Version="9.1.3" />

0 commit comments

Comments
 (0)