11using Microsoft . Extensions . Logging ;
2+ using NPOI . SS . UserModel ;
3+ using NPOI . XWPF . UserModel ;
24using System ;
5+ using System . Collections . Generic ;
36using System . IO ;
47using System . Linq ;
58using System . Text ;
@@ -13,104 +16,104 @@ namespace Unity.GrantManager.AI
1316 public partial class TextExtractionService : ITextExtractionService , ITransientDependency
1417 {
1518 private const int MaxExtractedTextLength = 50000 ;
19+ private const int MaxExcelSheets = 10 ;
20+ private const int MaxExcelRowsPerSheet = 2000 ;
21+ private const int MaxExcelCellsPerRow = 50 ;
22+ private const int MaxDocxParagraphs = 2000 ;
23+ private const int MaxDocxTableRows = 2000 ;
24+ private const int MaxDocxTableCellsPerRow = 50 ;
1625 private readonly ILogger < TextExtractionService > _logger ;
1726
1827 public TextExtractionService ( ILogger < TextExtractionService > logger )
1928 {
2029 _logger = logger ;
2130 }
2231
23- public async Task < string > ExtractTextAsync ( string fileName , byte [ ] fileContent , string contentType )
32+ public Task < string > ExtractTextAsync ( string fileName , byte [ ] fileContent , string contentType )
2433 {
2534 if ( fileContent == null || fileContent . Length == 0 )
2635 {
2736 _logger . LogDebug ( "File content is empty for {FileName}" , fileName ) ;
28- return string . Empty ;
37+ return Task . FromResult ( string . Empty ) ;
2938 }
3039
3140 try
3241 {
33- // Normalize content type
3442 var normalizedContentType = contentType ? . ToLowerInvariant ( ) ?? string . Empty ;
3543 var extension = Path . GetExtension ( fileName ) ? . ToLowerInvariant ( ) ?? string . Empty ;
3644
3745 string rawText ;
3846
39- // Handle text-based files
4047 if ( normalizedContentType . Contains ( "text/" ) ||
4148 extension == ".txt" ||
4249 extension == ".csv" ||
4350 extension == ".json" ||
4451 extension == ".xml" )
4552 {
46- rawText = await ExtractTextFromTextFileAsync ( fileContent ) ;
47- return NormalizeAndLimitText ( rawText , fileName ) ;
53+ rawText = ExtractTextFromTextFile ( fileContent ) ;
54+ return Task . FromResult ( NormalizeAndLimitText ( rawText , fileName ) ) ;
4855 }
4956
50- // Handle PDF files
5157 if ( normalizedContentType . Contains ( "pdf" ) || extension == ".pdf" )
5258 {
53- rawText = await Task . FromResult ( ExtractTextFromPdfFile ( fileName , fileContent ) ) ;
54- return NormalizeAndLimitText ( rawText , fileName ) ;
59+ rawText = ExtractTextFromPdfFile ( fileName , fileContent ) ;
60+ return Task . FromResult ( NormalizeAndLimitText ( rawText , fileName ) ) ;
5561 }
5662
57- // Handle Word documents
5863 if ( normalizedContentType . Contains ( "word" ) ||
5964 normalizedContentType . Contains ( "msword" ) ||
6065 normalizedContentType . Contains ( "officedocument.wordprocessingml" ) ||
6166 extension == ".doc" ||
6267 extension == ".docx" )
6368 {
64- // For now, return empty string - can be enhanced with Word parsing library
65- _logger . LogDebug ( "Word document text extraction not yet implemented for {FileName}" , fileName ) ;
66- return string . Empty ;
69+ if ( extension == ".docx" || normalizedContentType . Contains ( "officedocument.wordprocessingml" ) )
70+ {
71+ rawText = ExtractTextFromWordDocx ( fileName , fileContent ) ;
72+ return Task . FromResult ( NormalizeAndLimitText ( rawText , fileName ) ) ;
73+ }
74+
75+ _logger . LogDebug ( "Legacy .doc extraction is not supported for {FileName}" , fileName ) ;
76+ return Task . FromResult ( string . Empty ) ;
6777 }
6878
69- // Handle Excel files
7079 if ( normalizedContentType . Contains ( "excel" ) ||
7180 normalizedContentType . Contains ( "spreadsheet" ) ||
7281 extension == ".xls" ||
7382 extension == ".xlsx" )
7483 {
75- // For now, return empty string - can be enhanced with Excel parsing library
76- _logger . LogDebug ( "Excel text extraction not yet implemented for {FileName}" , fileName ) ;
77- return string . Empty ;
84+ rawText = ExtractTextFromExcelFile ( fileName , fileContent ) ;
85+ return Task . FromResult ( NormalizeAndLimitText ( rawText , fileName ) ) ;
7886 }
7987
80- // For other file types, return empty string
8188 _logger . LogDebug ( "No text extraction available for content type {ContentType} with extension {Extension}" ,
8289 contentType , extension ) ;
83- return string . Empty ;
90+ return Task . FromResult ( string . Empty ) ;
8491 }
8592 catch ( Exception ex )
8693 {
8794 _logger . LogError ( ex , "Error extracting text from {FileName}" , fileName ) ;
88- return string . Empty ;
95+ return Task . FromResult ( string . Empty ) ;
8996 }
9097 }
9198
92- private async Task < string > ExtractTextFromTextFileAsync ( byte [ ] fileContent )
99+ private string ExtractTextFromTextFile ( byte [ ] fileContent )
93100 {
94101 try
95102 {
96- // Try UTF-8 first
97103 var text = Encoding . UTF8 . GetString ( fileContent ) ;
98104
99- // Check if the decoded text contains replacement characters (indicates encoding issue)
100105 if ( text . Contains ( '\uFFFD ' ) )
101106 {
102- // Try other encodings
103107 text = Encoding . ASCII . GetString ( fileContent ) ;
104108 }
105109
106- // Limit the extracted text to a reasonable size.
107110 if ( text . Length > MaxExtractedTextLength )
108111 {
109112 text = text . Substring ( 0 , MaxExtractedTextLength ) ;
110113 _logger . LogDebug ( "Truncated text content to {MaxLength} characters" , MaxExtractedTextLength ) ;
111114 }
112115
113- return await Task . FromResult ( text ) ;
116+ return text ;
114117 }
115118 catch ( Exception ex )
116119 {
@@ -155,6 +158,186 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
155158 }
156159 }
157160
161+ private string ExtractTextFromWordDocx ( string fileName , byte [ ] fileContent )
162+ {
163+ try
164+ {
165+ using var stream = new MemoryStream ( fileContent , writable : false ) ;
166+ using var document = new XWPFDocument ( stream ) ;
167+ var builder = new StringBuilder ( ) ;
168+
169+ foreach ( var paragraphText in document . Paragraphs . Take ( MaxDocxParagraphs ) . Select ( paragraph => paragraph . ParagraphText ) )
170+ {
171+ var limitReached = AppendWithLimit ( builder , paragraphText , MaxExtractedTextLength , Environment . NewLine ) ;
172+ if ( limitReached )
173+ {
174+ break ;
175+ }
176+ }
177+
178+ if ( builder . Length < MaxExtractedTextLength )
179+ {
180+ foreach ( var table in document . Tables )
181+ {
182+ foreach ( var row in table . Rows . Take ( MaxDocxTableRows ) )
183+ {
184+ foreach ( var cell in row . GetTableCells ( ) . Take ( MaxDocxTableCellsPerRow ) )
185+ {
186+ var limitReached = AppendWithLimit ( builder , cell . GetText ( ) , MaxExtractedTextLength , Environment . NewLine ) ;
187+ if ( limitReached )
188+ {
189+ break ;
190+ }
191+ }
192+
193+ if ( builder . Length >= MaxExtractedTextLength )
194+ {
195+ break ;
196+ }
197+ }
198+
199+ if ( builder . Length >= MaxExtractedTextLength )
200+ {
201+ break ;
202+ }
203+ }
204+ }
205+
206+ return builder . ToString ( ) ;
207+ }
208+ catch ( Exception ex )
209+ {
210+ _logger . LogWarning ( ex , "Word (.docx) text extraction failed for {FileName}" , fileName ) ;
211+ return string . Empty ;
212+ }
213+ }
214+
215+ private string ExtractTextFromExcelFile ( string fileName , byte [ ] fileContent )
216+ {
217+ try
218+ {
219+ using var stream = new MemoryStream ( fileContent , writable : false ) ;
220+ using var workbook = WorkbookFactory . Create ( stream ) ;
221+ var builder = new StringBuilder ( ) ;
222+ var sheetCount = Math . Min ( workbook . NumberOfSheets , MaxExcelSheets ) ;
223+ var limitReached = false ;
224+
225+ for ( var sheetIndex = 0 ; sheetIndex < sheetCount ; sheetIndex ++ )
226+ {
227+ if ( limitReached || builder . Length >= MaxExtractedTextLength )
228+ {
229+ break ;
230+ }
231+
232+ var sheet = workbook . GetSheetAt ( sheetIndex ) ;
233+ if ( sheet == null )
234+ {
235+ continue ;
236+ }
237+
238+ var processedRows = 0 ;
239+ foreach ( IRow row in sheet )
240+ {
241+ if ( processedRows >= MaxExcelRowsPerSheet || builder . Length >= MaxExtractedTextLength )
242+ {
243+ break ;
244+ }
245+
246+ var rowHasValue = false ;
247+ foreach ( var cell in row . Cells . Take ( MaxExcelCellsPerRow ) )
248+ {
249+ var value = GetCellText ( cell ) ;
250+ if ( string . IsNullOrWhiteSpace ( value ) )
251+ {
252+ continue ;
253+ }
254+
255+ var separator = rowHasValue ? " | " : ( builder . Length > 0 ? Environment . NewLine : null ) ;
256+ limitReached = AppendWithLimit ( builder , value , MaxExtractedTextLength , separator ) ;
257+ rowHasValue = true ;
258+ if ( limitReached )
259+ {
260+ break ;
261+ }
262+ }
263+
264+ processedRows ++ ;
265+ if ( limitReached )
266+ {
267+ break ;
268+ }
269+ }
270+ }
271+
272+ return builder . ToString ( ) ;
273+ }
274+ catch ( Exception ex )
275+ {
276+ _logger . LogWarning ( ex , "Excel text extraction failed for {FileName}" , fileName ) ;
277+ return string . Empty ;
278+ }
279+ }
280+
281+ private static bool AppendWithLimit ( StringBuilder builder , string ? value , int maxLength , string ? separator = null )
282+ {
283+ if ( string . IsNullOrWhiteSpace ( value ) )
284+ {
285+ return builder . Length >= maxLength ;
286+ }
287+
288+ if ( builder . Length >= maxLength )
289+ {
290+ return true ;
291+ }
292+
293+ var remaining = maxLength - builder . Length ;
294+ if ( remaining <= 0 )
295+ {
296+ return true ;
297+ }
298+
299+ if ( ! string . IsNullOrEmpty ( separator ) && builder . Length > 0 )
300+ {
301+ if ( separator . Length >= remaining )
302+ {
303+ builder . Append ( separator . AsSpan ( 0 , remaining ) ) ;
304+ return true ;
305+ }
306+
307+ builder . Append ( separator ) ;
308+ remaining -= separator . Length ;
309+ }
310+
311+ if ( value . Length >= remaining )
312+ {
313+ builder . Append ( value . AsSpan ( 0 , remaining ) ) ;
314+ return true ;
315+ }
316+
317+ builder . Append ( value ) ;
318+ return false ;
319+ }
320+
321+ private static string GetCellText ( NPOI . SS . UserModel . ICell cell )
322+ {
323+ if ( cell == null )
324+ {
325+ return string . Empty ;
326+ }
327+
328+ return ( cell . CellType switch
329+ {
330+ CellType . String => cell . StringCellValue ?? string . Empty ,
331+ CellType . Numeric => DateUtil . IsCellDateFormatted ( cell )
332+ ? cell . DateCellValue . ToString ( )
333+ : cell . NumericCellValue . ToString ( ) ,
334+ CellType . Boolean => cell . BooleanCellValue ? "true" : "false" ,
335+ CellType . Formula => cell . ToString ( ) ,
336+ CellType . Blank => string . Empty ,
337+ _ => cell . ToString ( ) ?? string . Empty
338+ } ) ?? string . Empty ;
339+ }
340+
158341 private string NormalizeAndLimitText ( string text , string fileName )
159342 {
160343 var normalized = NormalizeExtractedText ( text ) ;
0 commit comments