forked from microsoft/markitdown
-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathZipConverter.cs
More file actions
256 lines (214 loc) · 8.27 KB
/
ZipConverter.cs
File metadata and controls
256 lines (214 loc) · 8.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
using System.Collections.Generic;
using System.IO.Compression;
using System.Linq;
using System.Text;
using ManagedCode.MimeTypes;
using MarkItDown;
namespace MarkItDown.Converters;
/// <summary>
/// Converter for ZIP files that extracts and converts all contained files.
/// </summary>
public sealed class ZipConverter : IDocumentConverter
{
private static readonly HashSet<string> AcceptedExtensions = new(StringComparer.OrdinalIgnoreCase)
{
".zip"
};
private static readonly IReadOnlyCollection<string> AcceptedMimeTypePrefixes = new List<string>
{
MimeHelper.ZIP,
MimeTypeUtilities.WithSubtype(MimeHelper.ZIP, "x-zip-compressed"),
};
// Converters that we'll use for processing files within the ZIP
private readonly List<IDocumentConverter> _innerConverters;
public int Priority => 400; // Process before generic converters
public ZipConverter(IEnumerable<IDocumentConverter>? innerConverters = null)
{
_innerConverters = innerConverters?.ToList() ?? new List<IDocumentConverter>();
}
public bool AcceptsInput(StreamInfo streamInfo)
{
var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo);
var extension = streamInfo.Extension?.ToLowerInvariant();
if (extension is not null && AcceptedExtensions.Contains(extension))
return true;
return MimeTypeUtilities.MatchesAny(normalizedMime, AcceptedMimeTypePrefixes)
|| (normalizedMime is null && MimeTypeUtilities.MatchesAny(streamInfo.MimeType, AcceptedMimeTypePrefixes));
}
public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
{
if (!AcceptsInput(streamInfo))
return false;
// Try to validate this is actually a ZIP file by checking the header
if (!stream.CanSeek)
return true;
try
{
var originalPosition = stream.Position;
stream.Position = 0;
// Check for ZIP file signature (PK)
var buffer = new byte[4];
var bytesRead = stream.Read(buffer, 0, 4);
stream.Position = originalPosition;
// ZIP files start with "PK" (0x50, 0x4B)
return bytesRead >= 2 && buffer[0] == 0x50 && buffer[1] == 0x4B;
}
catch
{
if (stream.CanSeek)
stream.Position = 0;
return true;
}
}
public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
{
try
{
// Reset stream position
if (stream.CanSeek)
stream.Position = 0;
var markdown = new StringBuilder();
var fileName = streamInfo.FileName ?? "archive.zip";
var title = $"Content from {fileName}";
markdown.AppendLine($"# {title}");
markdown.AppendLine();
using var archive = new ZipArchive(stream, ZipArchiveMode.Read, leaveOpen: true);
var processedFiles = 0;
var totalFiles = archive.Entries.Count;
foreach (var entry in archive.Entries.OrderBy(e => e.FullName))
{
cancellationToken.ThrowIfCancellationRequested();
// Skip directories
if (entry.FullName.EndsWith("/") || entry.FullName.EndsWith("\\"))
continue;
try
{
await ProcessZipEntry(entry, markdown, cancellationToken);
processedFiles++;
}
catch (Exception ex)
{
// Log the error but continue processing other files
markdown.AppendLine($"## File: {entry.FullName}");
markdown.AppendLine();
markdown.AppendLine($"*Error processing file: {ex.Message}*");
markdown.AppendLine();
}
}
if (processedFiles == 0)
{
markdown.AppendLine("*No files could be processed from this archive.*");
}
else
{
markdown.Insert(title.Length + 4, $" ({processedFiles} of {totalFiles} files processed)");
}
return new DocumentConverterResult(
markdown: markdown.ToString().TrimEnd(),
title: title
);
}
catch (InvalidDataException ex)
{
throw new FileConversionException($"Invalid ZIP file format: {ex.Message}", ex);
}
catch (Exception ex) when (!(ex is MarkItDownException))
{
throw new FileConversionException($"Failed to convert ZIP file: {ex.Message}", ex);
}
}
private async Task ProcessZipEntry(ZipArchiveEntry entry, StringBuilder markdown, CancellationToken cancellationToken)
{
markdown.AppendLine($"## File: {entry.FullName}");
markdown.AppendLine();
// Add basic file information
if (entry.Length > 0)
{
markdown.AppendLine($"**Size:** {FileUtilities.FormatFileSize(entry.Length)}");
}
if (entry.LastWriteTime != DateTimeOffset.MinValue)
{
markdown.AppendLine($"**Last Modified:** {entry.LastWriteTime:yyyy-MM-dd HH:mm:ss}");
}
markdown.AppendLine();
// Skip empty files
if (entry.Length == 0)
{
markdown.AppendLine("*Empty file*");
markdown.AppendLine();
return;
}
// Skip very large files to avoid memory issues
const long maxFileSize = 50 * 1024 * 1024; // 50MB
if (entry.Length > maxFileSize)
{
markdown.AppendLine($"*File too large to process ({FileUtilities.FormatFileSize(entry.Length)})*");
markdown.AppendLine();
return;
}
try
{
using var entryStream = entry.Open();
using var memoryStream = new MemoryStream();
// Copy to memory stream so we can seek
await entryStream.CopyToAsync(memoryStream, cancellationToken);
memoryStream.Position = 0;
// Create StreamInfo for the file
var fileExtension = Path.GetExtension(entry.Name);
var fileName = entry.Name;
var mimeType = MimeMapping.GetMimeType(fileExtension);
var fileStreamInfo = new StreamInfo(
mimeType: mimeType,
extension: fileExtension,
charset: null,
fileName: fileName,
url: null
);
// Try to find a suitable converter
var converter = FindConverter(memoryStream, fileStreamInfo, cancellationToken);
if (converter != null)
{
memoryStream.Position = 0;
var result = await converter.ConvertAsync(memoryStream, fileStreamInfo, cancellationToken);
if (!string.IsNullOrWhiteSpace(result.Markdown))
{
markdown.AppendLine(result.Markdown);
}
else
{
markdown.AppendLine("*File processed but no content extracted*");
}
}
else
{
markdown.AppendLine($"*No converter available for file type: {fileExtension}*");
}
}
catch (Exception ex)
{
markdown.AppendLine($"*Error processing file: {ex.Message}*");
}
markdown.AppendLine();
}
private IDocumentConverter? FindConverter(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken)
{
foreach (var converter in _innerConverters.OrderBy(c => c.Priority))
{
try
{
if (stream.CanSeek)
stream.Position = 0;
if (converter.Accepts(stream, streamInfo, cancellationToken))
{
return converter;
}
}
catch
{
// Continue to next converter if this one fails
continue;
}
}
return null;
}
}