|
5 | 5 | import os |
6 | 6 | import pickle |
7 | 7 | import shutil |
| 8 | +import tarfile |
| 9 | +import zipfile |
| 10 | +import io |
8 | 11 | from dataclasses import dataclass, field |
9 | 12 | from typing import List, Dict, Optional, Any |
10 | 13 | from datetime import datetime |
@@ -69,6 +72,31 @@ class Book: |
69 | 72 |
|
70 | 73 | # --- Utilities --- |
71 | 74 |
|
| 75 | +def _convert_tar_to_zip_in_memory(tar_path: str) -> io.BytesIO: |
| 76 | + """ |
| 77 | + Converts a .tar (or compressed tar) file to a .zip file in memory. |
| 78 | + Ebooklib requires a ZIP-format EPUB. |
| 79 | + """ |
| 80 | + zip_buffer = io.BytesIO() |
| 81 | + |
| 82 | + with tarfile.open(tar_path, "r:*") as tar: |
| 83 | + members = tar.getmembers() |
| 84 | + # Find the root directory by locating the 'mimetype' file |
| 85 | + mimetype_member = next((m for m in members if os.path.basename(m.name) == 'mimetype'), None) |
| 86 | + root = os.path.dirname(mimetype_member.name) + '/' if mimetype_member and os.path.dirname(mimetype_member.name) else "" |
| 87 | + |
| 88 | + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: |
| 89 | + for member in members: |
| 90 | + if member.isfile(): |
| 91 | + f = tar.extractfile(member) |
| 92 | + if f: |
| 93 | + arcname = member.name[len(root):] if member.name.startswith(root) else member.name |
| 94 | + zip_file.writestr(arcname, f.read()) |
| 95 | + |
| 96 | + zip_buffer.seek(0) |
| 97 | + return zip_buffer |
| 98 | + |
| 99 | + |
72 | 100 | def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup: |
73 | 101 |
|
74 | 102 | # Remove dangerous/useless tags |
@@ -176,7 +204,15 @@ def process_epub(epub_path: str, output_dir: str) -> Book: |
176 | 204 |
|
177 | 205 | # 1. Load Book |
178 | 206 | print(f"Loading {epub_path}...") |
179 | | - book = epub.read_epub(epub_path) |
| 207 | + |
| 208 | + # Handle TAR files by converting them to ZIP in memory |
| 209 | + if tarfile.is_tarfile(epub_path): |
| 210 | + print(f"Detected TAR format for {epub_path}, converting to ZIP in memory...") |
| 211 | + epub_resource = _convert_tar_to_zip_in_memory(epub_path) |
| 212 | + else: |
| 213 | + epub_resource = epub_path |
| 214 | + |
| 215 | + book = epub.read_epub(epub_resource) |
180 | 216 |
|
181 | 217 | # 2. Extract Metadata |
182 | 218 | metadata = extract_metadata_robust(book) |
|
0 commit comments