Skip to content

Commit 9f3e06e

Browse files
committed
Add support for TAR file handling in EPUB processing
1 parent 64960f9 commit 9f3e06e

1 file changed

Lines changed: 37 additions & 1 deletion

File tree

reader3.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
import os
66
import pickle
77
import shutil
8+
import tarfile
9+
import zipfile
10+
import io
811
from dataclasses import dataclass, field
912
from typing import List, Dict, Optional, Any
1013
from datetime import datetime
@@ -69,6 +72,31 @@ class Book:
6972

7073
# --- Utilities ---
7174

75+
def _convert_tar_to_zip_in_memory(tar_path: str) -> io.BytesIO:
76+
"""
77+
Converts a .tar (or compressed tar) file to a .zip file in memory.
78+
Ebooklib requires a ZIP-format EPUB.
79+
"""
80+
zip_buffer = io.BytesIO()
81+
82+
with tarfile.open(tar_path, "r:*") as tar:
83+
members = tar.getmembers()
84+
# Find the root directory by locating the 'mimetype' file
85+
mimetype_member = next((m for m in members if os.path.basename(m.name) == 'mimetype'), None)
86+
root = os.path.dirname(mimetype_member.name) + '/' if mimetype_member and os.path.dirname(mimetype_member.name) else ""
87+
88+
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
89+
for member in members:
90+
if member.isfile():
91+
f = tar.extractfile(member)
92+
if f:
93+
arcname = member.name[len(root):] if member.name.startswith(root) else member.name
94+
zip_file.writestr(arcname, f.read())
95+
96+
zip_buffer.seek(0)
97+
return zip_buffer
98+
99+
72100
def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup:
73101

74102
# Remove dangerous/useless tags
@@ -176,7 +204,15 @@ def process_epub(epub_path: str, output_dir: str) -> Book:
176204

177205
# 1. Load Book
178206
print(f"Loading {epub_path}...")
179-
book = epub.read_epub(epub_path)
207+
208+
# Handle TAR files by converting them to ZIP in memory
209+
if tarfile.is_tarfile(epub_path):
210+
print(f"Detected TAR format for {epub_path}, converting to ZIP in memory...")
211+
epub_resource = _convert_tar_to_zip_in_memory(epub_path)
212+
else:
213+
epub_resource = epub_path
214+
215+
book = epub.read_epub(epub_resource)
180216

181217
# 2. Extract Metadata
182218
metadata = extract_metadata_robust(book)

0 commit comments

Comments
 (0)