Skip to content

Commit 52568b7

Browse files
authored
Split a page range to a single document (#66)
* Split a page range to a single document * Fix styling * Add tests --------- Co-authored-by: avvertix <5672748+avvertix@users.noreply.github.com>
1 parent 6d88942 commit 52568b7

4 files changed

Lines changed: 524 additions & 27 deletions

File tree

src/parxy_cli/commands/pdf.py

Lines changed: 112 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def split(
156156
typer.Option(
157157
'--output',
158158
'-o',
159-
help='Output directory for split files. If not specified, creates a folder next to the input file.',
159+
help='Output path. Without --combine: output directory for split files (default: folder next to input). With --combine: output file path (default: {stem}_pages_{from}-{to}.pdf next to input).',
160160
),
161161
] = None,
162162
prefix: Annotated[
@@ -167,6 +167,20 @@ def split(
167167
help='Prefix for output filenames. If not specified, uses the input filename.',
168168
),
169169
] = None,
170+
pages: Annotated[
171+
Optional[str],
172+
typer.Option(
173+
'--pages',
174+
help='Page range to extract (1-based). Examples: "1" (single page), "1:3" (pages 1-3), ":3" (up to page 3), "3:" (from page 3). If not specified, all pages are extracted.',
175+
),
176+
] = None,
177+
combine: Annotated[
178+
bool,
179+
typer.Option(
180+
'--combine',
181+
help='Combine extracted pages into a single PDF instead of one file per page.',
182+
),
183+
] = False,
170184
):
171185
"""
172186
Split a PDF file into individual pages.
@@ -175,6 +189,12 @@ def split(
175189
176190
Output files are named: {prefix}_page_{number}.pdf
177191
192+
Page ranges use 1-based indexing:
193+
- "1" - only page 1
194+
- "1:3" - pages 1 to 3 (inclusive)
195+
- ":3" - from first page to page 3
196+
- "3:" - from page 3 to the end
197+
178198
Examples:
179199
180200
# Split into individual pages (default behavior)
@@ -188,6 +208,18 @@ def split(
188208
189209
# Split with custom output and prefix
190210
parxy pdf:split report.pdf -o ./pages -p page
211+
212+
# Extract only pages 2 to 5
213+
parxy pdf:split document.pdf --pages 2:5
214+
215+
# Extract a single page
216+
parxy pdf:split document.pdf --pages 3
217+
218+
# Combine pages 2-5 into a single PDF
219+
parxy pdf:split document.pdf --pages 2:5 --combine
220+
221+
# Combine with custom output path
222+
parxy pdf:split document.pdf --pages 2:5 --combine -o extracted.pdf
191223
"""
192224
console.action('Split PDF file', space_after=False)
193225

@@ -201,15 +233,35 @@ def split(
201233
console.error(f'Input file must be a PDF: {input_file}', panel=True)
202234
raise typer.Exit(1)
203235

204-
# Determine output directory
205-
if output_dir is None:
206-
# Create a folder next to the input file
207-
output_path = input_path.parent / f'{input_path.stem}_split'
208-
else:
209-
output_path = Path(output_dir)
236+
# Parse --pages option into 0-based from_page / to_page
237+
from_page = None
238+
to_page = None
239+
if pages is not None:
240+
try:
241+
if ':' in pages:
242+
start_str, end_str = pages.split(':', 1)
243+
from_page = (int(start_str) - 1) if start_str.strip() else None
244+
to_page = (int(end_str) - 1) if end_str.strip() else None
245+
else:
246+
page_num = int(pages) - 1
247+
from_page = page_num
248+
to_page = page_num
249+
except ValueError:
250+
console.error(
251+
f'Invalid --pages value: "{pages}". Use formats like "1", "1:3", ":3", or "3:".',
252+
panel=True,
253+
)
254+
raise typer.Exit(1)
210255

211-
# Create output directory
212-
output_path.mkdir(parents=True, exist_ok=True)
256+
# Determine output directory (only relevant when not combining)
257+
if not combine:
258+
if output_dir is None:
259+
output_path = input_path.parent / f'{input_path.stem}_split'
260+
else:
261+
output_path = Path(output_dir)
262+
output_path.mkdir(parents=True, exist_ok=True)
263+
else:
264+
output_path = None # unused in combine mode
213265

214266
# Determine filename prefix
215267
if prefix is None:
@@ -228,27 +280,64 @@ def split(
228280
console.error('PDF file is empty (no pages)', panel=True)
229281
raise typer.Exit(1)
230282

283+
# Determine effective range for display
284+
effective_from = (from_page if from_page is not None else 0) + 1
285+
effective_to = (to_page if to_page is not None else total_pages - 1) + 1
286+
extract_count = effective_to - effective_from + 1
287+
231288
console.info(
232289
f'Processing PDF with {total_pages} page{"s" if total_pages > 1 else ""}'
233290
)
234-
console.info(
235-
f'Splitting into {total_pages} file{"s" if total_pages > 1 else ""}'
236-
)
291+
if pages is not None:
292+
console.info(
293+
f'Extracting pages {effective_from}-{effective_to} ({extract_count} page{"s" if extract_count > 1 else ""})'
294+
)
295+
296+
if combine:
297+
# Determine output file path
298+
if output_dir is not None:
299+
combined_output = Path(output_dir)
300+
if combined_output.suffix.lower() != '.pdf':
301+
combined_output = combined_output.with_suffix('.pdf')
302+
else:
303+
range_label = (
304+
f'{effective_from}-{effective_to}'
305+
if effective_from != effective_to
306+
else str(effective_from)
307+
)
308+
combined_output = (
309+
input_path.parent / f'{input_path.stem}_pages_{range_label}.pdf'
310+
)
237311

238-
with console.shimmer(f'Splitting PDF...'):
239-
# Use service to split PDF
240-
output_files = PdfService.split_pdf(input_path, output_path, prefix)
312+
with console.shimmer('Extracting pages into single PDF...'):
313+
PdfService.extract_pages(
314+
input_path, combined_output, from_page, to_page
315+
)
241316

242-
# Display created files
243-
for idx, output_file in enumerate(output_files):
244-
console.print(
245-
f'[faint]⎿ [/faint] Created {output_file.name} (page {idx + 1})'
317+
console.newline()
318+
console.success(
319+
f'Successfully extracted {extract_count} page{"s" if extract_count > 1 else ""} into {combined_output}'
320+
)
321+
else:
322+
console.info(
323+
f'Splitting into {extract_count} file{"s" if extract_count > 1 else ""}'
324+
)
325+
326+
with console.shimmer('Splitting PDF...'):
327+
output_files = PdfService.split_pdf(
328+
input_path, output_path, prefix, from_page, to_page
246329
)
247330

248-
console.newline()
249-
console.success(
250-
f'Successfully split PDF into {len(output_files)} file{"s" if len(output_files) > 1 else ""} in {output_path}'
251-
)
331+
for output_file in output_files:
332+
page_num = int(output_file.stem.rsplit('_', 1)[-1])
333+
console.print(
334+
f'[faint]⎿ [/faint] Created {output_file.name} (page {page_num})'
335+
)
336+
337+
console.newline()
338+
console.success(
339+
f'Successfully split PDF into {len(output_files)} file{"s" if len(output_files) > 1 else ""} in {output_path}'
340+
)
252341

253342
except (ValueError, FileNotFoundError) as e:
254343
console.error(f'Error during split: {str(e)}')

src/parxy_core/services/pdf_service.py

Lines changed: 93 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -266,21 +266,29 @@ def merge_pdfs(
266266
merged_pdf.close()
267267

268268
@staticmethod
269-
def split_pdf(input_path: Path, output_dir: Path, prefix: str) -> List[Path]:
269+
def split_pdf(
270+
input_path: Path,
271+
output_dir: Path,
272+
prefix: str,
273+
from_page: Optional[int] = None,
274+
to_page: Optional[int] = None,
275+
) -> List[Path]:
270276
"""
271277
Split a PDF file into individual pages.
272278
273279
Args:
274280
input_path: Path to the PDF file to split
275281
output_dir: Directory where split PDFs should be saved
276282
prefix: Prefix for output filenames
283+
from_page: First page to extract (0-based, inclusive). None means first page.
284+
to_page: Last page to extract (0-based, inclusive). None means last page.
277285
278286
Returns:
279287
List of paths to the created PDF files
280288
281289
Raises:
282290
FileNotFoundError: If input PDF doesn't exist
283-
ValueError: If PDF is empty or invalid
291+
ValueError: If PDF is empty or page range is invalid
284292
"""
285293
if not input_path.is_file():
286294
raise FileNotFoundError(f'PDF file not found: {input_path}')
@@ -292,14 +300,34 @@ def split_pdf(input_path: Path, output_dir: Path, prefix: str) -> List[Path]:
292300
pdf.close()
293301
raise ValueError('PDF file is empty (no pages)')
294302

303+
start = from_page if from_page is not None else 0
304+
end = to_page if to_page is not None else total_pages - 1
305+
306+
if start < 0 or start >= total_pages:
307+
pdf.close()
308+
raise ValueError(
309+
f'Invalid page range: page {start + 1} does not exist (PDF has {total_pages} pages)'
310+
)
311+
312+
if end < 0 or end >= total_pages:
313+
pdf.close()
314+
raise ValueError(
315+
f'Invalid page range: page {end + 1} does not exist (PDF has {total_pages} pages)'
316+
)
317+
318+
if start > end:
319+
pdf.close()
320+
raise ValueError(
321+
f'Invalid page range: start page {start + 1} > end page {end + 1}'
322+
)
323+
295324
# Ensure output directory exists
296325
output_dir.mkdir(parents=True, exist_ok=True)
297326

298327
output_files = []
299328

300329
try:
301-
# Split into individual pages
302-
for page_num in range(total_pages):
330+
for page_num in range(start, end + 1):
303331
output_file = output_dir / f'{prefix}_page_{page_num + 1}.pdf'
304332
output_pdf = pymupdf.open()
305333
output_pdf.insert_pdf(pdf, from_page=page_num, to_page=page_num)
@@ -311,6 +339,67 @@ def split_pdf(input_path: Path, output_dir: Path, prefix: str) -> List[Path]:
311339

312340
return output_files
313341

342+
@staticmethod
343+
def extract_pages(
344+
input_path: Path,
345+
output_path: Path,
346+
from_page: Optional[int] = None,
347+
to_page: Optional[int] = None,
348+
) -> None:
349+
"""
350+
Extract a page range from a PDF into a single output PDF.
351+
352+
Args:
353+
input_path: Path to the source PDF file
354+
output_path: Path where the extracted PDF should be saved
355+
from_page: First page to extract (0-based, inclusive). None means first page.
356+
to_page: Last page to extract (0-based, inclusive). None means last page.
357+
358+
Raises:
359+
FileNotFoundError: If input PDF doesn't exist
360+
ValueError: If PDF is empty or page range is invalid
361+
"""
362+
if not input_path.is_file():
363+
raise FileNotFoundError(f'PDF file not found: {input_path}')
364+
365+
pdf = pymupdf.open(input_path)
366+
total_pages = pdf.page_count
367+
368+
if total_pages == 0:
369+
pdf.close()
370+
raise ValueError('PDF file is empty (no pages)')
371+
372+
start = from_page if from_page is not None else 0
373+
end = to_page if to_page is not None else total_pages - 1
374+
375+
if start < 0 or start >= total_pages:
376+
pdf.close()
377+
raise ValueError(
378+
f'Invalid page range: page {start + 1} does not exist (PDF has {total_pages} pages)'
379+
)
380+
381+
if end < 0 or end >= total_pages:
382+
pdf.close()
383+
raise ValueError(
384+
f'Invalid page range: page {end + 1} does not exist (PDF has {total_pages} pages)'
385+
)
386+
387+
if start > end:
388+
pdf.close()
389+
raise ValueError(
390+
f'Invalid page range: start page {start + 1} > end page {end + 1}'
391+
)
392+
393+
output_path.parent.mkdir(parents=True, exist_ok=True)
394+
395+
try:
396+
output_pdf = pymupdf.open()
397+
output_pdf.insert_pdf(pdf, from_page=start, to_page=end)
398+
output_pdf.save(str(output_path))
399+
output_pdf.close()
400+
finally:
401+
pdf.close()
402+
314403
@staticmethod
315404
def optimize_pdf(
316405
input_path: Path,

0 commit comments

Comments
 (0)