diff --git a/README.md b/README.md index d1b360b..c926326 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ a. From command line: docx2txt file.docx # extract text and images docx2txt -i /tmp/img_dir file.docx +# extract text and split on page breaks +docx2txt -s file.docx ``` b. From python: ```python @@ -26,5 +28,7 @@ import docx2txt text = docx2txt.process("file.docx") # extract text and write images in /tmp/img_dir -text = docx2txt.process("file.docx", "/tmp/img_dir") +text = docx2txt.process("file.docx", img_dir="/tmp/img_dir") +# extract text, split on page breaks, and write images in /tmp/img_dir +text = docx2txt.process("file.docx", split_pages=True, img_dir="/tmp/img_dir") ``` diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py index 0dac072..8977b13 100755 --- a/docx2txt/docx2txt.py +++ b/docx2txt/docx2txt.py @@ -2,6 +2,7 @@ import argparse import re +from typing import List, Literal, Union, cast, overload import xml.etree.ElementTree as ET import zipfile import os @@ -16,6 +17,8 @@ def process_args(): 'to extract text and images ' 'from docx files.') parser.add_argument("docx", help="path of the docx file") + parser.add_argument('-s', '--split_pages', + help='Split text on page breaks') parser.add_argument('-i', '--img_dir', help='path of directory ' 'to extract images') @@ -47,7 +50,15 @@ def qn(tag): return '{{{}}}{}'.format(uri, tagroot) -def xml2text(xml): +@overload +def xml2text(xml, split_pages: Literal[True]) -> List[str]: ... + + +@overload +def xml2text(xml, split_pages: Literal[False]) -> str: ... + + +def xml2text(xml, split_pages: bool): """ A string representing the textual content of this run, with content child elements like ```` translated to their Python @@ -55,6 +66,8 @@ def xml2text(xml): Adapted from: https://github.com/python-openxml/python-docx/ """ text = u'' + texts = [] + root = ET.fromstring(xml) for child in root.iter(): if child.tag == qn('w:t'): @@ -62,15 +75,36 @@ def xml2text(xml): text += t_text if t_text is not None else '' elif child.tag == qn('w:tab'): text += '\t' + elif split_pages and child.tag == qn('w:br') and list(child.attrib.values())[0] == "page": + texts.append(text) + text = u'' elif child.tag in (qn('w:br'), qn('w:cr')): text += '\n' elif child.tag == qn("w:p"): text += '\n\n' - return text + return texts if split_pages else text -def process(docx, img_dir=None): - text = u'' +def strip_list(lst: list): + while lst and not lst[-1]: + lst.pop() + + while lst and not lst[0]: + lst.pop(0) + + return lst + + +@overload +def process(docx, split_pages: Literal[True], img_dir=None) -> List[str]: ... + + +@overload +def process(docx, split_pages: Literal[False], img_dir=None) -> str: ... + + +def process(docx, split_pages=False, img_dir=None): + text: Union[list[str], str] = [] if split_pages else "" # unzip the docx in memory zipf = zipfile.ZipFile(docx) @@ -81,18 +115,33 @@ def process(docx, img_dir=None): header_xmls = 'word/header[0-9]*.xml' for fname in filelist: if re.match(header_xmls, fname): - text += xml2text(zipf.read(fname)) + if split_pages: + text = cast(list[str], text) + text.extend(xml2text(zipf.read(fname), split_pages)) + else: + text = cast(str, text) + text += xml2text(zipf.read(fname), split_pages) # get main text doc_xml = 'word/document.xml' - text += xml2text(zipf.read(doc_xml)) + if split_pages: + text = cast(list[str], text) + text.extend(xml2text(zipf.read(doc_xml), split_pages)) + else: + text = cast(str, text) + text += xml2text(zipf.read(doc_xml), split_pages) # get footer text # there can be 3 footer files in the zip footer_xmls = 'word/footer[0-9]*.xml' for fname in filelist: if re.match(footer_xmls, fname): - text += xml2text(zipf.read(fname)) + if split_pages: + text = cast(list[str], text) + text.extend(xml2text(zipf.read(fname), split_pages)) + else: + text = cast(str, text) + text += xml2text(zipf.read(fname), split_pages) if img_dir is not None: # extract images @@ -104,10 +153,10 @@ def process(docx, img_dir=None): dst_f.write(zipf.read(fname)) zipf.close() - return text.strip() + return [t.strip() for t in strip_list(cast(list[str], text))] if split_pages else cast(str, text).strip() if __name__ == '__main__': args = process_args() - text = process(args.docx, args.img_dir) + text = process(args.docx, args.split_pages, args.img_dir) sys.stdout.write(text.encode('utf-8'))