Merge pull request #5 from n7space/feature-mbep#314-m2docx

Lurkerpas · web-flow · commit 368633a3e209 · 2025-12-18T11:15:56.000+01:00
Feature MBEP#314 m2docx
diff --git a/LICENSE.MD2DOCX b/LICENSE.MD2DOCX
@@ -0,0 +1,23 @@
+This license applies to templateprocessor/md2docx.py file
+
+MIT License
+
+Copyright (c) 2024 Shlok T
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/examples/generate_ecss_demo.sh b/examples/generate_ecss_demo.sh
@@ -1,26 +1,21 @@
 #!/bin/bash
 mkdir -p output
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_1_software_static_architecture.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_1_software_static_architecture.pdf output/ecss-e-st-40c_4_1_software_static_architecture.md
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_2_software_dynamic_architecture.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_2_software_dynamic_architecture.pdf output/ecss-e-st-40c_4_2_software_dynamic_architecture.md
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_4_interfaces_context.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_4_interfaces_context.pdf output/ecss-e-st-40c_4_4_interfaces_context.md
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_2_overall_architecture.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_2_overall_architecture.pdf output/ecss-e-st-40c_5_2_overall_architecture.md
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_3_software_components_design.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_3_software_components_design.pdf output/ecss-e-st-40c_5_3_software_components_design.md
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_4_aspects_of_each_component.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_4_aspects_of_each_component.pdf output/ecss-e-st-40c_5_4_aspects_of_each_component.md
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_5_internal_interface_design.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_5_internal_interface_design.pdf output/ecss-e-st-40c_5_5_internal_interface_design.md
-
-template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_6_requirement_traceability.tmplt
-pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_6_requirement_traceability.pdf output/ecss-e-st-40c_6_requirement_traceability.md
+# List of template names
+templates=(
+    "ecss-e-st-40c_4_1_software_static_architecture"
+    "ecss-e-st-40c_4_2_software_dynamic_architecture"
+    "ecss-e-st-40c_4_4_interfaces_context"
+    "ecss-e-st-40c_5_2_overall_architecture"
+    "ecss-e-st-40c_5_3_software_components_design"
+    "ecss-e-st-40c_5_4_aspects_of_each_component"
+    "ecss-e-st-40c_5_5_internal_interface_design"
+    "ecss-e-st-40c_6_requirement_traceability"
+)
+
+# Loop through templates
+for template in "${templates[@]}"; do
+    template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt
+    template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt -p md2docx
+    template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt -p md2html
+    pandoc --pdf-engine=pdfroff --output=output/${template}.pdf output/${template}.md
+done
diff --git a/examples/generate_so_list.sh b/examples/generate_so_list.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 mkdir -p output
 template-processor --verbosity info --system-objects ../data/events.csv -o output -t so_list.tmplt
-pandoc --pdf-engine=pdfroff --output=output/so_list.pdf output/so_list.md
+pandoc --pdf-engine=pdfroff --output=output/so_list.pdf output/so_list.md
+template-processor --verbosity info --system-objects ../data/events.csv -o output -t so_list.tmplt -p md2docx
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,6 @@
 pytest==7.4.2
 black==24.3.0
 mako==1.3.10
-
+python-docx==1.2.0
+bs4==0.0.2
+markdown2==2.5.4
diff --git a/setup.py b/setup.py
@@ -30,7 +30,10 @@
     include_package_data=True,
     python_requires='>=3.8',
     install_requires=[
-        "mako==1.3.10"
+        "mako==1.3.10",
+        "python-docx==1.2.0",
+        "beautifulsoup4==4.12.3",
+        "markdown2==2.5.4"
     ],
     extras_require={
         'dev': [
diff --git a/templateprocessor/cli.py b/templateprocessor/cli.py
@@ -14,6 +14,13 @@
 from templateprocessor.soreader import SOReader
 from templateprocessor.dvreader import DVReader
 from templateprocessor.so import SystemObjectType
+from templateprocessor.postprocessor import (
+    PostprocessorType,
+    Md2docxPostprocessor,
+    Md2HtmlPostprocessor,
+    PassthroughPostprocessor,
+    Postprocessor,
+)
 
 
 def parse_arguments() -> argparse.Namespace:
@@ -88,7 +95,7 @@ def parse_arguments() -> argparse.Namespace:
     parser.add_argument(
         "-p",
         "--postprocess",
-        choices=["none", "md2docx"],
+        choices=["none", "md2docx", "md2html"],
         help="Output postprocessing",
         default="none",
     )
@@ -107,6 +114,16 @@ def get_log_level(level_str: str) -> int:
     return log_levels.get(level_str.lower(), logging.WARNING)
 
 
+def get_postprocessor_type(type_str: str) -> PostprocessorType:
+    types = {
+        PostprocessorType.NONE.value: PostprocessorType.NONE,
+        PostprocessorType.MD2DOCX.value: PostprocessorType.MD2DOCX,
+        PostprocessorType.MD2HTML.value: PostprocessorType.MD2HTML,
+    }
+
+    return types.get(type_str.lower(), PostprocessorType.NONE)
+
+
 def get_values_dictionary(values: list[str]) -> dict[str, str]:
     if not values or not isinstance(values, list):
         return {}
@@ -143,8 +160,10 @@ def read_sots(file_names: list[str]) -> dict[str, SystemObjectType]:
 
 def instantiate(
     instantiator: TemplateInstantiator,
+    postprocessor: Postprocessor,
     template_file: str,
     module_directory: str,
+    postprocessor_type: PostprocessorType,
     output_directory: str,
 ):
     try:
@@ -157,10 +176,9 @@ def instantiate(
         logging.debug(f"Instantiating template:\n {template}")
         instantiated_template = instantiator.instantiate(template, module_directory)
         logging.debug(f"Instantiation:\n {instantiated_template}")
-        output = Path(output_directory) / f"{name}.md"
-        logging.debug(f"Saving to {output}")
-        with open(output, "w") as f:
-            f.write(instantiated_template)
+        output = str(Path(output_directory) / f"{name}")
+        logging.debug(f"Postprocessing with {postprocessor_type}")
+        postprocessor.process(postprocessor_type, instantiated_template, output)
     except FileNotFoundError as e:
         logging.error(f"File not found: {e.filename}")
     except Exception as e:
@@ -173,6 +191,7 @@ def main():
     args = parse_arguments()
     logging_level = get_log_level(args.verbosity)
     logging.basicConfig(level=logging_level)
+    postprocessor_type = get_postprocessor_type(args.postprocess)
 
     logging.info("Template Processor")
     logging.debug(f"Interface View: {args.iv}")
@@ -182,6 +201,7 @@ def main():
     logging.debug(f"Templates: {args.template}")
     logging.debug(f"Output Directory: {args.output}")
     logging.debug(f"Module directory: {args.module_directory}")
+    logging.debug(f"Postprocessing: {postprocessor_type.value}")
 
     logging.info(f"Reading Interface View from {args.iv}")
     iv = IVReader().read(args.iv) if args.iv else InterfaceView()
@@ -198,10 +218,26 @@ def main():
     logging.info(f"Instantiating the TemplateInstantiator")
     instantiator = TemplateInstantiator(iv, dv, sots, values)
 
+    logging.info(f"Instantiating the Postprocessor")
+    postprocessor = Postprocessor(
+        {
+            PostprocessorType.NONE: PassthroughPostprocessor(),
+            PostprocessorType.MD2DOCX: Md2docxPostprocessor(),
+            PostprocessorType.MD2HTML: Md2HtmlPostprocessor(),
+        }
+    )
+
     if args.template:
         logging.info(f"Instantiating templates")
         for template_file in args.template:
-            instantiate(instantiator, template_file, args.module_directory, args.output)
+            instantiate(
+                instantiator,
+                postprocessor,
+                template_file,
+                args.module_directory,
+                postprocessor_type,
+                args.output,
+            )
 
     return 0
 
diff --git a/templateprocessor/md2docx.py b/templateprocessor/md2docx.py
@@ -0,0 +1,116 @@
+"""
+
+Markdown to DOCX conversion module extracted from md2docx-python project.
+
+Project address: https://github.com/shloktech/md2docx-python/
+Project LICENSE: LICENSE.MD2DOCX
+
+The reason for extraction is to align the API and features with the needs.
+Changes:
+- input is text, not file
+- markdown2 is used instead of markdown
+- table support is added via markdown2 extras and additional HTML processing
+
+"""
+
+import markdown2
+from docx import Document
+from bs4 import BeautifulSoup, Tag
+
+
+def get_element_text(element: Tag) -> str:
+    if hasattr(element, "get_text"):
+        return element.get_text(strip=True)
+    else:
+        return str(element).strip()
+
+
+def process_list_items(list_element: Tag, doc: Document, style_base: str, level=0):
+    # Get direct children li elements only (not nested)
+    for li in list_element.find_all("li", recursive=False):
+        # Get text content, excluding nested lists
+        text_parts = []
+        for child in li.children:
+            if child.name not in ["ul", "ol"]:
+                text_parts.append(get_element_text(child))
+
+        text = " ".join(text_parts).strip()
+
+        # Add paragraph with appropriate indentation level
+        if text:
+            style = style_base if level == 0 else f"{style_base} {level + 1}"
+            doc.add_paragraph(text, style=style)
+
+        # Process nested lists
+        nested_ul = li.find("ul", recursive=False)
+        nested_ol = li.find("ol", recursive=False)
+
+        if nested_ul:
+            process_list_items(nested_ul, doc, "List Bullet", level + 1)
+        if nested_ol:
+            process_list_items(nested_ol, doc, "List Number", level + 1)
+
+
+def markdown_to_word_file(markdown_source: str, word_file_path: str):
+    doc = markdown_to_word_object(markdown_source)
+    doc.save(word_file_path)
+
+
+def markdown_to_word_object(markdown_source: str) -> Document:
+    # Converting Markdown to HTML
+    html_content = markdown2.markdown(markdown_source, extras=["tables", "wiki-tables"])
+
+    # Creating a new Word Document
+    doc = Document()
+
+    # Converting HTML to text and adding it to the Word Document
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    # Adding content to the Word Document
+    for element in soup:
+        if element.name == "h1":
+            doc.add_heading(element.text, level=1)
+        elif element.name == "h2":
+            doc.add_heading(element.text, level=2)
+        elif element.name == "h3":
+            doc.add_heading(element.text, level=3)
+        elif element.name == "p":
+            paragraph = doc.add_paragraph()
+            for child in element.children:
+                if child.name == "strong":
+                    paragraph.add_run(child.text).bold = True
+                elif child.name == "em":
+                    paragraph.add_run(child.text).italic = True
+                else:
+                    paragraph.add_run(child)
+        elif element.name == "ul":
+            process_list_items(element, doc, "List Bullet")
+        elif element.name == "ol":
+            process_list_items(element, doc, "List Number")
+        elif element.name == "table":
+            rows_data = []
+            for row in element.find_all("tr"):
+                cells = row.find_all(["th", "td"])
+                row_data = [cell.get_text(strip=True) for cell in cells]
+                if row_data:
+                    rows_data.append(row_data)
+
+            if rows_data:
+                columns_count = len(rows_data[0])
+                table = doc.add_table(rows=len(rows_data), cols=columns_count)
+                table.style = "Table Grid"
+
+                for row_index, row_data in enumerate(rows_data):
+                    for column_index, cell_text in enumerate(row_data):
+                        if column_index < columns_count:
+                            table.rows[row_index].cells[column_index].text = cell_text
+
+                # Make the first row bold if it is a header
+                first_row = element.find("tr")
+                if first_row and first_row.find("th"):
+                    for cell in table.rows[0].cells:
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+
+    return doc
diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py
diff --git a/tests/Makefile b/tests/Makefile
diff --git a/tests/test_md2docx.py b/tests/test_md2docx.py