"""Document merger for combining multiple DOCX files into one""" import os import glob from docx import Document from docx.enum.text import WD_BREAK from typing import List class DocumentMerger: """Merge multiple DOCX files into a single document with page breaks""" def __init__(self): pass def _add_page_break(self, doc: Document): """ Add a page break at the end of document Args: doc: DOCX document object """ paragraph = doc.add_paragraph() run = paragraph.add_run() run.add_break(WD_BREAK.PAGE) def _copy_paragraph(self, source_para, target_doc: Document): """ Copy paragraph with all formatting to target document Args: source_para: Source paragraph to copy target_doc: Target document to copy to """ # Create new paragraph with same style new_para = target_doc.add_paragraph(style=source_para.style) new_para.alignment = source_para.alignment # Copy all runs with formatting for run in source_para.runs: new_run = new_para.add_run(run.text) new_run.bold = run.bold new_run.italic = run.italic new_run.underline = run.underline # Copy font properties if run.font.size: new_run.font.size = run.font.size if run.font.name: new_run.font.name = run.font.name def _copy_table(self, source_table, target_doc: Document): """ Copy table with all formatting to target document Args: source_table: Source table to copy target_doc: Target document to copy to """ rows = len(source_table.rows) cols = len(source_table.columns) # Create new table new_table = target_doc.add_table(rows=rows, cols=cols) # Copy table style if source_table.style: new_table.style = source_table.style # Copy cell contents for i, row in enumerate(source_table.rows): for j, cell in enumerate(row.cells): new_cell = new_table.rows[i].cells[j] # Remove default paragraph new_cell.text = '' # Copy each paragraph in the cell for para in cell.paragraphs: new_para = new_cell.add_paragraph(style=para.style) new_para.alignment = para.alignment # Copy runs with formatting for run in para.runs: new_run = new_para.add_run(run.text) new_run.bold = run.bold new_run.italic = run.italic new_run.underline = run.underline if run.font.size: new_run.font.size = run.font.size if run.font.name: new_run.font.name = run.font.name # Remove the first empty paragraph that was auto-created if len(new_cell.paragraphs) > len(cell.paragraphs): p = new_cell.paragraphs[0]._element p.getparent().remove(p) def merge_files(self, docx_files: List[str], output_file: str): """ Merge multiple DOCX files into a single document Args: docx_files: List of paths to DOCX files to merge output_file: Path to output merged document """ if not docx_files: raise ValueError("No DOCX files provided to merge") # Sort files to ensure consistent order docx_files = sorted(docx_files) # Start with the first document as base merged_doc = Document(docx_files[0]) # Process remaining documents for docx_file in docx_files[1:]: # Add page break before next document self._add_page_break(merged_doc) # Load the document to merge sub_doc = Document(docx_file) # Copy all paragraphs for paragraph in sub_doc.paragraphs: self._copy_paragraph(paragraph, merged_doc) # Copy all tables for table in sub_doc.tables: self._copy_table(table, merged_doc) # Save merged document merged_doc.save(output_file) def merge_from_folder(self, input_folder: str, output_file: str, pattern: str = '*.docx') -> int: """ Merge all DOCX files from a folder into a single document Args: input_folder: Folder containing DOCX files output_file: Path to output merged document pattern: File pattern to match (default: *.docx) Returns: Number of documents merged """ # Find all matching DOCX files search_pattern = os.path.join(input_folder, pattern) docx_files = glob.glob(search_pattern) if not docx_files: raise ValueError(f"No DOCX files found in {input_folder} matching pattern {pattern}") # Merge files self.merge_files(docx_files, output_file) return len(docx_files)