| """Document merger for combining multiple DOCX files into one""" |
|
|
| import os |
| import glob |
| from docx import Document |
| from docx.enum.text import WD_BREAK |
| from typing import List |
|
|
|
|
| class DocumentMerger: |
| """Merge multiple DOCX files into a single document with page breaks""" |
|
|
| def __init__(self): |
| pass |
|
|
| def _add_page_break(self, doc: Document): |
| """ |
| Add a page break at the end of document |
| |
| Args: |
| doc: DOCX document object |
| """ |
| paragraph = doc.add_paragraph() |
| run = paragraph.add_run() |
| run.add_break(WD_BREAK.PAGE) |
|
|
| def _copy_paragraph(self, source_para, target_doc: Document): |
| """ |
| Copy paragraph with all formatting to target document |
| |
| Args: |
| source_para: Source paragraph to copy |
| target_doc: Target document to copy to |
| """ |
| |
| new_para = target_doc.add_paragraph(style=source_para.style) |
| new_para.alignment = source_para.alignment |
|
|
| |
| for run in source_para.runs: |
| new_run = new_para.add_run(run.text) |
| new_run.bold = run.bold |
| new_run.italic = run.italic |
| new_run.underline = run.underline |
|
|
| |
| if run.font.size: |
| new_run.font.size = run.font.size |
| if run.font.name: |
| new_run.font.name = run.font.name |
|
|
| def _copy_table(self, source_table, target_doc: Document): |
| """ |
| Copy table with all formatting to target document |
| |
| Args: |
| source_table: Source table to copy |
| target_doc: Target document to copy to |
| """ |
| rows = len(source_table.rows) |
| cols = len(source_table.columns) |
|
|
| |
| new_table = target_doc.add_table(rows=rows, cols=cols) |
|
|
| |
| if source_table.style: |
| new_table.style = source_table.style |
|
|
| |
| for i, row in enumerate(source_table.rows): |
| for j, cell in enumerate(row.cells): |
| new_cell = new_table.rows[i].cells[j] |
|
|
| |
| new_cell.text = '' |
|
|
| |
| for para in cell.paragraphs: |
| new_para = new_cell.add_paragraph(style=para.style) |
| new_para.alignment = para.alignment |
|
|
| |
| for run in para.runs: |
| new_run = new_para.add_run(run.text) |
| new_run.bold = run.bold |
| new_run.italic = run.italic |
| new_run.underline = run.underline |
|
|
| if run.font.size: |
| new_run.font.size = run.font.size |
| if run.font.name: |
| new_run.font.name = run.font.name |
|
|
| |
| if len(new_cell.paragraphs) > len(cell.paragraphs): |
| p = new_cell.paragraphs[0]._element |
| p.getparent().remove(p) |
|
|
| def merge_files(self, docx_files: List[str], output_file: str): |
| """ |
| Merge multiple DOCX files into a single document |
| |
| Args: |
| docx_files: List of paths to DOCX files to merge |
| output_file: Path to output merged document |
| """ |
| if not docx_files: |
| raise ValueError("No DOCX files provided to merge") |
|
|
| |
| docx_files = sorted(docx_files) |
|
|
| |
| merged_doc = Document(docx_files[0]) |
|
|
| |
| for docx_file in docx_files[1:]: |
| |
| self._add_page_break(merged_doc) |
|
|
| |
| sub_doc = Document(docx_file) |
|
|
| |
| for paragraph in sub_doc.paragraphs: |
| self._copy_paragraph(paragraph, merged_doc) |
|
|
| |
| for table in sub_doc.tables: |
| self._copy_table(table, merged_doc) |
|
|
| |
| merged_doc.save(output_file) |
|
|
| def merge_from_folder(self, input_folder: str, output_file: str, pattern: str = '*.docx') -> int: |
| """ |
| Merge all DOCX files from a folder into a single document |
| |
| Args: |
| input_folder: Folder containing DOCX files |
| output_file: Path to output merged document |
| pattern: File pattern to match (default: *.docx) |
| |
| Returns: |
| Number of documents merged |
| """ |
| |
| search_pattern = os.path.join(input_folder, pattern) |
| docx_files = glob.glob(search_pattern) |
|
|
| if not docx_files: |
| raise ValueError(f"No DOCX files found in {input_folder} matching pattern {pattern}") |
|
|
| |
| self.merge_files(docx_files, output_file) |
|
|
| return len(docx_files) |
|
|