File size: 5,229 Bytes
e6708ef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | """Document merger for combining multiple DOCX files into one"""
import os
import glob
from docx import Document
from docx.enum.text import WD_BREAK
from typing import List
class DocumentMerger:
"""Merge multiple DOCX files into a single document with page breaks"""
def __init__(self):
pass
def _add_page_break(self, doc: Document):
"""
Add a page break at the end of document
Args:
doc: DOCX document object
"""
paragraph = doc.add_paragraph()
run = paragraph.add_run()
run.add_break(WD_BREAK.PAGE)
def _copy_paragraph(self, source_para, target_doc: Document):
"""
Copy paragraph with all formatting to target document
Args:
source_para: Source paragraph to copy
target_doc: Target document to copy to
"""
# Create new paragraph with same style
new_para = target_doc.add_paragraph(style=source_para.style)
new_para.alignment = source_para.alignment
# Copy all runs with formatting
for run in source_para.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
# Copy font properties
if run.font.size:
new_run.font.size = run.font.size
if run.font.name:
new_run.font.name = run.font.name
def _copy_table(self, source_table, target_doc: Document):
"""
Copy table with all formatting to target document
Args:
source_table: Source table to copy
target_doc: Target document to copy to
"""
rows = len(source_table.rows)
cols = len(source_table.columns)
# Create new table
new_table = target_doc.add_table(rows=rows, cols=cols)
# Copy table style
if source_table.style:
new_table.style = source_table.style
# Copy cell contents
for i, row in enumerate(source_table.rows):
for j, cell in enumerate(row.cells):
new_cell = new_table.rows[i].cells[j]
# Remove default paragraph
new_cell.text = ''
# Copy each paragraph in the cell
for para in cell.paragraphs:
new_para = new_cell.add_paragraph(style=para.style)
new_para.alignment = para.alignment
# Copy runs with formatting
for run in para.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
if run.font.size:
new_run.font.size = run.font.size
if run.font.name:
new_run.font.name = run.font.name
# Remove the first empty paragraph that was auto-created
if len(new_cell.paragraphs) > len(cell.paragraphs):
p = new_cell.paragraphs[0]._element
p.getparent().remove(p)
def merge_files(self, docx_files: List[str], output_file: str):
"""
Merge multiple DOCX files into a single document
Args:
docx_files: List of paths to DOCX files to merge
output_file: Path to output merged document
"""
if not docx_files:
raise ValueError("No DOCX files provided to merge")
# Sort files to ensure consistent order
docx_files = sorted(docx_files)
# Start with the first document as base
merged_doc = Document(docx_files[0])
# Process remaining documents
for docx_file in docx_files[1:]:
# Add page break before next document
self._add_page_break(merged_doc)
# Load the document to merge
sub_doc = Document(docx_file)
# Copy all paragraphs
for paragraph in sub_doc.paragraphs:
self._copy_paragraph(paragraph, merged_doc)
# Copy all tables
for table in sub_doc.tables:
self._copy_table(table, merged_doc)
# Save merged document
merged_doc.save(output_file)
def merge_from_folder(self, input_folder: str, output_file: str, pattern: str = '*.docx') -> int:
"""
Merge all DOCX files from a folder into a single document
Args:
input_folder: Folder containing DOCX files
output_file: Path to output merged document
pattern: File pattern to match (default: *.docx)
Returns:
Number of documents merged
"""
# Find all matching DOCX files
search_pattern = os.path.join(input_folder, pattern)
docx_files = glob.glob(search_pattern)
if not docx_files:
raise ValueError(f"No DOCX files found in {input_folder} matching pattern {pattern}")
# Merge files
self.merge_files(docx_files, output_file)
return len(docx_files)
|