acceptance-certificate-maker / utils /document_merger.py
Simonlob's picture
zip
e6708ef
Raw
History Blame
5.23 kB
"""Document merger for combining multiple DOCX files into one"""
import os
import glob
from docx import Document
from docx.enum.text import WD_BREAK
from typing import List
class DocumentMerger:
"""Merge multiple DOCX files into a single document with page breaks"""
def __init__(self):
pass
def _add_page_break(self, doc: Document):
"""
Add a page break at the end of document
Args:
doc: DOCX document object
"""
paragraph = doc.add_paragraph()
run = paragraph.add_run()
run.add_break(WD_BREAK.PAGE)
def _copy_paragraph(self, source_para, target_doc: Document):
"""
Copy paragraph with all formatting to target document
Args:
source_para: Source paragraph to copy
target_doc: Target document to copy to
"""
# Create new paragraph with same style
new_para = target_doc.add_paragraph(style=source_para.style)
new_para.alignment = source_para.alignment
# Copy all runs with formatting
for run in source_para.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
# Copy font properties
if run.font.size:
new_run.font.size = run.font.size
if run.font.name:
new_run.font.name = run.font.name
def _copy_table(self, source_table, target_doc: Document):
"""
Copy table with all formatting to target document
Args:
source_table: Source table to copy
target_doc: Target document to copy to
"""
rows = len(source_table.rows)
cols = len(source_table.columns)
# Create new table
new_table = target_doc.add_table(rows=rows, cols=cols)
# Copy table style
if source_table.style:
new_table.style = source_table.style
# Copy cell contents
for i, row in enumerate(source_table.rows):
for j, cell in enumerate(row.cells):
new_cell = new_table.rows[i].cells[j]
# Remove default paragraph
new_cell.text = ''
# Copy each paragraph in the cell
for para in cell.paragraphs:
new_para = new_cell.add_paragraph(style=para.style)
new_para.alignment = para.alignment
# Copy runs with formatting
for run in para.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
if run.font.size:
new_run.font.size = run.font.size
if run.font.name:
new_run.font.name = run.font.name
# Remove the first empty paragraph that was auto-created
if len(new_cell.paragraphs) > len(cell.paragraphs):
p = new_cell.paragraphs[0]._element
p.getparent().remove(p)
def merge_files(self, docx_files: List[str], output_file: str):
"""
Merge multiple DOCX files into a single document
Args:
docx_files: List of paths to DOCX files to merge
output_file: Path to output merged document
"""
if not docx_files:
raise ValueError("No DOCX files provided to merge")
# Sort files to ensure consistent order
docx_files = sorted(docx_files)
# Start with the first document as base
merged_doc = Document(docx_files[0])
# Process remaining documents
for docx_file in docx_files[1:]:
# Add page break before next document
self._add_page_break(merged_doc)
# Load the document to merge
sub_doc = Document(docx_file)
# Copy all paragraphs
for paragraph in sub_doc.paragraphs:
self._copy_paragraph(paragraph, merged_doc)
# Copy all tables
for table in sub_doc.tables:
self._copy_table(table, merged_doc)
# Save merged document
merged_doc.save(output_file)
def merge_from_folder(self, input_folder: str, output_file: str, pattern: str = '*.docx') -> int:
"""
Merge all DOCX files from a folder into a single document
Args:
input_folder: Folder containing DOCX files
output_file: Path to output merged document
pattern: File pattern to match (default: *.docx)
Returns:
Number of documents merged
"""
# Find all matching DOCX files
search_pattern = os.path.join(input_folder, pattern)
docx_files = glob.glob(search_pattern)
if not docx_files:
raise ValueError(f"No DOCX files found in {input_folder} matching pattern {pattern}")
# Merge files
self.merge_files(docx_files, output_file)
return len(docx_files)