Spaces:

aiacademy-kg
/

acceptance-certificate-maker

Sleeping

App Files Files Community

acceptance-certificate-maker / utils /document_merger.py

Simonlob

zip

e6708ef 3 months ago

Raw

History Blame

5.23 kB

	"""Document merger for combining multiple DOCX files into one"""

	import os
	import glob
	from docx import Document
	from docx.enum.text import WD_BREAK
	from typing import List


	class DocumentMerger:
	"""Merge multiple DOCX files into a single document with page breaks"""

	def __init__(self):
	pass

	def _add_page_break(self, doc: Document):
	"""
	Add a page break at the end of document

	Args:
	doc: DOCX document object
	"""
	paragraph = doc.add_paragraph()
	run = paragraph.add_run()
	run.add_break(WD_BREAK.PAGE)

	def _copy_paragraph(self, source_para, target_doc: Document):
	"""
	Copy paragraph with all formatting to target document

	Args:
	source_para: Source paragraph to copy
	target_doc: Target document to copy to
	"""
	# Create new paragraph with same style
	new_para = target_doc.add_paragraph(style=source_para.style)
	new_para.alignment = source_para.alignment

	# Copy all runs with formatting
	for run in source_para.runs:
	new_run = new_para.add_run(run.text)
	new_run.bold = run.bold
	new_run.italic = run.italic
	new_run.underline = run.underline

	# Copy font properties
	if run.font.size:
	new_run.font.size = run.font.size
	if run.font.name:
	new_run.font.name = run.font.name

	def _copy_table(self, source_table, target_doc: Document):
	"""
	Copy table with all formatting to target document

	Args:
	source_table: Source table to copy
	target_doc: Target document to copy to
	"""
	rows = len(source_table.rows)
	cols = len(source_table.columns)

	# Create new table
	new_table = target_doc.add_table(rows=rows, cols=cols)

	# Copy table style
	if source_table.style:
	new_table.style = source_table.style

	# Copy cell contents
	for i, row in enumerate(source_table.rows):
	for j, cell in enumerate(row.cells):
	new_cell = new_table.rows[i].cells[j]

	# Remove default paragraph
	new_cell.text = ''

	# Copy each paragraph in the cell
	for para in cell.paragraphs:
	new_para = new_cell.add_paragraph(style=para.style)
	new_para.alignment = para.alignment

	# Copy runs with formatting
	for run in para.runs:
	new_run = new_para.add_run(run.text)
	new_run.bold = run.bold
	new_run.italic = run.italic
	new_run.underline = run.underline

	if run.font.size:
	new_run.font.size = run.font.size
	if run.font.name:
	new_run.font.name = run.font.name

	# Remove the first empty paragraph that was auto-created
	if len(new_cell.paragraphs) > len(cell.paragraphs):
	p = new_cell.paragraphs[0]._element
	p.getparent().remove(p)

	def merge_files(self, docx_files: List[str], output_file: str):
	"""
	Merge multiple DOCX files into a single document

	Args:
	docx_files: List of paths to DOCX files to merge
	output_file: Path to output merged document
	"""
	if not docx_files:
	raise ValueError("No DOCX files provided to merge")

	# Sort files to ensure consistent order
	docx_files = sorted(docx_files)

	# Start with the first document as base
	merged_doc = Document(docx_files[0])

	# Process remaining documents
	for docx_file in docx_files[1:]:
	# Add page break before next document
	self._add_page_break(merged_doc)

	# Load the document to merge
	sub_doc = Document(docx_file)

	# Copy all paragraphs
	for paragraph in sub_doc.paragraphs:
	self._copy_paragraph(paragraph, merged_doc)

	# Copy all tables
	for table in sub_doc.tables:
	self._copy_table(table, merged_doc)

	# Save merged document
	merged_doc.save(output_file)

	def merge_from_folder(self, input_folder: str, output_file: str, pattern: str = '*.docx') -> int:
	"""
	Merge all DOCX files from a folder into a single document

	Args:
	input_folder: Folder containing DOCX files
	output_file: Path to output merged document
	pattern: File pattern to match (default: *.docx)

	Returns:
	Number of documents merged
	"""
	# Find all matching DOCX files
	search_pattern = os.path.join(input_folder, pattern)
	docx_files = glob.glob(search_pattern)

	if not docx_files:
	raise ValueError(f"No DOCX files found in {input_folder} matching pattern {pattern}")

	# Merge files
	self.merge_files(docx_files, output_file)

	return len(docx_files)