File size: 5,229 Bytes
e6708ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""Document merger for combining multiple DOCX files into one"""

import os
import glob
from docx import Document
from docx.enum.text import WD_BREAK
from typing import List


class DocumentMerger:
    """Merge multiple DOCX files into a single document with page breaks"""

    def __init__(self):
        pass

    def _add_page_break(self, doc: Document):
        """
        Add a page break at the end of document

        Args:
            doc: DOCX document object
        """
        paragraph = doc.add_paragraph()
        run = paragraph.add_run()
        run.add_break(WD_BREAK.PAGE)

    def _copy_paragraph(self, source_para, target_doc: Document):
        """
        Copy paragraph with all formatting to target document

        Args:
            source_para: Source paragraph to copy
            target_doc: Target document to copy to
        """
        # Create new paragraph with same style
        new_para = target_doc.add_paragraph(style=source_para.style)
        new_para.alignment = source_para.alignment

        # Copy all runs with formatting
        for run in source_para.runs:
            new_run = new_para.add_run(run.text)
            new_run.bold = run.bold
            new_run.italic = run.italic
            new_run.underline = run.underline

            # Copy font properties
            if run.font.size:
                new_run.font.size = run.font.size
            if run.font.name:
                new_run.font.name = run.font.name

    def _copy_table(self, source_table, target_doc: Document):
        """
        Copy table with all formatting to target document

        Args:
            source_table: Source table to copy
            target_doc: Target document to copy to
        """
        rows = len(source_table.rows)
        cols = len(source_table.columns)

        # Create new table
        new_table = target_doc.add_table(rows=rows, cols=cols)

        # Copy table style
        if source_table.style:
            new_table.style = source_table.style

        # Copy cell contents
        for i, row in enumerate(source_table.rows):
            for j, cell in enumerate(row.cells):
                new_cell = new_table.rows[i].cells[j]

                # Remove default paragraph
                new_cell.text = ''

                # Copy each paragraph in the cell
                for para in cell.paragraphs:
                    new_para = new_cell.add_paragraph(style=para.style)
                    new_para.alignment = para.alignment

                    # Copy runs with formatting
                    for run in para.runs:
                        new_run = new_para.add_run(run.text)
                        new_run.bold = run.bold
                        new_run.italic = run.italic
                        new_run.underline = run.underline

                        if run.font.size:
                            new_run.font.size = run.font.size
                        if run.font.name:
                            new_run.font.name = run.font.name

                # Remove the first empty paragraph that was auto-created
                if len(new_cell.paragraphs) > len(cell.paragraphs):
                    p = new_cell.paragraphs[0]._element
                    p.getparent().remove(p)

    def merge_files(self, docx_files: List[str], output_file: str):
        """
        Merge multiple DOCX files into a single document

        Args:
            docx_files: List of paths to DOCX files to merge
            output_file: Path to output merged document
        """
        if not docx_files:
            raise ValueError("No DOCX files provided to merge")

        # Sort files to ensure consistent order
        docx_files = sorted(docx_files)

        # Start with the first document as base
        merged_doc = Document(docx_files[0])

        # Process remaining documents
        for docx_file in docx_files[1:]:
            # Add page break before next document
            self._add_page_break(merged_doc)

            # Load the document to merge
            sub_doc = Document(docx_file)

            # Copy all paragraphs
            for paragraph in sub_doc.paragraphs:
                self._copy_paragraph(paragraph, merged_doc)

            # Copy all tables
            for table in sub_doc.tables:
                self._copy_table(table, merged_doc)

        # Save merged document
        merged_doc.save(output_file)

    def merge_from_folder(self, input_folder: str, output_file: str, pattern: str = '*.docx') -> int:
        """
        Merge all DOCX files from a folder into a single document

        Args:
            input_folder: Folder containing DOCX files
            output_file: Path to output merged document
            pattern: File pattern to match (default: *.docx)

        Returns:
            Number of documents merged
        """
        # Find all matching DOCX files
        search_pattern = os.path.join(input_folder, pattern)
        docx_files = glob.glob(search_pattern)

        if not docx_files:
            raise ValueError(f"No DOCX files found in {input_folder} matching pattern {pattern}")

        # Merge files
        self.merge_files(docx_files, output_file)

        return len(docx_files)