zip
Browse files- .DS_Store +0 -0
- .gitignore +7 -0
- README.md +2 -2
- app.py +164 -0
- requirements.txt +2 -0
- utils/.DS_Store +0 -0
- utils/README.md +177 -0
- utils/__init__.py +8 -0
- utils/document_merger.py +158 -0
- utils/document_processor.py +147 -0
- utils/number_converter.py +121 -0
- utils/receipt_parser.py +94 -0
.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
env/
|
| 3 |
+
.env
|
| 4 |
+
*.zip
|
| 5 |
+
draft.zip
|
| 6 |
+
draft.docx
|
| 7 |
+
*.DS_Store
|
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
title: Acceptance Certificate Maker
|
| 3 |
emoji: ⚡
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.10.0
|
| 8 |
app_file: app.py
|
|
|
|
| 1 |
---
|
| 2 |
title: Acceptance Certificate Maker
|
| 3 |
emoji: ⚡
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.10.0
|
| 8 |
app_file: app.py
|
app.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
ACT Maker - Gradio Web Interface for Hugging Face Spaces
|
| 4 |
+
|
| 5 |
+
Web interface for generating ACT documents from ESFS XML receipts.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import shutil
|
| 10 |
+
import tempfile
|
| 11 |
+
import gradio as gr
|
| 12 |
+
from utils import NumberToWords, DocumentProcessor, ReceiptParser, DocumentMerger
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def process_receipt(xml_file, user_name):
|
| 16 |
+
"""
|
| 17 |
+
Process uploaded XML receipt and generate ACT document
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
xml_file: Uploaded XML file from Gradio
|
| 21 |
+
user_name: User name string (e.g., "ИП Пупкин Василий Алибабаевич")
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
Path to generated DOCX file or error message
|
| 25 |
+
"""
|
| 26 |
+
if xml_file is None:
|
| 27 |
+
return None, "⚠️ Please upload an XML receipt file"
|
| 28 |
+
|
| 29 |
+
if not user_name or not user_name.strip():
|
| 30 |
+
return None, "⚠️ Please enter your name"
|
| 31 |
+
|
| 32 |
+
user_name = user_name.strip()
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
# Create temporary directory for processing
|
| 36 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 37 |
+
# Create esfs folder for XML files
|
| 38 |
+
esfs_folder = os.path.join(temp_dir, 'esfs')
|
| 39 |
+
os.makedirs(esfs_folder)
|
| 40 |
+
|
| 41 |
+
# Create temp docs folder
|
| 42 |
+
temp_docs_folder = os.path.join(temp_dir, '.temp_docs')
|
| 43 |
+
os.makedirs(temp_docs_folder)
|
| 44 |
+
|
| 45 |
+
# Copy uploaded XML file to esfs folder
|
| 46 |
+
xml_path = os.path.join(esfs_folder, os.path.basename(xml_file.name))
|
| 47 |
+
shutil.copy(xml_file.name, xml_path)
|
| 48 |
+
|
| 49 |
+
# Initialize components
|
| 50 |
+
receipt_parser = ReceiptParser(esfs_folder)
|
| 51 |
+
number_converter = NumberToWords()
|
| 52 |
+
doc_processor = DocumentProcessor('draft.docx', user_name)
|
| 53 |
+
doc_merger = DocumentMerger()
|
| 54 |
+
|
| 55 |
+
# Parse receipts
|
| 56 |
+
receipts_data = receipt_parser.get_all_receipt_data()
|
| 57 |
+
|
| 58 |
+
if not receipts_data:
|
| 59 |
+
return None, "⚠️ No receipt data found in XML file"
|
| 60 |
+
|
| 61 |
+
# Process each receipt
|
| 62 |
+
temp_files = []
|
| 63 |
+
for idx, receipt_data in enumerate(receipts_data, 1):
|
| 64 |
+
# Convert price to words
|
| 65 |
+
price_words = number_converter.convert(receipt_data['price'])
|
| 66 |
+
|
| 67 |
+
# Prepare full data
|
| 68 |
+
full_data = {
|
| 69 |
+
'contract_date': receipt_data['contract_date'],
|
| 70 |
+
'price': receipt_data['price'],
|
| 71 |
+
'price_words': price_words,
|
| 72 |
+
'today_date': receipt_data['today_date']
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
# Process document
|
| 76 |
+
doc = doc_processor.process_document(full_data)
|
| 77 |
+
|
| 78 |
+
# Save to temporary folder
|
| 79 |
+
temp_filename = os.path.join(temp_docs_folder, f"act_{idx:03d}.docx")
|
| 80 |
+
doc.save(temp_filename)
|
| 81 |
+
temp_files.append(temp_filename)
|
| 82 |
+
|
| 83 |
+
# Merge all documents
|
| 84 |
+
output_path = os.path.join(temp_dir, 'result.docx')
|
| 85 |
+
doc_merger.merge_files(temp_files, output_path)
|
| 86 |
+
|
| 87 |
+
# Create a permanent copy for download
|
| 88 |
+
final_output = tempfile.NamedTemporaryFile(
|
| 89 |
+
delete=False,
|
| 90 |
+
suffix='.docx',
|
| 91 |
+
prefix='act_'
|
| 92 |
+
)
|
| 93 |
+
shutil.copy(output_path, final_output.name)
|
| 94 |
+
|
| 95 |
+
success_msg = f"✅ Successfully processed {len(receipts_data)} receipt(s)"
|
| 96 |
+
return final_output.name, success_msg
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
error_msg = f"❌ Error processing: {str(e)}"
|
| 100 |
+
return None, error_msg
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# Create Gradio interface
|
| 104 |
+
with gr.Blocks(title="ACT Maker - Acceptance Certificate Generator") as demo:
|
| 105 |
+
gr.Markdown(
|
| 106 |
+
"""
|
| 107 |
+
# 📄 ACT Maker
|
| 108 |
+
### Automated Acceptance Certificate Generation from ESFS XML Receipts
|
| 109 |
+
|
| 110 |
+
Upload an XML receipt file and enter your name to generate an acceptance certificate.
|
| 111 |
+
"""
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
with gr.Row():
|
| 115 |
+
with gr.Column():
|
| 116 |
+
xml_input = gr.File(
|
| 117 |
+
label="📁 ESFS XML Receipt",
|
| 118 |
+
file_types=[".xml"],
|
| 119 |
+
type="filepath"
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
user_name_input = gr.Textbox(
|
| 123 |
+
label="👤 User Name",
|
| 124 |
+
placeholder="e.g., ИП Иванов Иван Иванович",
|
| 125 |
+
lines=1
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
submit_btn = gr.Button("🚀 Generate Certificate", variant="primary", size="lg")
|
| 129 |
+
|
| 130 |
+
with gr.Column():
|
| 131 |
+
status_output = gr.Textbox(
|
| 132 |
+
label="📊 Status",
|
| 133 |
+
lines=2,
|
| 134 |
+
interactive=False
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
file_output = gr.File(
|
| 138 |
+
label="📥 Download Document",
|
| 139 |
+
interactive=False
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
gr.Markdown(
|
| 143 |
+
"""
|
| 144 |
+
---
|
| 145 |
+
### 📖 How to Use:
|
| 146 |
+
1. **Upload XML file** - ESFS receipt in XML format
|
| 147 |
+
2. **Enter your name** - Full name (e.g., "ИП Иванов Иван Иванович")
|
| 148 |
+
3. **Click "Generate Certificate"** - Document will be created automatically
|
| 149 |
+
4. **Download result** - Ready acceptance certificate in DOCX format
|
| 150 |
+
|
| 151 |
+
ℹ️ If the XML file contains multiple receipts, they will be processed and merged into one document.
|
| 152 |
+
"""
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Connect the button to the processing function
|
| 156 |
+
submit_btn.click(
|
| 157 |
+
fn=process_receipt,
|
| 158 |
+
inputs=[xml_input, user_name_input],
|
| 159 |
+
outputs=[file_output, status_output]
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Launch the app
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-docx==1.1.2
|
| 2 |
+
gradio==4.44.0
|
utils/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
utils/README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ACT Maker
|
| 2 |
+
|
| 3 |
+
Automated document generation tool for creating ACT (acceptance certificate) documents from ESFS XML receipts.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- 📄 **Automated Document Generation**: Process multiple receipts at once
|
| 8 |
+
- 🔄 **Template-based**: Uses customizable DOCX templates
|
| 9 |
+
- 🌍 **Russian Language Support**: Converts numbers to Russian words with proper grammar
|
| 10 |
+
- 📊 **Batch Processing**: Handles multiple XML files and receipts
|
| 11 |
+
- ⚙️ **Configurable**: Command-line arguments for flexibility
|
| 12 |
+
|
| 13 |
+
## Project Structure
|
| 14 |
+
|
| 15 |
+
```
|
| 16 |
+
act_maker/
|
| 17 |
+
├── main.py # Main entry point
|
| 18 |
+
├── merge_documents.py # Standalone document merger utility
|
| 19 |
+
├── requirements.txt # Python dependencies
|
| 20 |
+
├── README.md # This file
|
| 21 |
+
├── .user_name # User configuration (your name)
|
| 22 |
+
├── draft.docx # DOCX template file
|
| 23 |
+
├── result.docx # Final merged output (auto-generated)
|
| 24 |
+
├── esfs/ # Input folder for XML receipts
|
| 25 |
+
│ └── *.xml # ESFS receipt XML files
|
| 26 |
+
├── .temp_docs/ # Temporary buffer (auto-created, auto-deleted)
|
| 27 |
+
│ └── act_*.docx # Individual documents (only with --keep-temp)
|
| 28 |
+
└── utils/ # Utility modules
|
| 29 |
+
├── __init__.py
|
| 30 |
+
├── number_converter.py # Number to Russian words converter
|
| 31 |
+
├── receipt_parser.py # XML receipt parser
|
| 32 |
+
├── document_processor.py # DOCX template processor
|
| 33 |
+
└── document_merger.py # Document merging logic
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Installation
|
| 37 |
+
|
| 38 |
+
1. **Clone or download this project**
|
| 39 |
+
|
| 40 |
+
2. **Create a virtual environment** (recommended):
|
| 41 |
+
```bash
|
| 42 |
+
python3 -m venv venv
|
| 43 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
3. **Install dependencies**:
|
| 47 |
+
```bash
|
| 48 |
+
pip install -r requirements.txt
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
## Configuration
|
| 52 |
+
|
| 53 |
+
1. **Create `.user_name` file** with your name:
|
| 54 |
+
```
|
| 55 |
+
ИП Пупкин Василий Алибабаевич
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
2. **Prepare your template** (`draft.docx`) with the following tags:
|
| 59 |
+
- `<my-name-caps>` - Your name in UPPERCASE (bold)
|
| 60 |
+
- `<my-name>` - Your name in original case with last word on new line (bold)
|
| 61 |
+
- `<contract-date>` - Contract date (bold)
|
| 62 |
+
- `<price-som>` - Price in numbers (bold)
|
| 63 |
+
- `<price-leters>` - Price in words (normal)
|
| 64 |
+
- `<today-date>` - Today's date (bold)
|
| 65 |
+
|
| 66 |
+
3. **Place XML receipts** in the `esfs/` folder
|
| 67 |
+
|
| 68 |
+
## Usage
|
| 69 |
+
|
| 70 |
+
### Basic Usage
|
| 71 |
+
|
| 72 |
+
Process all receipts and create a merged document:
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
python main.py
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
This will:
|
| 79 |
+
1. Parse all XML files from `esfs/` folder
|
| 80 |
+
2. Generate individual documents in `.temp_docs/` folder
|
| 81 |
+
3. Merge all documents into `result.docx`
|
| 82 |
+
4. Clean up temporary files
|
| 83 |
+
|
| 84 |
+
### Advanced Usage
|
| 85 |
+
|
| 86 |
+
Keep temporary files for inspection:
|
| 87 |
+
|
| 88 |
+
```bash
|
| 89 |
+
python main.py --keep-temp
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
Customize paths and output file:
|
| 93 |
+
|
| 94 |
+
```bash
|
| 95 |
+
python main.py --template my_template.docx \
|
| 96 |
+
--esfs-folder input_xmls \
|
| 97 |
+
--output my_result.docx \
|
| 98 |
+
--user-config .my_name
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
### Command-Line Arguments
|
| 102 |
+
|
| 103 |
+
| Argument | Default | Description |
|
| 104 |
+
|----------|---------|-------------|
|
| 105 |
+
| `--template` | `draft.docx` | Path to DOCX template file |
|
| 106 |
+
| `--esfs-folder` | `esfs` | Folder containing XML receipt files |
|
| 107 |
+
| `--output` | `result.docx` | Output merged document file |
|
| 108 |
+
| `--user-config` | `.user_name` | Config file containing user name |
|
| 109 |
+
| `--keep-temp` | `false` | Keep temporary files after merging |
|
| 110 |
+
|
| 111 |
+
### Standalone Document Merger
|
| 112 |
+
|
| 113 |
+
You can also merge existing DOCX files using the standalone merger:
|
| 114 |
+
|
| 115 |
+
```bash
|
| 116 |
+
python merge_documents.py --input-folder my_docs --output merged.docx
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
## How It Works
|
| 120 |
+
|
| 121 |
+
1. **Parse XML Receipts**: Scans the ESFS folder for XML files and extracts receipt data
|
| 122 |
+
2. **Extract Data**: Gets contract dates, prices, and creation dates from each receipt
|
| 123 |
+
3. **Convert Numbers**: Converts prices to Russian words (e.g., 87000 → "восемьдесят семь тысяч")
|
| 124 |
+
4. **Fill Template**: Replaces tags in the DOCX template with actual data
|
| 125 |
+
5. **Generate Individual Documents**: Saves each filled document to `.temp_docs/` buffer
|
| 126 |
+
6. **Merge Documents**: Combines all documents into a single `result.docx` file with page breaks
|
| 127 |
+
7. **Cleanup**: Removes temporary files (unless `--keep-temp` is specified)
|
| 128 |
+
|
| 129 |
+
## Template Tags
|
| 130 |
+
|
| 131 |
+
All tags are replaced with proper formatting (bold where specified):
|
| 132 |
+
|
| 133 |
+
- **`<my-name-caps>`**: Full name in UPPERCASE and bold
|
| 134 |
+
- **`<my-name>`**: Full name with last word on new line, bold
|
| 135 |
+
- **`<contract-date>`**: Date in format "2 ноября 2024", bold
|
| 136 |
+
- **`<price-som>`**: Integer price (e.g., "87000"), bold
|
| 137 |
+
- **`<price-leters>`**: Price in words (e.g., "восемьдесят семь тысяч"), regular font
|
| 138 |
+
- **`<today-date>`**: Date in format "1 апреля 2026", bold
|
| 139 |
+
|
| 140 |
+
## Example Output
|
| 141 |
+
|
| 142 |
+
Input receipt with price `87000` and date `2024-11-02` will generate:
|
| 143 |
+
|
| 144 |
+
- Contract date: **2 ноября 2024**
|
| 145 |
+
- Price: **87000** сом (восемьдесят семь тысяч)
|
| 146 |
+
- Name: **ИП ИВАНОВ ИВАН ИВАНОВИЧ** / **ИП Иванов Иван**
|
| 147 |
+
**Иванович**
|
| 148 |
+
|
| 149 |
+
## Development
|
| 150 |
+
|
| 151 |
+
### Running Tests
|
| 152 |
+
|
| 153 |
+
Test the number converter:
|
| 154 |
+
|
| 155 |
+
```bash
|
| 156 |
+
python -c "from utils import NumberToWords; c = NumberToWords(); print(c.convert(87000))"
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
### Project Components
|
| 160 |
+
|
| 161 |
+
- **NumberToWords**: Converts integers to Russian words with proper grammar
|
| 162 |
+
- **ReceiptParser**: Extracts data from ESFS XML receipts
|
| 163 |
+
- **DocumentProcessor**: Fills DOCX templates while preserving formatting
|
| 164 |
+
- **DocumentMerger**: Merges multiple DOCX files into a single document with page breaks
|
| 165 |
+
|
| 166 |
+
## Requirements
|
| 167 |
+
|
| 168 |
+
- Python 3.12+
|
| 169 |
+
- python-docx 1.1.2
|
| 170 |
+
|
| 171 |
+
## License
|
| 172 |
+
|
| 173 |
+
This is a utility tool for internal use.
|
| 174 |
+
|
| 175 |
+
## Support
|
| 176 |
+
|
| 177 |
+
For issues or questions, please check the code documentation or modify according to your needs.
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility modules for ACT document generation"""
|
| 2 |
+
|
| 3 |
+
from .number_converter import NumberToWords
|
| 4 |
+
from .document_processor import DocumentProcessor
|
| 5 |
+
from .receipt_parser import ReceiptParser
|
| 6 |
+
from .document_merger import DocumentMerger
|
| 7 |
+
|
| 8 |
+
__all__ = ['NumberToWords', 'DocumentProcessor', 'ReceiptParser', 'DocumentMerger']
|
utils/document_merger.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document merger for combining multiple DOCX files into one"""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import glob
|
| 5 |
+
from docx import Document
|
| 6 |
+
from docx.enum.text import WD_BREAK
|
| 7 |
+
from typing import List
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DocumentMerger:
|
| 11 |
+
"""Merge multiple DOCX files into a single document with page breaks"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
pass
|
| 15 |
+
|
| 16 |
+
def _add_page_break(self, doc: Document):
|
| 17 |
+
"""
|
| 18 |
+
Add a page break at the end of document
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
doc: DOCX document object
|
| 22 |
+
"""
|
| 23 |
+
paragraph = doc.add_paragraph()
|
| 24 |
+
run = paragraph.add_run()
|
| 25 |
+
run.add_break(WD_BREAK.PAGE)
|
| 26 |
+
|
| 27 |
+
def _copy_paragraph(self, source_para, target_doc: Document):
|
| 28 |
+
"""
|
| 29 |
+
Copy paragraph with all formatting to target document
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
source_para: Source paragraph to copy
|
| 33 |
+
target_doc: Target document to copy to
|
| 34 |
+
"""
|
| 35 |
+
# Create new paragraph with same style
|
| 36 |
+
new_para = target_doc.add_paragraph(style=source_para.style)
|
| 37 |
+
new_para.alignment = source_para.alignment
|
| 38 |
+
|
| 39 |
+
# Copy all runs with formatting
|
| 40 |
+
for run in source_para.runs:
|
| 41 |
+
new_run = new_para.add_run(run.text)
|
| 42 |
+
new_run.bold = run.bold
|
| 43 |
+
new_run.italic = run.italic
|
| 44 |
+
new_run.underline = run.underline
|
| 45 |
+
|
| 46 |
+
# Copy font properties
|
| 47 |
+
if run.font.size:
|
| 48 |
+
new_run.font.size = run.font.size
|
| 49 |
+
if run.font.name:
|
| 50 |
+
new_run.font.name = run.font.name
|
| 51 |
+
|
| 52 |
+
def _copy_table(self, source_table, target_doc: Document):
|
| 53 |
+
"""
|
| 54 |
+
Copy table with all formatting to target document
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
source_table: Source table to copy
|
| 58 |
+
target_doc: Target document to copy to
|
| 59 |
+
"""
|
| 60 |
+
rows = len(source_table.rows)
|
| 61 |
+
cols = len(source_table.columns)
|
| 62 |
+
|
| 63 |
+
# Create new table
|
| 64 |
+
new_table = target_doc.add_table(rows=rows, cols=cols)
|
| 65 |
+
|
| 66 |
+
# Copy table style
|
| 67 |
+
if source_table.style:
|
| 68 |
+
new_table.style = source_table.style
|
| 69 |
+
|
| 70 |
+
# Copy cell contents
|
| 71 |
+
for i, row in enumerate(source_table.rows):
|
| 72 |
+
for j, cell in enumerate(row.cells):
|
| 73 |
+
new_cell = new_table.rows[i].cells[j]
|
| 74 |
+
|
| 75 |
+
# Remove default paragraph
|
| 76 |
+
new_cell.text = ''
|
| 77 |
+
|
| 78 |
+
# Copy each paragraph in the cell
|
| 79 |
+
for para in cell.paragraphs:
|
| 80 |
+
new_para = new_cell.add_paragraph(style=para.style)
|
| 81 |
+
new_para.alignment = para.alignment
|
| 82 |
+
|
| 83 |
+
# Copy runs with formatting
|
| 84 |
+
for run in para.runs:
|
| 85 |
+
new_run = new_para.add_run(run.text)
|
| 86 |
+
new_run.bold = run.bold
|
| 87 |
+
new_run.italic = run.italic
|
| 88 |
+
new_run.underline = run.underline
|
| 89 |
+
|
| 90 |
+
if run.font.size:
|
| 91 |
+
new_run.font.size = run.font.size
|
| 92 |
+
if run.font.name:
|
| 93 |
+
new_run.font.name = run.font.name
|
| 94 |
+
|
| 95 |
+
# Remove the first empty paragraph that was auto-created
|
| 96 |
+
if len(new_cell.paragraphs) > len(cell.paragraphs):
|
| 97 |
+
p = new_cell.paragraphs[0]._element
|
| 98 |
+
p.getparent().remove(p)
|
| 99 |
+
|
| 100 |
+
def merge_files(self, docx_files: List[str], output_file: str):
|
| 101 |
+
"""
|
| 102 |
+
Merge multiple DOCX files into a single document
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
docx_files: List of paths to DOCX files to merge
|
| 106 |
+
output_file: Path to output merged document
|
| 107 |
+
"""
|
| 108 |
+
if not docx_files:
|
| 109 |
+
raise ValueError("No DOCX files provided to merge")
|
| 110 |
+
|
| 111 |
+
# Sort files to ensure consistent order
|
| 112 |
+
docx_files = sorted(docx_files)
|
| 113 |
+
|
| 114 |
+
# Start with the first document as base
|
| 115 |
+
merged_doc = Document(docx_files[0])
|
| 116 |
+
|
| 117 |
+
# Process remaining documents
|
| 118 |
+
for docx_file in docx_files[1:]:
|
| 119 |
+
# Add page break before next document
|
| 120 |
+
self._add_page_break(merged_doc)
|
| 121 |
+
|
| 122 |
+
# Load the document to merge
|
| 123 |
+
sub_doc = Document(docx_file)
|
| 124 |
+
|
| 125 |
+
# Copy all paragraphs
|
| 126 |
+
for paragraph in sub_doc.paragraphs:
|
| 127 |
+
self._copy_paragraph(paragraph, merged_doc)
|
| 128 |
+
|
| 129 |
+
# Copy all tables
|
| 130 |
+
for table in sub_doc.tables:
|
| 131 |
+
self._copy_table(table, merged_doc)
|
| 132 |
+
|
| 133 |
+
# Save merged document
|
| 134 |
+
merged_doc.save(output_file)
|
| 135 |
+
|
| 136 |
+
def merge_from_folder(self, input_folder: str, output_file: str, pattern: str = '*.docx') -> int:
|
| 137 |
+
"""
|
| 138 |
+
Merge all DOCX files from a folder into a single document
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
input_folder: Folder containing DOCX files
|
| 142 |
+
output_file: Path to output merged document
|
| 143 |
+
pattern: File pattern to match (default: *.docx)
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
Number of documents merged
|
| 147 |
+
"""
|
| 148 |
+
# Find all matching DOCX files
|
| 149 |
+
search_pattern = os.path.join(input_folder, pattern)
|
| 150 |
+
docx_files = glob.glob(search_pattern)
|
| 151 |
+
|
| 152 |
+
if not docx_files:
|
| 153 |
+
raise ValueError(f"No DOCX files found in {input_folder} matching pattern {pattern}")
|
| 154 |
+
|
| 155 |
+
# Merge files
|
| 156 |
+
self.merge_files(docx_files, output_file)
|
| 157 |
+
|
| 158 |
+
return len(docx_files)
|
utils/document_processor.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document processor for filling DOCX templates with data"""
|
| 2 |
+
|
| 3 |
+
from docx import Document
|
| 4 |
+
from typing import Dict
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DocumentProcessor:
|
| 8 |
+
"""Process DOCX templates and fill them with data"""
|
| 9 |
+
|
| 10 |
+
def __init__(self, template_path: str, user_name: str):
|
| 11 |
+
"""
|
| 12 |
+
Initialize document processor
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
template_path: Path to DOCX template file
|
| 16 |
+
user_name: User name to fill in the document
|
| 17 |
+
"""
|
| 18 |
+
self.template_path = template_path
|
| 19 |
+
self.user_name_original = user_name
|
| 20 |
+
self.user_name_caps = user_name.upper()
|
| 21 |
+
|
| 22 |
+
def _split_run_with_tag(self, paragraph, run, tag: str, replacement: str, make_bold: bool = True):
|
| 23 |
+
"""
|
| 24 |
+
Split a run containing a tag into three parts to preserve formatting
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
paragraph: DOCX paragraph object
|
| 28 |
+
run: DOCX run object
|
| 29 |
+
tag: Tag to find and replace
|
| 30 |
+
replacement: Text to replace the tag with
|
| 31 |
+
make_bold: Whether to make the replacement text bold
|
| 32 |
+
"""
|
| 33 |
+
if tag not in run.text:
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
text = run.text
|
| 37 |
+
tag_pos = text.find(tag)
|
| 38 |
+
before_text = text[:tag_pos]
|
| 39 |
+
after_text = text[tag_pos + len(tag):]
|
| 40 |
+
|
| 41 |
+
# Save original formatting
|
| 42 |
+
font_size = run.font.size
|
| 43 |
+
font_name = run.font.name
|
| 44 |
+
original_bold = run.bold
|
| 45 |
+
|
| 46 |
+
run_index = paragraph._element.index(run._element)
|
| 47 |
+
|
| 48 |
+
# Update current run to "before" text
|
| 49 |
+
run.text = before_text
|
| 50 |
+
run.bold = original_bold
|
| 51 |
+
if font_size:
|
| 52 |
+
run.font.size = font_size
|
| 53 |
+
if font_name:
|
| 54 |
+
run.font.name = font_name
|
| 55 |
+
|
| 56 |
+
# Add replacement text as new run
|
| 57 |
+
new_run = paragraph.add_run(replacement)
|
| 58 |
+
new_run.bold = make_bold
|
| 59 |
+
if font_size:
|
| 60 |
+
new_run.font.size = font_size
|
| 61 |
+
if font_name:
|
| 62 |
+
new_run.font.name = font_name
|
| 63 |
+
paragraph._element.insert(run_index + 1, new_run._element)
|
| 64 |
+
|
| 65 |
+
# Add "after" text as new run with original formatting
|
| 66 |
+
after_run = paragraph.add_run(after_text)
|
| 67 |
+
after_run.bold = original_bold
|
| 68 |
+
if font_size:
|
| 69 |
+
after_run.font.size = font_size
|
| 70 |
+
if font_name:
|
| 71 |
+
after_run.font.name = font_name
|
| 72 |
+
paragraph._element.insert(run_index + 2, after_run._element)
|
| 73 |
+
|
| 74 |
+
def _process_paragraph(self, paragraph, data: Dict[str, any]):
|
| 75 |
+
"""
|
| 76 |
+
Process a single paragraph and replace all tags
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
paragraph: DOCX paragraph object
|
| 80 |
+
data: Dictionary containing data to fill (contract_date, price, price_words, today_date)
|
| 81 |
+
"""
|
| 82 |
+
tags_to_check = ['<my-name-caps>', '<my-name>', '<contract-date>', '<price-som>', '<price-leters>', '<today-date>']
|
| 83 |
+
if not any(tag in paragraph.text for tag in tags_to_check):
|
| 84 |
+
return
|
| 85 |
+
|
| 86 |
+
# Process runs - snapshot because we'll be adding runs
|
| 87 |
+
runs_snapshot = list(paragraph.runs)
|
| 88 |
+
|
| 89 |
+
for run in runs_snapshot:
|
| 90 |
+
# Process <my-name-caps> tag
|
| 91 |
+
if '<my-name-caps>' in run.text:
|
| 92 |
+
self._split_run_with_tag(paragraph, run, '<my-name-caps>', self.user_name_caps, make_bold=True)
|
| 93 |
+
|
| 94 |
+
# Process <my-name> tag (with last word on new line)
|
| 95 |
+
elif '<my-name>' in run.text:
|
| 96 |
+
name_parts = self.user_name_original.rsplit(' ', 1)
|
| 97 |
+
if len(name_parts) == 2:
|
| 98 |
+
name_with_newline = name_parts[0] + '\n' + name_parts[1]
|
| 99 |
+
else:
|
| 100 |
+
name_with_newline = self.user_name_original
|
| 101 |
+
self._split_run_with_tag(paragraph, run, '<my-name>', name_with_newline, make_bold=True)
|
| 102 |
+
|
| 103 |
+
# Process <contract-date> tag
|
| 104 |
+
elif '<contract-date>' in run.text:
|
| 105 |
+
self._split_run_with_tag(paragraph, run, '<contract-date>', data['contract_date'], make_bold=True)
|
| 106 |
+
|
| 107 |
+
# Process <price-som> tag
|
| 108 |
+
elif '<price-som>' in run.text:
|
| 109 |
+
self._split_run_with_tag(paragraph, run, '<price-som>', str(data['price']), make_bold=True)
|
| 110 |
+
|
| 111 |
+
# Process <price-leters> tag (NOT bold)
|
| 112 |
+
elif '<price-leters>' in run.text:
|
| 113 |
+
self._split_run_with_tag(paragraph, run, '<price-leters>', data['price_words'], make_bold=False)
|
| 114 |
+
|
| 115 |
+
# Process <today-date> tag
|
| 116 |
+
elif '<today-date>' in run.text:
|
| 117 |
+
self._split_run_with_tag(paragraph, run, '<today-date>', data['today_date'], make_bold=True)
|
| 118 |
+
|
| 119 |
+
def process_document(self, data: Dict[str, any]) -> Document:
|
| 120 |
+
"""
|
| 121 |
+
Process template document with given data
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
data: Dictionary containing:
|
| 125 |
+
- contract_date: Contract date string
|
| 126 |
+
- price: Price as integer
|
| 127 |
+
- price_words: Price in words
|
| 128 |
+
- today_date: Today's date string
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
Processed DOCX document
|
| 132 |
+
"""
|
| 133 |
+
# Load template
|
| 134 |
+
doc = Document(self.template_path)
|
| 135 |
+
|
| 136 |
+
# Process all paragraphs
|
| 137 |
+
for para in doc.paragraphs:
|
| 138 |
+
self._process_paragraph(para, data)
|
| 139 |
+
|
| 140 |
+
# Process tables
|
| 141 |
+
for table in doc.tables:
|
| 142 |
+
for row in table.rows:
|
| 143 |
+
for cell in row.cells:
|
| 144 |
+
for para in cell.paragraphs:
|
| 145 |
+
self._process_paragraph(para, data)
|
| 146 |
+
|
| 147 |
+
return doc
|
utils/number_converter.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Number to Russian words converter"""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class NumberToWords:
|
| 5 |
+
"""Convert numbers to Russian words in nominative case"""
|
| 6 |
+
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.ones = ['', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять']
|
| 9 |
+
self.tens = ['', '', 'двадцать', 'тридцать', 'сорок', 'пятьдесят', 'шестьдесят', 'семьдесят', 'восемьдесят', 'девяносто']
|
| 10 |
+
self.teens = ['десять', 'одиннадцать', 'двенадцать', 'тринадцать', 'четырнадцать', 'пятнадцать',
|
| 11 |
+
'шестнадцать', 'семнадцать', 'восемнадцать', 'девятнадцать']
|
| 12 |
+
self.hundreds = ['', 'сто', 'двести', 'триста', 'четыреста', 'пятьсот', 'шестьсот', 'семьсот', 'восемьсот', 'девятьсот']
|
| 13 |
+
self.thousands = ['', 'одна', 'две', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять']
|
| 14 |
+
|
| 15 |
+
def _convert_hundreds(self, num: int) -> str:
|
| 16 |
+
"""Convert number 0-999 to words"""
|
| 17 |
+
if num == 0:
|
| 18 |
+
return ''
|
| 19 |
+
|
| 20 |
+
result = []
|
| 21 |
+
|
| 22 |
+
# Hundreds
|
| 23 |
+
hundred = num // 100
|
| 24 |
+
if hundred > 0:
|
| 25 |
+
result.append(self.hundreds[hundred])
|
| 26 |
+
|
| 27 |
+
# Tens and ones
|
| 28 |
+
remainder = num % 100
|
| 29 |
+
if 10 <= remainder <= 19:
|
| 30 |
+
result.append(self.teens[remainder - 10])
|
| 31 |
+
else:
|
| 32 |
+
ten = remainder // 10
|
| 33 |
+
one = remainder % 10
|
| 34 |
+
if ten > 0:
|
| 35 |
+
result.append(self.tens[ten])
|
| 36 |
+
if one > 0:
|
| 37 |
+
result.append(self.ones[one])
|
| 38 |
+
|
| 39 |
+
return ' '.join(result)
|
| 40 |
+
|
| 41 |
+
def _convert_thousands(self, num: int) -> str:
|
| 42 |
+
"""Convert thousands part (0-999) to words with feminine forms"""
|
| 43 |
+
if num == 0:
|
| 44 |
+
return ''
|
| 45 |
+
|
| 46 |
+
result = []
|
| 47 |
+
|
| 48 |
+
# Hundreds
|
| 49 |
+
hundred = num // 100
|
| 50 |
+
if hundred > 0:
|
| 51 |
+
result.append(self.hundreds[hundred])
|
| 52 |
+
|
| 53 |
+
# Tens and ones (with feminine forms for thousands)
|
| 54 |
+
remainder = num % 100
|
| 55 |
+
if 10 <= remainder <= 19:
|
| 56 |
+
result.append(self.teens[remainder - 10])
|
| 57 |
+
else:
|
| 58 |
+
ten = remainder // 10
|
| 59 |
+
one = remainder % 10
|
| 60 |
+
if ten > 0:
|
| 61 |
+
result.append(self.tens[ten])
|
| 62 |
+
if one > 0:
|
| 63 |
+
result.append(self.thousands[one])
|
| 64 |
+
|
| 65 |
+
return ' '.join(result)
|
| 66 |
+
|
| 67 |
+
def _thousand_word(self, num: int) -> str:
|
| 68 |
+
"""Get correct grammatical form of 'thousand' word"""
|
| 69 |
+
if num % 100 in [11, 12, 13, 14]:
|
| 70 |
+
return 'тысяч'
|
| 71 |
+
last_digit = num % 10
|
| 72 |
+
if last_digit == 1:
|
| 73 |
+
return 'тысяча'
|
| 74 |
+
elif last_digit in [2, 3, 4]:
|
| 75 |
+
return 'тысячи'
|
| 76 |
+
else:
|
| 77 |
+
return 'тысяч'
|
| 78 |
+
|
| 79 |
+
def convert(self, num: int) -> str:
|
| 80 |
+
"""
|
| 81 |
+
Convert integer to Russian words in nominative case
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
num: Integer number to convert
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
String representation of the number in Russian
|
| 88 |
+
"""
|
| 89 |
+
if num == 0:
|
| 90 |
+
return 'ноль'
|
| 91 |
+
|
| 92 |
+
if num < 0:
|
| 93 |
+
return 'минус ' + self.convert(-num)
|
| 94 |
+
|
| 95 |
+
result = []
|
| 96 |
+
|
| 97 |
+
# Millions
|
| 98 |
+
millions = num // 1000000
|
| 99 |
+
if millions > 0:
|
| 100 |
+
result.append(self._convert_hundreds(millions))
|
| 101 |
+
if millions % 100 in [11, 12, 13, 14]:
|
| 102 |
+
result.append('миллионов')
|
| 103 |
+
elif millions % 10 == 1:
|
| 104 |
+
result.append('миллион')
|
| 105 |
+
elif millions % 10 in [2, 3, 4]:
|
| 106 |
+
result.append('миллиона')
|
| 107 |
+
else:
|
| 108 |
+
result.append('миллионов')
|
| 109 |
+
|
| 110 |
+
# Thousands
|
| 111 |
+
thousands = (num % 1000000) // 1000
|
| 112 |
+
if thousands > 0:
|
| 113 |
+
result.append(self._convert_thousands(thousands))
|
| 114 |
+
result.append(self._thousand_word(thousands))
|
| 115 |
+
|
| 116 |
+
# Hundreds
|
| 117 |
+
hundreds = num % 1000
|
| 118 |
+
if hundreds > 0:
|
| 119 |
+
result.append(self._convert_hundreds(hundreds))
|
| 120 |
+
|
| 121 |
+
return ' '.join(result)
|
utils/receipt_parser.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""XML Receipt parser for ESFS documents"""
|
| 2 |
+
|
| 3 |
+
import xml.etree.ElementTree as ET
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
import glob
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ReceiptParser:
|
| 10 |
+
"""Parser for extracting receipt data from XML files"""
|
| 11 |
+
|
| 12 |
+
# Month names in Russian (genitive case for dates)
|
| 13 |
+
MONTHS_GENITIVE = {
|
| 14 |
+
1: 'января', 2: 'февраля', 3: 'марта', 4: 'апреля',
|
| 15 |
+
5: 'мая', 6: 'июня', 7: 'июля', 8: 'августа',
|
| 16 |
+
9: 'сентября', 10: 'октября', 11: 'ноября', 12: 'декабря'
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
def __init__(self, esfs_folder: str = 'esfs'):
|
| 20 |
+
"""
|
| 21 |
+
Initialize receipt parser
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
esfs_folder: Path to folder containing XML files
|
| 25 |
+
"""
|
| 26 |
+
self.esfs_folder = esfs_folder
|
| 27 |
+
|
| 28 |
+
def find_xml_files(self) -> List[str]:
|
| 29 |
+
"""
|
| 30 |
+
Find all XML files in the ESFS folder
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
List of XML file paths
|
| 34 |
+
"""
|
| 35 |
+
pattern = f"{self.esfs_folder}/*.xml"
|
| 36 |
+
return glob.glob(pattern)
|
| 37 |
+
|
| 38 |
+
def parse_receipts(self) -> List[ET.Element]:
|
| 39 |
+
"""
|
| 40 |
+
Parse all receipts from all XML files in the folder
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
List of receipt XML elements
|
| 44 |
+
"""
|
| 45 |
+
xml_files = self.find_xml_files()
|
| 46 |
+
all_receipts = []
|
| 47 |
+
|
| 48 |
+
for xml_file in xml_files:
|
| 49 |
+
tree = ET.parse(xml_file)
|
| 50 |
+
root = tree.getroot()
|
| 51 |
+
receipts = root.findall('.//receipt')
|
| 52 |
+
all_receipts.extend(receipts)
|
| 53 |
+
|
| 54 |
+
return all_receipts
|
| 55 |
+
|
| 56 |
+
def extract_receipt_data(self, receipt: ET.Element) -> Dict[str, any]:
|
| 57 |
+
"""
|
| 58 |
+
Extract data from a single receipt element
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
receipt: XML element containing receipt data
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
Dictionary with extracted data
|
| 65 |
+
"""
|
| 66 |
+
# Contract date
|
| 67 |
+
contract_date_str = receipt.find('deliveryContractDate').text
|
| 68 |
+
contract_date = datetime.fromisoformat(contract_date_str.replace('+06:00', ''))
|
| 69 |
+
contract_date_formatted = f"{contract_date.day} {self.MONTHS_GENITIVE[contract_date.month]} {contract_date.year}"
|
| 70 |
+
|
| 71 |
+
# Price
|
| 72 |
+
price_str = receipt.find('.//goods/good/price').text
|
| 73 |
+
price_int = int(float(price_str))
|
| 74 |
+
|
| 75 |
+
# Today's date (creation date)
|
| 76 |
+
today_date_str = receipt.find('createdDate').text
|
| 77 |
+
today_date = datetime.fromisoformat(today_date_str)
|
| 78 |
+
today_date_formatted = f"{today_date.day} {self.MONTHS_GENITIVE[today_date.month]} {today_date.year}"
|
| 79 |
+
|
| 80 |
+
return {
|
| 81 |
+
'contract_date': contract_date_formatted,
|
| 82 |
+
'price': price_int,
|
| 83 |
+
'today_date': today_date_formatted
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
def get_all_receipt_data(self) -> List[Dict[str, any]]:
|
| 87 |
+
"""
|
| 88 |
+
Get data for all receipts in the folder
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
List of dictionaries containing receipt data
|
| 92 |
+
"""
|
| 93 |
+
receipts = self.parse_receipts()
|
| 94 |
+
return [self.extract_receipt_data(receipt) for receipt in receipts]
|