acceptance-certificate-maker / utils /receipt_parser.py
Simonlob's picture
zip
e6708ef
Raw
History Blame Contribute Delete
3 kB
"""XML Receipt parser for ESFS documents"""
import xml.etree.ElementTree as ET
from datetime import datetime
from typing import List, Dict
import glob
class ReceiptParser:
"""Parser for extracting receipt data from XML files"""
# Month names in Russian (genitive case for dates)
MONTHS_GENITIVE = {
1: 'января', 2: 'февраля', 3: 'марта', 4: 'апреля',
5: 'мая', 6: 'июня', 7: 'июля', 8: 'августа',
9: 'сентября', 10: 'октября', 11: 'ноября', 12: 'декабря'
}
def __init__(self, esfs_folder: str = 'esfs'):
"""
Initialize receipt parser
Args:
esfs_folder: Path to folder containing XML files
"""
self.esfs_folder = esfs_folder
def find_xml_files(self) -> List[str]:
"""
Find all XML files in the ESFS folder
Returns:
List of XML file paths
"""
pattern = f"{self.esfs_folder}/*.xml"
return glob.glob(pattern)
def parse_receipts(self) -> List[ET.Element]:
"""
Parse all receipts from all XML files in the folder
Returns:
List of receipt XML elements
"""
xml_files = self.find_xml_files()
all_receipts = []
for xml_file in xml_files:
tree = ET.parse(xml_file)
root = tree.getroot()
receipts = root.findall('.//receipt')
all_receipts.extend(receipts)
return all_receipts
def extract_receipt_data(self, receipt: ET.Element) -> Dict[str, any]:
"""
Extract data from a single receipt element
Args:
receipt: XML element containing receipt data
Returns:
Dictionary with extracted data
"""
# Contract date
contract_date_str = receipt.find('deliveryContractDate').text
contract_date = datetime.fromisoformat(contract_date_str.replace('+06:00', ''))
contract_date_formatted = f"{contract_date.day} {self.MONTHS_GENITIVE[contract_date.month]} {contract_date.year}"
# Price
price_str = receipt.find('.//goods/good/price').text
price_int = int(float(price_str))
# Today's date (creation date)
today_date_str = receipt.find('createdDate').text
today_date = datetime.fromisoformat(today_date_str)
today_date_formatted = f"{today_date.day} {self.MONTHS_GENITIVE[today_date.month]} {today_date.year}"
return {
'contract_date': contract_date_formatted,
'price': price_int,
'today_date': today_date_formatted
}
def get_all_receipt_data(self) -> List[Dict[str, any]]:
"""
Get data for all receipts in the folder
Returns:
List of dictionaries containing receipt data
"""
receipts = self.parse_receipts()
return [self.extract_receipt_data(receipt) for receipt in receipts]