| """XML Receipt parser for ESFS documents""" |
|
|
| import xml.etree.ElementTree as ET |
| from datetime import datetime |
| from typing import List, Dict |
| import glob |
|
|
|
|
| class ReceiptParser: |
| """Parser for extracting receipt data from XML files""" |
|
|
| |
| MONTHS_GENITIVE = { |
| 1: 'января', 2: 'февраля', 3: 'марта', 4: 'апреля', |
| 5: 'мая', 6: 'июня', 7: 'июля', 8: 'августа', |
| 9: 'сентября', 10: 'октября', 11: 'ноября', 12: 'декабря' |
| } |
|
|
| def __init__(self, esfs_folder: str = 'esfs'): |
| """ |
| Initialize receipt parser |
| |
| Args: |
| esfs_folder: Path to folder containing XML files |
| """ |
| self.esfs_folder = esfs_folder |
|
|
| def find_xml_files(self) -> List[str]: |
| """ |
| Find all XML files in the ESFS folder |
| |
| Returns: |
| List of XML file paths |
| """ |
| pattern = f"{self.esfs_folder}/*.xml" |
| return glob.glob(pattern) |
|
|
| def parse_receipts(self) -> List[ET.Element]: |
| """ |
| Parse all receipts from all XML files in the folder |
| |
| Returns: |
| List of receipt XML elements |
| """ |
| xml_files = self.find_xml_files() |
| all_receipts = [] |
|
|
| for xml_file in xml_files: |
| tree = ET.parse(xml_file) |
| root = tree.getroot() |
| receipts = root.findall('.//receipt') |
| all_receipts.extend(receipts) |
|
|
| return all_receipts |
|
|
| def extract_receipt_data(self, receipt: ET.Element) -> Dict[str, any]: |
| """ |
| Extract data from a single receipt element |
| |
| Args: |
| receipt: XML element containing receipt data |
| |
| Returns: |
| Dictionary with extracted data |
| """ |
| |
| contract_date_str = receipt.find('deliveryContractDate').text |
| contract_date = datetime.fromisoformat(contract_date_str.replace('+06:00', '')) |
| contract_date_formatted = f"{contract_date.day} {self.MONTHS_GENITIVE[contract_date.month]} {contract_date.year}" |
|
|
| |
| price_str = receipt.find('.//goods/good/price').text |
| price_int = int(float(price_str)) |
|
|
| |
| today_date_str = receipt.find('createdDate').text |
| today_date = datetime.fromisoformat(today_date_str) |
| today_date_formatted = f"{today_date.day} {self.MONTHS_GENITIVE[today_date.month]} {today_date.year}" |
|
|
| return { |
| 'contract_date': contract_date_formatted, |
| 'price': price_int, |
| 'today_date': today_date_formatted |
| } |
|
|
| def get_all_receipt_data(self) -> List[Dict[str, any]]: |
| """ |
| Get data for all receipts in the folder |
| |
| Returns: |
| List of dictionaries containing receipt data |
| """ |
| receipts = self.parse_receipts() |
| return [self.extract_receipt_data(receipt) for receipt in receipts] |
|
|