import os import re from typing import Dict, Any, Tuple import pdfplumber """ text_extraction.py What it does: Extracts clean text from PDF documents. How it works: It uses `pdfplumber` to open the PDF file, iterates through its pages, and extracts the text. It employs regular expressions to strip out common headers, footers, and page numbers, while attempting to preserve the general structure of the text (like newlines). Key algorithms/formulas: - Regex for page numbers: r'^\\s*\\d+\\s*$' - Regex for generic headers: r'^(Page \\d+|Chapter \\d+|Confidential)' Example input: - A PDF file path e.g., 'data/sample_documents.pdf' Example output: { 'filename': 'sample_documents.pdf', 'text': 'Extracted content goes here...', 'metadata': {'pages_processed': 10, 'file_size_bytes': 102400} } """ def clean_text(text: str) -> str: """ Cleans the extracted text by removing page numbers, standard headers, and footers. Args: text (str): The raw text from a PDF page. Returns: str: The cleaned text. """ if not text: return "" lines = text.split('\n') cleaned_lines = [] # Patterns to match page numbers, headers, and footers page_number_pattern = re.compile(r'^\s*-?\s*\d+\s*-?\s*$') header_pattern = re.compile(r'^(Page \d+|Confidential|Draft).*$', re.IGNORECASE) for line in lines: stripped = line.strip() # Skip if it's just a page number if page_number_pattern.match(stripped): continue # Skip generic headers/footers if header_pattern.match(stripped): continue cleaned_lines.append(line) return '\n'.join(cleaned_lines) def extract_text_from_pdf(pdf_path: str) -> Dict[str, Any]: """ Extracts text from a given PDF file path. Args: pdf_path (str): The absolute or relative path to the PDF file. Returns: Dict[str, Any]: A dictionary containing the filename, full cleaned text, and metadata. """ if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF file not found at: {pdf_path}") filename = os.path.basename(pdf_path) full_text = [] page_count = 0 file_size = os.path.getsize(pdf_path) try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: page_count += 1 text = page.extract_text() if text: cleaned = clean_text(text) full_text.append(cleaned) except Exception as e: print(f"Error extracting text from {filename}: {e}") combined_text = "\n\n".join(full_text) return { "filename": filename, "text": combined_text, "metadata": { "pages_processed": page_count, "file_size_bytes": file_size } } if __name__ == "__main__": # Example usage for testing import sys if len(sys.argv) > 1: test_path = sys.argv[1] result = extract_text_from_pdf(test_path) print(f"Extracted {len(result['text'])} characters from {result['metadata']['pages_processed']} pages.")