| import os |
| import re |
| from typing import Dict, Any, Tuple |
| import pdfplumber |
|
|
| """ |
| text_extraction.py |
| |
| What it does: |
| Extracts clean text from PDF documents. |
| |
| How it works: |
| It uses `pdfplumber` to open the PDF file, iterates through its pages, and extracts the text. |
| It employs regular expressions to strip out common headers, footers, and page numbers, |
| while attempting to preserve the general structure of the text (like newlines). |
| |
| Key algorithms/formulas: |
| - Regex for page numbers: r'^\\s*\\d+\\s*$' |
| - Regex for generic headers: r'^(Page \\d+|Chapter \\d+|Confidential)' |
| |
| Example input: |
| - A PDF file path e.g., 'data/sample_documents.pdf' |
| |
| Example output: |
| { |
| 'filename': 'sample_documents.pdf', |
| 'text': 'Extracted content goes here...', |
| 'metadata': {'pages_processed': 10, 'file_size_bytes': 102400} |
| } |
| """ |
|
|
| def clean_text(text: str) -> str: |
| """ |
| Cleans the extracted text by removing page numbers, standard headers, and footers. |
| |
| Args: |
| text (str): The raw text from a PDF page. |
| |
| Returns: |
| str: The cleaned text. |
| """ |
| if not text: |
| return "" |
| |
| lines = text.split('\n') |
| cleaned_lines = [] |
| |
| |
| page_number_pattern = re.compile(r'^\s*-?\s*\d+\s*-?\s*$') |
| header_pattern = re.compile(r'^(Page \d+|Confidential|Draft).*$', re.IGNORECASE) |
| |
| for line in lines: |
| stripped = line.strip() |
| |
| if page_number_pattern.match(stripped): |
| continue |
| |
| if header_pattern.match(stripped): |
| continue |
| |
| cleaned_lines.append(line) |
| |
| return '\n'.join(cleaned_lines) |
|
|
| def extract_text_from_pdf(pdf_path: str) -> Dict[str, Any]: |
| """ |
| Extracts text from a given PDF file path. |
| |
| Args: |
| pdf_path (str): The absolute or relative path to the PDF file. |
| |
| Returns: |
| Dict[str, Any]: A dictionary containing the filename, full cleaned text, and metadata. |
| """ |
| if not os.path.exists(pdf_path): |
| raise FileNotFoundError(f"PDF file not found at: {pdf_path}") |
| |
| filename = os.path.basename(pdf_path) |
| full_text = [] |
| page_count = 0 |
| file_size = os.path.getsize(pdf_path) |
| |
| try: |
| with pdfplumber.open(pdf_path) as pdf: |
| for page in pdf.pages: |
| page_count += 1 |
| text = page.extract_text() |
| if text: |
| cleaned = clean_text(text) |
| full_text.append(cleaned) |
| except Exception as e: |
| print(f"Error extracting text from {filename}: {e}") |
| |
| combined_text = "\n\n".join(full_text) |
| |
| return { |
| "filename": filename, |
| "text": combined_text, |
| "metadata": { |
| "pages_processed": page_count, |
| "file_size_bytes": file_size |
| } |
| } |
|
|
| if __name__ == "__main__": |
| |
| import sys |
| if len(sys.argv) > 1: |
| test_path = sys.argv[1] |
| result = extract_text_from_pdf(test_path) |
| print(f"Extracted {len(result['text'])} characters from {result['metadata']['pages_processed']} pages.") |
|
|