File size: 3,237 Bytes
08fd094
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import re
from typing import Dict, Any, Tuple
import pdfplumber

"""
text_extraction.py

What it does:
Extracts clean text from PDF documents.

How it works:
It uses `pdfplumber` to open the PDF file, iterates through its pages, and extracts the text.
It employs regular expressions to strip out common headers, footers, and page numbers,
while attempting to preserve the general structure of the text (like newlines).

Key algorithms/formulas:
- Regex for page numbers: r'^\\s*\\d+\\s*$'
- Regex for generic headers: r'^(Page \\d+|Chapter \\d+|Confidential)'

Example input:
- A PDF file path e.g., 'data/sample_documents.pdf'

Example output:
{
    'filename': 'sample_documents.pdf',
    'text': 'Extracted content goes here...',
    'metadata': {'pages_processed': 10, 'file_size_bytes': 102400}
}
"""

def clean_text(text: str) -> str:
    """
    Cleans the extracted text by removing page numbers, standard headers, and footers.
    
    Args:
        text (str): The raw text from a PDF page.
        
    Returns:
        str: The cleaned text.
    """
    if not text:
        return ""
        
    lines = text.split('\n')
    cleaned_lines = []
    
    # Patterns to match page numbers, headers, and footers
    page_number_pattern = re.compile(r'^\s*-?\s*\d+\s*-?\s*$')
    header_pattern = re.compile(r'^(Page \d+|Confidential|Draft).*$', re.IGNORECASE)
    
    for line in lines:
        stripped = line.strip()
        # Skip if it's just a page number
        if page_number_pattern.match(stripped):
            continue
        # Skip generic headers/footers
        if header_pattern.match(stripped):
            continue
            
        cleaned_lines.append(line)
        
    return '\n'.join(cleaned_lines)

def extract_text_from_pdf(pdf_path: str) -> Dict[str, Any]:
    """
    Extracts text from a given PDF file path.
    
    Args:
        pdf_path (str): The absolute or relative path to the PDF file.
        
    Returns:
        Dict[str, Any]: A dictionary containing the filename, full cleaned text, and metadata.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
        
    filename = os.path.basename(pdf_path)
    full_text = []
    page_count = 0
    file_size = os.path.getsize(pdf_path)
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_count += 1
                text = page.extract_text()
                if text:
                    cleaned = clean_text(text)
                    full_text.append(cleaned)
    except Exception as e:
        print(f"Error extracting text from {filename}: {e}")
        
    combined_text = "\n\n".join(full_text)
    
    return {
        "filename": filename,
        "text": combined_text,
        "metadata": {
            "pages_processed": page_count,
            "file_size_bytes": file_size
        }
    }

if __name__ == "__main__":
    # Example usage for testing
    import sys
    if len(sys.argv) > 1:
        test_path = sys.argv[1]
        result = extract_text_from_pdf(test_path)
        print(f"Extracted {len(result['text'])} characters from {result['metadata']['pages_processed']} pages.")