Spaces:

ashish1265659565
/

pharmaspine-backend

Sleeping

App Files Files Community

pharmaspine-backend / src /text_extraction.py

ashish1265659565

Upload folder using huggingface_hub

08fd094 verified 3 days ago

Raw

History Blame Contribute Delete

3.24 kB

	import os
	import re
	from typing import Dict, Any, Tuple
	import pdfplumber

	"""
	text_extraction.py

	What it does:
	Extracts clean text from PDF documents.

	How it works:
	It uses `pdfplumber` to open the PDF file, iterates through its pages, and extracts the text.
	It employs regular expressions to strip out common headers, footers, and page numbers,
	while attempting to preserve the general structure of the text (like newlines).

	Key algorithms/formulas:
	- Regex for page numbers: r'^\\s\\d+\\s$'
	- Regex for generic headers: r'^(Page \\d+\|Chapter \\d+\|Confidential)'

	Example input:
	- A PDF file path e.g., 'data/sample_documents.pdf'

	Example output:
	{
	'filename': 'sample_documents.pdf',
	'text': 'Extracted content goes here...',
	'metadata': {'pages_processed': 10, 'file_size_bytes': 102400}
	}
	"""

	def clean_text(text: str) -> str:
	"""
	Cleans the extracted text by removing page numbers, standard headers, and footers.

	Args:
	text (str): The raw text from a PDF page.

	Returns:
	str: The cleaned text.
	"""
	if not text:
	return ""

	lines = text.split('\n')
	cleaned_lines = []

	# Patterns to match page numbers, headers, and footers
	page_number_pattern = re.compile(r'^\s-?\s\d+\s-?\s$')
	header_pattern = re.compile(r'^(Page \d+\|Confidential\|Draft).*$', re.IGNORECASE)

	for line in lines:
	stripped = line.strip()
	# Skip if it's just a page number
	if page_number_pattern.match(stripped):
	continue
	# Skip generic headers/footers
	if header_pattern.match(stripped):
	continue

	cleaned_lines.append(line)

	return '\n'.join(cleaned_lines)

	def extract_text_from_pdf(pdf_path: str) -> Dict[str, Any]:
	"""
	Extracts text from a given PDF file path.

	Args:
	pdf_path (str): The absolute or relative path to the PDF file.

	Returns:
	Dict[str, Any]: A dictionary containing the filename, full cleaned text, and metadata.
	"""
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"PDF file not found at: {pdf_path}")

	filename = os.path.basename(pdf_path)
	full_text = []
	page_count = 0
	file_size = os.path.getsize(pdf_path)

	try:
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	page_count += 1
	text = page.extract_text()
	if text:
	cleaned = clean_text(text)
	full_text.append(cleaned)
	except Exception as e:
	print(f"Error extracting text from {filename}: {e}")

	combined_text = "\n\n".join(full_text)

	return {
	"filename": filename,
	"text": combined_text,
	"metadata": {
	"pages_processed": page_count,
	"file_size_bytes": file_size
	}
	}

	if __name__ == "__main__":
	# Example usage for testing
	import sys
	if len(sys.argv) > 1:
	test_path = sys.argv[1]
	result = extract_text_from_pdf(test_path)
	print(f"Extracted {len(result['text'])} characters from {result['metadata']['pages_processed']} pages.")