import re
from typing import List, Dict, Any
import tiktoken
from src.config import MAX_TOKENS, MIN_TOKENS

"""
chunking.py

What it does:
Splits raw text into semantic chunks of an appropriate token size, while trying to respect
sentence boundaries and preserving section context (e.g., "Dosage", "Side Effects").

How it works:
1. It uses regular expressions to detect major section headers.
2. It splits the text into sentences using punctuation boundaries.
3. It iterates through the sentences, adding them to the current chunk until adding the next
   sentence would exceed MAX_TOKENS (400).
4. If a chunk completes but is under MIN_TOKENS (20), it merges it backward into the previous chunk.
5. It uses the `tiktoken` library (cl100k_base encoding) to accurately count tokens.

Example input:
- Raw text extracted from a PDF.

Example output:
[
  {"text": "Pemetrexed is indicated for...", "section": "Indications", "index": 0},
  {"text": "The recommended dosage is...", "section": "Dosage", "index": 1}
]
"""

# Initialize tiktoken encoder
try:
    encoder = tiktoken.get_encoding("cl100k_base")
except Exception:
    # Fallback if cl100k_base is somehow unavailable
    encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")

def count_tokens(text: str) -> int:
    """Returns the number of tokens in a text string."""
    return len(encoder.encode(text))

def detect_section(line: str, current_section: str) -> str:
    """
    Detects if a line is a medical section header and returns it.
    Otherwise, returns the current section.
    """
    # Common medical document headers
    headers = ["dosage", "side effects", "indications", "warnings", "contraindications", 
               "precautions", "adverse reactions", "description", "clinical pharmacology"]
               
    clean_line = line.strip().lower()
    # Simple heuristic: if a line is short and matches a known header type
    if len(clean_line) < 50:
        for header in headers:
            if header in clean_line:
                return header.title()
                
    return current_section

def chunk_text(text: str) -> List[Dict[str, Any]]:
    """
    Chunks raw text into a list of semantic chunks with metadata.
    
    Args:
        text (str): The raw text string.
        
    Returns:
        List[Dict]: A list of chunk dictionaries containing 'text', 'section', and 'index'.
    """
    chunks = []
    current_chunk_sentences = []
    current_tokens = 0
    current_section = "General"
    chunk_index = 0
    
    # Split text into paragraphs first to detect headers better
    paragraphs = [p for p in text.split('\n') if p.strip()]
    
    for paragraph in paragraphs:
        # Check if this paragraph is a section header
        current_section = detect_section(paragraph, current_section)
        
        # Split paragraph into sentences (basic regex for sentence boundaries)
        # Matches period, question mark, or exclamation point followed by whitespace.
        sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
        
        for sentence in sentences:
            if not sentence.strip():
                continue
                
            sentence_tokens = count_tokens(sentence)
            
            # If a single sentence is larger than max tokens, we have to split it (rare, but possible)
            if sentence_tokens > MAX_TOKENS:
                # Force split by words if needed, but for simplicity we append it here
                pass 
            
            # Check if adding this sentence exceeds the limit
            if current_tokens + sentence_tokens > MAX_TOKENS and current_chunk_sentences:
                # Save the current chunk
                chunk_text_str = " ".join(current_chunk_sentences)
                chunks.append({
                    "text": chunk_text_str,
                    "section": current_section,
                    "index": chunk_index
                })
                chunk_index += 1
                
                # Start new chunk
                current_chunk_sentences = [sentence]
                current_tokens = sentence_tokens
            else:
                # Add to current chunk
                current_chunk_sentences.append(sentence)
                current_tokens += sentence_tokens
                
    # Add the last chunk if it has content
    if current_chunk_sentences:
        chunk_text_str = " ".join(current_chunk_sentences)
        chunks.append({
            "text": chunk_text_str,
            "section": current_section,
            "index": chunk_index
        })

    # Post-processing: Merge backward if a chunk is less than MIN_TOKENS
    merged_chunks = []
    for i, chunk in enumerate(chunks):
        chunk_token_count = count_tokens(chunk["text"])
        
        if chunk_token_count < MIN_TOKENS and len(merged_chunks) > 0:
            # Merge with previous chunk
            prev_chunk = merged_chunks[-1]
            prev_chunk["text"] += " " + chunk["text"]
        else:
            merged_chunks.append(chunk)

    # Re-index
    for i, chunk in enumerate(merged_chunks):
        chunk["index"] = i
        
    return merged_chunks

if __name__ == "__main__":
    # Test block
    sample = "Indications and Usage. Pemetrexed is indicated for the treatment of patients with locally advanced or metastatic nonsquamous non-small cell lung cancer (NSCLC). Dosage and Administration. The recommended dose of Pemetrexed is 500 mg/m2 administered as an intravenous infusion over 10 minutes on Day 1 of each 21-day cycle."
    print("Chunking sample text...")
    res = chunk_text(sample)
    for c in res:
        print(f"Chunk {c['index']} [{c['section']}]: {c['text']}")