import re from typing import List, Dict, Any import tiktoken from src.config import MAX_TOKENS, MIN_TOKENS """ chunking.py What it does: Splits raw text into semantic chunks of an appropriate token size, while trying to respect sentence boundaries and preserving section context (e.g., "Dosage", "Side Effects"). How it works: 1. It uses regular expressions to detect major section headers. 2. It splits the text into sentences using punctuation boundaries. 3. It iterates through the sentences, adding them to the current chunk until adding the next sentence would exceed MAX_TOKENS (400). 4. If a chunk completes but is under MIN_TOKENS (20), it merges it backward into the previous chunk. 5. It uses the `tiktoken` library (cl100k_base encoding) to accurately count tokens. Example input: - Raw text extracted from a PDF. Example output: [ {"text": "Pemetrexed is indicated for...", "section": "Indications", "index": 0}, {"text": "The recommended dosage is...", "section": "Dosage", "index": 1} ] """ # Initialize tiktoken encoder try: encoder = tiktoken.get_encoding("cl100k_base") except Exception: # Fallback if cl100k_base is somehow unavailable encoder = tiktoken.encoding_for_model("gpt-3.5-turbo") def count_tokens(text: str) -> int: """Returns the number of tokens in a text string.""" return len(encoder.encode(text)) def detect_section(line: str, current_section: str) -> str: """ Detects if a line is a medical section header and returns it. Otherwise, returns the current section. """ # Common medical document headers headers = ["dosage", "side effects", "indications", "warnings", "contraindications", "precautions", "adverse reactions", "description", "clinical pharmacology"] clean_line = line.strip().lower() # Simple heuristic: if a line is short and matches a known header type if len(clean_line) < 50: for header in headers: if header in clean_line: return header.title() return current_section def chunk_text(text: str) -> List[Dict[str, Any]]: """ Chunks raw text into a list of semantic chunks with metadata. Args: text (str): The raw text string. Returns: List[Dict]: A list of chunk dictionaries containing 'text', 'section', and 'index'. """ chunks = [] current_chunk_sentences = [] current_tokens = 0 current_section = "General" chunk_index = 0 # Split text into paragraphs first to detect headers better paragraphs = [p for p in text.split('\n') if p.strip()] for paragraph in paragraphs: # Check if this paragraph is a section header current_section = detect_section(paragraph, current_section) # Split paragraph into sentences (basic regex for sentence boundaries) # Matches period, question mark, or exclamation point followed by whitespace. sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip()) for sentence in sentences: if not sentence.strip(): continue sentence_tokens = count_tokens(sentence) # If a single sentence is larger than max tokens, we have to split it (rare, but possible) if sentence_tokens > MAX_TOKENS: # Force split by words if needed, but for simplicity we append it here pass # Check if adding this sentence exceeds the limit if current_tokens + sentence_tokens > MAX_TOKENS and current_chunk_sentences: # Save the current chunk chunk_text_str = " ".join(current_chunk_sentences) chunks.append({ "text": chunk_text_str, "section": current_section, "index": chunk_index }) chunk_index += 1 # Start new chunk current_chunk_sentences = [sentence] current_tokens = sentence_tokens else: # Add to current chunk current_chunk_sentences.append(sentence) current_tokens += sentence_tokens # Add the last chunk if it has content if current_chunk_sentences: chunk_text_str = " ".join(current_chunk_sentences) chunks.append({ "text": chunk_text_str, "section": current_section, "index": chunk_index }) # Post-processing: Merge backward if a chunk is less than MIN_TOKENS merged_chunks = [] for i, chunk in enumerate(chunks): chunk_token_count = count_tokens(chunk["text"]) if chunk_token_count < MIN_TOKENS and len(merged_chunks) > 0: # Merge with previous chunk prev_chunk = merged_chunks[-1] prev_chunk["text"] += " " + chunk["text"] else: merged_chunks.append(chunk) # Re-index for i, chunk in enumerate(merged_chunks): chunk["index"] = i return merged_chunks if __name__ == "__main__": # Test block sample = "Indications and Usage. Pemetrexed is indicated for the treatment of patients with locally advanced or metastatic nonsquamous non-small cell lung cancer (NSCLC). Dosage and Administration. The recommended dose of Pemetrexed is 500 mg/m2 administered as an intravenous infusion over 10 minutes on Day 1 of each 21-day cycle." print("Chunking sample text...") res = chunk_text(sample) for c in res: print(f"Chunk {c['index']} [{c['section']}]: {c['text']}")