Spaces:

salvinjose
/

HNTAI

Paused

App Files Files Community

sachinchandrankallar commited on Nov 25, 2025

Commit

baf854e

1 Parent(s): 733c0c5

feat: Implement unified model management, centralized constants, and error handling for AI medical extraction services.

Browse files

Files changed (13) hide show

__pycache__/app.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/__pycache__/inference_service.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/__pycache__/phi_scrubber_service.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/services/error_handler.py +16 -0
services/ai-service/src/ai_med_extract/utils/__pycache__/model_config.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/utils/__pycache__/performance_monitor.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/utils/constants.py +1 -0
services/ai-service/src/ai_med_extract/utils/model_config.py +59 -0
services/ai-service/src/ai_med_extract/utils/unified_model_manager.py +175 -1
services/ai-service/test_token_limits.py +120 -0

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/__pycache__/inference_service.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/__pycache__/inference_service.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/__pycache__/inference_service.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/__pycache__/phi_scrubber_service.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/__pycache__/phi_scrubber_service.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/__pycache__/phi_scrubber_service.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/services/error_handler.py CHANGED Viewed

@@ -23,6 +23,7 @@ class ErrorCategory(Enum):
     MEMORY = "memory"
     VALIDATION = "validation"
     GENERATION = "generation"
     CACHE = "cache"
     UNKNOWN = "unknown"
@@ -79,10 +80,16 @@ def categorize_error(error: Exception) -> ErrorCategory:
         return ErrorCategory.MEMORY
     elif "validation" in error_str or "value" in error_str or isinstance(error, ValueError):
         return ErrorCategory.VALIDATION
     # Detect model/generation failures
     try:
         from ..utils.unified_model_manager import ModelError  # type: ignore
         if isinstance(error, ModelError):
             return ErrorCategory.GENERATION
     except Exception:
         pass
@@ -233,6 +240,15 @@ def _get_default_recommendations(category: ErrorCategory, error_str: str) -> lis
             "Check data format and types",
             "Review API documentation"
         ]
     elif category == ErrorCategory.GENERATION:
         recommendations = [
             "Verify model availability and internet access",

     MEMORY = "memory"
     VALIDATION = "validation"
     GENERATION = "generation"
+    TOKEN_LIMIT = "token_limit"
     CACHE = "cache"
     UNKNOWN = "unknown"
         return ErrorCategory.MEMORY
     elif "validation" in error_str or "value" in error_str or isinstance(error, ValueError):
         return ErrorCategory.VALIDATION
+    # Detect token limit errors
+    elif "token_limit_exceeded" in error_str or "token limit" in error_str or "input is too long" in error_str or "maximum context length" in error_str:
+        return ErrorCategory.TOKEN_LIMIT
     # Detect model/generation failures
     try:
         from ..utils.unified_model_manager import ModelError  # type: ignore
         if isinstance(error, ModelError):
+            # Check if it's specifically a token limit error
+            if hasattr(error, 'error_type') and error.error_type == "token_limit_exceeded":
+                return ErrorCategory.TOKEN_LIMIT
             return ErrorCategory.GENERATION
     except Exception:
         pass
             "Check data format and types",
             "Review API documentation"
         ]
+    elif category == ErrorCategory.TOKEN_LIMIT:
+        recommendations = [
+            "Reduce the number of patient visits in the request",
+            "Use a model with larger context window (e.g., Phi-3-mini-128k-instruct instead of 4k)",
+            "Split patient data into multiple requests",
+            "Use chunking endpoints for large datasets",
+            "Filter visits by date range to reduce data size",
+            "Check logs for exact token count and model limits"
+        ]
     elif category == ErrorCategory.GENERATION:
         recommendations = [
             "Verify model availability and internet access",

services/ai-service/src/ai_med_extract/utils/__pycache__/model_config.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/utils/__pycache__/model_config.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/utils/__pycache__/model_config.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/utils/__pycache__/performance_monitor.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/utils/__pycache__/performance_monitor.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/utils/__pycache__/performance_monitor.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/utils/constants.py CHANGED Viewed

@@ -80,6 +80,7 @@ ERROR_MESSAGES = {
     "model_load_failed": "Failed to load AI model. Please try again or contact support.",
     "generation_timeout": "Summary generation timed out. Please try again with a simpler request.",
     "generation_failed": "Summary generation failed. Please try again or contact support.",
     "cache_error": "Cache operation failed. Continuing with fresh generation."
 }

     "model_load_failed": "Failed to load AI model. Please try again or contact support.",
     "generation_timeout": "Summary generation timed out. Please try again with a simpler request.",
     "generation_failed": "Summary generation failed. Please try again or contact support.",
+    "token_limit_exceeded": "Patient data exceeds model's token limit. Please reduce the number of visits or use a model with larger context window.",
     "cache_error": "Cache operation failed. Continuing with fresh generation."
 }

services/ai-service/src/ai_med_extract/utils/model_config.py CHANGED Viewed

@@ -172,6 +172,65 @@ QUANTIZATION_CONFIG = {
     "skip_layers": ["embeddings", "lm_head", "shared", "embed_positions"]  # Layers to skip quantization
 }
 def get_default_model(model_type: str, use_spaces_optimized: bool = False) -> str:
     """Get the default model for a given type, optimized for T4 Spaces"""
     # Always use T4-optimized models when on T4 Medium

     "skip_layers": ["embeddings", "lm_head", "shared", "embed_positions"]  # Layers to skip quantization
 }
+# ========== MODEL-SPECIFIC TOKEN LIMITS ==========
+# Maximum context window sizes for different models
+MODEL_TOKEN_LIMITS = {
+    # Phi-3 models
+    "microsoft/Phi-3-mini-4k-instruct": 4096,
+    "microsoft/Phi-3-mini-4k-instruct-gguf": 4096,
+    "microsoft/Phi-3-mini-4k-instruct-GGUF": 4096,
+    "microsoft/Phi-3-mini-128k-instruct": 131072,
+    "microsoft/Phi-3-mini-128k-instruct-gguf": 131072,
+    "microsoft/Phi-3-small-8k-instruct": 8192,
+    "microsoft/Phi-3-small-128k-instruct": 131072,
+    # OpenVINO models
+    "OpenVINO/Phi-3-mini-4k-instruct-fp16-ov": 4096,
+    "OpenVINO/Phi-3-mini-128k-instruct-int4-ov": 131072,
+    # DialoGPT and BART models
+    "microsoft/DialoGPT-small": 1024,
+    "microsoft/DialoGPT-medium": 1024,
+    "facebook/bart-base": 1024,
+    "facebook/bart-large-cnn": 1024,
+    "sshleifer/distilbart-cnn-6-6": 1024,
+    # Default fallback
+    "default": 4096
+}
+def get_model_token_limit(model_name: str) -> int:
+    """
+    Get the maximum token limit for a specific model.
+    Args:
+        model_name: Name of the model
+    Returns:
+        Maximum token limit for the model
+    """
+    # Check exact match first
+    if model_name in MODEL_TOKEN_LIMITS:
+        return MODEL_TOKEN_LIMITS[model_name]
+    # Check for partial matches (e.g., for GGUF files with full paths)
+    model_name_lower = model_name.lower()
+    for key, limit in MODEL_TOKEN_LIMITS.items():
+        if key.lower() in model_name_lower or model_name_lower in key.lower():
+            return limit
+    # Check for common patterns
+    if "128k" in model_name_lower:
+        return 131072
+    elif "8k" in model_name_lower:
+        return 8192
+    elif "4k" in model_name_lower:
+        return 4096
+    # Return default
+    return MODEL_TOKEN_LIMITS["default"]
 def get_default_model(model_type: str, use_spaces_optimized: bool = False) -> str:
     """Get the default model for a given type, optimized for T4 Spaces"""
     # Always use T4-optimized models when on T4 Medium

services/ai-service/src/ai_med_extract/utils/unified_model_manager.py CHANGED Viewed

@@ -25,7 +25,7 @@ import torch
 from .model_config import (
     get_default_model, get_fallback_model, get_t4_model_kwargs,
     get_t4_generation_config, is_model_supported_on_t4, detect_model_type,
-    IS_T4_MEDIUM
 )
 # Configure logging
@@ -76,6 +76,96 @@ class ModelError(Exception):
         self.timestamp = time.time()
         super().__init__(f"Model {model_name} failed ({error_type}): {details}")
 # Global unified model manager instance
 unified_model_manager = None
@@ -229,6 +319,24 @@ class TransformersModel(BaseModel):
         if self._model is None:
             raise ModelError(self.name, "not_loaded", "Model not loaded")
         try:
             # Get T4-optimized generation config
             gen_config = get_t4_generation_config(self.model_type)
@@ -275,6 +383,16 @@ class TransformersModel(BaseModel):
                 return generated_text
         except Exception as e:
             raise ModelError(self.name, "generation_failed", f"Generation failed: {str(e)}", e)
 class GGUFModel(BaseModel):
@@ -330,6 +448,24 @@ class GGUFModel(BaseModel):
         if self._model is None:
             raise ModelError(self.name, "not_loaded", "Model not loaded")
         try:
             # Get T4-optimized generation config
             gen_config = get_t4_generation_config("gguf")
@@ -347,6 +483,16 @@ class GGUFModel(BaseModel):
             return result['choices'][0]['text'] if result and 'choices' in result else ""
         except Exception as e:
             raise ModelError(self.name, "generation_failed", f"Generation failed: {str(e)}", e)
 class OpenVINOModel(BaseModel):
@@ -390,6 +536,24 @@ class OpenVINOModel(BaseModel):
         if self._model is None or self._tokenizer is None:
             raise ModelError(self.name, "not_loaded", "Model not loaded")
         try:
             inputs = self._tokenizer(prompt, return_tensors="pt")
             if torch.cuda.is_available():
@@ -413,6 +577,16 @@ class OpenVINOModel(BaseModel):
             return generated_text
         except Exception as e:
             raise ModelError(self.name, "generation_failed", f"Generation failed: {str(e)}", e)
 class FallbackModel(BaseModel):

 from .model_config import (
     get_default_model, get_fallback_model, get_t4_model_kwargs,
     get_t4_generation_config, is_model_supported_on_t4, detect_model_type,
+    get_model_token_limit, IS_T4_MEDIUM
 )
 # Configure logging
         self.timestamp = time.time()
         super().__init__(f"Model {model_name} failed ({error_type}): {details}")
+def count_tokens(text: str, model_name: str = None) -> int:
+    """
+    Estimate token count for a given text.
+    Uses a simple heuristic: ~4 characters per token for English text.
+    This is a conservative estimate that works reasonably well for medical text.
+    Args:
+        text: Text to count tokens for
+        model_name: Optional model name for model-specific counting
+    Returns:
+        Estimated token count
+    """
+    if not text:
+        return 0
+    # Simple heuristic: ~4 characters per token
+    # This is conservative and works well for medical/clinical text
+    estimated_tokens = len(text) // 4
+    # Add some overhead for special tokens and formatting
+    estimated_tokens = int(estimated_tokens * 1.1)
+    return estimated_tokens
+def check_token_limits(text: str, model_name: str, reserve_for_output: int = 8192) -> dict:
+    """
+    Check if text exceeds model's token limit.
+    Args:
+        text: Input text to check
+        model_name: Name of the model
+        reserve_for_output: Tokens to reserve for model output
+    Returns:
+        Dictionary with check results:
+        - within_limit: bool
+        - estimated_tokens: int
+        - max_tokens: int
+        - available_for_input: int
+        - usage_percentage: float
+    """
+    max_tokens = get_model_token_limit(model_name)
+    estimated_tokens = count_tokens(text, model_name)
+    available_for_input = max_tokens - reserve_for_output
+    return {
+        "within_limit": estimated_tokens <= available_for_input,
+        "estimated_tokens": estimated_tokens,
+        "max_tokens": max_tokens,
+        "available_for_input": available_for_input,
+        "reserve_for_output": reserve_for_output,
+        "usage_percentage": (estimated_tokens / available_for_input * 100) if available_for_input > 0 else 0
+    }
+def is_token_limit_error(error: Exception) -> bool:
+    """
+    Detect if an error is related to token limits being exceeded.
+    Args:
+        error: Exception to check
+    Returns:
+        True if error is token-limit related
+    """
+    error_str = str(error).lower()
+    error_patterns = [
+        "input is too long",
+        "maximum context length",
+        "exceeds the maximum",
+        "context_length",
+        "too many tokens",
+        "input too long",
+        "sequence length",
+        "max_position_embeddings",
+        "position_ids",
+        "token limit"  # Added for direct token limit messages
+    ]
+    # Check error message
+    for pattern in error_patterns:
+        if pattern in error_str:
+            return True
+    # Check for IndexError which can indicate token overflow
+    if isinstance(error, IndexError) and ("position" in error_str or "index" in error_str):
+        return True
+    return False
 # Global unified model manager instance
 unified_model_manager = None
         if self._model is None:
             raise ModelError(self.name, "not_loaded", "Model not loaded")
+        # Check token limits before generation
+        token_check = check_token_limits(prompt, self.name, config.max_tokens)
+        logger.info(f"Token check for {self.name}: {token_check['estimated_tokens']}/{token_check['available_for_input']} tokens ({token_check['usage_percentage']:.1f}%)")
+        if not token_check["within_limit"]:
+            error_msg = (
+                f"Input exceeds token limit for model {self.name}. "
+                f"Estimated tokens: {token_check['estimated_tokens']}, "
+                f"Available for input: {token_check['available_for_input']} "
+                f"(max: {token_check['max_tokens']}, reserved for output: {token_check['reserve_for_output']}). "
+                f"Please reduce the input size or use a model with larger context window."
+            )
+            logger.error(error_msg)
+            raise ModelError(self.name, "token_limit_exceeded", error_msg)
+        if token_check["usage_percentage"] > 80:
+            logger.warning(f"Approaching token limit for {self.name}: {token_check['usage_percentage']:.1f}% of available tokens")
         try:
             # Get T4-optimized generation config
             gen_config = get_t4_generation_config(self.model_type)
                 return generated_text
         except Exception as e:
+            # Check if this is a token limit error
+            if is_token_limit_error(e):
+                error_msg = (
+                    f"Token limit exceeded for model {self.name}. "
+                    f"Input length: ~{token_check['estimated_tokens']} tokens, "
+                    f"Model limit: {token_check['max_tokens']} tokens. "
+                    f"Original error: {str(e)}"
+                )
+                logger.error(error_msg)
+                raise ModelError(self.name, "token_limit_exceeded", error_msg, e)
             raise ModelError(self.name, "generation_failed", f"Generation failed: {str(e)}", e)
 class GGUFModel(BaseModel):
         if self._model is None:
             raise ModelError(self.name, "not_loaded", "Model not loaded")
+        # Check token limits before generation
+        token_check = check_token_limits(prompt, self.name, config.max_tokens)
+        logger.info(f"Token check for {self.name}: {token_check['estimated_tokens']}/{token_check['available_for_input']} tokens ({token_check['usage_percentage']:.1f}%)")
+        if not token_check["within_limit"]:
+            error_msg = (
+                f"Input exceeds token limit for model {self.name}. "
+                f"Estimated tokens: {token_check['estimated_tokens']}, "
+                f"Available for input: {token_check['available_for_input']} "
+                f"(max: {token_check['max_tokens']}, reserved for output: {token_check['reserve_for_output']}). "
+                f"Please reduce the input size or use a model with larger context window."
+            )
+            logger.error(error_msg)
+            raise ModelError(self.name, "token_limit_exceeded", error_msg)
+        if token_check["usage_percentage"] > 80:
+            logger.warning(f"Approaching token limit for {self.name}: {token_check['usage_percentage']:.1f}% of available tokens")
         try:
             # Get T4-optimized generation config
             gen_config = get_t4_generation_config("gguf")
             return result['choices'][0]['text'] if result and 'choices' in result else ""
         except Exception as e:
+            # Check if this is a token limit error
+            if is_token_limit_error(e):
+                error_msg = (
+                    f"Token limit exceeded for model {self.name}. "
+                    f"Input length: ~{token_check['estimated_tokens']} tokens, "
+                    f"Model limit: {token_check['max_tokens']} tokens. "
+                    f"Original error: {str(e)}"
+                )
+                logger.error(error_msg)
+                raise ModelError(self.name, "token_limit_exceeded", error_msg, e)
             raise ModelError(self.name, "generation_failed", f"Generation failed: {str(e)}", e)
 class OpenVINOModel(BaseModel):
         if self._model is None or self._tokenizer is None:
             raise ModelError(self.name, "not_loaded", "Model not loaded")
+        # Check token limits before generation
+        token_check = check_token_limits(prompt, self.name, config.max_tokens)
+        logger.info(f"Token check for {self.name}: {token_check['estimated_tokens']}/{token_check['available_for_input']} tokens ({token_check['usage_percentage']:.1f}%)")
+        if not token_check["within_limit"]:
+            error_msg = (
+                f"Input exceeds token limit for model {self.name}. "
+                f"Estimated tokens: {token_check['estimated_tokens']}, "
+                f"Available for input: {token_check['available_for_input']} "
+                f"(max: {token_check['max_tokens']}, reserved for output: {token_check['reserve_for_output']}). "
+                f"Please reduce the input size or use a model with larger context window."
+            )
+            logger.error(error_msg)
+            raise ModelError(self.name, "token_limit_exceeded", error_msg)
+        if token_check["usage_percentage"] > 80:
+            logger.warning(f"Approaching token limit for {self.name}: {token_check['usage_percentage']:.1f}% of available tokens")
         try:
             inputs = self._tokenizer(prompt, return_tensors="pt")
             if torch.cuda.is_available():
             return generated_text
         except Exception as e:
+            # Check if this is a token limit error
+            if is_token_limit_error(e):
+                error_msg = (
+                    f"Token limit exceeded for model {self.name}. "
+                    f"Input length: ~{token_check['estimated_tokens']} tokens, "
+                    f"Model limit: {token_check['max_tokens']} tokens. "
+                    f"Original error: {str(e)}"
+                )
+                logger.error(error_msg)
+                raise ModelError(self.name, "token_limit_exceeded", error_msg, e)
             raise ModelError(self.name, "generation_failed", f"Generation failed: {str(e)}", e)
 class FallbackModel(BaseModel):

services/ai-service/test_token_limits.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+Simple test to verify token limit detection works correctly.
+"""
+import sys
+import os
+# Set UTF-8 encoding for Windows console
+if sys.platform == 'win32':
+    os.system('chcp 65001 > nul')
+sys.path.insert(0, 'src')
+from ai_med_extract.utils.model_config import get_model_token_limit
+from ai_med_extract.utils.unified_model_manager import count_tokens, check_token_limits, is_token_limit_error
+def test_model_token_limits():
+    """Test that model token limits are configured correctly"""
+    print("Testing model token limits...")
+    assert get_model_token_limit("microsoft/Phi-3-mini-4k-instruct") == 4096
+    assert get_model_token_limit("microsoft/Phi-3-mini-128k-instruct") == 131072
+    assert get_model_token_limit("microsoft/Phi-3-small-8k-instruct") == 8192
+    assert get_model_token_limit("microsoft/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf") == 4096
+    assert get_model_token_limit("some-model-128k") == 131072
+    assert get_model_token_limit("unknown-model") == 4096
+    print("[PASS] Model token limits working correctly\n")
+def test_token_counting():
+    """Test token counting estimation"""
+    print("Testing token counting...")
+    assert count_tokens("") == 0
+    small_text = "This is a test of the token counting system. It should estimate tokens based on character count."
+    tokens = count_tokens(small_text)
+    assert 20 < tokens < 35, f"Expected ~27 tokens, got {tokens}"
+    large_text = "Patient visit data. " * 1000
+    tokens = count_tokens(large_text)
+    assert 5000 < tokens < 6000, f"Expected ~5,500 tokens, got {tokens}"
+    print(f"[PASS] Token counting working correctly")
+    print(f"  Small text ({len(small_text)} chars) = {count_tokens(small_text)} tokens")
+    print(f"  Large text ({len(large_text)} chars) = {count_tokens(large_text)} tokens\n")
+def test_token_limit_checking():
+    """Test token limit validation"""
+    print("Testing token limit checking...")
+    model_name = "microsoft/Phi-3-mini-4k-instruct"
+    # Small input
+    small_text = "Short patient summary. " * 10
+    result = check_token_limits(small_text, model_name, reserve_for_output=2048)
+    assert result["within_limit"] == True
+    print(f"[PASS] Small input: {result['estimated_tokens']}/{result['available_for_input']} tokens ({result['usage_percentage']:.1f}%)")
+    # Large input
+    large_text = "Patient visit data. " * 2000
+    result = check_token_limits(large_text, model_name, reserve_for_output=2048)
+    assert result["within_limit"] == False
+    print(f"[PASS] Large input: {result['estimated_tokens']}/{result['available_for_input']} tokens ({result['usage_percentage']:.1f}%) - EXCEEDS LIMIT")
+    # Medium input - adjust to actually be ~80-90% of limit
+    medium_text = "Patient visit data. " * 350  # ~7000 chars = ~1925 tokens (~94% of 2048)
+    result = check_token_limits(medium_text, model_name, reserve_for_output=2048)
+    print(f"[INFO] Medium input: {result['estimated_tokens']}/{result['available_for_input']} tokens ({result['usage_percentage']:.1f}%)")
+    assert result["within_limit"] == True
+    assert result["usage_percentage"] > 80, f"Expected >80%, got {result['usage_percentage']:.1f}%"
+    print(f"[PASS] Medium input - APPROACHING LIMIT\n")
+def test_error_detection():
+    """Test token limit error pattern detection"""
+    print("Testing error pattern detection...")
+    test_cases = [
+        (Exception("input is too long"), True),
+        (Exception("maximum context length exceeded"), True),
+        (Exception("Token limit exceeded"), True),
+        (IndexError("position index out of range"), True),
+        (Exception("some other error"), False),
+    ]
+    for error, expected in test_cases:
+        result = is_token_limit_error(error)
+        assert result == expected, f"Failed for: {error}"
+        status = "[PASS]" if result else "[SKIP]"
+        print(f"  {status} '{str(error)[:40]}...' -> token_limit={result}")
+    print("[PASS] Error pattern detection working correctly\n")
+if __name__ == "__main__":
+    print("="*60)
+    print("Token Limit Detection - Verification Tests")
+    print("="*60 + "\n")
+    try:
+        test_model_token_limits()
+        test_token_counting()
+        test_token_limit_checking()
+        test_error_detection()
+        print("="*60)
+        print("[SUCCESS] ALL TESTS PASSED")
+        print("="*60)
+        print("\nToken limit detection is working correctly!")
+        print("\nNext steps:")
+        print("1. Test with real patient data containing many visits")
+        print("2. Verify error messages appear in API responses")
+        print("3. Check logs for token diagnostics")
+    except AssertionError as e:
+        print(f"\n[FAILED] TEST FAILED: {e}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n[ERROR] {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)