Spaces:

salvinjose
/

HNTAI

Paused

App Files Files Community

sachinchandrankallar commited on Nov 6, 2025

Commit

299444a

1 Parent(s): 84a9aa6

Refactor PyTorch compatibility handling by centralizing the RMSNorm patch into a dedicated utility function. This ensures consistent application across modules and improves maintainability. Update logging to reflect the new approach.

Browse files

Files changed (6) hide show

services/ai-service/src/ai_med_extract/api/routes_fastapi.py +3 -29
services/ai-service/src/ai_med_extract/app.py +3 -0
services/ai-service/src/ai_med_extract/utils/fallback_pipeline.py +228 -0
services/ai-service/src/ai_med_extract/utils/model_loader_spaces.py +11 -277
services/ai-service/src/ai_med_extract/utils/torch_compat.py +120 -0
services/ai-service/src/ai_med_extract/utils/unified_model_manager.py +3 -33

services/ai-service/src/ai_med_extract/api/routes_fastapi.py CHANGED Viewed

@@ -15,35 +15,9 @@ from ..core_logger import log_with_memory, log_exception_with_memory
 logger = logging.getLogger(__name__)
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import torch
-# Patch torch.rms_norm for compatibility with models like Phi-3 that expect this function
-if not hasattr(torch, 'rms_norm'):
-    def rms_norm(input_tensor, normalized_shape=None, weight=None, eps=1e-6):
-        """Simple RMS normalization implementation compatible with various call signatures"""
-        # Handle different input formats
-        if normalized_shape is None:
-            # If no shape specified, normalize over last dimension
-            dim = -1
-            keepdim = True
-        else:
-            # If shape is specified, normalize over those dimensions
-            if isinstance(normalized_shape, int):
-                dim = normalized_shape
-                keepdim = True
-            else:
-                # Multiple dimensions - normalize over all of them
-                dim = tuple(range(-len(normalized_shape), 0))
-                keepdim = True
-        # Calculate RMS (root mean square)
-        variance = input_tensor.pow(2).mean(dim=dim, keepdim=keepdim)
-        # Normalize
-        output = input_tensor * torch.rsqrt(variance + eps)
-        # Apply weight if provided
-        if weight is not None:
-            output = output * weight
-        return output
-    torch.rms_norm = rms_norm
-    logger.info("Patched torch.rms_norm for compatibility with Phi-3 and similar models")
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline as transformers_pipeline
 import requests

 logger = logging.getLogger(__name__)
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import torch
+# Ensure PyTorch compatibility patches are applied early
+from ..utils.torch_compat import ensure_torch_compatibility
+ensure_torch_compatibility()
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline as transformers_pipeline
 import requests

services/ai-service/src/ai_med_extract/app.py CHANGED Viewed

@@ -7,6 +7,9 @@ from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.middleware.base import BaseHTTPMiddleware
 import torch
 from contextlib import asynccontextmanager
 from datetime import datetime
 import redis.asyncio as redis

 from fastapi.middleware.cors import CORSMiddleware
 from starlette.middleware.base import BaseHTTPMiddleware
 import torch
+# Ensure PyTorch compatibility patches are applied early
+from .utils.torch_compat import ensure_torch_compatibility
+ensure_torch_compatibility()
 from contextlib import asynccontextmanager
 from datetime import datetime
 import redis.asyncio as redis

services/ai-service/src/ai_med_extract/utils/fallback_pipeline.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Fallback Pipeline Implementation
+Provides a transformers-compatible pipeline wrapper for fallback scenarios
+"""
+import logging
+import torch
+from typing import Dict, Any, Optional, Union
+logger = logging.getLogger(__name__)
+class FallbackPipeline:
+    """
+    A transformers-compatible pipeline wrapper for fallback scenarios.
+    This class provides a consistent interface when OpenVINO loading fails
+    and we need to fall back to standard transformers models.
+    """
+    def __init__(self, model, tokenizer):
+        """
+        Initialize the fallback pipeline.
+        Args:
+            model: The transformers model instance
+            tokenizer: The tokenizer instance
+        """
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = next(model.parameters()).device if hasattr(model, 'parameters') else None
+        self.cache_settings = {}
+        if hasattr(model, 'config'):
+            self.cache_settings['max_length'] = getattr(
+                model.config, 'max_position_embeddings', 2048
+            )
+    def _has_dynamic_cache(self) -> bool:
+        """Check if model has dynamic cache support."""
+        if not hasattr(self.model, 'config'):
+            return False
+        model_config = self.model.config
+        return (
+            hasattr(model_config, 'sliding_window') or
+            hasattr(model_config, 'sliding_window_size') or
+            (hasattr(model_config, 'architectures') and
+             model_config.architectures and
+             any(arch.lower() in ('mistral', 'llama', 'phi')
+                 for arch in model_config.architectures))
+        )
+    def _cleanup_legacy_cache_attrs(self):
+        """Remove legacy cache attributes that may cause issues."""
+        if hasattr(self.model, 'config'):
+            model_config = self.model.config
+            for legacy_attr in ['get_max_length', 'max_cache_length']:
+                if hasattr(model_config, legacy_attr):
+                    try:
+                        delattr(model_config, legacy_attr)
+                    except Exception:
+                        pass
+    def _get_safe_generation_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Extract and sanitize generation kwargs.
+        Args:
+            kwargs: Original generation kwargs
+        Returns:
+            Sanitized kwargs safe for model generation
+        """
+        # Remove unsupported kwargs
+        kwargs.pop('loss_type', None)
+        # Determine cache usage
+        use_cache = not self._has_dynamic_cache()
+        self._cleanup_legacy_cache_attrs()
+        # Allowed generation parameters
+        allowed = {
+            'max_new_tokens', 'do_sample', 'temperature', 'top_k', 'top_p',
+            'num_return_sequences', 'pad_token_id', 'eos_token_id', 'num_beams',
+            'early_stopping', 'repetition_penalty', 'use_cache',
+            'output_attentions', 'output_hidden_states', 'return_dict_in_generate'
+        }
+        safe_kwargs = {k: v for k, v in kwargs.items() if k in allowed}
+        safe_kwargs['use_cache'] = use_cache
+        # Convert max_length to max_new_tokens if needed
+        if 'max_length' in kwargs and 'max_new_tokens' not in safe_kwargs:
+            try:
+                input_len = self._get_input_length(kwargs.get('inputs'))
+                max_len_val = kwargs.get('max_length', 2048)
+                computed_new = max(1, int(max_len_val) - int(input_len))
+                safe_kwargs['max_new_tokens'] = min(
+                    computed_new,
+                    self.cache_settings.get('max_length', 2048)
+                )
+            except Exception:
+                safe_kwargs['max_new_tokens'] = 256
+        return safe_kwargs
+    def _get_input_length(self, inputs: Any) -> int:
+        """Extract input length from various input formats."""
+        if isinstance(inputs, dict) and 'input_ids' in inputs:
+            input_ids = inputs['input_ids']
+            try:
+                return input_ids.shape[-1]
+            except Exception:
+                try:
+                    return len(input_ids[0])
+                except Exception:
+                    return 0
+        return 0
+    def _extract_prompt_text(self, inputs: Union[str, Dict[str, Any]]) -> str:
+        """
+        Extract prompt text from various input formats.
+        Args:
+            inputs: Can be a string prompt or a dict with tokenized inputs
+        Returns:
+            Extracted prompt text
+        """
+        if isinstance(inputs, dict):
+            # Try to decode tokenized inputs
+            if 'input_ids' in inputs and self.tokenizer is not None:
+                try:
+                    input_ids = inputs['input_ids']
+                    # Handle both tensor and list formats
+                    if hasattr(input_ids, 'tolist'):
+                        decoded = self.tokenizer.decode(
+                            input_ids[0] if len(input_ids.shape) > 1 else input_ids,
+                            skip_special_tokens=True
+                        )
+                    else:
+                        decoded = self.tokenizer.decode(
+                            input_ids[0] if isinstance(input_ids, list) else input_ids,
+                            skip_special_tokens=True
+                        )
+                    return decoded
+                except Exception:
+                    pass
+            # Fallback to text/prompt keys
+            return inputs.get('text') or inputs.get('prompt') or ""
+        return str(inputs) if inputs is not None else ""
+    def _move_to_device(self, tokenized: Dict[str, Any]) -> Dict[str, Any]:
+        """Move tokenized inputs to the appropriate device."""
+        if self.device is not None:
+            try:
+                return {
+                    k: v.to(self.device) if hasattr(v, 'to') and torch.is_tensor(v) else v
+                    for k, v in tokenized.items()
+                }
+            except Exception:
+                pass
+        return tokenized
+    def generate(self, inputs: Union[str, Dict[str, Any]], **kwargs) -> str:
+        """
+        Generate text from inputs.
+        Args:
+            inputs: Input prompt (string or tokenized dict)
+            **kwargs: Generation parameters
+        Returns:
+            Generated text
+        """
+        # Get safe kwargs
+        safe_kwargs = self._get_safe_generation_kwargs(kwargs.copy())
+        # Extract prompt text
+        prompt_text = self._extract_prompt_text(inputs)
+        try:
+            # Tokenize and generate if we have a tokenizer and string input
+            if self.tokenizer is not None and isinstance(prompt_text, str) and prompt_text:
+                tokenized = self.tokenizer([prompt_text], return_tensors='pt')
+                tokenized = self._move_to_device(tokenized)
+                # Generate
+                with torch.no_grad():
+                    outputs = self.model.generate(**tokenized, **safe_kwargs)
+                # Decode output
+                if hasattr(self.tokenizer, 'decode'):
+                    return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                return str(outputs)
+            # Try direct generation with provided inputs
+            if isinstance(inputs, dict):
+                inputs = self._move_to_device(inputs)
+            with torch.no_grad():
+                outputs = self.model.generate(**inputs, **safe_kwargs)
+            # Decode if possible
+            if hasattr(self.tokenizer, 'decode'):
+                try:
+                    return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                except Exception:
+                    return str(outputs)
+            return str(outputs)
+        except TypeError as te:
+            logger.warning(f"Generation signature mismatch: {te}")
+            # Last resort: try unified model manager
+            try:
+                from .unified_model_manager import unified_model_manager as _umm
+                return _umm.generate_text(
+                    getattr(self.model, 'name', str(self.model)),
+                    prompt_text,
+                    model_type="text-generation"
+                )
+            except Exception as e:
+                logger.warning(f"Fallback generation failed: {e}")
+                raise

services/ai-service/src/ai_med_extract/utils/model_loader_spaces.py CHANGED Viewed

@@ -107,38 +107,9 @@ def get_openvino_pipeline(model_name: str, device: str = None):
 	logging.info(f"Loading OpenVINO model {model_name} on device: {device}")
-	# Check for torch.rms_norm compatibility issue and patch if needed
-	# Some models (like Phi-3) may reference torch.rms_norm which doesn't exist in older PyTorch versions
-	if not hasattr(torch, 'rms_norm'):
-		# Add a simple RMS normalization function to torch if missing
-		# This is a workaround for models that expect torch.rms_norm to exist
-		def rms_norm(input_tensor, normalized_shape=None, weight=None, eps=1e-6):
-			"""Simple RMS normalization implementation compatible with various call signatures"""
-			# Handle different input formats
-			if normalized_shape is None:
-				# If no shape specified, normalize over last dimension
-				dim = -1
-				keepdim = True
-			else:
-				# If shape is specified, normalize over those dimensions
-				if isinstance(normalized_shape, int):
-					dim = normalized_shape
-					keepdim = True
-				else:
-					# Multiple dimensions - normalize over all of them
-					dim = tuple(range(-len(normalized_shape), 0))
-					keepdim = True
-			# Calculate RMS (root mean square)
-			variance = input_tensor.pow(2).mean(dim=dim, keepdim=keepdim)
-			# Normalize
-			output = input_tensor * torch.rsqrt(variance + eps)
-			# Apply weight if provided
-			if weight is not None:
-				output = output * weight
-			return output
-		torch.rms_norm = rms_norm
-		logging.info("Patched torch.rms_norm for compatibility")
 	try:
 		# If model_name is a directory, try to load IR from there; else, download and export
@@ -225,112 +196,8 @@ def get_openvino_pipeline(model_name: str, device: str = None):
 					cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
 				)
-				# Use the FallbackPipeline class defined below
-				class FallbackPipeline:
-					def __init__(self, model, tokenizer):
-						self.model = model
-						self.tokenizer = tokenizer
-						self.device = next(model.parameters()).device if hasattr(model, 'parameters') else None
-						self.cache_settings = {}
-						if hasattr(model, 'config'):
-							self.cache_settings['max_length'] = getattr(model.config, 'max_position_embeddings', 2048)
-					def generate(self, inputs, **kwargs):
-						import logging as _logging
-						if 'loss_type' in kwargs:
-							kwargs.pop('loss_type', None)
-						use_cache_value = False
-						if hasattr(self.model, 'config'):
-							model_config = self.model.config
-							has_dynamic_cache = (
-								hasattr(model_config, 'sliding_window') or
-								hasattr(model_config, 'sliding_window_size') or
-								(hasattr(model_config, 'architectures') and
-								 model_config.architectures and
-								 any('mistral' in arch.lower() or 'llama' in arch.lower() or 'phi' in arch.lower()
-									 for arch in model_config.architectures))
-							)
-							if has_dynamic_cache:
-								use_cache_value = False
-							else:
-								use_cache_value = True
-							for legacy_cache_attr in ['get_max_length', 'max_cache_length']:
-								if hasattr(model_config, legacy_cache_attr):
-									delattr(model_config, legacy_cache_attr)
-						kwargs['use_cache'] = use_cache_value
-						allowed = {
-							'max_new_tokens', 'do_sample', 'temperature', 'top_k', 'top_p', 'num_return_sequences',
-							'pad_token_id', 'eos_token_id', 'num_beams', 'early_stopping', 'repetition_penalty',
-							'use_cache', 'output_attentions', 'output_hidden_states', 'return_dict_in_generate'
-						}
-						safe_kwargs = {k: v for k, v in kwargs.items() if k in allowed}
-						if 'max_length' in kwargs and 'max_new_tokens' not in safe_kwargs:
-							try:
-								input_len = 0
-								if isinstance(inputs, dict) and 'input_ids' in inputs:
-									input_ids = inputs['input_ids']
-									try:
-										input_len = input_ids.shape[-1]
-									except Exception:
-										try:
-											input_len = len(input_ids[0])
-										except Exception:
-											input_len = 0
-								max_len_val = kwargs.get('max_length')
-								computed_new = max(1, int(max_len_val) - int(input_len))
-								safe_kwargs['max_new_tokens'] = min(computed_new, self.cache_settings.get('max_length', 2048))
-							except Exception:
-								safe_kwargs['max_new_tokens'] = 256
-						prompt_text = None
-						if isinstance(inputs, dict):
-							if 'input_ids' in inputs and self.tokenizer is not None:
-								try:
-									input_ids = inputs['input_ids']
-									if hasattr(input_ids, 'tolist'):
-										decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
-									else:
-										decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
-									prompt_text = decoded
-								except Exception:
-									prompt_text = None
-							else:
-								prompt_text = inputs.get('text') or inputs.get('prompt')
-						else:
-							prompt_text = inputs
-						if prompt_text is None:
-							prompt_text = ""
-						try:
-							if self.tokenizer is not None and isinstance(prompt_text, str):
-								tokenized = self.tokenizer([prompt_text], return_tensors='pt')
-								try:
-									if self.device is not None and hasattr(tokenized['input_ids'], 'to'):
-										tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
-								except Exception:
-									_pass = None
-								outputs = self.model.generate(**tokenized, **safe_kwargs)
-								if hasattr(self.tokenizer, 'decode'):
-									return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-								return outputs
-							else:
-								outputs = self.model.generate(**inputs, **safe_kwargs)
-								if hasattr(self.tokenizer, 'decode'):
-									try:
-										return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-									except Exception:
-										return str(outputs)
-								return outputs
-						except TypeError as te:
-							_logging.warning(f"Fallback generate signature mismatch: {te}")
-							try:
-								from .unified_model_manager import unified_model_manager as _umm
-								return _umm.generate_text(self.model.name if hasattr(self.model, 'name') else str(self.model), prompt_text)
-							except Exception as e:
-								_logging.warning(f"Fallback final generation failed: {e}")
-								raise
 				logging.info(f"Successfully loaded transformers model as fallback: {original_model_name}")
 				return FallbackPipeline(model, tokenizer)
@@ -385,150 +252,17 @@ def get_openvino_pipeline(model_name: str, device: str = None):
 				original_model_name,
 				torch_dtype=torch.float32,
 				device_map="auto" if torch.cuda.is_available() and not is_hf_spaces else None,
-				trust_remote_code=True
 			)
 			tokenizer = AutoTokenizer.from_pretrained(
 				original_model_name,
-				trust_remote_code=True
 			)
-			# Create a compatible pipeline
-			class FallbackPipeline:
-				def __init__(self, model, tokenizer):
-					self.model = model
-					self.tokenizer = tokenizer
-					# determine device for tensors
-					self.device = next(model.parameters()).device if hasattr(model, 'parameters') else None
-					# Modern cache configuration for transformer models
-					self.cache_settings = {}
-					if hasattr(model, 'config'):
-						# Don't set use_cache at model config level - handle it per generation
-						# This prevents issues with dynamic cache systems
-						# Store max length for reference but don't enforce it
-						self.cache_settings['max_length'] = getattr(model.config, 'max_position_embeddings', 2048)
-				def generate(self, inputs, **kwargs):
-					"""Robust generate wrapper that accepts either a prompt string or a tokenized inputs dict.
-					It sanitizes unsupported kwargs (e.g., loss_type) before delegating to the underlying model.
-					"""
-					import logging as _logging
-					# Sanitize unsupported kwargs forwarded from callers
-					if 'loss_type' in kwargs:
-						kwargs.pop('loss_type', None)
-					# Modern cache handling for transformers models with dynamic cache support
-					# For single independent generations, explicitly disable cache to prevent stale cache issues
-					use_cache_value = False  # Default to False for single generations
-					# Check if model has dynamic cache support
-					if hasattr(self.model, 'config'):
-						model_config = self.model.config
-						# Check for dynamic cache indicators
-						has_dynamic_cache = (
-							hasattr(model_config, 'sliding_window') or
-							hasattr(model_config, 'sliding_window_size') or
-							(hasattr(model_config, 'architectures') and
-							 model_config.architectures and
-							 any('mistral' in arch.lower() or 'llama' in arch.lower() or 'phi' in arch.lower()
-								 for arch in model_config.architectures))
-						)
-						if has_dynamic_cache:
-							use_cache_value = False  # Disable cache for dynamic cache models in single generations
-						else:
-							# For standard models without dynamic cache, we can use cache
-							use_cache_value = True
-						# Ensure we're not passing legacy cache attributes
-						for legacy_cache_attr in ['get_max_length', 'max_cache_length']:
-							if hasattr(model_config, legacy_cache_attr):
-								delattr(model_config, legacy_cache_attr)
-					# Set use_cache in kwargs for generation
-					kwargs['use_cache'] = use_cache_value
-					# Known-safe generation args (prefer max_new_tokens for causal models)
-					allowed = {
-						'max_new_tokens', 'do_sample', 'temperature', 'top_k', 'top_p', 'num_return_sequences',
-						'pad_token_id', 'eos_token_id', 'num_beams', 'early_stopping', 'repetition_penalty',
-						'use_cache', 'output_attentions', 'output_hidden_states', 'return_dict_in_generate'
-					}
-					safe_kwargs = {k: v for k, v in kwargs.items() if k in allowed}
-					# If callers provided max_length (common in some code paths), convert it to max_new_tokens
-					# for causal models to avoid internal calls that rely on deprecated cache APIs.
-					if 'max_length' in kwargs and 'max_new_tokens' not in safe_kwargs:
-						try:
-							input_len = 0
-							if isinstance(inputs, dict) and 'input_ids' in inputs:
-								input_ids = inputs['input_ids']
-								# support tensor-like or list-like input_ids
-								try:
-									input_len = input_ids.shape[-1]
-								except Exception:
-									try:
-										input_len = len(input_ids[0])
-									except Exception:
-										input_len = 0
-							max_len_val = kwargs.get('max_length')
-							computed_new = max(1, int(max_len_val) - int(input_len))
-							safe_kwargs['max_new_tokens'] = min(computed_new, self.cache_settings.get('max_length', 2048))
-						except Exception:
-							# If anything goes wrong, default to a conservative value
-							safe_kwargs['max_new_tokens'] = 256
-					# Accept prompt string or tokenized dict
-					prompt_text = None
-					if isinstance(inputs, dict):
-						# If tokenized tensors provided, try to decode to text when tokenizer exists
-						if 'input_ids' in inputs and self.tokenizer is not None:
-							try:
-								input_ids = inputs['input_ids']
-								# handle tensors or lists
-								if hasattr(input_ids, 'tolist'):
-									decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
-								else:
-									decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
-								prompt_text = decoded
-							except Exception:
-								prompt_text = None
-						else:
-							prompt_text = inputs.get('text') or inputs.get('prompt')
-					else:
-						prompt_text = inputs
-					if prompt_text is None:
-						prompt_text = ""
-					try:
-						# If tokenizer available, tokenize prompt and generate
-						if self.tokenizer is not None and isinstance(prompt_text, str):
-							tokenized = self.tokenizer([prompt_text], return_tensors='pt')
-							# move tensors to device if needed
-							try:
-								if self.device is not None and hasattr(tokenized['input_ids'], 'to'):
-									tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
-							except Exception:
-								_pass = None
-							outputs = self.model.generate(**tokenized, **safe_kwargs)
-							# decode if tokenizer has decode
-							if hasattr(self.tokenizer, 'decode'):
-								return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-							return outputs
-						else:
-							# Try calling model.generate with provided inputs
-							outputs = self.model.generate(**inputs, **safe_kwargs)
-							# If tokenizer exists and outputs is tensor-like, decode
-							if hasattr(self.tokenizer, 'decode'):
-								try:
-									return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-								except Exception:
-									return str(outputs)
-							return outputs
-					except TypeError as te:
-						_logging.warning(f"Fallback generate signature mismatch: {te}")
-						# As a last resort, try to call unified_model_manager if available
-						try:
-							from .unified_model_manager import unified_model_manager as _umm
-							return _umm.generate_text(self.model.name if hasattr(self.model, 'name') else str(self.model), prompt_text)
-						except Exception as e:
-							_logging.warning(f"Fallback final generation failed: {e}")
-							raise
 			logging.info(f"Successfully loaded fallback transformers model: {original_model_name}")
 			return FallbackPipeline(model, tokenizer)

 	logging.info(f"Loading OpenVINO model {model_name} on device: {device}")
+	# Ensure torch compatibility patches are applied
+	from .torch_compat import ensure_torch_compatibility
+	ensure_torch_compatibility()
 	try:
 		# If model_name is a directory, try to load IR from there; else, download and export
 					cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
 				)
+				# Use optimized FallbackPipeline from dedicated module
+				from .fallback_pipeline import FallbackPipeline
 				logging.info(f"Successfully loaded transformers model as fallback: {original_model_name}")
 				return FallbackPipeline(model, tokenizer)
 				original_model_name,
 				torch_dtype=torch.float32,
 				device_map="auto" if torch.cuda.is_available() and not is_hf_spaces else None,
+				trust_remote_code=True,
+				cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
 			)
 			tokenizer = AutoTokenizer.from_pretrained(
 				original_model_name,
+				trust_remote_code=True,
+				cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
 			)
+			# Use optimized FallbackPipeline from dedicated module
+			from .fallback_pipeline import FallbackPipeline
 			logging.info(f"Successfully loaded fallback transformers model: {original_model_name}")
 			return FallbackPipeline(model, tokenizer)

services/ai-service/src/ai_med_extract/utils/torch_compat.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+PyTorch Compatibility Utilities
+Provides compatibility patches and optimizations for PyTorch operations
+"""
+import logging
+import torch
+from typing import Optional, Union, Tuple
+logger = logging.getLogger(__name__)
+# Track if patches have been applied
+_RMS_NORM_PATCHED = False
+def _create_rms_norm_implementation():
+    """
+    Create an optimized RMS normalization function.
+    RMS normalization formula: output = input * rsqrt(mean(input^2) + eps) * weight
+    Optimizations:
+    - Uses in-place operations where possible
+    - Efficient tensor operations
+    - Handles various input shapes and signatures
+    """
+    def rms_norm(
+        input_tensor: torch.Tensor,
+        normalized_shape: Optional[Union[int, Tuple[int, ...]]] = None,
+        weight: Optional[torch.Tensor] = None,
+        eps: float = 1e-6
+    ) -> torch.Tensor:
+        """
+        RMS normalization implementation compatible with PyTorch's expected signature.
+        Args:
+            input_tensor: Input tensor to normalize
+            normalized_shape: Shape of dimensions to normalize over (None = last dim)
+            weight: Optional weight tensor to apply after normalization
+            eps: Small epsilon value for numerical stability
+        Returns:
+            Normalized tensor
+        """
+        # Determine normalization dimensions
+        if normalized_shape is None:
+            # Default: normalize over last dimension
+            dim = -1
+            keepdim = True
+        elif isinstance(normalized_shape, int):
+            # Single dimension specified
+            dim = normalized_shape
+            keepdim = True
+        else:
+            # Multiple dimensions specified (tuple/list)
+            if isinstance(normalized_shape, (list, tuple)):
+                # Normalize over trailing dimensions matching the shape
+                dim = tuple(range(-len(normalized_shape), 0))
+            else:
+                dim = normalized_shape
+            keepdim = True
+        # Compute RMS: sqrt(mean(x^2))
+        # Use pow(2) instead of **2 for better performance in some cases
+        variance = input_tensor.pow(2).mean(dim=dim, keepdim=keepdim)
+        # Normalize: x * rsqrt(variance + eps)
+        # Using rsqrt is more efficient than 1/sqrt
+        output = input_tensor * torch.rsqrt(variance + eps)
+        # Apply weight if provided
+        if weight is not None:
+            output = output * weight
+        return output
+    return rms_norm
+def patch_torch_rms_norm() -> bool:
+    """
+    Patch torch.rms_norm if it doesn't exist.
+    This is needed for compatibility with models like Phi-3 that expect
+    torch.rms_norm to be available, but it may not exist in older PyTorch versions.
+    Returns:
+        True if patch was applied, False if already exists
+    """
+    global _RMS_NORM_PATCHED
+    if _RMS_NORM_PATCHED:
+        return False
+    if hasattr(torch, 'rms_norm'):
+        # Already exists, no need to patch
+        _RMS_NORM_PATCHED = True
+        return False
+    try:
+        rms_norm_func = _create_rms_norm_implementation()
+        torch.rms_norm = rms_norm_func
+        _RMS_NORM_PATCHED = True
+        logger.info("Patched torch.rms_norm for compatibility with Phi-3 and similar models")
+        return True
+    except Exception as e:
+        logger.warning(f"Failed to patch torch.rms_norm: {e}")
+        return False
+def ensure_torch_compatibility():
+    """
+    Ensure all PyTorch compatibility patches are applied.
+    Call this at module initialization time.
+    """
+    patch_torch_rms_norm()
+# Auto-apply patch on import
+ensure_torch_compatibility()

services/ai-service/src/ai_med_extract/utils/unified_model_manager.py CHANGED Viewed

@@ -16,44 +16,14 @@ from enum import Enum
 from collections import OrderedDict
 import psutil
 import torch
-# Patch torch.rms_norm for compatibility with models like Phi-3 that expect this function
-if not hasattr(torch, 'rms_norm'):
-    def rms_norm(input_tensor, normalized_shape=None, weight=None, eps=1e-6):
-        """Simple RMS normalization implementation compatible with various call signatures"""
-        # Handle different input formats
-        if normalized_shape is None:
-            # If no shape specified, normalize over last dimension
-            dim = -1
-            keepdim = True
-        else:
-            # If shape is specified, normalize over those dimensions
-            if isinstance(normalized_shape, int):
-                dim = normalized_shape
-                keepdim = True
-            else:
-                # Multiple dimensions - normalize over all of them
-                dim = tuple(range(-len(normalized_shape), 0))
-                keepdim = True
-        # Calculate RMS (root mean square)
-        variance = input_tensor.pow(2).mean(dim=dim, keepdim=keepdim)
-        # Normalize
-        output = input_tensor * torch.rsqrt(variance + eps)
-        # Apply weight if provided
-        if weight is not None:
-            output = output * weight
-        return output
-    torch.rms_norm = rms_norm
-    _rms_norm_patched = True
-else:
-    _rms_norm_patched = False
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # Configure logging
 logger = logging.getLogger(__name__)
-if _rms_norm_patched:
-    logger.info("Patched torch.rms_norm for compatibility with Phi-3 and similar models")
 class ModelType(Enum):
     """Supported model types"""

 from collections import OrderedDict
 import psutil
 import torch
+# Ensure PyTorch compatibility patches are applied early
+from .torch_compat import ensure_torch_compatibility
+ensure_torch_compatibility()
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # Configure logging
 logger = logging.getLogger(__name__)
 class ModelType(Enum):
     """Supported model types"""