import gradio as gr from PIL import Image import torch import os import sys from huggingface_hub import login from transformers import AutoConfig, AutoProcessor, AutoModelForCausalLM import uvicorn # Import spaces module for ZeroGPU support try: import spaces has_spaces = True print("ZeroGPU support enabled via spaces module") except ImportError: has_spaces = False print("spaces module not found, ZeroGPU features will be disabled") # Create examples directory if it doesn't exist os.makedirs("examples", exist_ok=True) # Authenticate with Hugging Face Hub using environment variable hf_token = os.environ.get("HF_TOKEN") if hf_token: login(token=hf_token) else: print("Warning: HF_TOKEN environment variable not set. Some features may not work.") # Model and device setup device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Model identifier - hardcode the correct model path instead of using environment variables model_id = "thorscribe/thorscribe-model-3" print(f"Using model: {model_id}") # Determine dtype based on available hardware if device == "cuda": if torch.cuda.is_bf16_supported(): torch_dtype = torch.bfloat16 print("Using bfloat16 precision") else: torch_dtype = torch.float16 print("Using float16 precision") else: torch_dtype = torch.float32 print("Using float32 precision (CPU mode)") # Calculate target dimensions - using fixed dimensions target_size = 1024 # Use a fixed size that works well with the model print(f"Using fixed image resolution of {target_size}x{target_size}") def pad_to_square(image, background_color=(0, 0, 0)): """Pad image to square with black background""" if image is None: return None width, height = image.size if width == height: return image new_size = max(width, height) new_image = Image.new('RGB', (new_size, new_size), background_color) # Paste the original image centered in the square paste_x = (new_size - width) // 2 paste_y = (new_size - height) // 2 new_image.paste(image, (paste_x, paste_y)) return new_image def process_image(image, size=1024): """Process image to be suitable for the model""" if image is None: return None # First make the image square by padding image = pad_to_square(image) # Then resize to the target size image = image.resize((size, size), Image.LANCZOS) print(f"Processed image to {image.size[0]}x{image.size[1]}") return image # Load processor first (lower memory requirements) print(f"Loading processor from {model_id}...") try: processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) print("Processor loaded successfully!") except Exception as e: print(f"Error loading processor: {str(e)}") sys.exit(1) # Load and inspect model config via AutoConfig try: cfg = AutoConfig.from_pretrained(model_id, trust_remote_code=True) print("Vision config - patch_size:", cfg.vision_config.patch_size) print("Vision config - patch_stride:", cfg.vision_config.patch_stride) print("Vision config - patch_padding:", cfg.vision_config.patch_padding) except Exception as e: print(f"Error loading model config: {str(e)}") sys.exit(1) # Load model with explicit config try: print(f"Loading model from {model_id}...") model = AutoModelForCausalLM.from_pretrained( model_id, config=cfg, torch_dtype=torch_dtype, low_cpu_mem_usage=True, trust_remote_code=True ) # Only move model to GPU when we're actually using it # Will be handled by the @spaces.GPU decorator if not has_spaces and device == "cuda": model.to(device) print("Model moved to CUDA device") print("Model loaded successfully with explicit config!") except Exception as e: print(f"Error loading model: {str(e)}") import traceback print(traceback.format_exc()) sys.exit(1) # Default prompt to use (hidden from UI) DEFAULT_PROMPT = " What does this figure show?" # Define the generation function with ZeroGPU decorator if available if has_spaces: @spaces.GPU(duration=60) # Set appropriate duration based on your model's generation time def generate_caption(image): if image is None: return "Please upload an image." try: # Move model to GPU when using ZeroGPU model.to(device) # Process the image to be suitable for the model processed_image = process_image(image, size=target_size) # Process text and image separately pixel_values = processor.image_processor(images=processed_image, return_tensors="pt").pixel_values # Process the text with controlled parameters input_ids = processor.tokenizer( DEFAULT_PROMPT, return_tensors="pt", padding="max_length", max_length=77, # Use a safe, reasonable value truncation=True ).input_ids # Build inputs dictionary inputs = { "pixel_values": pixel_values.to(device, dtype=torch_dtype), "input_ids": input_ids.to(device) } # Generate with conservative settings with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=50, num_beams=1, do_sample=False ) # Decode and truncate text = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] return text except Exception as e: import traceback trace = traceback.format_exc() print(f"Error: {str(e)}") print(trace) return f"Error processing image: {str(e)[:200]}. Check console for full traceback." else: # Regular function without ZeroGPU def generate_caption(image): if image is None: return "Please upload an image." try: # Process the image to be suitable for the model processed_image = process_image(image, size=target_size) # Process text and image separately pixel_values = processor.image_processor(images=processed_image, return_tensors="pt").pixel_values # Process the text with controlled parameters input_ids = processor.tokenizer( DEFAULT_PROMPT, return_tensors="pt", padding="max_length", max_length=77, # Use a safe, reasonable value truncation=True ).input_ids # Build inputs dictionary inputs = { "pixel_values": pixel_values.to(device, dtype=torch_dtype), "input_ids": input_ids.to(device) } # Generate with conservative settings with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=50, num_beams=1, do_sample=False ) # Decode and truncate text = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] return text except Exception as e: import traceback trace = traceback.format_exc() print(f"Error: {str(e)}") print(trace) return f"Error processing image: {str(e)[:200]}. Check console for full traceback." # Create a simple Gradio interface without FastAPI integration demo = gr.Interface( fn=generate_caption, inputs=gr.Image(type="pil", label="Upload Thoracic MRI/X-ray Image"), outputs=gr.Textbox(label="Generated Caption", lines=5, max_lines=5, show_copy_button=True), title="THORSCRIBE: AI-Powered Thoracic Image Captioning", description="THORSCRIBE is an advanced AI model that generates detailed captions for MRI and X-ray images of the thorax area. Upload your medical image to receive an informative caption." + (" (with ZeroGPU)" if has_spaces else ""), allow_flagging="never", theme=gr.themes.Monochrome(), examples=["examples/example1.jpg", "examples/example2.jpg", "examples/example3.jpg", "examples/example4.jpg"] if os.path.exists("examples/example1.jpg") else None, article="

About THORSCRIBE

THORSCRIBE is specialized in analyzing thoracic medical imagery, providing accurate descriptions of findings in MRI and X-ray images. This tool is designed to assist medical professionals in their diagnostic workflows.

Powered by model: thorscribe/thorscribe-model-3

" ) # Launch the app - Use 7860 which is the standard port for Hugging Face Spaces if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=False, show_error=True, )