#!/usr/bin/env python3
"""
Gradio app for Qwen3 Resume Parser using Transformers (GPU deployment).

Optimized for NVIDIA T4 GPU with 8-bit quantization.
Loads base model + LoRA adapter at runtime (same approach as inference.py).
"""

import gradio as gr
import hashlib
import json
import os
import re
from collections import OrderedDict
from pathlib import Path

import threading
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel

# Model configuration
BASE_MODEL = "Qwen/Qwen3-0.6B"  # Base model
LORA_ADAPTER = "sandeeppanem/qwen3-0.6b-resume-json"  # LoRA adapter

# Global variables for model caching
_model = None
_tokenizer = None
_device = None

# Shared result cache (key: hash of resume text, value: (formatted_json, raw_output))
_result_cache = OrderedDict()
MAX_CACHE_SIZE = 100


def load_model():
    """Load base model + LoRA adapter with 8-bit quantization for GPU."""
    global _model, _tokenizer, _device
    
    if _model is not None and _tokenizer is not None:
        return _model, _tokenizer
    
    try:
        from transformers import BitsAndBytesConfig
    except ImportError:
        raise ImportError(
            "bitsandbytes not installed. "
            "Install with: pip install bitsandbytes"
        )
    
    print("Loading base model and LoRA adapter...")
    
    # Check for GPU
    if torch.cuda.is_available():
        _device = "cuda"
        print(f"✓ CUDA available: {torch.cuda.get_device_name(0)}")
        print(f"✓ CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    else:
        _device = "cpu"
        print("⚠️ CUDA not available, using CPU")
    
    # 8-bit quantization config for T4 GPU (16GB)
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
    )
    
    # Load tokenizer from base model
    print("Loading tokenizer...")
    _tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL,
        trust_remote_code=True,
    )
    if _tokenizer.pad_token is None:
        _tokenizer.pad_token = _tokenizer.eos_token
        _tokenizer.pad_token_id = _tokenizer.eos_token_id
    print("✓ Tokenizer loaded")
    
    # Load base model with 8-bit quantization for GPU
    print("Loading base model...")
    if _device == "cuda":
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            quantization_config=quantization_config,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        # Fallback to CPU (not recommended for production)
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.float32,
            trust_remote_code=True,
        )
        base_model = base_model.to(_device)
    print("✓ Base model loaded")
    
    # Load LoRA adapter
    print("Loading LoRA adapter...")
    _model = PeftModel.from_pretrained(base_model, LORA_ADAPTER)
    _model.eval()
    print("✓ LoRA adapter loaded")
    print(f"✓ Model ready on {_device}")
    
    return _model, _tokenizer


def build_messages(resume_text: str) -> list[dict[str, str]]:
    """Build messages for Qwen3 chat template."""
    return [
        {
            "role": "system",
            "content": (
                "You are an expert resume parser. "
                "Extract structured information from resumes and return ONLY valid JSON. "
                "Do not include explanations or extra text."
            ),
        },
        {"role": "user", "content": f"Resume:\n{resume_text.strip()}"},
    ]


def _format_incomplete_json(text: str) -> str:
    """Format incomplete JSON for visibility during streaming."""
    if not text or not text.strip():
        return text
    
    formatted = text
    
    # First, ensure proper spacing around colons (makes it more readable)
    formatted = re.sub(r':"', ': "', formatted)
    formatted = re.sub(r':(\d+)', r': \1', formatted)
    formatted = re.sub(r':(true|false|null)', r': \1', formatted)
    formatted = re.sub(r':\{', ': {', formatted)
    formatted = re.sub(r':\[', ': [', formatted)
    formatted = re.sub(r',\s*"', ',\n  "', formatted)
    # Pattern: comma followed by number
    formatted = re.sub(r',\s*(\d+)', r',\n  \1', formatted)
    formatted = re.sub(r',\s*(true|false|null)', r',\n  \1', formatted)
    # Pattern: comma followed by opening brace/array
    formatted = re.sub(r',\s*(\{|\[)', r',\n  \1', formatted)
    formatted = re.sub(r'\{\s*"', '{\n  "', formatted)
    
    # Add newline before closing brace (if it's on same line with content)
    # But be careful not to break strings
    formatted = re.sub(r'([^}\s"])\s*\}', r'\1\n}', formatted)
    formatted = re.sub(r'\n\n+', '\n', formatted)
    formatted = re.sub(r'  +', '  ', formatted)
    return formatted


def parse_resume_stream(resume_text: str):
    """Parse resume with streaming output."""
    global _model, _tokenizer, _device
    
    if not resume_text or not resume_text.strip():
        yield "⚠️ Please provide resume text", ""
        return
    
    # Check cache
    normalized_text = resume_text.strip().lower()
    cache_key = hashlib.md5(normalized_text.encode()).hexdigest()
    
    if cache_key in _result_cache:
        cached_json, cached_raw = _result_cache[cache_key]
        yield cached_json, cached_raw
        return
    
    try:
        model, tokenizer = load_model()
        
        MAX_RESUME_CHARS = 4000
        if len(normalized_text) > MAX_RESUME_CHARS:
            truncated = normalized_text[:MAX_RESUME_CHARS]
            last_space = truncated.rfind(' ', MAX_RESUME_CHARS - 200, MAX_RESUME_CHARS)
            if last_space > MAX_RESUME_CHARS - 500:
                truncated = truncated[:last_space]
            normalized_text = truncated + "..."
        
        messages = build_messages(normalized_text)
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )
        
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(_device) for k, v in inputs.items()}
        
        # Setup streaming
        streamer = TextIteratorStreamer(
            tokenizer,
            skip_prompt=True,
            skip_special_tokens=True,
            timeout=300.0,  # 5 minute timeout
        )
        
        # Generation parameters
        generation_kwargs = {
            **inputs,
            "max_new_tokens": 350,
            "do_sample": False,
            "pad_token_id": tokenizer.eos_token_id,
            "streamer": streamer,
        }
        
        # Start generation in a separate thread with inference_mode
        def generate_with_inference_mode():
            with torch.inference_mode():
                model.generate(**generation_kwargs)
        
        generation_thread = threading.Thread(
            target=generate_with_inference_mode,
            daemon=True,
        )
        generation_thread.start()
        
        # Process streamed tokens
        accumulated_text = ""
        final_json = None
        final_raw = None
        chunk_count = 0
        
        for new_text in streamer:
            if new_text:
                accumulated_text += new_text
                chunk_count += 1
                
                # Only do expensive operations every 5 chunks or if we have enough text
                # This reduces overhead during streaming
                if chunk_count % 5 == 0 or len(accumulated_text) > 50:
                    cleaned_text = accumulated_text
                    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL)
                    cleaned_text = re.sub(r'</?redacted_reasoning>', '', cleaned_text)
                    cleaned_text = re.sub(r'</?think>', '', cleaned_text)
                    cleaned_text = re.sub(r'\n\s*\n+', '\n', cleaned_text)
                    cleaned_text = cleaned_text.strip()
                    
                    try:
                        parsed_json = json.loads(cleaned_text)
                        formatted_json = json.dumps(parsed_json, indent=2, ensure_ascii=False)
                        final_json = formatted_json
                        final_raw = cleaned_text
                        yield formatted_json, cleaned_text
                    except json.JSONDecodeError:
                        formatted_incomplete = _format_incomplete_json(cleaned_text)
                        yield formatted_incomplete, cleaned_text
        
        # Wait for generation thread to complete
        generation_thread.join()
        
        # Final processing after stream completes
        assistant_response = accumulated_text.strip()
        assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL)
        assistant_response = re.sub(r'</?redacted_reasoning>', '', assistant_response)
        assistant_response = re.sub(r'</?think>', '', assistant_response)
        assistant_response = re.sub(r'\n\s*\n+', '\n', assistant_response)
        assistant_response = assistant_response.strip()
        
        # Final JSON parsing and caching
        try:
            parsed_json = json.loads(assistant_response)
            formatted_json = json.dumps(parsed_json, indent=2, ensure_ascii=False)
            final_json = formatted_json
            final_raw = assistant_response
            
            # Cache result
            if len(_result_cache) >= MAX_CACHE_SIZE:
                _result_cache.popitem(last=False)  # Remove oldest
            _result_cache[cache_key] = (formatted_json, assistant_response)
            
            yield formatted_json, assistant_response
        except json.JSONDecodeError:
            formatted_incomplete = _format_incomplete_json(assistant_response)
            yield formatted_incomplete, assistant_response
        
    except Exception as e:
        error_msg = f"❌ Error: {str(e)}"
        print(error_msg)
        yield error_msg, ""


def create_interface():
    """Create Gradio interface."""
    with gr.Blocks(title="Qwen3 Resume Structured Information Extraction ", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # 🚀 Qwen3 Resume Structured Information Extraction
            
            Extract structured information from resumes using fine-tuned Qwen3-0.6B model.
            **Optimized with 8-bit quantization.**
            
            **How to use:**
            1. Paste your resume text in the text box below
            2. Click "Parse Resume" 
            3. View the extracted structured JSON output
            
            **Model:** [sandeeppanem/qwen3-0.6b-resume-json](https://huggingface.co/sandeeppanem/qwen3-0.6b-resume-json)  
            **Dataset:** [sandeeppanem/resume-json-extraction-5k](https://huggingface.co/datasets/sandeeppanem/resume-json-extraction-5k)  
            **Repository:** [qwen3-resume-extraction](https://github.com/sandeeppanem/qwen3-resume-extraction)  
            **Format:** Base model + LoRA adapter with 8-bit quantization 
            """
        )
        
        with gr.Row():
            with gr.Column(scale=1):
                resume_input = gr.Textbox(
                    label="Resume Text",
                    placeholder="Paste your resume text here...",
                    lines=15,
                    max_lines=20,
                )
                parse_btn = gr.Button("Parse Resume", variant="primary", size="lg")
                
                gr.Markdown(
                    """
                    **Example:**
                    ```
                    Senior IT Project Manager with 10+ years experience leading enterprise migrations. 
                    Skills: Python, SQL, AWS, Agile. Location: Chicago, IL. 
                    Experience: Project Manager at Acme Corp (2019-2024). 
                    Education: MS Computer Science.
                    ```
                    """
                )
            
            with gr.Column(scale=1):
                json_output = gr.Code(
                    label="Structured JSON Output",
                    language="json",
                    lines=20,
                )
                raw_output = gr.Textbox(
                    label="Raw Output",
                    lines=10,
                    visible=False,
                )
        
        # Examples - diverse resume samples (same as CPU version)
        example_resumes = [
            """Senior IT Project Manager with 10+ years experience leading enterprise migrations. 
Skills: Python, SQL, AWS, Agile. Location: Chicago, IL. 
Experience: Project Manager at Acme Corp (2019-2024). 
Education: MS Computer Science.""",
            
            """Software Engineer
John Smith
Email: john.smith@email.com | Phone: (555) 123-4567 | Location: San Francisco, CA

PROFESSIONAL SUMMARY
Full Stack Developer with 5 years of experience building scalable web applications. 
Expertise in React, Node.js, Python, and cloud technologies.

TECHNICAL SKILLS
Languages: JavaScript, Python, TypeScript, Java
Frameworks: React, Node.js, Express, Django, Spring Boot
Cloud: AWS (EC2, S3, Lambda), Docker, Kubernetes
Databases: PostgreSQL, MongoDB, Redis

PROFESSIONAL EXPERIENCE
Senior Software Engineer | TechCorp Inc. | San Francisco, CA | 2021 - Present
- Developed microservices architecture serving 1M+ users
- Led team of 3 junior developers
- Reduced API response time by 40% through optimization

Software Engineer | StartupXYZ | San Francisco, CA | 2019 - 2021
- Built customer-facing React applications
- Implemented CI/CD pipelines using Jenkins

EDUCATION
Bachelor of Science in Computer Science
University of California, Berkeley | 2019""",
            
            """Data Scientist
Sarah Johnson
sarah.johnson@email.com | (555) 987-6543 | New York, NY

SUMMARY
Data Scientist with 7 years of experience in machine learning, statistical analysis, and big data. 
Specialized in NLP and computer vision applications.

SKILLS
Programming: Python, R, SQL, Scala
ML/AI: TensorFlow, PyTorch, scikit-learn, XGBoost
Tools: Spark, Hadoop, Tableau, Jupyter
Cloud: AWS SageMaker, Azure ML

EXPERIENCE
Lead Data Scientist | DataTech Solutions | New York, NY | 2020 - Present
- Built recommendation system increasing user engagement by 35%
- Developed NLP models for sentiment analysis
- Managed team of 4 data scientists

Data Scientist | Analytics Pro | New York, NY | 2018 - 2020
- Created predictive models for customer churn
- Analyzed large datasets using Spark

EDUCATION
Master of Science in Data Science | Columbia University | 2018
Bachelor of Science in Statistics | NYU | 2016""",
            
            """Marketing Manager
Michael Chen
michael.chen@email.com | (555) 456-7890 | Los Angeles, CA

PROFESSIONAL PROFILE
Strategic Marketing Manager with 8+ years driving brand growth and digital marketing campaigns. 
Expert in SEO, content marketing, and social media strategy.

CORE COMPETENCIES
Digital Marketing, SEO/SEM, Content Strategy, Social Media Management, 
Google Analytics, HubSpot, Marketo, Brand Management

PROFESSIONAL EXPERIENCE
Marketing Manager | BrandCo | Los Angeles, CA | 2019 - Present
- Increased website traffic by 150% through SEO optimization
- Launched successful social media campaigns reaching 2M+ impressions
- Managed $500K annual marketing budget

Marketing Specialist | Growth Agency | Los Angeles, CA | 2016 - 2019
- Developed content marketing strategies
- Executed email marketing campaigns with 25% open rate

EDUCATION
Master of Business Administration (MBA) | UCLA | 2016
Bachelor of Arts in Communications | USC | 2014""",
            
            """Product Manager
Emily Rodriguez
emily.rodriguez@email.com | (555) 234-5678 | Seattle, WA

OVERVIEW
Product Manager with 6 years of experience in B2B SaaS products. 
Led product launches from concept to market, working with engineering and design teams.

KEY SKILLS
Product Strategy, Agile/Scrum, User Research, A/B Testing, 
Roadmap Planning, Stakeholder Management, JIRA, Figma

WORK EXPERIENCE
Senior Product Manager | CloudSoft | Seattle, WA | 2020 - Present
- Launched 3 major product features, increasing revenue by $2M annually
- Conducted user research and usability testing
- Managed product roadmap and prioritized features

Product Manager | StartupHub | Seattle, WA | 2018 - 2020
- Owned product lifecycle for mobile application
- Collaborated with cross-functional teams

EDUCATION
Master of Science in Product Management | University of Washington | 2018
Bachelor of Science in Business Administration | Washington State University | 2016""",
            
            """DevOps Engineer
David Kim
david.kim@email.com | (555) 345-6789 | Austin, TX

SUMMARY
DevOps Engineer with 4 years of experience in CI/CD, infrastructure automation, and cloud architecture. 
Proven track record of improving deployment efficiency and system reliability.

TECHNICAL SKILLS
Cloud Platforms: AWS, Azure, GCP
CI/CD: Jenkins, GitLab CI, GitHub Actions, CircleCI
Infrastructure: Terraform, Ansible, CloudFormation
Containers: Docker, Kubernetes, ECS
Monitoring: Prometheus, Grafana, ELK Stack
Scripting: Bash, Python, PowerShell

EXPERIENCE
DevOps Engineer | CloudInfra Inc. | Austin, TX | 2021 - Present
- Reduced deployment time from 2 hours to 15 minutes
- Implemented infrastructure as code using Terraform
- Set up monitoring and alerting systems

Junior DevOps Engineer | TechStart | Austin, TX | 2020 - 2021
- Maintained CI/CD pipelines
- Managed cloud infrastructure on AWS

EDUCATION
Bachelor of Science in Computer Engineering
University of Texas at Austin | 2020"""
        ]
        
        gr.Examples(
            examples=[[resume] for resume in example_resumes],
            inputs=resume_input,
            label="Select a sample resume:",
        )
        
        # Connect button with streaming
        parse_btn.click(
            fn=parse_resume_stream,
            inputs=resume_input,
            outputs=[json_output, raw_output],
        )
        
        gr.Examples(
            examples=[
                ["Senior Software Engineer with 10+ years of experience in Python, Java, and cloud technologies. Currently at Google, previously at Microsoft. MS in Computer Science from Stanford."],
                ["Data Scientist with expertise in machine learning, deep learning, and NLP. 5 years experience. PhD in Statistics. Proficient in Python, TensorFlow, PyTorch."],
                ["Product Manager with 8 years experience in tech startups. MBA from Wharton. Led product launches at 3 companies. Strong in Agile, Scrum, and product strategy."],
            ],
            inputs=resume_input,
            outputs=[json_output, raw_output],
        )
    
    return demo


if __name__ == "__main__":
    
    # Enable TF32 for faster GPU matrix operations (NVIDIA Ampere+ GPUs)
    if torch.cuda.is_available():
        torch.backends.cuda.matmul.allow_tf32 = True
        print("✓ TF32 enabled for faster GPU inference")
    
    # Load model at startup
    print("=" * 60)
    print("Application Startup")
    print("=" * 60)
    try:
        load_model()
        print("✓ Model loaded successfully at startup")
    except Exception as e:
        print(f"⚠️ Warning: Could not load model at startup: {e}")
        print("Model will be loaded on first use.")
    
    demo = create_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)