#!/usr/bin/env python3 """ Gradio app for Qwen3 Resume Parser using Transformers (GPU deployment). Optimized for NVIDIA T4 GPU with 8-bit quantization. Loads base model + LoRA adapter at runtime (same approach as inference.py). """ import gradio as gr import hashlib import json import os import re from collections import OrderedDict from pathlib import Path import threading import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from peft import PeftModel # Model configuration BASE_MODEL = "Qwen/Qwen3-0.6B" # Base model LORA_ADAPTER = "sandeeppanem/qwen3-0.6b-resume-json" # LoRA adapter # Global variables for model caching _model = None _tokenizer = None _device = None # Shared result cache (key: hash of resume text, value: (formatted_json, raw_output)) _result_cache = OrderedDict() MAX_CACHE_SIZE = 100 def load_model(): """Load base model + LoRA adapter with 8-bit quantization for GPU.""" global _model, _tokenizer, _device if _model is not None and _tokenizer is not None: return _model, _tokenizer try: from transformers import BitsAndBytesConfig except ImportError: raise ImportError( "bitsandbytes not installed. " "Install with: pip install bitsandbytes" ) print("Loading base model and LoRA adapter...") # Check for GPU if torch.cuda.is_available(): _device = "cuda" print(f"✓ CUDA available: {torch.cuda.get_device_name(0)}") print(f"✓ CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") else: _device = "cpu" print("⚠️ CUDA not available, using CPU") # 8-bit quantization config for T4 GPU (16GB) quantization_config = BitsAndBytesConfig( load_in_8bit=True, llm_int8_threshold=6.0, llm_int8_has_fp16_weight=False, ) # Load tokenizer from base model print("Loading tokenizer...") _tokenizer = AutoTokenizer.from_pretrained( BASE_MODEL, trust_remote_code=True, ) if _tokenizer.pad_token is None: _tokenizer.pad_token = _tokenizer.eos_token _tokenizer.pad_token_id = _tokenizer.eos_token_id print("✓ Tokenizer loaded") # Load base model with 8-bit quantization for GPU print("Loading base model...") if _device == "cuda": base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) else: # Fallback to CPU (not recommended for production) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float32, trust_remote_code=True, ) base_model = base_model.to(_device) print("✓ Base model loaded") # Load LoRA adapter print("Loading LoRA adapter...") _model = PeftModel.from_pretrained(base_model, LORA_ADAPTER) _model.eval() print("✓ LoRA adapter loaded") print(f"✓ Model ready on {_device}") return _model, _tokenizer def build_messages(resume_text: str) -> list[dict[str, str]]: """Build messages for Qwen3 chat template.""" return [ { "role": "system", "content": ( "You are an expert resume parser. " "Extract structured information from resumes and return ONLY valid JSON. " "Do not include explanations or extra text." ), }, {"role": "user", "content": f"Resume:\n{resume_text.strip()}"}, ] def _format_incomplete_json(text: str) -> str: """Format incomplete JSON for visibility during streaming.""" if not text or not text.strip(): return text formatted = text # First, ensure proper spacing around colons (makes it more readable) formatted = re.sub(r':"', ': "', formatted) formatted = re.sub(r':(\d+)', r': \1', formatted) formatted = re.sub(r':(true|false|null)', r': \1', formatted) formatted = re.sub(r':\{', ': {', formatted) formatted = re.sub(r':\[', ': [', formatted) formatted = re.sub(r',\s*"', ',\n "', formatted) # Pattern: comma followed by number formatted = re.sub(r',\s*(\d+)', r',\n \1', formatted) formatted = re.sub(r',\s*(true|false|null)', r',\n \1', formatted) # Pattern: comma followed by opening brace/array formatted = re.sub(r',\s*(\{|\[)', r',\n \1', formatted) formatted = re.sub(r'\{\s*"', '{\n "', formatted) # Add newline before closing brace (if it's on same line with content) # But be careful not to break strings formatted = re.sub(r'([^}\s"])\s*\}', r'\1\n}', formatted) formatted = re.sub(r'\n\n+', '\n', formatted) formatted = re.sub(r' +', ' ', formatted) return formatted def parse_resume_stream(resume_text: str): """Parse resume with streaming output.""" global _model, _tokenizer, _device if not resume_text or not resume_text.strip(): yield "⚠️ Please provide resume text", "" return # Check cache normalized_text = resume_text.strip().lower() cache_key = hashlib.md5(normalized_text.encode()).hexdigest() if cache_key in _result_cache: cached_json, cached_raw = _result_cache[cache_key] yield cached_json, cached_raw return try: model, tokenizer = load_model() MAX_RESUME_CHARS = 4000 if len(normalized_text) > MAX_RESUME_CHARS: truncated = normalized_text[:MAX_RESUME_CHARS] last_space = truncated.rfind(' ', MAX_RESUME_CHARS - 200, MAX_RESUME_CHARS) if last_space > MAX_RESUME_CHARS - 500: truncated = truncated[:last_space] normalized_text = truncated + "..." messages = build_messages(normalized_text) prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False, ) inputs = tokenizer(prompt, return_tensors="pt") inputs = {k: v.to(_device) for k, v in inputs.items()} # Setup streaming streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0, # 5 minute timeout ) # Generation parameters generation_kwargs = { **inputs, "max_new_tokens": 350, "do_sample": False, "pad_token_id": tokenizer.eos_token_id, "streamer": streamer, } # Start generation in a separate thread with inference_mode def generate_with_inference_mode(): with torch.inference_mode(): model.generate(**generation_kwargs) generation_thread = threading.Thread( target=generate_with_inference_mode, daemon=True, ) generation_thread.start() # Process streamed tokens accumulated_text = "" final_json = None final_raw = None chunk_count = 0 for new_text in streamer: if new_text: accumulated_text += new_text chunk_count += 1 # Only do expensive operations every 5 chunks or if we have enough text # This reduces overhead during streaming if chunk_count % 5 == 0 or len(accumulated_text) > 50: cleaned_text = accumulated_text cleaned_text = re.sub(r'.*?', '', cleaned_text, flags=re.DOTALL) cleaned_text = re.sub(r'', '', cleaned_text) cleaned_text = re.sub(r'', '', cleaned_text) cleaned_text = re.sub(r'\n\s*\n+', '\n', cleaned_text) cleaned_text = cleaned_text.strip() try: parsed_json = json.loads(cleaned_text) formatted_json = json.dumps(parsed_json, indent=2, ensure_ascii=False) final_json = formatted_json final_raw = cleaned_text yield formatted_json, cleaned_text except json.JSONDecodeError: formatted_incomplete = _format_incomplete_json(cleaned_text) yield formatted_incomplete, cleaned_text # Wait for generation thread to complete generation_thread.join() # Final processing after stream completes assistant_response = accumulated_text.strip() assistant_response = re.sub(r'.*?', '', assistant_response, flags=re.DOTALL) assistant_response = re.sub(r'', '', assistant_response) assistant_response = re.sub(r'', '', assistant_response) assistant_response = re.sub(r'\n\s*\n+', '\n', assistant_response) assistant_response = assistant_response.strip() # Final JSON parsing and caching try: parsed_json = json.loads(assistant_response) formatted_json = json.dumps(parsed_json, indent=2, ensure_ascii=False) final_json = formatted_json final_raw = assistant_response # Cache result if len(_result_cache) >= MAX_CACHE_SIZE: _result_cache.popitem(last=False) # Remove oldest _result_cache[cache_key] = (formatted_json, assistant_response) yield formatted_json, assistant_response except json.JSONDecodeError: formatted_incomplete = _format_incomplete_json(assistant_response) yield formatted_incomplete, assistant_response except Exception as e: error_msg = f"❌ Error: {str(e)}" print(error_msg) yield error_msg, "" def create_interface(): """Create Gradio interface.""" with gr.Blocks(title="Qwen3 Resume Structured Information Extraction ", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🚀 Qwen3 Resume Structured Information Extraction Extract structured information from resumes using fine-tuned Qwen3-0.6B model. **Optimized with 8-bit quantization.** **How to use:** 1. Paste your resume text in the text box below 2. Click "Parse Resume" 3. View the extracted structured JSON output **Model:** [sandeeppanem/qwen3-0.6b-resume-json](https://huggingface.co/sandeeppanem/qwen3-0.6b-resume-json) **Dataset:** [sandeeppanem/resume-json-extraction-5k](https://huggingface.co/datasets/sandeeppanem/resume-json-extraction-5k) **Repository:** [qwen3-resume-extraction](https://github.com/sandeeppanem/qwen3-resume-extraction) **Format:** Base model + LoRA adapter with 8-bit quantization """ ) with gr.Row(): with gr.Column(scale=1): resume_input = gr.Textbox( label="Resume Text", placeholder="Paste your resume text here...", lines=15, max_lines=20, ) parse_btn = gr.Button("Parse Resume", variant="primary", size="lg") gr.Markdown( """ **Example:** ``` Senior IT Project Manager with 10+ years experience leading enterprise migrations. Skills: Python, SQL, AWS, Agile. Location: Chicago, IL. Experience: Project Manager at Acme Corp (2019-2024). Education: MS Computer Science. ``` """ ) with gr.Column(scale=1): json_output = gr.Code( label="Structured JSON Output", language="json", lines=20, ) raw_output = gr.Textbox( label="Raw Output", lines=10, visible=False, ) # Examples - diverse resume samples (same as CPU version) example_resumes = [ """Senior IT Project Manager with 10+ years experience leading enterprise migrations. Skills: Python, SQL, AWS, Agile. Location: Chicago, IL. Experience: Project Manager at Acme Corp (2019-2024). Education: MS Computer Science.""", """Software Engineer John Smith Email: john.smith@email.com | Phone: (555) 123-4567 | Location: San Francisco, CA PROFESSIONAL SUMMARY Full Stack Developer with 5 years of experience building scalable web applications. Expertise in React, Node.js, Python, and cloud technologies. TECHNICAL SKILLS Languages: JavaScript, Python, TypeScript, Java Frameworks: React, Node.js, Express, Django, Spring Boot Cloud: AWS (EC2, S3, Lambda), Docker, Kubernetes Databases: PostgreSQL, MongoDB, Redis PROFESSIONAL EXPERIENCE Senior Software Engineer | TechCorp Inc. | San Francisco, CA | 2021 - Present - Developed microservices architecture serving 1M+ users - Led team of 3 junior developers - Reduced API response time by 40% through optimization Software Engineer | StartupXYZ | San Francisco, CA | 2019 - 2021 - Built customer-facing React applications - Implemented CI/CD pipelines using Jenkins EDUCATION Bachelor of Science in Computer Science University of California, Berkeley | 2019""", """Data Scientist Sarah Johnson sarah.johnson@email.com | (555) 987-6543 | New York, NY SUMMARY Data Scientist with 7 years of experience in machine learning, statistical analysis, and big data. Specialized in NLP and computer vision applications. SKILLS Programming: Python, R, SQL, Scala ML/AI: TensorFlow, PyTorch, scikit-learn, XGBoost Tools: Spark, Hadoop, Tableau, Jupyter Cloud: AWS SageMaker, Azure ML EXPERIENCE Lead Data Scientist | DataTech Solutions | New York, NY | 2020 - Present - Built recommendation system increasing user engagement by 35% - Developed NLP models for sentiment analysis - Managed team of 4 data scientists Data Scientist | Analytics Pro | New York, NY | 2018 - 2020 - Created predictive models for customer churn - Analyzed large datasets using Spark EDUCATION Master of Science in Data Science | Columbia University | 2018 Bachelor of Science in Statistics | NYU | 2016""", """Marketing Manager Michael Chen michael.chen@email.com | (555) 456-7890 | Los Angeles, CA PROFESSIONAL PROFILE Strategic Marketing Manager with 8+ years driving brand growth and digital marketing campaigns. Expert in SEO, content marketing, and social media strategy. CORE COMPETENCIES Digital Marketing, SEO/SEM, Content Strategy, Social Media Management, Google Analytics, HubSpot, Marketo, Brand Management PROFESSIONAL EXPERIENCE Marketing Manager | BrandCo | Los Angeles, CA | 2019 - Present - Increased website traffic by 150% through SEO optimization - Launched successful social media campaigns reaching 2M+ impressions - Managed $500K annual marketing budget Marketing Specialist | Growth Agency | Los Angeles, CA | 2016 - 2019 - Developed content marketing strategies - Executed email marketing campaigns with 25% open rate EDUCATION Master of Business Administration (MBA) | UCLA | 2016 Bachelor of Arts in Communications | USC | 2014""", """Product Manager Emily Rodriguez emily.rodriguez@email.com | (555) 234-5678 | Seattle, WA OVERVIEW Product Manager with 6 years of experience in B2B SaaS products. Led product launches from concept to market, working with engineering and design teams. KEY SKILLS Product Strategy, Agile/Scrum, User Research, A/B Testing, Roadmap Planning, Stakeholder Management, JIRA, Figma WORK EXPERIENCE Senior Product Manager | CloudSoft | Seattle, WA | 2020 - Present - Launched 3 major product features, increasing revenue by $2M annually - Conducted user research and usability testing - Managed product roadmap and prioritized features Product Manager | StartupHub | Seattle, WA | 2018 - 2020 - Owned product lifecycle for mobile application - Collaborated with cross-functional teams EDUCATION Master of Science in Product Management | University of Washington | 2018 Bachelor of Science in Business Administration | Washington State University | 2016""", """DevOps Engineer David Kim david.kim@email.com | (555) 345-6789 | Austin, TX SUMMARY DevOps Engineer with 4 years of experience in CI/CD, infrastructure automation, and cloud architecture. Proven track record of improving deployment efficiency and system reliability. TECHNICAL SKILLS Cloud Platforms: AWS, Azure, GCP CI/CD: Jenkins, GitLab CI, GitHub Actions, CircleCI Infrastructure: Terraform, Ansible, CloudFormation Containers: Docker, Kubernetes, ECS Monitoring: Prometheus, Grafana, ELK Stack Scripting: Bash, Python, PowerShell EXPERIENCE DevOps Engineer | CloudInfra Inc. | Austin, TX | 2021 - Present - Reduced deployment time from 2 hours to 15 minutes - Implemented infrastructure as code using Terraform - Set up monitoring and alerting systems Junior DevOps Engineer | TechStart | Austin, TX | 2020 - 2021 - Maintained CI/CD pipelines - Managed cloud infrastructure on AWS EDUCATION Bachelor of Science in Computer Engineering University of Texas at Austin | 2020""" ] gr.Examples( examples=[[resume] for resume in example_resumes], inputs=resume_input, label="Select a sample resume:", ) # Connect button with streaming parse_btn.click( fn=parse_resume_stream, inputs=resume_input, outputs=[json_output, raw_output], ) gr.Examples( examples=[ ["Senior Software Engineer with 10+ years of experience in Python, Java, and cloud technologies. Currently at Google, previously at Microsoft. MS in Computer Science from Stanford."], ["Data Scientist with expertise in machine learning, deep learning, and NLP. 5 years experience. PhD in Statistics. Proficient in Python, TensorFlow, PyTorch."], ["Product Manager with 8 years experience in tech startups. MBA from Wharton. Led product launches at 3 companies. Strong in Agile, Scrum, and product strategy."], ], inputs=resume_input, outputs=[json_output, raw_output], ) return demo if __name__ == "__main__": # Enable TF32 for faster GPU matrix operations (NVIDIA Ampere+ GPUs) if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True print("✓ TF32 enabled for faster GPU inference") # Load model at startup print("=" * 60) print("Application Startup") print("=" * 60) try: load_model() print("✓ Model loaded successfully at startup") except Exception as e: print(f"⚠️ Warning: Could not load model at startup: {e}") print("Model will be loaded on first use.") demo = create_interface() demo.launch(server_name="0.0.0.0", server_port=7860)