""" BharatGen AyurParam — Swastik.fit AI Vaidya Hosted on HuggingFace Spaces (ZeroGPU) Model: bharatgenai/AyurParam (2.9B params, trained on 1,000+ Ayurvedic texts) License: CC-BY-4.0 (commercial OK) Prompt format: {question} This Space is called by the Swastik Cloud Function (ayurParamProxy). The /gradio_api/call/predict endpoint receives: { data: [" ... "] } Returns: { data: ["response text"] } ZeroGPU: GPU is allocated on-demand per request (no cold-start, shared GPU pool). Model loads into GPU memory on first call, cached for duration of GPU slot. """ import gradio as gr import torch import spaces from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "bharatgenai/AyurParam" # Module-level cache — persists across ZeroGPU calls within the same session _tokenizer = None _model = None def _ensure_model(): """Load model if not already loaded. Called inside @spaces.GPU context.""" global _tokenizer, _model if _model is not None: return print("[AyurParam] Loading tokenizer...") _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False) print("[AyurParam] Loading model to GPU...") _model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto", ) _model.eval() print("[AyurParam] Model ready on GPU.") @spaces.GPU(duration=120) def generate(prompt: str) -> str: """ Main inference function — runs on ZeroGPU (T4/A100). Accepts either: - Raw prompt already formatted: " ... " - Plain text question (will be wrapped automatically) Returns: assistant response only (no prompt echo) """ _ensure_model() if not prompt or not prompt.strip(): return "Please provide a question." # Ensure correct prompt format if "" not in prompt: formatted = f" {prompt.strip()} " else: formatted = prompt.strip() if not formatted.endswith(""): formatted = formatted + " " inputs = _tokenizer(formatted, return_tensors="pt") # Move inputs to same device as model device = next(_model.parameters()).device inputs = {k: v.to(device) for k, v in inputs.items()} input_len = inputs["input_ids"].shape[1] with torch.no_grad(): output = _model.generate( **inputs, max_new_tokens=256, do_sample=True, top_k=50, top_p=0.95, temperature=0.6, eos_token_id=_tokenizer.eos_token_id, pad_token_id=_tokenizer.eos_token_id, use_cache=True, ) # Decode only the new tokens (not the prompt) new_tokens = output[0][input_len:] response = _tokenizer.decode(new_tokens, skip_special_tokens=True).strip() # Clean up any trailing special tokens for stop in ["", "", ""]: if stop in response: response = response[: response.index(stop)].strip() return response # Gradio interface — Swastik Cloud Function calls /gradio_api/call/predict demo = gr.Interface( fn=generate, inputs=gr.Textbox( label="Prompt", placeholder=" What foods should I eat for better digestion? ", lines=3, ), outputs=gr.Textbox(label="AyurParam Response", lines=8), title="BharatGen AyurParam — Ayurveda AI", description=( "**AyurParam** is India's first AI trained on 1,000+ Ayurvedic texts (54.5M words). " "2.9B parameter model fine-tuned on classical Ayurveda knowledge.\n\n" "Prompt format: ` your question `\n\n" "This Space powers the AI Vaidya at [swastik.fit](https://swastik.fit)." ), examples=[ [" What foods should I eat to improve digestion according to Ayurveda? "], [" I have vata imbalance — what daily routine do you recommend? "], [" What are the benefits of turmeric in Ayurvedic medicine? "], [" namaste "], ], cache_examples=False, api_name="predict", ) if __name__ == "__main__": demo.launch()