# app.py -- T2MOVIES starter pipeline (text->video try, gTTS TTS, sample fallback) import gradio as gr import time import uuid from pathlib import Path from huggingface_hub import InferenceClient from gtts import gTTS # --- CONFIG: change these if you want --- TEXT2VIDEO_CANDIDATES = [ "damo-vilab/text-to-video-ms-1.7b", ] SAMPLE_VIDEO = "sample.mp4" TMP_DIR = Path("/tmp") if Path("/tmp").exists() else Path(".") # HF client (Spaces provides token automatically) try: hf_client = InferenceClient() except Exception: hf_client = None def try_text_to_video(prompt: str, style: str, duration: int, aspect: str): if hf_client is None: return None for model_id in TEXT2VIDEO_CANDIDATES: try: payload = {"prompt": prompt, "style": style, "duration": duration, "aspect_ratio": aspect} # attempt generic inference; some models return bytes, some return url/dict if hasattr(hf_client, "text_to_video"): res = hf_client.text_to_video(model_id, payload) else: res = hf_client.model_inference(model_id, payload) if isinstance(res, (bytes, bytearray)): out = TMP_DIR / f"{uuid.uuid4().hex}_out.mp4" with open(out, "wb") as f: f.write(res) return str(out) if isinstance(res, dict): if "url" in res and isinstance(res["url"], str): return res["url"] except Exception as e: print(f"[text2video] model {model_id} failed: {e}") continue return None def tts_gtts(text: str, outname=None): if not text: return None outname = outname or TMP_DIR / f"{uuid.uuid4().hex}_tts.mp3" tts = gTTS(text=text, lang="en", slow=False) tts.save(str(outname)) return str(outname) def pipeline(prompt, style, duration, aspect): status_msgs = [] status_msgs.append("Stage 1 — text→video: trying HF models...") video = try_text_to_video(prompt, style, duration, aspect) if video: status_msgs.append(f"Text->Video produced: {video}") else: status_msgs.append("Text->Video not available (using sample fallback).") sample = Path(SAMPLE_VIDEO) if sample.exists(): video = str(sample) else: status_msgs.append("No sample.mp4 found in repo. Please upload sample.mp4 to Space files.") return None, "\n".join(status_msgs) status_msgs.append("Stage 2 — generating voice (gTTS)...") tts_path = tts_gtts(prompt) if tts_path: status_msgs.append(f"TTS saved: {tts_path}") else: status_msgs.append("TTS generation failed or skipped.") status_msgs.append("Stage 3 — lip-sync (placeholder) — not yet implemented on this Space.") status_msgs.append("Stage 4 — finalizing & returning video (no changes applied).") return video, "\n".join(status_msgs) with gr.Blocks() as demo: gr.Markdown("# 🎬 T2MOVIES — Pipeline (starter)\nThis demo tries text→video (via HF), TTS (gTTS), and shows stage messages. Lip-sync is a placeholder.") with gr.Row(): prompt = gr.Textbox(lines=3, placeholder="Write a short scene (use ; to separate scenes)", label="Prompt") with gr.Row(): style = gr.Dropdown(choices=["anime", "cinematic", "cartoon"], value="anime", label="Style") duration = gr.Slider(5, 30, value=10, step=5, label="Duration (s)") aspect = gr.Dropdown(choices=["9:16", "16:9", "1:1"], value="9:16", label="Aspect") generate_btn = gr.Button("Generate") out_video = gr.Video(label="Result") log = gr.Textbox(label="Pipeline log", interactive=False) def on_generate(prompt_text, style_val, duration_val, aspect_val): log.value = "Queued..." log.update() out, messages = pipeline(prompt_text, style_val, duration_val, aspect_val) log.value = messages return out, log.value generate_btn.click(on_generate, inputs=[prompt, style, duration, aspect], outputs=[out_video, log]) if __name__ == "__main__": demo.launch()