# -*- coding: utf-8 -*- """ Wan2.2-S2V Avatar - image + audio -> talking video (lip-sync) on ZeroGPU (H200) diffusers fork WanSpeechToVideoPipeline (PR #12258, tolgacangoz) """ import os, tempfile, math, subprocess, glob import torch import gradio as gr import spaces from PIL import Image from diffusers import AutoencoderKLWan, WanSpeechToVideoPipeline from diffusers.utils import export_to_video, load_audio try: from diffusers.utils import export_to_merged_video_audio HAS_MERGE = True except Exception: HAS_MERGE = False from transformers import Wav2Vec2ForCTC MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers" NEG = ("色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量," "低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的," "毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走") # ZeroGPU: load the model lazily INSIDE @spaces.GPU (real GPU) to avoid meta-tensor at module scope print("App started; Wan2.2-S2V will load on first generate (inside ZeroGPU worker).") PIPE = {"p": None} def _load_pipe(): print("Loading Wan2.2-S2V (first run)...") audio_encoder = Wav2Vec2ForCTC.from_pretrained(MODEL_ID, subfolder="audio_encoder", dtype=torch.float32) vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32) p = WanSpeechToVideoPipeline.from_pretrained( MODEL_ID, vae=vae, audio_encoder=audio_encoder, torch_dtype=torch.bfloat16 ) p.to("cuda") print("Model ready on GPU.") return p def _snap_size(img, target_area=480 * 832, divisor=64, max_side=1280): w, h = img.size scale = math.sqrt(target_area / (w * h)) nw = max(divisor, min(max_side, int(round(w * scale / divisor) * divisor))) nh = max(divisor, min(max_side, int(round(h * scale / divisor) * divisor))) return nh, nw def download_audio_url(url): if not url or not url.strip(): return None, "วางลิงก์ก่อน" url = url.strip() import yt_dlp key = abs(hash(url)) % 100000 raw = f"/tmp/araw_{key}.%(ext)s" final = f"/tmp/audio_{key}.mp3" for f in glob.glob(f"/tmp/araw_{key}.*"): try: os.remove(f) except Exception: pass opts = {"outtmpl": raw, "quiet": True, "no_warnings": True, "noplaylist": True, "overwrites": True, "ignoreerrors": True, "geo_bypass": True, "format": "bestaudio/best"} try: with yt_dlp.YoutubeDL(opts) as ydl: ydl.download([url]) except Exception as e: return None, f"โหลดไม่ได้: {str(e)[:120]}" dl = glob.glob(f"/tmp/araw_{key}.*") if not dl: return None, "โหลดเสียงไม่สำเร็จ (ลิงก์อาจต้องล็อกอิน / โดนบล็อก)" try: subprocess.run(["ffmpeg", "-y", "-loglevel", "error", "-i", dl[0], "-vn", "-ar", "16000", "-ac", "1", "-c:a", "libmp3lame", final], timeout=300, check=True) except Exception as e: return None, f"แปลงเสียงไม่สำเร็จ: {str(e)[:100]}" if os.path.exists(final) and os.path.getsize(final) > 500: return final, f"OK โหลดเสียงแล้ว ({os.path.getsize(final)//1024} KB) - กด Generate ได้เลย" return None, "แปลงเสียงไม่สำเร็จ" def download_video_url(url): if not url or not url.strip(): return None, "วางลิงก์ก่อน" url = url.strip() import yt_dlp key = abs(hash(url)) % 100000 raw = f"/tmp/vraw_{key}.%(ext)s" final = f"/tmp/video_{key}.mp4" for f in glob.glob(f"/tmp/vraw_{key}.*"): try: os.remove(f) except Exception: pass opts = {"outtmpl": raw, "quiet": True, "no_warnings": True, "noplaylist": True, "overwrites": True, "ignoreerrors": True, "geo_bypass": True, "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/b"} try: with yt_dlp.YoutubeDL(opts) as ydl: ydl.download([url]) except Exception as e: return None, f"โหลดไม่ได้: {str(e)[:120]}" dl = glob.glob(f"/tmp/vraw_{key}.*") if not dl: return None, "โหลดวิดีโอไม่สำเร็จ (ลิงก์อาจต้องล็อกอิน / โดนบล็อก)" try: subprocess.run(["ffmpeg", "-y", "-loglevel", "error", "-i", dl[0], "-c:v", "libx264", "-preset", "veryfast", "-an", "-movflags", "+faststart", final], timeout=600, check=True) except Exception as e: return None, f"แปลงวิดีโอไม่สำเร็จ: {str(e)[:100]}" if os.path.exists(final) and os.path.getsize(final) > 1000: return final, f"OK โหลดวิดีโออ้างอิงแล้ว ({os.path.getsize(final)//1024//1024} MB) - avatar จะขยับตามนี้" return None, "แปลงวิดีโอไม่สำเร็จ" @spaces.GPU(duration=300) def generate(image_path, audio_path, pose_video, prompt, negative_prompt, steps, guidance, res_area, seed, progress=gr.Progress(track_tqdm=True)): if not image_path: raise gr.Error("กรุณาใส่ภาพ avatar (หน้าตรงชัด) ก่อน") if not audio_path: raise gr.Error("กรุณาใส่ไฟล์เสียงพูดก่อน") if PIPE["p"] is None: PIPE["p"] = _load_pipe() pipe = PIPE["p"] image = Image.open(image_path).convert("RGB") audio, sampling_rate = load_audio(audio_path) target = 480 * 832 if res_area == "480p" else 720 * 1280 h, w = _snap_size(image, target_area=target) gen = torch.Generator(device="cuda").manual_seed(int(seed)) kwargs = {} if pose_video: kwargs["pose_video_path_or_url"] = pose_video frames = pipe( image=image, audio=audio, sampling_rate=sampling_rate, prompt=prompt or "a person talking to the camera, natural expression", negative_prompt=(negative_prompt or NEG), height=h, width=w, num_frames_per_chunk=80, num_inference_steps=int(steps), guidance_scale=float(guidance), generator=gen, **kwargs, ).frames[0] out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name export_to_video(frames, out, fps=16) if HAS_MERGE: try: export_to_merged_video_audio(out, audio_path) except Exception as e: print("merge audio failed:", e) return out with gr.Blocks(title="Wan2.2-S2V Avatar", theme=gr.themes.Soft()) as demo: gr.Markdown("# Wan2.2-S2V Avatar - image + audio -> talking video (lip-sync) - ZeroGPU H200") gr.Markdown("ใส่ภาพหน้าตรง + ไฟล์เสียงพูด -> วิดีโอคนพูดขยับปากตามเสียง (รองรับหลายภาษา)") with gr.Row(): with gr.Column(): image = gr.Image(type="filepath", label="ภาพ avatar (หน้าตรงชัด คน/การ์ตูน)") audio = gr.Audio(type="filepath", label="ไฟล์เสียงพูด (อัปโหลด/อัดเสียง)") with gr.Row(equal_height=True): audio_url = gr.Textbox(placeholder="หรือวางลิงก์ YouTube/TikTok -> ดึงเสียง", scale=4, container=False, lines=1, max_lines=1) audio_url_btn = gr.Button("ดึงเสียง", scale=1, min_width=110) url_status = gr.Markdown("") pose_video = gr.Video(label="วิดีโออ้างอิงท่าทาง (ไม่บังคับ) - avatar จะขยับตามวิดีโอนี้", height=200) with gr.Row(equal_height=True): video_url = gr.Textbox(placeholder="หรือวางลิงก์วิดีโอ (YouTube/TikTok) -> ดึงมาเป็นท่าอ้างอิง", scale=4, container=False, lines=1, max_lines=1) video_url_btn = gr.Button("ดึงวิดีโอ", scale=1, min_width=110) vid_status = gr.Markdown("") prompt = gr.Textbox(label="Prompt (บรรยายฉาก/ท่าทาง - ใช้คู่/แทนวิดีโออ้างอิงได้)", value="a person talking to the camera, natural expression, slight head movement") with gr.Accordion("ตั้งค่าขั้นสูง", open=False): negative_prompt = gr.Textbox(label="Negative prompt", value=NEG, lines=2) res_area = gr.Radio(["480p", "720p"], value="480p", label="ความละเอียด (720p ช้ากว่า)") steps = gr.Slider(8, 50, value=20, step=1, label="Steps") guidance = gr.Slider(1.0, 8.0, value=4.5, step=0.5, label="Guidance scale") seed = gr.Number(value=42, label="Seed", precision=0) btn = gr.Button("สร้างวิดีโอ Avatar", variant="primary") with gr.Column(): out_video = gr.Video(label="ผลลัพธ์ (วิดีโอพูด)") btn.click(generate, [image, audio, pose_video, prompt, negative_prompt, steps, guidance, res_area, seed], out_video) audio_url_btn.click(download_audio_url, inputs=audio_url, outputs=[audio, url_status]) video_url_btn.click(download_video_url, inputs=video_url, outputs=[pose_video, vid_status]) if __name__ == "__main__": demo.queue().launch()