# -*- coding: utf-8 -*- """ Wan2.2-S2V Avatar — รูป + เสียง → วิดีโอคนพูด (lip-sync) บน ZeroGPU (H200) ใช้ diffusers fork (WanSpeechToVideoPipeline จาก PR #12258 ของ tolgacangoz) """ import os, tempfile, math, subprocess, glob import torch import gradio as gr import spaces from PIL import Image from diffusers import AutoencoderKLWan, WanSpeechToVideoPipeline from diffusers.utils import export_to_video, load_audio try: from diffusers.utils import export_to_merged_video_audio HAS_MERGE = True except Exception: HAS_MERGE = False from transformers import Wav2Vec2ForCTC MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers" NEG = ("色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量," "低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的," "毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走") # ===== โหลดโมเดล (ครั้งแรกดาวน์โหลด ~30GB) ===== print("Loading Wan2.2-S2V…") audio_encoder = Wav2Vec2ForCTC.from_pretrained(MODEL_ID, subfolder="audio_encoder", dtype=torch.float32) vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32) pipe = WanSpeechToVideoPipeline.from_pretrained( MODEL_ID, vae=vae, audio_encoder=audio_encoder, torch_dtype=torch.bfloat16 ) # ZeroGPU: do NOT .to("cuda") at load time (no GPU yet, weights on meta) -> move inside generate() print("Model ready (on CPU; moves to GPU inside @spaces.GPU).") def _snap_size(img, target_area=480 * 832, divisor=64, max_side=1280): w, h = img.size scale = math.sqrt(target_area / (w * h)) nw = max(divisor, min(max_side, int(round(w * scale / divisor) * divisor))) nh = max(divisor, min(max_side, int(round(h * scale / divisor) * divisor))) return nh, nw def download_audio_url(url): """Pull audio from a link (YouTube/TikTok/public web/audio file) -> mp3 16kHz mono""" if not url or not url.strip(): return None, "วางลิงก์ก่อน" url = url.strip() import yt_dlp key = abs(hash(url)) % 100000 raw = f"/tmp/araw_{key}.%(ext)s" final = f"/tmp/audio_{key}.mp3" for f in glob.glob(f"/tmp/araw_{key}.*"): try: os.remove(f) except Exception: pass opts = {"outtmpl": raw, "quiet": True, "no_warnings": True, "noplaylist": True, "overwrites": True, "ignoreerrors": True, "geo_bypass": True, "format": "bestaudio/best"} try: with yt_dlp.YoutubeDL(opts) as ydl: ydl.download([url]) except Exception as e: return None, f"❌ โหลดไม่ได้: {str(e)[:120]}" dl = glob.glob(f"/tmp/araw_{key}.*") if not dl: return None, "❌ โหลดเสียงไม่สำเร็จ (ลิงก์อาจต้องล็อกอิน / โดนบล็อก)" try: subprocess.run(["ffmpeg", "-y", "-loglevel", "error", "-i", dl[0], "-vn", "-ar", "16000", "-ac", "1", "-c:a", "libmp3lame", final], timeout=300, check=True) except Exception as e: return None, f"❌ แปลงเสียงไม่สำเร็จ: {str(e)[:100]}" if os.path.exists(final) and os.path.getsize(final) > 500: return final, f"✓ โหลดเสียงแล้ว ({os.path.getsize(final)//1024} KB) — กด Generate ได้เลย" return None, "❌ แปลงเสียงไม่สำเร็จ" @spaces.GPU(duration=300) def generate(image_path, audio_path, prompt, negative_prompt, steps, guidance, res_area, seed, progress=gr.Progress(track_tqdm=True)): if not image_path: raise gr.Error("กรุณาใส่ภาพ avatar (หน้าตรงชัด) ก่อน") if not audio_path: raise gr.Error("กรุณาใส่ไฟล์เสียงพูดก่อน") pipe.to("cuda") # ZeroGPU: move model to GPU here (real GPU inside @spaces.GPU) image = Image.open(image_path).convert("RGB") audio, sampling_rate = load_audio(audio_path) target = 480 * 832 if res_area == "480p" else 720 * 1280 h, w = _snap_size(image, target_area=target) gen = torch.Generator(device="cuda").manual_seed(int(seed)) frames = pipe( image=image, audio=audio, sampling_rate=sampling_rate, prompt=prompt or "a person talking to the camera, natural expression", negative_prompt=(negative_prompt or NEG), height=h, width=w, num_frames_per_chunk=80, num_inference_steps=int(steps), guidance_scale=float(guidance), generator=gen, ).frames[0] out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name export_to_video(frames, out, fps=16) if HAS_MERGE: try: export_to_merged_video_audio(out, audio_path) except Exception as e: print("merge audio failed:", e) return out with gr.Blocks(title="Wan2.2-S2V Avatar", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎤 Wan2.2-S2V Avatar — รูป + เสียง → วิดีโอพูด (lip-sync) · ZeroGPU H200") gr.Markdown("ใส่ **ภาพหน้าตรง** + **ไฟล์เสียงพูด** → ได้วิดีโอคนพูดขยับปากตามเสียง (รองรับหลายภาษา) " "· เสียงยาวจะต่อหลาย chunk อัตโนมัติ") with gr.Row(): with gr.Column(): image = gr.Image(type="filepath", label="ภาพ avatar (หน้าตรงชัด คน/การ์ตูน)") audio = gr.Audio(type="filepath", label="ไฟล์เสียงพูด (อัปโหลด/อัดเสียง)") with gr.Row(equal_height=True): audio_url = gr.Textbox(placeholder="🔗 หรือวางลิงก์ YouTube/TikTok/เว็บ → ดึงเสียงอัตโนมัติ", scale=4, container=False, lines=1, max_lines=1) audio_url_btn = gr.Button("⬇️ ดึงเสียง", scale=1, min_width=110) url_status = gr.Markdown("") prompt = gr.Textbox(label="Prompt (บรรยายฉาก/ท่าทาง)", value="a person talking to the camera, natural expression, slight head movement") with gr.Accordion("⚙️ ตั้งค่าขั้นสูง", open=False): negative_prompt = gr.Textbox(label="Negative prompt", value=NEG, lines=2) res_area = gr.Radio(["480p", "720p"], value="480p", label="ความละเอียด (720p ช้ากว่า)") steps = gr.Slider(8, 50, value=20, step=1, label="Steps (มาก=คมแต่ช้า)") guidance = gr.Slider(1.0, 8.0, value=4.5, step=0.5, label="Guidance scale") seed = gr.Number(value=42, label="Seed", precision=0) btn = gr.Button("🎬 สร้างวิดีโอ Avatar", variant="primary") with gr.Column(): out_video = gr.Video(label="ผลลัพธ์ (วิดีโอพูด)") btn.click(generate, [image, audio, prompt, negative_prompt, steps, guidance, res_area, seed], out_video) audio_url_btn.click(download_audio_url, inputs=audio_url, outputs=[audio, url_status]) if __name__ == "__main__": demo.queue().launch()