# -*- coding: utf-8 -*-
"""
Wan2.2-S2V Avatar - image + audio -> talking video (lip-sync) on ZeroGPU (H200)
diffusers fork WanSpeechToVideoPipeline (PR #12258, tolgacangoz)
"""
import os, tempfile, math, subprocess, glob
import torch
import gradio as gr
import spaces
from PIL import Image
from diffusers import AutoencoderKLWan, WanSpeechToVideoPipeline
from diffusers.utils import export_to_video, load_audio
try:
    from diffusers.utils import export_to_merged_video_audio
    HAS_MERGE = True
except Exception:
    HAS_MERGE = False
from transformers import Wav2Vec2ForCTC

MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers"
NEG = ("色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，"
       "低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，"
       "毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")

# ZeroGPU: load the model lazily INSIDE @spaces.GPU (real GPU) to avoid meta-tensor at module scope
print("App started; Wan2.2-S2V will load on first generate (inside ZeroGPU worker).")
PIPE = {"p": None}


def _load_pipe():
    print("Loading Wan2.2-S2V (first run)...")
    audio_encoder = Wav2Vec2ForCTC.from_pretrained(MODEL_ID, subfolder="audio_encoder", dtype=torch.float32)
    vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
    p = WanSpeechToVideoPipeline.from_pretrained(
        MODEL_ID, vae=vae, audio_encoder=audio_encoder, torch_dtype=torch.bfloat16
    )
    p.to("cuda")
    print("Model ready on GPU.")
    return p


def _snap_size(img, target_area=480 * 832, divisor=64, max_side=1280):
    w, h = img.size
    scale = math.sqrt(target_area / (w * h))
    nw = max(divisor, min(max_side, int(round(w * scale / divisor) * divisor)))
    nh = max(divisor, min(max_side, int(round(h * scale / divisor) * divisor)))
    return nh, nw


def download_audio_url(url):
    if not url or not url.strip():
        return None, "วางลิงก์ก่อน"
    url = url.strip()
    import yt_dlp
    key = abs(hash(url)) % 100000
    raw = f"/tmp/araw_{key}.%(ext)s"
    final = f"/tmp/audio_{key}.mp3"
    for f in glob.glob(f"/tmp/araw_{key}.*"):
        try: os.remove(f)
        except Exception: pass
    opts = {"outtmpl": raw, "quiet": True, "no_warnings": True, "noplaylist": True,
            "overwrites": True, "ignoreerrors": True, "geo_bypass": True, "format": "bestaudio/best"}
    try:
        with yt_dlp.YoutubeDL(opts) as ydl:
            ydl.download([url])
    except Exception as e:
        return None, f"โหลดไม่ได้: {str(e)[:120]}"
    dl = glob.glob(f"/tmp/araw_{key}.*")
    if not dl:
        return None, "โหลดเสียงไม่สำเร็จ (ลิงก์อาจต้องล็อกอิน / โดนบล็อก)"
    try:
        subprocess.run(["ffmpeg", "-y", "-loglevel", "error", "-i", dl[0],
                        "-vn", "-ar", "16000", "-ac", "1", "-c:a", "libmp3lame", final],
                       timeout=300, check=True)
    except Exception as e:
        return None, f"แปลงเสียงไม่สำเร็จ: {str(e)[:100]}"
    if os.path.exists(final) and os.path.getsize(final) > 500:
        return final, f"OK โหลดเสียงแล้ว ({os.path.getsize(final)//1024} KB) - กด Generate ได้เลย"
    return None, "แปลงเสียงไม่สำเร็จ"


def download_video_url(url):
    if not url or not url.strip():
        return None, "วางลิงก์ก่อน"
    url = url.strip()
    import yt_dlp
    key = abs(hash(url)) % 100000
    raw = f"/tmp/vraw_{key}.%(ext)s"
    final = f"/tmp/video_{key}.mp4"
    for f in glob.glob(f"/tmp/vraw_{key}.*"):
        try: os.remove(f)
        except Exception: pass
    opts = {"outtmpl": raw, "quiet": True, "no_warnings": True, "noplaylist": True,
            "overwrites": True, "ignoreerrors": True, "geo_bypass": True,
            "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/b"}
    try:
        with yt_dlp.YoutubeDL(opts) as ydl:
            ydl.download([url])
    except Exception as e:
        return None, f"โหลดไม่ได้: {str(e)[:120]}"
    dl = glob.glob(f"/tmp/vraw_{key}.*")
    if not dl:
        return None, "โหลดวิดีโอไม่สำเร็จ (ลิงก์อาจต้องล็อกอิน / โดนบล็อก)"
    try:
        subprocess.run(["ffmpeg", "-y", "-loglevel", "error", "-i", dl[0],
                        "-c:v", "libx264", "-preset", "veryfast", "-an", "-movflags", "+faststart", final],
                       timeout=600, check=True)
    except Exception as e:
        return None, f"แปลงวิดีโอไม่สำเร็จ: {str(e)[:100]}"
    if os.path.exists(final) and os.path.getsize(final) > 1000:
        return final, f"OK โหลดวิดีโออ้างอิงแล้ว ({os.path.getsize(final)//1024//1024} MB) - avatar จะขยับตามนี้"
    return None, "แปลงวิดีโอไม่สำเร็จ"


@spaces.GPU(duration=300)
def generate(image_path, audio_path, pose_video, prompt, negative_prompt, steps, guidance, res_area, seed,
             progress=gr.Progress(track_tqdm=True)):
    if not image_path:
        raise gr.Error("กรุณาใส่ภาพ avatar (หน้าตรงชัด) ก่อน")
    if not audio_path:
        raise gr.Error("กรุณาใส่ไฟล์เสียงพูดก่อน")
    if PIPE["p"] is None:
        PIPE["p"] = _load_pipe()
    pipe = PIPE["p"]
    image = Image.open(image_path).convert("RGB")
    audio, sampling_rate = load_audio(audio_path)
    target = 480 * 832 if res_area == "480p" else 720 * 1280
    h, w = _snap_size(image, target_area=target)
    gen = torch.Generator(device="cuda").manual_seed(int(seed))
    kwargs = {}
    if pose_video:
        kwargs["pose_video_path_or_url"] = pose_video
    frames = pipe(
        image=image, audio=audio, sampling_rate=sampling_rate,
        prompt=prompt or "a person talking to the camera, natural expression",
        negative_prompt=(negative_prompt or NEG),
        height=h, width=w,
        num_frames_per_chunk=80,
        num_inference_steps=int(steps),
        guidance_scale=float(guidance),
        generator=gen,
        **kwargs,
    ).frames[0]
    out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    export_to_video(frames, out, fps=16)
    if HAS_MERGE:
        try:
            export_to_merged_video_audio(out, audio_path)
        except Exception as e:
            print("merge audio failed:", e)
    return out


with gr.Blocks(title="Wan2.2-S2V Avatar", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Wan2.2-S2V Avatar - image + audio -> talking video (lip-sync) - ZeroGPU H200")
    gr.Markdown("ใส่ภาพหน้าตรง + ไฟล์เสียงพูด -> วิดีโอคนพูดขยับปากตามเสียง (รองรับหลายภาษา)")
    with gr.Row():
        with gr.Column():
            image = gr.Image(type="filepath", label="ภาพ avatar (หน้าตรงชัด คน/การ์ตูน)")
            audio = gr.Audio(type="filepath", label="ไฟล์เสียงพูด (อัปโหลด/อัดเสียง)")
            with gr.Row(equal_height=True):
                audio_url = gr.Textbox(placeholder="หรือวางลิงก์ YouTube/TikTok -> ดึงเสียง",
                                       scale=4, container=False, lines=1, max_lines=1)
                audio_url_btn = gr.Button("ดึงเสียง", scale=1, min_width=110)
            url_status = gr.Markdown("")
            pose_video = gr.Video(label="วิดีโออ้างอิงท่าทาง (ไม่บังคับ) - avatar จะขยับตามวิดีโอนี้", height=200)
            with gr.Row(equal_height=True):
                video_url = gr.Textbox(placeholder="หรือวางลิงก์วิดีโอ (YouTube/TikTok) -> ดึงมาเป็นท่าอ้างอิง",
                                       scale=4, container=False, lines=1, max_lines=1)
                video_url_btn = gr.Button("ดึงวิดีโอ", scale=1, min_width=110)
            vid_status = gr.Markdown("")
            prompt = gr.Textbox(label="Prompt (บรรยายฉาก/ท่าทาง - ใช้คู่/แทนวิดีโออ้างอิงได้)",
                                value="a person talking to the camera, natural expression, slight head movement")
            with gr.Accordion("ตั้งค่าขั้นสูง", open=False):
                negative_prompt = gr.Textbox(label="Negative prompt", value=NEG, lines=2)
                res_area = gr.Radio(["480p", "720p"], value="480p", label="ความละเอียด (720p ช้ากว่า)")
                steps = gr.Slider(8, 50, value=20, step=1, label="Steps")
                guidance = gr.Slider(1.0, 8.0, value=4.5, step=0.5, label="Guidance scale")
                seed = gr.Number(value=42, label="Seed", precision=0)
            btn = gr.Button("สร้างวิดีโอ Avatar", variant="primary")
        with gr.Column():
            out_video = gr.Video(label="ผลลัพธ์ (วิดีโอพูด)")
    btn.click(generate, [image, audio, pose_video, prompt, negative_prompt, steps, guidance, res_area, seed], out_video)
    audio_url_btn.click(download_audio_url, inputs=audio_url, outputs=[audio, url_status])
    video_url_btn.click(download_video_url, inputs=video_url, outputs=[pose_video, vid_status])

if __name__ == "__main__":
    demo.queue().launch()