Spaces:
Paused
Paused
| """MODE_REGISTRY — one Mode entry per generation mode. | |
| Each Mode declares: | |
| - name: short id ("t2v", "i2v", ...) | |
| - label: display name | |
| - icon: single-character or emoji icon for the sidebar | |
| - stage_map: list of (label, expected_share_pct) for the status banner | |
| - parameterize_fn: (Gradio inputs dict) -> list[(node_id, widget_index, value)] | |
| The parameterize_fn is the only mode-specific logic. Everything else (workflow | |
| loading, validation, dispatch) is mode-agnostic and lives in workflow.py / | |
| backend.py. | |
| Tasks 11 (T2V + I2V) and 12 (A2V + Lipsync + Keyframe + Style) populate | |
| MODE_REGISTRY. This task only sets up the dataclass and the empty container. | |
| """ | |
| from __future__ import annotations | |
| from collections.abc import Callable | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| Patch = tuple[int, int, Any] | |
| ParameterizeFn = Callable[[dict[str, Any]], list[Patch]] | |
| class Stage: | |
| label: str | |
| share_pct: int # rough share of total time, sums to ~100 across stages | |
| class Mode: | |
| name: str | |
| label: str | |
| icon: str | |
| parameterize_fn: ParameterizeFn | |
| stage_map: list[Stage] = field(default_factory=list) | |
| # Filled in by tasks 11–12. | |
| MODE_REGISTRY: dict[str, Mode] = {} | |
| # --------------------------------------------------------------------------- | |
| # Node-id constants — captured from workflows/{t2v,i2v}.json on 2026-04-30. | |
| # | |
| # The master workflow uses rgthree's GetNode/SetNode for indirection. SetNodes | |
| # named "pos"/"neg" expose the *outputs* of CLIPTextEncode, not the prompt | |
| # strings. So the canonical place to set the prompt text is the CLIPTextEncode | |
| # node itself. | |
| # | |
| # Width/Height/FPS are INTConstant nodes whose values feed downstream Set_* | |
| # variables. Clip length comes from a mxSlider (in seconds, then multiplied by | |
| # FPS via a MathExpression to compute frames). No SetNode for "noise"/seed | |
| # survived the extraction, so seed is intentionally NOT patched here — the | |
| # template's hard-coded value is used until we wire RandomNoise injection in | |
| # Task 12+. | |
| # | |
| # LoRA rows live inside a single Power Lora Loader (rgthree) node whose | |
| # widgets_values is a list of dicts. Patching a specific row requires knowing | |
| # the index, and the canonical mapping (camera_lora value -> row index) belongs | |
| # in models.py once camera-LoRA selection lands. Deferred for now. | |
| # --------------------------------------------------------------------------- | |
| T2V_NODE_PROMPT = 5536 # CLIPTextEncode positive — wv[0] = prompt | |
| T2V_NODE_NEG_PROMPT = 5537 # CLIPTextEncode negative — wv[0] = negative prompt | |
| T2V_NODE_WIDTH = 5383 # INTConstant "Width" — wv[0] | |
| T2V_NODE_HEIGHT = 5382 # INTConstant "Height" — wv[0] | |
| T2V_NODE_FPS = 5445 # INTConstant "FPS" — wv[0] | |
| T2V_NODE_CLIP_LENGTH = 196 # mxSlider "Clip Length ( in seconds )" — wv[0] | |
| I2V_NODE_PROMPT = 5536 | |
| I2V_NODE_NEG_PROMPT = 5537 | |
| I2V_NODE_WIDTH = 5383 | |
| I2V_NODE_HEIGHT = 5382 | |
| I2V_NODE_FPS = 5445 | |
| I2V_NODE_CLIP_LENGTH = 196 | |
| I2V_NODE_IMAGE = 149 # LoadImage "Load Image1" — wv[0] = filename | |
| # Mode-specific media nodes — captured from workflows/{a2v,lipsync,keyframe,style}.json | |
| # on 2026-04-30. All four templates contain the same node ids for these inputs (the | |
| # Loaders group is shared across modes); only a subset is wired into each mode's | |
| # pipeline. | |
| # | |
| # VHS_LoadAudioUpload and VHS_LoadVideo carry dict-style widgets_values keyed by | |
| # "audio"/"video". The current set_input helper is list-indexed; passing | |
| # widget_index=0 against a dict adds a numeric "0" key without replacing the | |
| # canonical "audio"/"video" entry. The runtime file-path swap is therefore not | |
| # yet wired — Task 12 only validates the patch tuple set. Real path injection | |
| # lands when backend.py grows file-staging in Task 17. | |
| A2V_NODE_PROMPT = 5536 | |
| A2V_NODE_NEG_PROMPT = 5537 | |
| A2V_NODE_WIDTH = 5383 | |
| A2V_NODE_HEIGHT = 5382 | |
| A2V_NODE_FPS = 5445 | |
| A2V_NODE_CLIP_LENGTH = 196 | |
| A2V_NODE_AUDIO = 5400 # VHS_LoadAudioUpload — dict wv keyed by "audio" | |
| LIPSYNC_NODE_PROMPT = 5536 | |
| LIPSYNC_NODE_NEG_PROMPT = 5537 | |
| LIPSYNC_NODE_FPS = 5445 | |
| LIPSYNC_NODE_CLIP_LENGTH = 196 | |
| LIPSYNC_NODE_IMAGE = 149 # LoadImage "Load Image1" — wv[0] = filename | |
| LIPSYNC_NODE_AUDIO = 5400 # VHS_LoadAudioUpload — dict wv keyed by "audio" | |
| KEYFRAME_NODE_PROMPT = 5536 | |
| KEYFRAME_NODE_NEG_PROMPT = 5537 | |
| KEYFRAME_NODE_FPS = 5445 | |
| KEYFRAME_NODE_CLIP_LENGTH = 196 | |
| KEYFRAME_NODE_FIRST_FRAME = 149 # LoadImage "Load Image1" — wv[0] = filename | |
| KEYFRAME_NODE_LAST_FRAME = 5437 # LoadImage "Load Image2" — wv[0] = filename | |
| STYLE_NODE_PROMPT = 5536 | |
| STYLE_NODE_NEG_PROMPT = 5537 | |
| STYLE_NODE_FPS = 5445 | |
| STYLE_NODE_CLIP_LENGTH = 196 | |
| STYLE_NODE_INPUT_VIDEO = 5444 # VHS_LoadVideo — dict wv keyed by "video" | |
| def _frames_to_seconds(frames: int, fps: int) -> int: | |
| """Convert (frames, fps) to integer seconds for the mxSlider clip-length widget. | |
| The downstream MathExpression is `a*b+1` (a=seconds, b=fps -> total frames), | |
| so for a target frame count F at fps R we need seconds = ceil((F - 1) / R). | |
| Round up so the slider is never short of the requested frames. | |
| """ | |
| if fps <= 0: | |
| return 1 | |
| return max(1, -(-(frames - 1) // fps)) | |
| def _t2v_parameterize(inp: dict[str, Any]) -> list[Patch]: | |
| return [ | |
| (T2V_NODE_PROMPT, 0, inp["prompt"]), | |
| (T2V_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")), | |
| (T2V_NODE_WIDTH, 0, int(inp["width"])), | |
| (T2V_NODE_HEIGHT, 0, int(inp["height"])), | |
| (T2V_NODE_FPS, 0, int(inp["fps"])), | |
| (T2V_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))), | |
| ] | |
| def _i2v_parameterize(inp: dict[str, Any]) -> list[Patch]: | |
| return [ | |
| (I2V_NODE_PROMPT, 0, inp["prompt"]), | |
| (I2V_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")), | |
| (I2V_NODE_IMAGE, 0, inp["image"]), | |
| (I2V_NODE_WIDTH, 0, int(inp["width"])), | |
| (I2V_NODE_HEIGHT, 0, int(inp["height"])), | |
| (I2V_NODE_FPS, 0, int(inp["fps"])), | |
| (I2V_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))), | |
| ] | |
| def _a2v_parameterize(inp: dict[str, Any]) -> list[Patch]: | |
| return [ | |
| (A2V_NODE_PROMPT, 0, inp["prompt"]), | |
| (A2V_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")), | |
| (A2V_NODE_AUDIO, 0, inp["audio"]), | |
| (A2V_NODE_WIDTH, 0, int(inp["width"])), | |
| (A2V_NODE_HEIGHT, 0, int(inp["height"])), | |
| (A2V_NODE_FPS, 0, int(inp["fps"])), | |
| (A2V_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))), | |
| ] | |
| def _lipsync_parameterize(inp: dict[str, Any]) -> list[Patch]: | |
| return [ | |
| (LIPSYNC_NODE_PROMPT, 0, inp["prompt"]), | |
| (LIPSYNC_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")), | |
| (LIPSYNC_NODE_IMAGE, 0, inp["image"]), | |
| (LIPSYNC_NODE_AUDIO, 0, inp["audio"]), | |
| (LIPSYNC_NODE_FPS, 0, int(inp["fps"])), | |
| (LIPSYNC_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))), | |
| ] | |
| def _keyframe_parameterize(inp: dict[str, Any]) -> list[Patch]: | |
| return [ | |
| (KEYFRAME_NODE_PROMPT, 0, inp["prompt"]), | |
| (KEYFRAME_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")), | |
| (KEYFRAME_NODE_FIRST_FRAME, 0, inp["first_frame"]), | |
| (KEYFRAME_NODE_LAST_FRAME, 0, inp["last_frame"]), | |
| (KEYFRAME_NODE_FPS, 0, int(inp["fps"])), | |
| (KEYFRAME_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))), | |
| ] | |
| def _style_parameterize(inp: dict[str, Any]) -> list[Patch]: | |
| return [ | |
| (STYLE_NODE_PROMPT, 0, inp["prompt"]), | |
| (STYLE_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")), | |
| (STYLE_NODE_INPUT_VIDEO, 0, inp["input_video"]), | |
| (STYLE_NODE_FPS, 0, int(inp["fps"])), | |
| (STYLE_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))), | |
| ] | |
| _T2V_STAGES = [ | |
| Stage("Encode prompt", 5), | |
| Stage("Diffusion (Stage 1)", 60), | |
| Stage("Spatial upscale", 7), | |
| Stage("Diffusion (Stage 2)", 18), | |
| Stage("Decode video", 10), | |
| ] | |
| _I2V_STAGES = [ | |
| Stage("Encode prompt", 5), | |
| Stage("Encode image", 3), | |
| Stage("Diffusion (Stage 1)", 55), | |
| Stage("Spatial upscale", 7), | |
| Stage("Diffusion (Stage 2)", 20), | |
| Stage("Decode video", 10), | |
| ] | |
| _A2V_STAGES = [ | |
| Stage("Encode prompt", 5), | |
| Stage("Encode audio", 5), | |
| Stage("Diffusion (Stage 1)", 55), | |
| Stage("Spatial upscale", 7), | |
| Stage("Diffusion (Stage 2)", 18), | |
| Stage("Decode video", 10), | |
| ] | |
| _LIPSYNC_STAGES = [ | |
| Stage("Encode prompt", 5), | |
| Stage("Encode image", 3), | |
| Stage("Encode audio", 5), | |
| Stage("Diffusion (Stage 1)", 52), | |
| Stage("Spatial upscale", 7), | |
| Stage("Diffusion (Stage 2)", 18), | |
| Stage("Decode video", 10), | |
| ] | |
| _KEYFRAME_STAGES = [ | |
| Stage("Encode prompt", 5), | |
| Stage("Encode keyframes", 5), | |
| Stage("Diffusion (Stage 1)", 55), | |
| Stage("Spatial upscale", 7), | |
| Stage("Diffusion (Stage 2)", 18), | |
| Stage("Decode video", 10), | |
| ] | |
| _STYLE_STAGES = [ | |
| Stage("Encode prompt", 5), | |
| Stage("Decode source video", 5), | |
| Stage("Diffusion (Stage 1)", 55), | |
| Stage("Spatial upscale", 7), | |
| Stage("Diffusion (Stage 2)", 18), | |
| Stage("Decode video", 10), | |
| ] | |
| MODE_REGISTRY["t2v"] = Mode( | |
| name="t2v", | |
| label="Text → Video", | |
| icon="📝", | |
| parameterize_fn=_t2v_parameterize, | |
| stage_map=_T2V_STAGES, | |
| ) | |
| MODE_REGISTRY["i2v"] = Mode( | |
| name="i2v", | |
| label="Image → Video", | |
| icon="🖼", | |
| parameterize_fn=_i2v_parameterize, | |
| stage_map=_I2V_STAGES, | |
| ) | |
| MODE_REGISTRY["a2v"] = Mode( | |
| name="a2v", | |
| label="Audio → Video", | |
| icon="🎵", | |
| parameterize_fn=_a2v_parameterize, | |
| stage_map=_A2V_STAGES, | |
| ) | |
| MODE_REGISTRY["lipsync"] = Mode( | |
| name="lipsync", | |
| label="Lipsync", | |
| icon="👄", | |
| parameterize_fn=_lipsync_parameterize, | |
| stage_map=_LIPSYNC_STAGES, | |
| ) | |
| MODE_REGISTRY["keyframe"] = Mode( | |
| name="keyframe", | |
| label="Keyframe → Video", | |
| icon="🎞", | |
| parameterize_fn=_keyframe_parameterize, | |
| stage_map=_KEYFRAME_STAGES, | |
| ) | |
| MODE_REGISTRY["style"] = Mode( | |
| name="style", | |
| label="Style Transfer", | |
| icon="🎨", | |
| parameterize_fn=_style_parameterize, | |
| stage_map=_STYLE_STAGES, | |
| ) | |