File size: 10,262 Bytes
0314079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0256245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03937ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0256245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03937ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0256245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03937ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0256245
 
 
 
 
 
 
 
 
 
 
 
 
 
03937ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""MODE_REGISTRY β€” one Mode entry per generation mode.

Each Mode declares:
- name: short id ("t2v", "i2v", ...)
- label: display name
- icon: single-character or emoji icon for the sidebar
- stage_map: list of (label, expected_share_pct) for the status banner
- parameterize_fn: (Gradio inputs dict) -> list[(node_id, widget_index, value)]

The parameterize_fn is the only mode-specific logic. Everything else (workflow
loading, validation, dispatch) is mode-agnostic and lives in workflow.py /
backend.py.

Tasks 11 (T2V + I2V) and 12 (A2V + Lipsync + Keyframe + Style) populate
MODE_REGISTRY. This task only sets up the dataclass and the empty container.
"""
from __future__ import annotations

from collections.abc import Callable
from dataclasses import dataclass, field
from typing import Any

Patch = tuple[int, int, Any]
ParameterizeFn = Callable[[dict[str, Any]], list[Patch]]


@dataclass(frozen=True)
class Stage:
    label: str
    share_pct: int  # rough share of total time, sums to ~100 across stages


@dataclass(frozen=True)
class Mode:
    name: str
    label: str
    icon: str
    parameterize_fn: ParameterizeFn
    stage_map: list[Stage] = field(default_factory=list)


# Filled in by tasks 11–12.
MODE_REGISTRY: dict[str, Mode] = {}


# ---------------------------------------------------------------------------
# Node-id constants β€” captured from workflows/{t2v,i2v}.json on 2026-04-30.
#
# The master workflow uses rgthree's GetNode/SetNode for indirection. SetNodes
# named "pos"/"neg" expose the *outputs* of CLIPTextEncode, not the prompt
# strings. So the canonical place to set the prompt text is the CLIPTextEncode
# node itself.
#
# Width/Height/FPS are INTConstant nodes whose values feed downstream Set_*
# variables.  Clip length comes from a mxSlider (in seconds, then multiplied by
# FPS via a MathExpression to compute frames).  No SetNode for "noise"/seed
# survived the extraction, so seed is intentionally NOT patched here β€” the
# template's hard-coded value is used until we wire RandomNoise injection in
# Task 12+.
#
# LoRA rows live inside a single Power Lora Loader (rgthree) node whose
# widgets_values is a list of dicts. Patching a specific row requires knowing
# the index, and the canonical mapping (camera_lora value -> row index) belongs
# in models.py once camera-LoRA selection lands. Deferred for now.
# ---------------------------------------------------------------------------

T2V_NODE_PROMPT = 5536            # CLIPTextEncode positive β€” wv[0] = prompt
T2V_NODE_NEG_PROMPT = 5537        # CLIPTextEncode negative β€” wv[0] = negative prompt
T2V_NODE_WIDTH = 5383             # INTConstant "Width" β€” wv[0]
T2V_NODE_HEIGHT = 5382            # INTConstant "Height" β€” wv[0]
T2V_NODE_FPS = 5445               # INTConstant "FPS" β€” wv[0]
T2V_NODE_CLIP_LENGTH = 196        # mxSlider "Clip Length ( in seconds )" β€” wv[0]

I2V_NODE_PROMPT = 5536
I2V_NODE_NEG_PROMPT = 5537
I2V_NODE_WIDTH = 5383
I2V_NODE_HEIGHT = 5382
I2V_NODE_FPS = 5445
I2V_NODE_CLIP_LENGTH = 196
I2V_NODE_IMAGE = 149              # LoadImage "Load Image1" β€” wv[0] = filename

# Mode-specific media nodes β€” captured from workflows/{a2v,lipsync,keyframe,style}.json
# on 2026-04-30. All four templates contain the same node ids for these inputs (the
# Loaders group is shared across modes); only a subset is wired into each mode's
# pipeline.
#
# VHS_LoadAudioUpload and VHS_LoadVideo carry dict-style widgets_values keyed by
# "audio"/"video". The current set_input helper is list-indexed; passing
# widget_index=0 against a dict adds a numeric "0" key without replacing the
# canonical "audio"/"video" entry. The runtime file-path swap is therefore not
# yet wired β€” Task 12 only validates the patch tuple set. Real path injection
# lands when backend.py grows file-staging in Task 17.

A2V_NODE_PROMPT = 5536
A2V_NODE_NEG_PROMPT = 5537
A2V_NODE_WIDTH = 5383
A2V_NODE_HEIGHT = 5382
A2V_NODE_FPS = 5445
A2V_NODE_CLIP_LENGTH = 196
A2V_NODE_AUDIO = 5400             # VHS_LoadAudioUpload β€” dict wv keyed by "audio"

LIPSYNC_NODE_PROMPT = 5536
LIPSYNC_NODE_NEG_PROMPT = 5537
LIPSYNC_NODE_FPS = 5445
LIPSYNC_NODE_CLIP_LENGTH = 196
LIPSYNC_NODE_IMAGE = 149          # LoadImage "Load Image1" β€” wv[0] = filename
LIPSYNC_NODE_AUDIO = 5400         # VHS_LoadAudioUpload β€” dict wv keyed by "audio"

KEYFRAME_NODE_PROMPT = 5536
KEYFRAME_NODE_NEG_PROMPT = 5537
KEYFRAME_NODE_FPS = 5445
KEYFRAME_NODE_CLIP_LENGTH = 196
KEYFRAME_NODE_FIRST_FRAME = 149   # LoadImage "Load Image1" β€” wv[0] = filename
KEYFRAME_NODE_LAST_FRAME = 5437   # LoadImage "Load Image2" β€” wv[0] = filename

STYLE_NODE_PROMPT = 5536
STYLE_NODE_NEG_PROMPT = 5537
STYLE_NODE_FPS = 5445
STYLE_NODE_CLIP_LENGTH = 196
STYLE_NODE_INPUT_VIDEO = 5444     # VHS_LoadVideo β€” dict wv keyed by "video"


def _frames_to_seconds(frames: int, fps: int) -> int:
    """Convert (frames, fps) to integer seconds for the mxSlider clip-length widget.

    The downstream MathExpression is `a*b+1` (a=seconds, b=fps -> total frames),
    so for a target frame count F at fps R we need seconds = ceil((F - 1) / R).
    Round up so the slider is never short of the requested frames.
    """
    if fps <= 0:
        return 1
    return max(1, -(-(frames - 1) // fps))


def _t2v_parameterize(inp: dict[str, Any]) -> list[Patch]:
    return [
        (T2V_NODE_PROMPT, 0, inp["prompt"]),
        (T2V_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
        (T2V_NODE_WIDTH, 0, int(inp["width"])),
        (T2V_NODE_HEIGHT, 0, int(inp["height"])),
        (T2V_NODE_FPS, 0, int(inp["fps"])),
        (T2V_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
    ]


def _i2v_parameterize(inp: dict[str, Any]) -> list[Patch]:
    return [
        (I2V_NODE_PROMPT, 0, inp["prompt"]),
        (I2V_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
        (I2V_NODE_IMAGE, 0, inp["image"]),
        (I2V_NODE_WIDTH, 0, int(inp["width"])),
        (I2V_NODE_HEIGHT, 0, int(inp["height"])),
        (I2V_NODE_FPS, 0, int(inp["fps"])),
        (I2V_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
    ]


def _a2v_parameterize(inp: dict[str, Any]) -> list[Patch]:
    return [
        (A2V_NODE_PROMPT, 0, inp["prompt"]),
        (A2V_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
        (A2V_NODE_AUDIO, 0, inp["audio"]),
        (A2V_NODE_WIDTH, 0, int(inp["width"])),
        (A2V_NODE_HEIGHT, 0, int(inp["height"])),
        (A2V_NODE_FPS, 0, int(inp["fps"])),
        (A2V_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
    ]


def _lipsync_parameterize(inp: dict[str, Any]) -> list[Patch]:
    return [
        (LIPSYNC_NODE_PROMPT, 0, inp["prompt"]),
        (LIPSYNC_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
        (LIPSYNC_NODE_IMAGE, 0, inp["image"]),
        (LIPSYNC_NODE_AUDIO, 0, inp["audio"]),
        (LIPSYNC_NODE_FPS, 0, int(inp["fps"])),
        (LIPSYNC_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
    ]


def _keyframe_parameterize(inp: dict[str, Any]) -> list[Patch]:
    return [
        (KEYFRAME_NODE_PROMPT, 0, inp["prompt"]),
        (KEYFRAME_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
        (KEYFRAME_NODE_FIRST_FRAME, 0, inp["first_frame"]),
        (KEYFRAME_NODE_LAST_FRAME, 0, inp["last_frame"]),
        (KEYFRAME_NODE_FPS, 0, int(inp["fps"])),
        (KEYFRAME_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
    ]


def _style_parameterize(inp: dict[str, Any]) -> list[Patch]:
    return [
        (STYLE_NODE_PROMPT, 0, inp["prompt"]),
        (STYLE_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
        (STYLE_NODE_INPUT_VIDEO, 0, inp["input_video"]),
        (STYLE_NODE_FPS, 0, int(inp["fps"])),
        (STYLE_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
    ]


_T2V_STAGES = [
    Stage("Encode prompt", 5),
    Stage("Diffusion (Stage 1)", 60),
    Stage("Spatial upscale", 7),
    Stage("Diffusion (Stage 2)", 18),
    Stage("Decode video", 10),
]

_I2V_STAGES = [
    Stage("Encode prompt", 5),
    Stage("Encode image", 3),
    Stage("Diffusion (Stage 1)", 55),
    Stage("Spatial upscale", 7),
    Stage("Diffusion (Stage 2)", 20),
    Stage("Decode video", 10),
]

_A2V_STAGES = [
    Stage("Encode prompt", 5),
    Stage("Encode audio", 5),
    Stage("Diffusion (Stage 1)", 55),
    Stage("Spatial upscale", 7),
    Stage("Diffusion (Stage 2)", 18),
    Stage("Decode video", 10),
]

_LIPSYNC_STAGES = [
    Stage("Encode prompt", 5),
    Stage("Encode image", 3),
    Stage("Encode audio", 5),
    Stage("Diffusion (Stage 1)", 52),
    Stage("Spatial upscale", 7),
    Stage("Diffusion (Stage 2)", 18),
    Stage("Decode video", 10),
]

_KEYFRAME_STAGES = [
    Stage("Encode prompt", 5),
    Stage("Encode keyframes", 5),
    Stage("Diffusion (Stage 1)", 55),
    Stage("Spatial upscale", 7),
    Stage("Diffusion (Stage 2)", 18),
    Stage("Decode video", 10),
]

_STYLE_STAGES = [
    Stage("Encode prompt", 5),
    Stage("Decode source video", 5),
    Stage("Diffusion (Stage 1)", 55),
    Stage("Spatial upscale", 7),
    Stage("Diffusion (Stage 2)", 18),
    Stage("Decode video", 10),
]

MODE_REGISTRY["t2v"] = Mode(
    name="t2v",
    label="Text β†’ Video",
    icon="πŸ“",
    parameterize_fn=_t2v_parameterize,
    stage_map=_T2V_STAGES,
)
MODE_REGISTRY["i2v"] = Mode(
    name="i2v",
    label="Image β†’ Video",
    icon="πŸ–Ό",
    parameterize_fn=_i2v_parameterize,
    stage_map=_I2V_STAGES,
)
MODE_REGISTRY["a2v"] = Mode(
    name="a2v",
    label="Audio β†’ Video",
    icon="🎡",
    parameterize_fn=_a2v_parameterize,
    stage_map=_A2V_STAGES,
)
MODE_REGISTRY["lipsync"] = Mode(
    name="lipsync",
    label="Lipsync",
    icon="πŸ‘„",
    parameterize_fn=_lipsync_parameterize,
    stage_map=_LIPSYNC_STAGES,
)
MODE_REGISTRY["keyframe"] = Mode(
    name="keyframe",
    label="Keyframe β†’ Video",
    icon="🎞",
    parameterize_fn=_keyframe_parameterize,
    stage_map=_KEYFRAME_STAGES,
)
MODE_REGISTRY["style"] = Mode(
    name="style",
    label="Style Transfer",
    icon="🎨",
    parameterize_fn=_style_parameterize,
    stage_map=_STYLE_STAGES,
)