mamungtai-sat pormungtai commited on
Commit
a1a4f4b
·
1 Parent(s): 2d2d472

Full-body fix: shot field first, shorter style_prefix (free CLIP tokens), auto-tall canvas 512x832 on full-body intent (sd15) (#32)

Browse files

- Full-body fix: shot field first, shorter style_prefix (free CLIP tokens), auto-tall canvas 512x832 on full-body intent (sd15) (eedbb2557e07ec27d04f6d6ed8f6877ed297862c)


Co-authored-by: pormungtailaw <pormungtai@users.noreply.huggingface.co>

Files changed (3) hide show
  1. app.py +5 -1
  2. models.json +1 -1
  3. pipeline_manager.py +18 -0
app.py CHANGED
@@ -156,6 +156,10 @@ def build_prompt(subject, age, ethnicity, skin, face, body, hair, eyes, outfit,
156
  """Assemble form fields into a prompt. Thai values are fine — the auto-translator
157
  turns the whole thing into English at generate time."""
158
  parts = []
 
 
 
 
159
  who = (subject or "ผู้หญิง").strip()
160
  if ethnicity and ethnicity.strip():
161
  who = f"{ethnicity.strip()} {who}"
@@ -163,7 +167,7 @@ def build_prompt(subject, age, ethnicity, skin, face, body, hair, eyes, outfit,
163
  who = f"{who} อายุ {str(age).strip()} ปี"
164
  parts.append(who)
165
  # skin tone + face shape sit right after "who" (core identity), then the rest.
166
- for v in (skin, face, body, hair, eyes, outfit, pose, expression, scene, lighting, shot):
167
  if v and str(v).strip():
168
  parts.append(str(v).strip())
169
  thai = ", ".join(parts)
 
156
  """Assemble form fields into a prompt. Thai values are fine — the auto-translator
157
  turns the whole thing into English at generate time."""
158
  parts = []
159
+ # Framing/shot goes FIRST so "full body / head to toe" survives CLIP's 77-token cut
160
+ # (it's the key compositional cue; if truncated, SD1.5 defaults to a portrait crop).
161
+ if shot and str(shot).strip():
162
+ parts.append(str(shot).strip())
163
  who = (subject or "ผู้หญิง").strip()
164
  if ethnicity and ethnicity.strip():
165
  who = f"{ethnicity.strip()} {who}"
 
167
  who = f"{who} อายุ {str(age).strip()} ปี"
168
  parts.append(who)
169
  # skin tone + face shape sit right after "who" (core identity), then the rest.
170
+ for v in (skin, face, body, hair, eyes, outfit, pose, expression, scene, lighting):
171
  if v and str(v).strip():
172
  parts.append(str(v).strip())
173
  thai = ", ".join(parts)
models.json CHANGED
@@ -15,7 +15,7 @@
15
  "default_height": 768,
16
  "sampler": "dpmpp_2m_karras",
17
  "vae": "stabilityai/sd-vae-ft-mse",
18
- "style_prefix": "RAW photo, analog film photograph, film grain, detailed skin texture, skin pores, fine hair strands, detailed eyes, catchlight, natural nails",
19
  "hires": { "scale": 1.3, "denoise": 0.3, "steps": 12 },
20
  "neg_embeddings": ["easynegative"],
21
  "recommended_prompt": "RAW photo, (photorealistic:1.2), portrait of a beautiful woman, detailed skin texture, visible skin pores, natural skin, subsurface scattering, film grain, soft cinematic light, 85mm, depth of field, ultra detailed, analog photo",
 
15
  "default_height": 768,
16
  "sampler": "dpmpp_2m_karras",
17
  "vae": "stabilityai/sd-vae-ft-mse",
18
+ "style_prefix": "RAW photo, analog film grain, detailed skin texture, skin pores, real hair strands",
19
  "hires": { "scale": 1.3, "denoise": 0.3, "steps": 12 },
20
  "neg_embeddings": ["easynegative"],
21
  "recommended_prompt": "RAW photo, (photorealistic:1.2), portrait of a beautiful woman, detailed skin texture, visible skin pores, natural skin, subsurface scattering, film grain, soft cinematic light, 85mm, depth of field, ultra detailed, analog photo",
pipeline_manager.py CHANGED
@@ -114,6 +114,19 @@ def has_thai(text):
114
  return any("฀" <= ch <= "๿" for ch in (text or ""))
115
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def _load_translator(engine):
118
  if engine in _TRANSLATOR_CACHE:
119
  return _TRANSLATOR_CACHE[engine]
@@ -473,6 +486,11 @@ def run_generation(cfg, mode, prompt, negative_prompt, ref_image,
473
  _parts = [cfg.get("style_prefix"), cfg.get("trigger"), prompt]
474
  full_prompt = ", ".join(p.strip() for p in _parts if p and str(p).strip()).strip(", ")
475
 
 
 
 
 
 
476
  call = dict(
477
  prompt=full_prompt,
478
  num_inference_steps=int(steps),
 
114
  return any("฀" <= ch <= "๿" for ch in (text or ""))
115
 
116
 
117
+ # Full-body framing cues — if present, SD1.5's 512x768 canvas crops to a portrait,
118
+ # so we give the canvas more vertical room (see run_generation). Checked on the
119
+ # already-translated English prompt; Thai เต็มตัว/ทั้งตัว included as a safety net.
120
+ _FULL_BODY_CUES = ("full body", "full-body", "head to toe", "head-to-toe",
121
+ "full length", "full-length", "full shot", "entire body",
122
+ "whole body", "standing", "เต็มตัว", "ทั้งตัว", "เห็นเท้า")
123
+
124
+
125
+ def wants_full_body(text):
126
+ t = (text or "").lower()
127
+ return any(c in t for c in _FULL_BODY_CUES)
128
+
129
+
130
  def _load_translator(engine):
131
  if engine in _TRANSLATOR_CACHE:
132
  return _TRANSLATOR_CACHE[engine]
 
486
  _parts = [cfg.get("style_prefix"), cfg.get("trigger"), prompt]
487
  full_prompt = ", ".join(p.strip() for p in _parts if p and str(p).strip()).strip(", ")
488
 
489
+ # Full-body framing fix: on SD1.5 a 512x768 canvas crops standing/seated subjects
490
+ # to a portrait even when "full body" is requested. Give it more vertical room.
491
+ if base == "sd15" and mode == "txt2img" and wants_full_body(prompt):
492
+ height = max(int(height), 832)
493
+
494
  call = dict(
495
  prompt=full_prompt,
496
  num_inference_steps=int(steps),