Spaces:

mamungtai-sat
/

character-studio

Running on Zero

App Files Files Community

mamungtai-sat

pormungtai commited on 26 days ago

Commit

a1a4f4b

1 Parent(s): 2d2d472

Full-body fix: shot field first, shorter style_prefix (free CLIP tokens), auto-tall canvas 512x832 on full-body intent (sd15) (#32)

Browse files

- Full-body fix: shot field first, shorter style_prefix (free CLIP tokens), auto-tall canvas 512x832 on full-body intent (sd15) (eedbb2557e07ec27d04f6d6ed8f6877ed297862c)

Co-authored-by: pormungtailaw <pormungtai@users.noreply.huggingface.co>

Files changed (3) hide show

app.py +5 -1
models.json +1 -1
pipeline_manager.py +18 -0

app.py CHANGED Viewed

@@ -156,6 +156,10 @@ def build_prompt(subject, age, ethnicity, skin, face, body, hair, eyes, outfit,
     """Assemble form fields into a prompt. Thai values are fine — the auto-translator
     turns the whole thing into English at generate time."""
     parts = []
     who = (subject or "ผู้หญิง").strip()
     if ethnicity and ethnicity.strip():
         who = f"{ethnicity.strip()} {who}"
@@ -163,7 +167,7 @@ def build_prompt(subject, age, ethnicity, skin, face, body, hair, eyes, outfit,
         who = f"{who} อายุ {str(age).strip()} ปี"
     parts.append(who)
     # skin tone + face shape sit right after "who" (core identity), then the rest.
-    for v in (skin, face, body, hair, eyes, outfit, pose, expression, scene, lighting, shot):
         if v and str(v).strip():
             parts.append(str(v).strip())
     thai = ", ".join(parts)

     """Assemble form fields into a prompt. Thai values are fine — the auto-translator
     turns the whole thing into English at generate time."""
     parts = []
+    # Framing/shot goes FIRST so "full body / head to toe" survives CLIP's 77-token cut
+    # (it's the key compositional cue; if truncated, SD1.5 defaults to a portrait crop).
+    if shot and str(shot).strip():
+        parts.append(str(shot).strip())
     who = (subject or "ผู้หญิง").strip()
     if ethnicity and ethnicity.strip():
         who = f"{ethnicity.strip()} {who}"
         who = f"{who} อายุ {str(age).strip()} ปี"
     parts.append(who)
     # skin tone + face shape sit right after "who" (core identity), then the rest.
+    for v in (skin, face, body, hair, eyes, outfit, pose, expression, scene, lighting):
         if v and str(v).strip():
             parts.append(str(v).strip())
     thai = ", ".join(parts)

models.json CHANGED Viewed

@@ -15,7 +15,7 @@
       "default_height": 768,
       "sampler": "dpmpp_2m_karras",
       "vae": "stabilityai/sd-vae-ft-mse",
-      "style_prefix": "RAW photo, analog film photograph, film grain, detailed skin texture, skin pores, fine hair strands, detailed eyes, catchlight, natural nails",
       "hires": { "scale": 1.3, "denoise": 0.3, "steps": 12 },
       "neg_embeddings": ["easynegative"],
       "recommended_prompt": "RAW photo, (photorealistic:1.2), portrait of a beautiful woman, detailed skin texture, visible skin pores, natural skin, subsurface scattering, film grain, soft cinematic light, 85mm, depth of field, ultra detailed, analog photo",

       "default_height": 768,
       "sampler": "dpmpp_2m_karras",
       "vae": "stabilityai/sd-vae-ft-mse",
+      "style_prefix": "RAW photo, analog film grain, detailed skin texture, skin pores, real hair strands",
       "hires": { "scale": 1.3, "denoise": 0.3, "steps": 12 },
       "neg_embeddings": ["easynegative"],
       "recommended_prompt": "RAW photo, (photorealistic:1.2), portrait of a beautiful woman, detailed skin texture, visible skin pores, natural skin, subsurface scattering, film grain, soft cinematic light, 85mm, depth of field, ultra detailed, analog photo",

pipeline_manager.py CHANGED Viewed

@@ -114,6 +114,19 @@ def has_thai(text):
     return any("฀" <= ch <= "๿" for ch in (text or ""))
 def _load_translator(engine):
     if engine in _TRANSLATOR_CACHE:
         return _TRANSLATOR_CACHE[engine]
@@ -473,6 +486,11 @@ def run_generation(cfg, mode, prompt, negative_prompt, ref_image,
     _parts = [cfg.get("style_prefix"), cfg.get("trigger"), prompt]
     full_prompt = ", ".join(p.strip() for p in _parts if p and str(p).strip()).strip(", ")
     call = dict(
         prompt=full_prompt,
         num_inference_steps=int(steps),

     return any("฀" <= ch <= "๿" for ch in (text or ""))
+# Full-body framing cues — if present, SD1.5's 512x768 canvas crops to a portrait,
+# so we give the canvas more vertical room (see run_generation). Checked on the
+# already-translated English prompt; Thai เต็มตัว/ทั้งตัว included as a safety net.
+_FULL_BODY_CUES = ("full body", "full-body", "head to toe", "head-to-toe",
+                   "full length", "full-length", "full shot", "entire body",
+                   "whole body", "standing", "เต็มตัว", "ทั้งตัว", "เห็นเท้า")
+def wants_full_body(text):
+    t = (text or "").lower()
+    return any(c in t for c in _FULL_BODY_CUES)
 def _load_translator(engine):
     if engine in _TRANSLATOR_CACHE:
         return _TRANSLATOR_CACHE[engine]
     _parts = [cfg.get("style_prefix"), cfg.get("trigger"), prompt]
     full_prompt = ", ".join(p.strip() for p in _parts if p and str(p).strip()).strip(", ")
+    # Full-body framing fix: on SD1.5 a 512x768 canvas crops standing/seated subjects
+    # to a portrait even when "full body" is requested. Give it more vertical room.
+    if base == "sd15" and mode == "txt2img" and wants_full_body(prompt):
+        height = max(int(height), 832)
     call = dict(
         prompt=full_prompt,
         num_inference_steps=int(steps),