mamungtai-sat pormungtai commited on
Commit
0446524
·
1 Parent(s): a1a4f4b

Fix scene truncation: front-load scene/lighting/composition in build_prompt + tighter compact-tag Typhoon output (fewer tokens, keep location) (#33)

Browse files

- Fix scene truncation: front-load scene/lighting/composition in build_prompt + tighter compact-tag Typhoon output (fewer tokens, keep location) (84c945dd6150562415f83c5db317982bf6c1a838)


Co-authored-by: pormungtailaw <pormungtai@users.noreply.huggingface.co>

Files changed (2) hide show
  1. app.py +4 -2
  2. pipeline_manager.py +11 -9
app.py CHANGED
@@ -166,8 +166,10 @@ def build_prompt(subject, age, ethnicity, skin, face, body, hair, eyes, outfit,
166
  if age and str(age).strip():
167
  who = f"{who} อายุ {str(age).strip()} ปี"
168
  parts.append(who)
169
- # skin tone + face shape sit right after "who" (core identity), then the rest.
170
- for v in (skin, face, body, hair, eyes, outfit, pose, expression, scene, lighting):
 
 
171
  if v and str(v).strip():
172
  parts.append(str(v).strip())
173
  thai = ", ".join(parts)
 
166
  if age and str(age).strip():
167
  who = f"{who} อายุ {str(age).strip()} ปี"
168
  parts.append(who)
169
+ # Priority order for CLIP's 77-token budget: compositional anchors first
170
+ # (location/lighting/outfit/pose), fine appearance details last (least harmful
171
+ # if truncated). Skin texture realism is carried by the model's style_prefix anyway.
172
+ for v in (scene, lighting, outfit, pose, expression, body, hair, skin, face, eyes):
173
  if v and str(v).strip():
174
  parts.append(str(v).strip())
175
  thai = ", ".join(parts)
pipeline_manager.py CHANGED
@@ -162,15 +162,17 @@ def translate_prompt(text, engine):
162
  return tok.batch_decode(out, skip_special_tokens=True)[0].strip()
163
  # typhoon: ask the LLM to rewrite as a clean English image prompt
164
  msgs = [
165
- {"role": "system", "content": "You convert Thai text-to-image prompts "
166
- "into a single concise, vivid English prompt for a PHOTOREALISTIC Stable "
167
- "Diffusion model. Describe it as a real candid photograph: keep the subject, "
168
- "clothing, pose, and scene, and add realistic photographic detail (natural "
169
- "skin texture and pores, real hair strands, lifelike eyes, soft natural "
170
- "light). NEVER use illustration/painting/anime/CG words such as 'masterpiece', "
171
- "'best quality', 'artstation', 'render', '3d', 'anime' or 'painting'. "
172
- "Output ONLY the English prompt as a comma-separated phrase — no quotes, "
173
- "no explanation."},
 
 
174
  {"role": "user", "content": text},
175
  ]
176
  chat = tok.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
 
162
  return tok.batch_decode(out, skip_special_tokens=True)[0].strip()
163
  # typhoon: ask the LLM to rewrite as a clean English image prompt
164
  msgs = [
165
+ {"role": "system", "content": "You convert Thai text-to-image prompts into "
166
+ "an English prompt for a PHOTOREALISTIC Stable Diffusion model. Output a "
167
+ "COMPACT comma-separated list of English tags / short phrases (booru-tag "
168
+ "style) NOT full sentences. Omit articles and filler words (a, an, the, "
169
+ "with, that is). Keep it short to fit a 77-token limit, but INCLUDE EVERY "
170
+ "detail from the input especially the location/scene, camera framing "
171
+ "(e.g. full body), clothing and pose; never drop the setting. Treat it as a "
172
+ "real candid photograph (natural skin texture, real hair, lifelike eyes, "
173
+ "natural light). NEVER use illustration/painting/anime/CG words such as "
174
+ "'masterpiece', 'best quality', 'render', '3d', 'anime' or 'painting'. "
175
+ "Output ONLY the comma-separated tags — no quotes, no explanation."},
176
  {"role": "user", "content": text},
177
  ]
178
  chat = tok.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)