Malaji71 commited on
Commit
f987206
Β·
verified Β·
1 Parent(s): 5dbe6d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -455
app.py CHANGED
@@ -1,17 +1,7 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
4
- from PIL import Image
5
- import random
6
- import os
7
-
8
- # Instalar dependencias necesarias si no estΓ‘n presentes
9
- try:
10
- import peft
11
- except ImportError:
12
- print("Instalando peft...")
13
- os.system("pip install -q peft")
14
- import peft
15
 
16
  # Check GPU availability
17
  use_gpu = torch.cuda.is_available()
@@ -22,7 +12,7 @@ processor, model, zephyr_generator = None, None, None
22
  def load_models():
23
  """Load models only when needed"""
24
  global processor, model, zephyr_generator
25
- try:
26
  print("Loading BLIP model...")
27
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
28
  model = BlipForConditionalGeneration.from_pretrained(
@@ -30,92 +20,40 @@ def load_models():
30
  torch_dtype=torch.float32 # Use float32 for CPU
31
  )
32
  print("βœ… BLIP model loaded successfully!")
33
-
34
- print("Loading SARA-Zephyr adapter model...")
 
 
 
 
 
 
 
 
 
 
35
  try:
36
- # Cargar el modelo base Zephyr primero
37
- from transformers import AutoModelForCausalLM, AutoTokenizer
38
- from peft import PeftModel, PeftConfig
39
-
40
- # Cargar tokenizer del modelo base
41
- tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
42
-
43
- # Cargar modelo base
44
- base_model = AutoModelForCausalLM.from_pretrained(
45
- "HuggingFaceH4/zephyr-7b-beta",
46
- torch_dtype=torch.float32,
47
- device_map="auto" if use_gpu else None
48
- )
49
-
50
- # Cargar configuraciΓ³n del adaptador
51
- try:
52
- # Si estΓ‘ usando un repositorio en HuggingFace
53
- adapter_config = PeftConfig.from_pretrained("Malaji71/SARA-Zephyr")
54
-
55
- # Cargar el adaptador sobre el modelo base
56
- peft_model = PeftModel.from_pretrained(
57
- base_model,
58
- "Malaji71/SARA-Zephyr"
59
- )
60
-
61
- print("βœ… PEFT adapter loaded from HuggingFace!")
62
-
63
- except Exception as e:
64
- print(f"Error loading from HuggingFace: {str(e)}")
65
- print("Trying to load adapter locally...")
66
-
67
- # Intentar cargar localmente si estΓ‘ disponible
68
- local_adapter_path = "./SARA-Zephyr" # Ajustar segΓΊn sea necesario
69
-
70
- try:
71
- adapter_config = PeftConfig.from_pretrained(local_adapter_path)
72
- peft_model = PeftModel.from_pretrained(
73
- base_model,
74
- local_adapter_path
75
- )
76
- print("βœ… PEFT adapter loaded locally!")
77
- except Exception as e2:
78
- print(f"Error loading adapter locally: {str(e2)}")
79
- print("Falling back to base model...")
80
- peft_model = base_model
81
-
82
- # Crear pipeline con el modelo adaptado
83
- zephyr_generator = pipeline(
84
- "text-generation",
85
- model=peft_model,
86
- tokenizer=tokenizer,
87
- torch_dtype=torch.float32
88
- )
89
-
90
- # Verificar que el pipeline se haya creado correctamente
91
- if zephyr_generator is None or not hasattr(zephyr_generator, 'tokenizer'):
92
- raise ValueError("Pipeline creation failed or doesn't have tokenizer attribute")
93
-
94
- print("βœ… SARA-Zephyr adapter model loaded successfully!")
95
- return True
96
-
97
  except Exception as e:
98
- print(f"Error loading SARA-Zephyr adapter: {str(e)}")
99
- print("Falling back to standard Zephyr model...")
100
-
101
- # Modelo de respaldo en caso de error
102
- zephyr_generator = pipeline(
103
- "text-generation",
104
- model="HuggingFaceH4/zephyr-7b-beta",
105
- torch_dtype=torch.float32,
106
- device_map="auto" if use_gpu else None
107
- )
108
-
109
- # Verificar que el pipeline de respaldo se haya creado correctamente
110
- if zephyr_generator is None or not hasattr(zephyr_generator, 'tokenizer'):
111
- raise ValueError("Fallback pipeline creation failed or doesn't have tokenizer attribute")
112
-
113
- print("βœ… Fallback Zephyr model loaded successfully!")
114
- return True
115
-
116
- except Exception as e:
117
- print(f"❌ Critical error loading models: {str(e)}")
118
- return False
119
 
120
  # Universal Video Prompting Guide combining Gen-4 + SARA
121
  unified_instructions = """
@@ -154,17 +92,10 @@ def analyze_image_with_zephyr(image):
154
  return "Please upload an image first.", {}
155
  try:
156
  # Lazy load models
157
- load_success = load_models()
158
- if not load_success:
159
- return "Error: Model loading failed. Please try again later.", {}
160
-
161
- if processor is None or model is None:
162
- return "Error: Image analysis model failed to load. Please try again.", {}
163
-
164
  # Convert to PIL if needed
165
  if not isinstance(image, Image.Image):
166
  image = Image.fromarray(image)
167
-
168
  # Get image dimensions
169
  width, height = image.size
170
  aspect_ratio = width / height
@@ -174,15 +105,12 @@ def analyze_image_with_zephyr(image):
174
  composition = "Vertical portrait shot"
175
  else:
176
  composition = "Balanced composition"
177
-
178
  # Generate caption with BLIP
179
  inputs = processor(image, return_tensors="pt")
180
  out = model.generate(**inputs, max_length=50, num_beams=3)
181
  basic_caption = processor.decode(out[0], skip_special_tokens=True)
182
-
183
  # Use Zephyr for advanced analysis
184
  enhanced_analysis = analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition)
185
-
186
  # Create comprehensive analysis
187
  analysis = f"""πŸ“Š **Image Analysis:**
188
  β€’ **Dimensions**: {width} x {height}
@@ -196,7 +124,6 @@ def analyze_image_with_zephyr(image):
196
  {chr(10).join(f"β€’ {insight}" for insight in enhanced_analysis['motion_insights'])}
197
  🎯 **Recommended Approach**:
198
  {enhanced_analysis['recommended_approach']}"""
199
-
200
  # Scene info for prompt generation
201
  scene_info = {
202
  'basic_description': basic_caption,
@@ -206,32 +133,11 @@ def analyze_image_with_zephyr(image):
206
  }
207
  return analysis, scene_info
208
  except Exception as e:
209
- print(f"Error in analyze_image_with_zephyr: {str(e)}")
210
  return f"Error analyzing image: {str(e)}", {}
211
 
212
  def analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition):
213
- """Use Zephyr with SARA framework for advanced scene analysis"""
214
- # Verificar que el modelo estΓ‘ cargado
215
- if zephyr_generator is None:
216
- # Intenta cargar los modelos si no estΓ‘n cargados
217
- success = load_models()
218
- if not success:
219
- return {
220
- 'scene_interpretation': "Error: Unable to load text generation model.",
221
- 'motion_insights': ["Model loading failed. Please try again."],
222
- 'recommended_approach': "Unable to determine approach due to model loading error."
223
- }
224
-
225
- # Verificar que zephyr_generator tiene el atributo tokenizer
226
- if not hasattr(zephyr_generator, 'tokenizer'):
227
- return {
228
- 'scene_interpretation': "Error: Text generation model is not properly initialized.",
229
- 'motion_insights': ["Model initialization failed. Please restart the application."],
230
- 'recommended_approach': "Unable to determine approach due to model initialization error."
231
- }
232
-
233
- try:
234
- analysis_prompt = f"""<|system|>
235
  You are a video prompt engineering expert specializing in the SARA framework. Analyze this image description for video creation potential.
236
  <|user|>
237
  Image description: "{basic_caption}"
@@ -244,82 +150,34 @@ Please provide:
244
  4. Best prompting approach (SARA vs Gen-4)
245
  Be concise and practical.
246
  <|assistant|>"""
247
-
248
- response = zephyr_generator(
249
- analysis_prompt,
250
- max_new_tokens=200,
251
- do_sample=True,
252
- temperature=0.7,
253
- top_k=50,
254
- top_p=0.95
255
- )
256
-
257
- # Extract generated text
258
- if isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
259
- generated_text = response[0]["generated_text"]
260
- # Extraer solo la respuesta del asistente
261
- if "<|assistant|>" in generated_text:
262
- ai_analysis = generated_text.split("<|assistant|>")[-1].strip()
263
- else:
264
- # Intentar extraer la ΓΊltima parte del texto si no encontramos la etiqueta
265
- ai_analysis = generated_text.split(analysis_prompt)[-1].strip()
266
-
267
- lines = ai_analysis.split('\n')
268
- motion_insights = []
269
- recommended_approach = "SARA framework recommended for precise control"
270
-
271
- for line in lines:
272
- if line.strip():
273
- if any(keyword in line.lower() for keyword in ['motion', 'movement', 'camera', 'lighting']):
274
- motion_insights.append(line.strip('- ').strip())
275
- elif 'sara' in line.lower() or 'gen-4' in line.lower():
276
- recommended_approach = line.strip('- ').strip()
277
-
278
- return {
279
- 'scene_interpretation': ai_analysis.split('\n')[0] if ai_analysis else "Scene analysis completed",
280
- 'motion_insights': motion_insights[:6] if motion_insights else ["Smooth cinematic movement", "Steady camera tracking", "Natural lighting transitions"],
281
- 'recommended_approach': recommended_approach
282
- }
283
- else:
284
- return {
285
- 'scene_interpretation': "Unable to generate analysis with current model.",
286
- 'motion_insights': ["Default: Smooth motion", "Default: Stable camera work", "Default: Natural lighting"],
287
- 'recommended_approach': "SARA framework recommended as default"
288
- }
289
-
290
- except Exception as e:
291
- print(f"Error in analyze_scene_with_zephyr: {str(e)}")
292
- return {
293
- 'scene_interpretation': f"Error analyzing scene: {str(e)}",
294
- 'motion_insights': ["Error occurred during analysis", "Using default recommendations", "Try simplifying the image"],
295
- 'recommended_approach': "SARA framework recommended (default)"
296
- }
297
 
298
  def generate_sample_prompts_with_zephyr(scene_info=None):
299
- """Generate sample prompts using Zephyr with SARA framework"""
300
- # Verificar que el modelo estΓ‘ cargado
301
- if zephyr_generator is None:
302
- # Intenta cargar los modelos si no estΓ‘n cargados
303
- success = load_models()
304
- if not success:
305
- return [
306
- "Error: Unable to load text generation model. Please try again.",
307
- "Default prompt: The subject walks forward smoothly while the background remains steady, cinematic atmosphere.",
308
- "Default prompt: A dramatic close-up captures the subject's expression as they speak directly to the camera."
309
- ]
310
-
311
- # Verificar que zephyr_generator tiene el atributo tokenizer
312
- if not hasattr(zephyr_generator, 'tokenizer'):
313
- return [
314
- "Error: Text generation model is not properly initialized. Please restart the application.",
315
- "Default prompt: The subject walks forward smoothly while the background remains steady, cinematic atmosphere.",
316
- "Default prompt: A dramatic close-up captures the subject's expression as they speak directly to the camera."
317
- ]
318
-
319
  if scene_info and scene_info.get('basic_description'):
320
- try:
321
- # Use Zephyr to generate contextual prompts
322
- context_prompt = f"""<|system|>
323
  Generate 3 professional video prompts using the SARA framework based on this image analysis.
324
  <|user|>
325
  Image description: {scene_info['basic_description']}
@@ -327,35 +185,19 @@ Composition: {scene_info.get('composition', 'Balanced')}
327
  Aspect Ratio: {scene_info.get('aspect_ratio', 'N/A'):.2f}
328
  Remember the SARA framework: Subject + Action + Reference + Atmosphere
329
  <|assistant|>"""
330
-
331
- response = zephyr_generator(
332
- context_prompt,
333
- max_new_tokens=200,
334
- do_sample=True,
335
- temperature=0.8,
336
- top_k=50,
337
- top_p=0.95
338
- )
339
-
340
- # Extract generated text
341
- if isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
342
- generated_text = response[0]["generated_text"]
343
- # Extraer solo la respuesta del asistente
344
- if "<|assistant|>" in generated_text:
345
- prompts_text = generated_text.split("<|assistant|>")[-1].strip()
346
- else:
347
- # Intentar extraer la ΓΊltima parte del texto si no encontramos la etiqueta
348
- prompts_text = generated_text.split(context_prompt)[-1].strip()
349
-
350
- # Extract and clean prompts
351
- prompts = [p.strip('123.-β€’ ') for p in prompts_text.split('\n') if p.strip()]
352
- # Return first 3 clean prompts
353
- if len(prompts) >= 3:
354
- return prompts[:3]
355
- except Exception as e:
356
- print(f"Error in generate_sample_prompts_with_zephyr: {str(e)}")
357
- # Continue to fallback prompts if there's an error
358
-
359
  # Fallback prompts if Zephyr fails or no scene info
360
  base_prompts = [
361
  "The subject walks forward smoothly while the background remains steady, cinematic atmosphere.",
@@ -365,30 +207,15 @@ Remember the SARA framework: Subject + Action + Reference + Atmosphere
365
  return base_prompts
366
 
367
  def optimize_user_prompt_with_zephyr(user_idea, scene_info=None):
368
- """Optimize user's prompt idea using SARA framework with Zephyr model"""
369
  if not user_idea.strip():
370
- return "Please enter your idea first.", "No input provided"
371
-
372
- # Verificar que el modelo estΓ‘ cargado
373
- if zephyr_generator is None:
374
- # Intenta cargar los modelos si no estΓ‘n cargados
375
- success = load_models()
376
- if not success:
377
- return "Error: Unable to load text generation model. Please try again or use Retry button.", "Model loading failed"
378
-
379
- # Verificar que zephyr_generator tiene el atributo tokenizer
380
- if not hasattr(zephyr_generator, 'tokenizer'):
381
- return ("Error: Text generation model is not properly initialized. Please restart the application or use Retry button.",
382
- "Model initialization failed")
383
-
384
  # Create context from scene if available
385
  context = ""
386
  if scene_info and scene_info.get('basic_description'):
387
  context = f"Image context: {scene_info['basic_description']}"
388
-
389
- try:
390
- # Enforce structure based on approach
391
- optimization_prompt = f"""<|system|>
392
  You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma.
393
  Key principles:
394
  - Focus on MOTION, not static description
@@ -401,140 +228,27 @@ User's idea: "{user_idea}"
401
  {context}
402
  Please create an optimized video prompt using the SARA framework. Respond with just the prompt.
403
  <|assistant|>"""
404
-
405
- response = zephyr_generator(
406
- optimization_prompt,
407
- max_new_tokens=100,
408
- do_sample=True,
409
- temperature=0.7,
410
- top_k=50,
411
- top_p=0.95
412
- )
413
-
414
- # Extract optimized prompt
415
- if isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
416
- generated_text = response[0]["generated_text"]
417
- # Extraer solo la respuesta del asistente
418
- if "<|assistant|>" in generated_text:
419
- optimized = generated_text.split("<|assistant|>")[-1].strip()
420
- else:
421
- # Intentar extraer la ΓΊltima parte del texto si no encontramos la etiqueta
422
- optimized = generated_text.split(optimization_prompt)[-1].strip()
423
- return optimized, "SARA-Zephyr model used successfully"
424
- else:
425
- return ("Error processing your idea. Please try again with a different description or use Retry button.",
426
- "Invalid model response format")
427
- except Exception as e:
428
- print(f"Error in optimize_user_prompt_with_zephyr: {str(e)}")
429
- return (f"Error generating prompt: {str(e)}. Please try again with a simpler description or use Retry button.",
430
- f"Error: {str(e)}")
431
-
432
- def fallback_generate_prompt(user_idea, scene_info=None):
433
- """FunciΓ³n de respaldo para generar prompts cuando el modelo principal falla"""
434
- if not user_idea.strip():
435
- return "Please enter your idea first."
436
-
437
- try:
438
- # Crear un generador de respaldo especΓ­fico para esta funciΓ³n
439
- from transformers import pipeline
440
- import torch
441
-
442
- fallback_generator = pipeline(
443
- "text-generation",
444
- model="HuggingFaceH4/zephyr-7b-beta",
445
- torch_dtype=torch.float32,
446
- device_map="auto" if torch.cuda.is_available() else None
447
- )
448
-
449
- # Create context from scene if available
450
- context = ""
451
- if scene_info and scene_info.get('basic_description'):
452
- context = f"Image context: {scene_info['basic_description']}"
453
-
454
- # Enforce structure based on approach
455
- optimization_prompt = f"""<|system|>
456
- You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma.
457
- Key principles:
458
- - Focus on MOTION, not static description
459
- - Use positive phrasing
460
- - Be specific about camera work
461
- - Include lighting/atmosphere details
462
- - Follow the SARA structure: Subject + Action + Reference + Atmosphere
463
- <|user|>
464
- User's idea: "{user_idea}"
465
- {context}
466
- Please create an optimized video prompt using the SARA framework. Respond with just the prompt.
467
- <|assistant|>"""
468
-
469
- response = fallback_generator(
470
- optimization_prompt,
471
- max_new_tokens=100,
472
- do_sample=True,
473
- temperature=0.7,
474
- top_k=50,
475
- top_p=0.95
476
- )
477
-
478
- # Extract optimized prompt
479
- if isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
480
- generated_text = response[0]["generated_text"]
481
- # Extraer solo la respuesta del asistente
482
- if "<|assistant|>" in generated_text:
483
- optimized = generated_text.split("<|assistant|>")[-1].strip()
484
- else:
485
- # Intentar extraer la ΓΊltima parte del texto si no encontramos la etiqueta
486
- optimized = generated_text.split(optimization_prompt)[-1].strip()
487
- return optimized
488
- else:
489
- return "Error processing your idea with the fallback model. Here's a template: Subject walks smoothly while camera remains steady, cinematic atmosphere."
490
-
491
- except Exception as e:
492
- print(f"Error in fallback_generate_prompt: {str(e)}")
493
- # GeneraciΓ³n manual de respaldo en caso de error total
494
- words = user_idea.strip().split()
495
- if len(words) > 2:
496
- subject = "The subject"
497
- if "man" in words or "boy" in words:
498
- subject = "The man"
499
- elif "woman" in words or "girl" in words:
500
- subject = "The woman"
501
- elif "child" in words or "kid" in words:
502
- subject = "The child"
503
-
504
- action = "moves naturally"
505
- for verb in ["walk", "run", "jump", "sit", "stand", "dance", "move", "turn"]:
506
- if any(verb in word.lower() for word in words):
507
- action = verb + "s smoothly"
508
- break
509
-
510
- return f"{subject} {action} while camera remains steady, cinematic atmosphere."
511
- else:
512
- return "The subject moves naturally while camera remains steady, cinematic atmosphere."
513
 
514
  def refine_prompt_with_zephyr(current_prompt, feedback, chat_history, scene_info=None):
515
- """Refine a prompt based on user feedback using Zephyr with SARA framework"""
516
  if not feedback.strip():
517
  return current_prompt, chat_history
518
-
519
- # Verificar que el modelo estΓ‘ cargado
520
- if zephyr_generator is None:
521
- # Intenta cargar los modelos si no estΓ‘n cargados
522
- success = load_models()
523
- if not success:
524
- return "Error: Unable to load text generation model. Please try again.", chat_history
525
-
526
- # Verificar que zephyr_generator tiene el atributo tokenizer
527
- if not hasattr(zephyr_generator, 'tokenizer'):
528
- return "Error: Text generation model is not properly initialized. Please restart the application.", chat_history
529
-
530
  # Create refinement context
531
  context = ""
532
  if scene_info and scene_info.get('basic_description'):
533
  context = f"Image context: {scene_info['basic_description']}"
534
-
535
- try:
536
- # Construct Zephyr refinement prompt
537
- refinement_prompt = f"""<|system|>
538
  You are an expert in refining video prompts using the SARA framework. Based on the user's feedback, improve the current prompt while maintaining its core structure.
539
  Key principles:
540
  - Focus on MOTION, not static description
@@ -548,35 +262,18 @@ Feedback: "{feedback}"
548
  {context}
549
  Please refine the prompt while keeping it under 100 words. Respond with just the refined prompt.
550
  <|assistant|>"""
551
-
552
- response = zephyr_generator(
553
- refinement_prompt,
554
- max_new_tokens=100,
555
- do_sample=True,
556
- temperature=0.7,
557
- top_k=50,
558
- top_p=0.95
559
- )
560
-
561
- # Extract refined prompt
562
- if isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
563
- generated_text = response[0]["generated_text"]
564
- # Extraer solo la respuesta del asistente
565
- if "<|assistant|>" in generated_text:
566
- refined = generated_text.split("<|assistant|>")[-1].strip()
567
- else:
568
- # Intentar extraer la ΓΊltima parte del texto si no encontramos la etiqueta
569
- refined = generated_text.split(refinement_prompt)[-1].strip()
570
-
571
- # Update chat history
572
- new_chat_history = chat_history + [[feedback, refined]]
573
- return refined, new_chat_history
574
- else:
575
- return current_prompt, chat_history
576
-
577
- except Exception as e:
578
- print(f"Error in refine_prompt_with_zephyr: {str(e)}")
579
- return f"Error refining prompt: {str(e)}. Please try again with a simpler request.", chat_history
580
 
581
  def generate_gen4_prompts(scene_info, foundation=""):
582
  """Generate Gen-4 style prompts iteratively"""
@@ -592,7 +289,6 @@ def generate_gen4_prompts(scene_info, foundation=""):
592
  subject = "The person"
593
  else:
594
  subject = "The subject"
595
-
596
  # Generate actions based on scene
597
  if any(word in description.lower() for word in ['sitting', 'seated']):
598
  actions = ['speaks to camera', 'gestures while seated', 'leans forward', 'adjusts posture']
@@ -600,14 +296,11 @@ def generate_gen4_prompts(scene_info, foundation=""):
600
  actions = ['speaks directly', 'gestures naturally', 'shifts weight', 'looks around']
601
  else:
602
  actions = ['moves forward', 'turns slightly', 'gestures', 'demonstrates']
603
-
604
  action = random.choice(actions)
605
-
606
  # Build Gen-4 iteratively
607
  basic = f"{subject} {action}"
608
  with_motion = f"{basic} smoothly"
609
  with_camera = f"{with_motion}. Camera captures steadily"
610
-
611
  # Add style based on composition
612
  composition = scene_info.get('composition', '')
613
  if 'Wide' in composition:
@@ -616,9 +309,7 @@ def generate_gen4_prompts(scene_info, foundation=""):
616
  style_addition = "Intimate portrait lighting"
617
  else:
618
  style_addition = "Professional documentary style"
619
-
620
  with_style = f"{with_camera}. {style_addition}."
621
-
622
  return f"""πŸš€ **Gen-4 Iterative Building:**
623
  **Basic**: {basic}
624
  **+ Motion**: {with_motion}
@@ -640,7 +331,6 @@ def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion,
640
  parts = []
641
  if foundation:
642
  parts.append(foundation)
643
-
644
  # Add motion elements
645
  motion_parts = []
646
  if subject_motion:
@@ -649,17 +339,14 @@ def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion,
649
  motion_parts.extend(scene_motion)
650
  if motion_parts:
651
  parts.append(", ".join(motion_parts))
652
-
653
  # Reference (camera stability)
654
  if camera_motion:
655
  parts.append(f"while {camera_motion}")
656
  else:
657
  parts.append("while background remains steady")
658
-
659
  # Atmosphere
660
  if style:
661
  parts.append(style)
662
-
663
  return " ".join(parts)
664
  else: # Gen-4 style
665
  # Gen-4 Structure: Simple iterative building
@@ -674,28 +361,18 @@ def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion,
674
  parts.extend(scene_motion)
675
  if style:
676
  parts.append(style)
677
-
678
  return ". ".join(parts) if parts else "The subject moves naturally"
679
 
680
  # Create the Gradio interface
681
  def create_interface():
682
  """Create the Gradio interface"""
683
- # AsegΓΊrate de cargar los modelos antes de crear la interfaz
684
- try:
685
- load_models()
686
- except Exception as e:
687
- print(f"⚠️ Warning: Initial model loading failed: {str(e)}")
688
- print("Models will be loaded on demand.")
689
-
690
  with gr.Blocks(theme=gr.themes.Soft(), title="AI Video Prompt Generator") as demo:
691
  # Header
692
- gr.Markdown("# 🎬 AI Video Prompt Generator - πŸ€– SARA Framework Powered")
693
  gr.Markdown("*Professional prompts for Sora, Gen-4, Pika, Luma, Runway and more*")
694
-
695
  # State variables
696
  scene_state = gr.State({})
697
  chat_history_state = gr.State([])
698
-
699
  with gr.Tabs():
700
  # Tab 1: Learning Guide
701
  with gr.Tab("πŸ“š Prompting Guide"):
@@ -713,7 +390,6 @@ def create_interface():
713
  - **Camera Motion**: Pan, tilt, dolly, zoom, orbit, tracking
714
  - **Environmental**: Wind, water flow, particle effects, lighting changes
715
  """)
716
-
717
  # Tab 2: Image Analysis
718
  with gr.Tab("πŸ“· Image Analysis"):
719
  with gr.Row():
@@ -725,7 +401,6 @@ def create_interface():
725
  analyze_btn = gr.Button("πŸ” Analyze Image", variant="primary")
726
  with gr.Column(scale=2):
727
  analysis_output = gr.Markdown(label="AI Analysis Results")
728
-
729
  # Sample prompts section
730
  with gr.Group():
731
  gr.Markdown("### πŸ’‘ Sample Prompts")
@@ -739,7 +414,6 @@ def create_interface():
739
  )
740
  for i in range(3)
741
  ]
742
-
743
  # Tab 3: AI Prompt Generator
744
  with gr.Tab("πŸ€– AI Prompt Generator"):
745
  with gr.Row():
@@ -750,13 +424,6 @@ def create_interface():
750
  lines=3
751
  )
752
  optimize_btn = gr.Button("πŸš€ Generate Optimized Prompt", variant="primary")
753
- with gr.Row():
754
- retry_btn = gr.Button("πŸ”„ Retry with Default Model", variant="secondary")
755
- model_status = gr.Textbox(
756
- label="Model Status",
757
- value="",
758
- interactive=False
759
- )
760
  optimized_prompt = gr.Textbox(
761
  label="AI-Optimized Video Prompt",
762
  lines=4,
@@ -774,7 +441,6 @@ def create_interface():
774
  # Chat history
775
  with gr.Accordion("πŸ’¬ Refinement History", open=False):
776
  chat_display = gr.Chatbot(height=300, type='messages')
777
-
778
  # Tab 4: Gen-4 Method
779
  with gr.Tab("πŸ“ Gen-4 Official"):
780
  gr.Markdown("*Official Gen-4 method: Simple β†’ Complex building*")
@@ -791,7 +457,6 @@ def create_interface():
791
  interactive=False,
792
  show_copy_button=True
793
  )
794
-
795
  # Tab 5: Custom Builder
796
  with gr.Tab("πŸ› οΈ Custom Builder"):
797
  gr.Markdown("## Build Your Custom Prompt")
@@ -834,7 +499,6 @@ def create_interface():
834
  interactive=True,
835
  show_copy_button=True
836
  )
837
-
838
  # Event handlers
839
  analyze_btn.click(
840
  fn=analyze_image_with_zephyr,
@@ -849,12 +513,7 @@ def create_interface():
849
  optimize_btn.click(
850
  fn=optimize_user_prompt_with_zephyr,
851
  inputs=[user_idea, scene_state],
852
- outputs=[optimized_prompt, model_status]
853
- )
854
- retry_btn.click(
855
- fn=lambda idea, scene_info: (fallback_generate_prompt(idea, scene_info), "Using default model"),
856
- inputs=[user_idea, scene_state],
857
- outputs=[optimized_prompt, model_status]
858
  )
859
  refine_btn.click(
860
  fn=refine_prompt_with_zephyr,
@@ -881,7 +540,7 @@ def create_interface():
881
 
882
  # Launch the app
883
  if __name__ == "__main__":
884
- print("🎬 Starting AI Video Prompt Generator with SARA LORA Adapter...")
885
  print(f"πŸ“Š Status: {'GPU' if use_gpu else 'CPU'} Mode Enabled")
886
  print("πŸ”§ Loading models (this may take a few minutes)...")
887
  try:
@@ -899,15 +558,10 @@ if __name__ == "__main__":
899
  print(f"❌ Error launching app: {e}")
900
  print("πŸ”§ Make sure you have sufficient CPU resources and all dependencies installed.")
901
  print("πŸ“¦ Required packages:")
902
- print(" pip install torch transformers gradio pillow accelerate bitsandbytes peft")
903
  # Alternative launch attempt
904
  print("\nπŸ”„ Attempting alternative launch...")
905
  try:
906
- # Intenta instalar las dependencias necesarias
907
- import subprocess
908
- print("πŸ”„ Installing/updating necessary dependencies...")
909
- subprocess.call(["pip", "install", "-U", "transformers", "accelerate", "peft", "huggingface_hub"])
910
-
911
  demo = create_interface()
912
  demo.launch(
913
  share=False,
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ from peft import PeftModel
 
 
 
 
 
 
 
 
 
 
5
 
6
  # Check GPU availability
7
  use_gpu = torch.cuda.is_available()
 
12
  def load_models():
13
  """Load models only when needed"""
14
  global processor, model, zephyr_generator
15
+ if processor is None or model is None or zephyr_generator is None:
16
  print("Loading BLIP model...")
17
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
18
  model = BlipForConditionalGeneration.from_pretrained(
 
20
  torch_dtype=torch.float32 # Use float32 for CPU
21
  )
22
  print("βœ… BLIP model loaded successfully!")
23
+ print("Loading SARA-Zephyr fine-tuned model...")
24
+
25
+ # Load base model
26
+ base_model_id = "HuggingFaceH4/zephyr-7b-beta"
27
+ base_model = AutoModelForCausalLM.from_pretrained(
28
+ base_model_id,
29
+ torch_dtype=torch.float16 if use_gpu else torch.float32, # Use float16 for GPU
30
+ device_map="auto" if use_gpu else None
31
+ )
32
+
33
+ # Apply LoRA adapters
34
+ lora_model_id = "Malaji71/SARA-Zephyr"
35
  try:
36
+ model_with_lora = PeftModel.from_pretrained(base_model, lora_model_id)
37
+ print("βœ… LoRA adapters applied successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  except Exception as e:
39
+ print(f"❌ Error applying LoRA adapters: {str(e)}")
40
+ raise ValueError("Failed to apply LoRA adapters.")
41
+
42
+ # Load tokenizer
43
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
44
+
45
+ # Create pipeline for text generation
46
+ zephyr_generator = pipeline(
47
+ "text-generation",
48
+ model=model_with_lora,
49
+ tokenizer=tokenizer,
50
+ max_new_tokens=128,
51
+ temperature=0.7,
52
+ top_p=0.95,
53
+ repetition_penalty=1.15,
54
+ device_map="auto" if use_gpu else None
55
+ )
56
+ print("βœ… SARA-Zephyr fine-tuned model loaded successfully!")
 
 
 
57
 
58
  # Universal Video Prompting Guide combining Gen-4 + SARA
59
  unified_instructions = """
 
92
  return "Please upload an image first.", {}
93
  try:
94
  # Lazy load models
95
+ load_models()
 
 
 
 
 
 
96
  # Convert to PIL if needed
97
  if not isinstance(image, Image.Image):
98
  image = Image.fromarray(image)
 
99
  # Get image dimensions
100
  width, height = image.size
101
  aspect_ratio = width / height
 
105
  composition = "Vertical portrait shot"
106
  else:
107
  composition = "Balanced composition"
 
108
  # Generate caption with BLIP
109
  inputs = processor(image, return_tensors="pt")
110
  out = model.generate(**inputs, max_length=50, num_beams=3)
111
  basic_caption = processor.decode(out[0], skip_special_tokens=True)
 
112
  # Use Zephyr for advanced analysis
113
  enhanced_analysis = analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition)
 
114
  # Create comprehensive analysis
115
  analysis = f"""πŸ“Š **Image Analysis:**
116
  β€’ **Dimensions**: {width} x {height}
 
124
  {chr(10).join(f"β€’ {insight}" for insight in enhanced_analysis['motion_insights'])}
125
  🎯 **Recommended Approach**:
126
  {enhanced_analysis['recommended_approach']}"""
 
127
  # Scene info for prompt generation
128
  scene_info = {
129
  'basic_description': basic_caption,
 
133
  }
134
  return analysis, scene_info
135
  except Exception as e:
 
136
  return f"Error analyzing image: {str(e)}", {}
137
 
138
  def analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition):
139
+ """Use SARA-Zephyr for advanced scene analysis"""
140
+ analysis_prompt = f"""<|system|>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  You are a video prompt engineering expert specializing in the SARA framework. Analyze this image description for video creation potential.
142
  <|user|>
143
  Image description: "{basic_caption}"
 
150
  4. Best prompting approach (SARA vs Gen-4)
151
  Be concise and practical.
152
  <|assistant|>"""
153
+ response = zephyr_generator(
154
+ analysis_prompt,
155
+ max_new_tokens=200,
156
+ do_sample=True,
157
+ temperature=0.7,
158
+ pad_token_id=zephyr_generator.tokenizer.eos_token_id
159
+ )
160
+ ai_analysis = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
161
+ lines = ai_analysis.split('\n')
162
+ motion_insights = []
163
+ recommended_approach = "SARA framework recommended for precise control"
164
+ for line in lines:
165
+ if line.strip():
166
+ if any(keyword in line.lower() for keyword in ['motion', 'movement', 'camera', 'lighting']):
167
+ motion_insights.append(line.strip('- ').strip())
168
+ elif 'sara' in line.lower() or 'gen-4' in line.lower():
169
+ recommended_approach = line.strip('- ').strip()
170
+ return {
171
+ 'scene_interpretation': ai_analysis.split('\n')[0] if ai_analysis else "Scene analysis completed",
172
+ 'motion_insights': motion_insights[:6],
173
+ 'recommended_approach': recommended_approach
174
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def generate_sample_prompts_with_zephyr(scene_info=None):
177
+ """Generate sample prompts using SARA-Zephyr"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  if scene_info and scene_info.get('basic_description'):
179
+ # Use Zephyr to generate contextual prompts
180
+ context_prompt = f"""<|system|>
 
181
  Generate 3 professional video prompts using the SARA framework based on this image analysis.
182
  <|user|>
183
  Image description: {scene_info['basic_description']}
 
185
  Aspect Ratio: {scene_info.get('aspect_ratio', 'N/A'):.2f}
186
  Remember the SARA framework: Subject + Action + Reference + Atmosphere
187
  <|assistant|>"""
188
+ response = zephyr_generator(
189
+ context_prompt,
190
+ max_new_tokens=200,
191
+ do_sample=True,
192
+ temperature=0.8,
193
+ pad_token_id=zephyr_generator.tokenizer.eos_token_id
194
+ )
195
+ # Extract and clean prompts
196
+ prompts_text = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
197
+ prompts = [p.strip('123.-β€’ ') for p in prompts_text.split('\n') if p.strip()]
198
+ # Return first 3 clean prompts
199
+ if len(prompts) >= 3:
200
+ return prompts[:3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  # Fallback prompts if Zephyr fails or no scene info
202
  base_prompts = [
203
  "The subject walks forward smoothly while the background remains steady, cinematic atmosphere.",
 
207
  return base_prompts
208
 
209
  def optimize_user_prompt_with_zephyr(user_idea, scene_info=None):
210
+ """Optimize user's prompt idea using SARA-Zephyr while respecting SARA/Gen-4 structure"""
211
  if not user_idea.strip():
212
+ return "Please enter your idea first."
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  # Create context from scene if available
214
  context = ""
215
  if scene_info and scene_info.get('basic_description'):
216
  context = f"Image context: {scene_info['basic_description']}"
217
+ # Enforce structure based on approach
218
+ optimization_prompt = f"""<|system|>
 
 
219
  You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma.
220
  Key principles:
221
  - Focus on MOTION, not static description
 
228
  {context}
229
  Please create an optimized video prompt using the SARA framework. Respond with just the prompt.
230
  <|assistant|>"""
231
+ response = zephyr_generator(
232
+ optimization_prompt,
233
+ max_new_tokens=100,
234
+ do_sample=True,
235
+ temperature=0.7,
236
+ pad_token_id=zephyr_generator.tokenizer.eos_token_id
237
+ )
238
+ # Extract optimized prompt
239
+ optimized = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
240
+ return optimized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  def refine_prompt_with_zephyr(current_prompt, feedback, chat_history, scene_info=None):
243
+ """Refine a prompt based on user feedback using SARA-Zephyr"""
244
  if not feedback.strip():
245
  return current_prompt, chat_history
 
 
 
 
 
 
 
 
 
 
 
 
246
  # Create refinement context
247
  context = ""
248
  if scene_info and scene_info.get('basic_description'):
249
  context = f"Image context: {scene_info['basic_description']}"
250
+ # Construct Zephyr refinement prompt
251
+ refinement_prompt = f"""<|system|>
 
 
252
  You are an expert in refining video prompts using the SARA framework. Based on the user's feedback, improve the current prompt while maintaining its core structure.
253
  Key principles:
254
  - Focus on MOTION, not static description
 
262
  {context}
263
  Please refine the prompt while keeping it under 100 words. Respond with just the refined prompt.
264
  <|assistant|>"""
265
+ response = zephyr_generator(
266
+ refinement_prompt,
267
+ max_new_tokens=100,
268
+ do_sample=True,
269
+ temperature=0.7,
270
+ pad_token_id=zephyr_generator.tokenizer.eos_token_id
271
+ )
272
+ # Extract refined prompt
273
+ refined = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
274
+ # Update chat history
275
+ new_chat_history = chat_history + [[feedback, refined]]
276
+ return refined, new_chat_history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  def generate_gen4_prompts(scene_info, foundation=""):
279
  """Generate Gen-4 style prompts iteratively"""
 
289
  subject = "The person"
290
  else:
291
  subject = "The subject"
 
292
  # Generate actions based on scene
293
  if any(word in description.lower() for word in ['sitting', 'seated']):
294
  actions = ['speaks to camera', 'gestures while seated', 'leans forward', 'adjusts posture']
 
296
  actions = ['speaks directly', 'gestures naturally', 'shifts weight', 'looks around']
297
  else:
298
  actions = ['moves forward', 'turns slightly', 'gestures', 'demonstrates']
 
299
  action = random.choice(actions)
 
300
  # Build Gen-4 iteratively
301
  basic = f"{subject} {action}"
302
  with_motion = f"{basic} smoothly"
303
  with_camera = f"{with_motion}. Camera captures steadily"
 
304
  # Add style based on composition
305
  composition = scene_info.get('composition', '')
306
  if 'Wide' in composition:
 
309
  style_addition = "Intimate portrait lighting"
310
  else:
311
  style_addition = "Professional documentary style"
 
312
  with_style = f"{with_camera}. {style_addition}."
 
313
  return f"""πŸš€ **Gen-4 Iterative Building:**
314
  **Basic**: {basic}
315
  **+ Motion**: {with_motion}
 
331
  parts = []
332
  if foundation:
333
  parts.append(foundation)
 
334
  # Add motion elements
335
  motion_parts = []
336
  if subject_motion:
 
339
  motion_parts.extend(scene_motion)
340
  if motion_parts:
341
  parts.append(", ".join(motion_parts))
 
342
  # Reference (camera stability)
343
  if camera_motion:
344
  parts.append(f"while {camera_motion}")
345
  else:
346
  parts.append("while background remains steady")
 
347
  # Atmosphere
348
  if style:
349
  parts.append(style)
 
350
  return " ".join(parts)
351
  else: # Gen-4 style
352
  # Gen-4 Structure: Simple iterative building
 
361
  parts.extend(scene_motion)
362
  if style:
363
  parts.append(style)
 
364
  return ". ".join(parts) if parts else "The subject moves naturally"
365
 
366
  # Create the Gradio interface
367
  def create_interface():
368
  """Create the Gradio interface"""
 
 
 
 
 
 
 
369
  with gr.Blocks(theme=gr.themes.Soft(), title="AI Video Prompt Generator") as demo:
370
  # Header
371
+ gr.Markdown("# 🎬 AI Video Prompt Generator - πŸ€– SARA-Zephyr AI Powered")
372
  gr.Markdown("*Professional prompts for Sora, Gen-4, Pika, Luma, Runway and more*")
 
373
  # State variables
374
  scene_state = gr.State({})
375
  chat_history_state = gr.State([])
 
376
  with gr.Tabs():
377
  # Tab 1: Learning Guide
378
  with gr.Tab("πŸ“š Prompting Guide"):
 
390
  - **Camera Motion**: Pan, tilt, dolly, zoom, orbit, tracking
391
  - **Environmental**: Wind, water flow, particle effects, lighting changes
392
  """)
 
393
  # Tab 2: Image Analysis
394
  with gr.Tab("πŸ“· Image Analysis"):
395
  with gr.Row():
 
401
  analyze_btn = gr.Button("πŸ” Analyze Image", variant="primary")
402
  with gr.Column(scale=2):
403
  analysis_output = gr.Markdown(label="AI Analysis Results")
 
404
  # Sample prompts section
405
  with gr.Group():
406
  gr.Markdown("### πŸ’‘ Sample Prompts")
 
414
  )
415
  for i in range(3)
416
  ]
 
417
  # Tab 3: AI Prompt Generator
418
  with gr.Tab("πŸ€– AI Prompt Generator"):
419
  with gr.Row():
 
424
  lines=3
425
  )
426
  optimize_btn = gr.Button("πŸš€ Generate Optimized Prompt", variant="primary")
 
 
 
 
 
 
 
427
  optimized_prompt = gr.Textbox(
428
  label="AI-Optimized Video Prompt",
429
  lines=4,
 
441
  # Chat history
442
  with gr.Accordion("πŸ’¬ Refinement History", open=False):
443
  chat_display = gr.Chatbot(height=300, type='messages')
 
444
  # Tab 4: Gen-4 Method
445
  with gr.Tab("πŸ“ Gen-4 Official"):
446
  gr.Markdown("*Official Gen-4 method: Simple β†’ Complex building*")
 
457
  interactive=False,
458
  show_copy_button=True
459
  )
 
460
  # Tab 5: Custom Builder
461
  with gr.Tab("πŸ› οΈ Custom Builder"):
462
  gr.Markdown("## Build Your Custom Prompt")
 
499
  interactive=True,
500
  show_copy_button=True
501
  )
 
502
  # Event handlers
503
  analyze_btn.click(
504
  fn=analyze_image_with_zephyr,
 
513
  optimize_btn.click(
514
  fn=optimize_user_prompt_with_zephyr,
515
  inputs=[user_idea, scene_state],
516
+ outputs=[optimized_prompt]
 
 
 
 
 
517
  )
518
  refine_btn.click(
519
  fn=refine_prompt_with_zephyr,
 
540
 
541
  # Launch the app
542
  if __name__ == "__main__":
543
+ print("🎬 Starting AI Video Prompt Generator with SARA-Zephyr...")
544
  print(f"πŸ“Š Status: {'GPU' if use_gpu else 'CPU'} Mode Enabled")
545
  print("πŸ”§ Loading models (this may take a few minutes)...")
546
  try:
 
558
  print(f"❌ Error launching app: {e}")
559
  print("πŸ”§ Make sure you have sufficient CPU resources and all dependencies installed.")
560
  print("πŸ“¦ Required packages:")
561
+ print(" pip install torch transformers gradio pillow accelerate bitsandbytes")
562
  # Alternative launch attempt
563
  print("\nπŸ”„ Attempting alternative launch...")
564
  try:
 
 
 
 
 
565
  demo = create_interface()
566
  demo.launch(
567
  share=False,