SreyanG-NVIDIA commited on
Commit
f07bdcd
·
1 Parent(s): 4b399db

Update prompts

Browse files
Files changed (1) hide show
  1. app.py +17 -2
app.py CHANGED
@@ -145,6 +145,20 @@ If you need explicit step-by-step timestamp-grounded reasoning traces, use
145
  PROMPT_NOTE = """
146
  > **Prompting note:** AF-Next-Instruct is strongest when the task is explicit. Ask directly for QA, ASR, AST, timestamps, or speaker labels instead of relying on a generic prompt.
147
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  APP_CSS = """
149
  :root {
150
  --font-sans: ui-sans-serif, system-ui, sans-serif,
@@ -305,11 +319,11 @@ EXAMPLE_YOUTUBE_PROMPTS = [
305
  ],
306
  [
307
  "https://youtu.be/iywaBOMvYLI",
308
- "Compose a detailed caption integrating all audio elements, describing sound effects, speech, and music.",
309
  ],
310
  [
311
  "https://youtu.be/_mTRvJ9fugM",
312
- "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
313
  ],
314
  ]
315
  _log_cuda_runtime("startup-before-load")
@@ -553,6 +567,7 @@ with gr.Blocks(css=APP_CSS, theme=gr.themes.Soft(primary_hue="teal", secondary_h
553
  )
554
  gr.Markdown(MODEL_GUIDE)
555
  gr.Markdown(PROMPT_NOTE)
 
556
 
557
  with gr.Tabs(elem_classes="tab-nav"):
558
  with gr.Row(elem_classes="panel-row"):
 
145
  PROMPT_NOTE = """
146
  > **Prompting note:** AF-Next-Instruct is strongest when the task is explicit. Ask directly for QA, ASR, AST, timestamps, or speaker labels instead of relying on a generic prompt.
147
  """
148
+ PROMPT_GUIDE_TABLE = """
149
+ ### Prompt Guide
150
+
151
+ | Task | Prompt | Recommended Checkpoint(s) |
152
+ | --- | --- | --- |
153
+ | ASR | `Transcribe the input speech.` | `Instruct`, `Think` |
154
+ | AST | `Translate any speech you hear from <src_lang> into <tgt_lang>.` | `Instruct`, `Think` |
155
+ | Short Audio Captioning | `Generate a caption for the input audio.` | `Captioner`, `Think` |
156
+ | Long Audio Captioning | `Generate a detailed caption for the input audio. In the caption, transcribe all spoken content by all speakers in the audio precisely.` | `Captioner`, `Think` |
157
+ | Music Captioning | `Summarize the track with precision: mention its musical style, BPM, key, arrangement, production choices, and the emotions or story it conveys.` | `Captioner`, `Instruct`, `Think` |
158
+ | Lyrics | `Generate a lyrics transcription from the input song.` | `Instruct`, `Captioner`, `Think` |
159
+ | QA | `What precise description did the commentator use for the punch that ended the fight?` | `Instruct`, `Think` |
160
+ | Timestamped Multi-Talker ASR | `Transcribe the input audio. If multiple speakers are present, provide diarized transcripts with speaker labels.`<br>`[Speaker 1] ...`<br>`[Speaker 2] ...` | `Instruct`, `Think` |
161
+ """
162
  APP_CSS = """
163
  :root {
164
  --font-sans: ui-sans-serif, system-ui, sans-serif,
 
319
  ],
320
  [
321
  "https://youtu.be/iywaBOMvYLI",
322
+ "Summarize the track with precision: mention its musical style, BPM, key, arrangement, production choices, and the emotions or story it conveys.",
323
  ],
324
  [
325
  "https://youtu.be/_mTRvJ9fugM",
326
+ "Generate a lyrics transcription from the input song.",
327
  ],
328
  ]
329
  _log_cuda_runtime("startup-before-load")
 
567
  )
568
  gr.Markdown(MODEL_GUIDE)
569
  gr.Markdown(PROMPT_NOTE)
570
+ gr.Markdown(PROMPT_GUIDE_TABLE)
571
 
572
  with gr.Tabs(elem_classes="tab-nav"):
573
  with gr.Row(elem_classes="panel-row"):