audio-flamingo-next-etheroi

Paused

App Files Files Community

SreyanG-NVIDIA commited on Apr 14

Commit

f07bdcd

1 Parent(s): 4b399db

Update prompts

Browse files

Files changed (1) hide show

app.py +17 -2

app.py CHANGED Viewed

@@ -145,6 +145,20 @@ If you need explicit step-by-step timestamp-grounded reasoning traces, use
 PROMPT_NOTE = """
 > **Prompting note:** AF-Next-Instruct is strongest when the task is explicit. Ask directly for QA, ASR, AST, timestamps, or speaker labels instead of relying on a generic prompt.
 """
 APP_CSS = """
 :root {
   --font-sans: ui-sans-serif, system-ui, sans-serif,
@@ -305,11 +319,11 @@ EXAMPLE_YOUTUBE_PROMPTS = [
     ],
     [
         "https://youtu.be/iywaBOMvYLI",
-        "Compose a detailed caption integrating all audio elements, describing sound effects, speech, and music.",
     ],
     [
         "https://youtu.be/_mTRvJ9fugM",
-        "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
     ],
 ]
 _log_cuda_runtime("startup-before-load")
@@ -553,6 +567,7 @@ with gr.Blocks(css=APP_CSS, theme=gr.themes.Soft(primary_hue="teal", secondary_h
     )
     gr.Markdown(MODEL_GUIDE)
     gr.Markdown(PROMPT_NOTE)
     with gr.Tabs(elem_classes="tab-nav"):
         with gr.Row(elem_classes="panel-row"):

 PROMPT_NOTE = """
 > **Prompting note:** AF-Next-Instruct is strongest when the task is explicit. Ask directly for QA, ASR, AST, timestamps, or speaker labels instead of relying on a generic prompt.
 """
+PROMPT_GUIDE_TABLE = """
+### Prompt Guide
+| Task | Prompt | Recommended Checkpoint(s) |
+| --- | --- | --- |
+| ASR | `Transcribe the input speech.` | `Instruct`, `Think` |
+| AST | `Translate any speech you hear from <src_lang> into <tgt_lang>.` | `Instruct`, `Think` |
+| Short Audio Captioning | `Generate a caption for the input audio.` | `Captioner`, `Think` |
+| Long Audio Captioning | `Generate a detailed caption for the input audio. In the caption, transcribe all spoken content by all speakers in the audio precisely.` | `Captioner`, `Think` |
+| Music Captioning | `Summarize the track with precision: mention its musical style, BPM, key, arrangement, production choices, and the emotions or story it conveys.` | `Captioner`, `Instruct`, `Think` |
+| Lyrics | `Generate a lyrics transcription from the input song.` | `Instruct`, `Captioner`, `Think` |
+| QA | `What precise description did the commentator use for the punch that ended the fight?` | `Instruct`, `Think` |
+| Timestamped Multi-Talker ASR | `Transcribe the input audio. If multiple speakers are present, provide diarized transcripts with speaker labels.`<br>`[Speaker 1] ...`<br>`[Speaker 2] ...` | `Instruct`, `Think` |
+"""
 APP_CSS = """
 :root {
   --font-sans: ui-sans-serif, system-ui, sans-serif,
     ],
     [
         "https://youtu.be/iywaBOMvYLI",
+        "Summarize the track with precision: mention its musical style, BPM, key, arrangement, production choices, and the emotions or story it conveys.",
     ],
     [
         "https://youtu.be/_mTRvJ9fugM",
+        "Generate a lyrics transcription from the input song.",
     ],
 ]
 _log_cuda_runtime("startup-before-load")
     )
     gr.Markdown(MODEL_GUIDE)
     gr.Markdown(PROMPT_NOTE)
+    gr.Markdown(PROMPT_GUIDE_TABLE)
     with gr.Tabs(elem_classes="tab-nav"):
         with gr.Row(elem_classes="panel-row"):