Spaces:

chenxie95
/

MeanAudio

Running on Zero

App Files Files Community

AndreasXi commited on 21 days ago

Commit

085b825

1 Parent(s): bbd22e4

update

Browse files

Files changed (1) hide show

app.py +16 -17

app.py CHANGED Viewed

@@ -111,6 +111,7 @@ def generate_audio_gradio(
     cfg_strength,
     num_steps,
     variant,
 ):
     if duration <= 0 or num_steps <= 0:
@@ -146,8 +147,7 @@ def generate_audio_gradio(
         sampler_arg_name = "fm"
     rng = torch.Generator(device=device)
-    # force to 42
-    rng.manual_seed(42)
     audios = generation_func(
         [prompt]*NUM_SAMPLE,
@@ -167,7 +167,7 @@ def generate_audio_gradio(
     for i, audio in enumerate(audios):
         audio = audio.float().cpu()
-        audio = fade_out(audio, seq_cfg.sampling_rate)
         current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
         filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
@@ -188,7 +188,7 @@ output_audio = gr.Audio(label="Generated Audio", type="filepath")
 denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
 cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
 duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
-# seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
 variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
@@ -214,27 +214,26 @@ description_text = """
 gr_interface = gr.Interface(
     fn=generate_audio_gradio,
-    inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
     outputs=[
         gr.Audio(label="🎵 Audio Sample", type="filepath"),
         gr.Textbox(label="Prompt Used", interactive=False)
     ],
     title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
-    description="",
     flagging_mode="never",
     examples=[
-        ["Generate the festive sounds of a fireworks show: explosions lighting up the sky, crowd cheering, and the faint music playing in the background!! Celebration of the new year!", 10, 3, 1, "meanaudio_s_full"],
         ["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
-        ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full"],
-        ["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full"],
-        ["A soccer ball hits a goalpost with a metallic clang, followed by cheers, clapping, and the distant hum of a commentator’s voice", 10, 3, 1, "meanaudio_s_full"],
-        ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full"],
-        ["Dripping water echoes sharply, a distant growl reverberates through the cavern, and soft scraping metal suggests something lurking unseen", 10, 3, 1, "meanaudio_s_full"],
-        ["A cow is mooing whilst a lion is roaring in the background as a hunter shoots. A flock of birds subsequently fly away from the trees.", 10, 3, 1, "meanaudio_s_full"],
-        ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water", 10, 3, 1, "meanaudio_s_full"],
-        ["Gentle female voice cooing and baby responding with happy gurgles and giggles", 10, 3, 1, "meanaudio_s_full"],
-        ['doorbell ding once followed by footsteps gradually getting louder and a door is opened ', 10, 3, 1, "meanaudio_s_full"],
-        ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background", 10, 3, 1, "meanaudio_s_full"]
     ],
     cache_examples="lazy",
 )

     cfg_strength,
     num_steps,
     variant,
+    seed
 ):
     if duration <= 0 or num_steps <= 0:
         sampler_arg_name = "fm"
     rng = torch.Generator(device=device)
+    rng.manual_seed(seed)
     audios = generation_func(
         [prompt]*NUM_SAMPLE,
     for i, audio in enumerate(audios):
         audio = audio.float().cpu()
+        audio = fade_out(audio, seq_cfg.sampling_rate, fade_ms=100)
         current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
         filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
 denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
 cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
 duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
+seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
 variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
 gr_interface = gr.Interface(
     fn=generate_audio_gradio,
+    inputs=[input_text, duration, cfg_strength, denoising_steps, variant, seed],
     outputs=[
         gr.Audio(label="🎵 Audio Sample", type="filepath"),
         gr.Textbox(label="Prompt Used", interactive=False)
     ],
     title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
+    description=description_text,
     flagging_mode="never",
     examples=[
+        ["Guitar and piano playing a warm music, with a soft and gentle melody, perfect for a romantic evening.", 10, 3, 1, "meanaudio_s_full", 42],
         ["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
+        ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full", 42],
+        ["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full", 42],
+        ["The sound of a steam engine.", 10, 3, 1, "meanaudio_s_full", 42],
+        ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full", 42],
+        ["Chopping meat on a wooden table.", 10, 3, 1, "meanaudio_s_full", 42],
+        ["A vehicle engine revving then accelerating at a high rate as a metal surface is whipped followed by tires skidding.", 10, 3, 1, "meanaudio_s_full", 42],
+        ["Battlefield scene, continuous roar of artillery and gunfire, high fidelity, the sharp crack of bullets, the thundering explosions of bombs, and the screams of wounded soldiers.", 10, 3, 1, "meanaudio_s_full", 42],
+        ["Pop music that upbeat, catchy, and easy to listen, high fidelity, with simple melodies, electronic instruments and polished production.", 10, 3, 1, "meanaudio_s_full", 42],
+        ["A fast-paced instrumental piece with a classical vibe featuring stringed instruments, evoking an energetic and uplifting mood.", 10, 3, 1, "meanaudio_s_full", 42]
     ],
     cache_examples="lazy",
 )