Spaces:

chenxie95
/

MeanAudio

Running on Zero

App Files Files Community

AndreasXi commited on 10 days ago

Commit

2b7760c

1 Parent(s): 0685a2c

update web

Browse files

Files changed (1) hide show

app.py +43 -43

app.py CHANGED Viewed

@@ -36,12 +36,22 @@ setup_eval_logging()
 OUTPUT_DIR = Path("./output/gradio")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-NUM_SAMPLE = 1
 # Global model cache to avoid reloading
 MODEL_CACHE = {}
 FEATURE_UTILS_CACHE = {}
 def ensure_models_downloaded():
     for variant, model_cfg in all_model_cfg.items():
         if not model_cfg.model_path.exists():
@@ -94,7 +104,6 @@ def generate_audio_gradio(
     net.update_seq_lengths(seq_cfg.latent_seq_len)
     if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full':
         use_meanflow=True
     elif variant == 'fluxaudio_s_full':
@@ -116,7 +125,7 @@ def generate_audio_gradio(
     rng = torch.Generator(device=device)
     # force to 42
-    rng.manual_seed(42)
     audios = generation_func(
         [prompt]*NUM_SAMPLE,
@@ -127,43 +136,34 @@ def generate_audio_gradio(
         cfg_strength=cfg_strength,
         **{sampler_arg_name: sampler},
     )
-    audio = audios[0].float().cpu()
-    def fade_out(x, sr, fade_ms=50):
-        n = len(x)
-        k = int(sr * fade_ms / 1000)
-        if k <= 0 or k >= n:
-            return x
-        w = np.linspace(1.0, 0.0, k)
-        x[-k:] = x[-k:] * w
-        return x
-    audio = fade_out(audio, seq_cfg.sampling_rate)
-    safe_prompt = (
-        "".join(c for c in prompt if c.isalnum() or c in (" ", "_"))
-        .rstrip()
-        .replace(" ", "_")[:50]
-    )
-    current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-    filename = f"{safe_prompt}_{current_time_string}.flac"
-    save_path = OUTPUT_DIR / filename
-    torchaudio.save(str(save_path), audio, seq_cfg.sampling_rate)
-    log.info(f"Audio saved to {save_path}")
     if device == "cuda":
         torch.cuda.empty_cache()
-    return (
-        f"Generated audio for prompt: '{prompt}' using {'MeanFlow' if use_meanflow else 'FlowMatching'}",
-        str(save_path),
-    )
 # Gradio input and output components
 input_text = gr.Textbox(lines=2, label="Prompt")
 output_audio = gr.Audio(label="Generated Audio", type="filepath")
-denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="SamplingSteps", interactive=True)
-cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
 duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
 # seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
 variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
@@ -171,23 +171,23 @@ variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()),
 gr_interface = gr.Interface(
     fn=generate_audio_gradio,
     inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
-    outputs=["text", "audio"],
     title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
     description="",
     flagging_mode="never",
     examples=[
-        ["Generate the festive sounds of a fireworks show: explosions lighting up the sky, crowd cheering, and the faint music playing in the background!! Celebration of the new year!", 10, 4.5, 1, "meanaudio_s_full"],
-        ["Melodic human whistling harmonizing with natural birdsong", 10, 4.5, 1, "meanaudio_s_full"],
-        ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 4.5, 1, "meanaudio_s_full"],
-        ["Quiet speech and then and airplane flying away", 10, 4.5, 1, "meanaudio_s_full"],
-        ["A soccer ball hits a goalpost with a metallic clang, followed by cheers, clapping, and the distant hum of a commentator’s voice", 10, 4.5, 1, "meanaudio_s_full"],
-        ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 4.5, 1, "meanaudio_s_full"],
-        ["Dripping water echoes sharply, a distant growl reverberates through the cavern, and soft scraping metal suggests something lurking unseen", 10, 4.5, 1, "meanaudio_s_full"],
-        ["A cow is mooing whilst a lion is roaring in the background as a hunter shoots. A flock of birds subsequently fly away from the trees.", 10, 4.5, 1, "meanaudio_s_full"],
-        ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water", 10, 4.5, 1, "meanaudio_s_full"],
-        ["Gentle female voice cooing and baby responding with happy gurgles and giggles", 10, 4.5, 1, "meanaudio_s_full"],
-        ['doorbell ding once followed by footsteps gradually getting louder and a door is opened ', 10, 4.5, 1, "meanaudio_s_full"],
-        ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background", 10, 4.5, 1, "meanaudio_s_full"]
     ],
     cache_examples="lazy", # Turn on to cache.
 )

 OUTPUT_DIR = Path("./output/gradio")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+NUM_SAMPLE = 2
 # Global model cache to avoid reloading
 MODEL_CACHE = {}
 FEATURE_UTILS_CACHE = {}
+def fade_out(x, sr, fade_ms=50):
+    n = len(x)
+    k = int(sr * fade_ms / 1000)
+    if k <= 0 or k >= n:
+        return x
+    w = np.linspace(1.0, 0.0, k)
+    x[-k:] = x[-k:] * w
+    return x
 def ensure_models_downloaded():
     for variant, model_cfg in all_model_cfg.items():
         if not model_cfg.model_path.exists():
     net.update_seq_lengths(seq_cfg.latent_seq_len)
     if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full':
         use_meanflow=True
     elif variant == 'fluxaudio_s_full':
     rng = torch.Generator(device=device)
     # force to 42
+    # rng.manual_seed(42)
     audios = generation_func(
         [prompt]*NUM_SAMPLE,
         cfg_strength=cfg_strength,
         **{sampler_arg_name: sampler},
     )
+    save_paths = []
+    for i, audio in enumerate(audios):
+        audio = audio.float().cpu()
+        audio = fade_out(audio, seq_cfg.sampling_rate)
+        safe_prompt = (
+            "".join(c for c in prompt if c.isalnum() or c in (" ", "_"))
+            .rstrip()
+            .replace(" ", "_")[:50]
+        )
+        current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
+        save_path = OUTPUT_DIR / filename
+        torchaudio.save(str(save_path), audio, seq_cfg.sampling_rate)
+        log.info(f"Audio saved to {save_path}")
+        save_paths.append(str(save_path))
     if device == "cuda":
         torch.cuda.empty_cache()
+    return save_paths
 # Gradio input and output components
 input_text = gr.Textbox(lines=2, label="Prompt")
 output_audio = gr.Audio(label="Generated Audio", type="filepath")
+denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
+cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
 duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
 # seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
 variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
 gr_interface = gr.Interface(
     fn=generate_audio_gradio,
     inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
+    outputs=["audio", "audio"],
     title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
     description="",
     flagging_mode="never",
     examples=[
+        ["Generate the festive sounds of a fireworks show: explosions lighting up the sky, crowd cheering, and the faint music playing in the background!! Celebration of the new year!", 10, 3, 1, "meanaudio_s_full"],
+        ["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
+        ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full"],
+        ["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full"],
+        ["A soccer ball hits a goalpost with a metallic clang, followed by cheers, clapping, and the distant hum of a commentator’s voice", 10, 3, 1, "meanaudio_s_full"],
+        ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full"],
+        ["Dripping water echoes sharply, a distant growl reverberates through the cavern, and soft scraping metal suggests something lurking unseen", 10, 3, 1, "meanaudio_s_full"],
+        ["A cow is mooing whilst a lion is roaring in the background as a hunter shoots. A flock of birds subsequently fly away from the trees.", 10, 3, 1, "meanaudio_s_full"],
+        ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water", 10, 3, 1, "meanaudio_s_full"],
+        ["Gentle female voice cooing and baby responding with happy gurgles and giggles", 10, 3, 1, "meanaudio_s_full"],
+        ['doorbell ding once followed by footsteps gradually getting louder and a door is opened ', 10, 3, 1, "meanaudio_s_full"],
+        ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background", 10, 3, 1, "meanaudio_s_full"]
     ],
     cache_examples="lazy", # Turn on to cache.
 )