Spaces:

chenxie95
/

MeanAudio

Running on Zero

App Files Files Community

AndreasXi commited on 11 days ago

Commit

d712cde

1 Parent(s): 67fd0db

update web

Browse files

Files changed (1) hide show

app.py +33 -90

app.py CHANGED Viewed

@@ -49,32 +49,26 @@ def ensure_models_downloaded():
             snapshot_download(repo_id="AndreasXi/MeanAudio", local_dir="./weights")
             break
-def load_model_if_needed(variant: str):
-    if variant in MODEL_CACHE:
-        return MODEL_CACHE[variant], FEATURE_UTILS_CACHE[variant]
-    log.info(f"Loading model {variant} for the first time...")
-    model_cfg = all_model_cfg[variant]
-    net = get_mean_audio(model_cfg.model_name, use_rope=True, text_c_dim=512)
-    net = net.to(device, torch.bfloat16).eval()
-    net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
-    feature_utils = FeaturesUtils(
-        tod_vae_ckpt=model_cfg.vae_path,
-        enable_conditions=True,
-        encoder_name="t5_clap",
-        mode=model_cfg.mode,
-        bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
-        need_vae_encoder=False
-    )
-    feature_utils = feature_utils.to(device, torch.bfloat16).eval()
-    MODEL_CACHE[variant] = net
-    FEATURE_UTILS_CACHE[variant] = feature_utils
-    log.info(f"Model {variant} loaded and cached successfully")
-    return net, feature_utils
 @spaces.GPU(duration=60)
@@ -84,7 +78,6 @@ def generate_audio_gradio(
     duration,
     cfg_strength,
     num_steps,
-    seed,
     variant,
 ):
@@ -93,7 +86,7 @@ def generate_audio_gradio(
     if variant not in all_model_cfg:
         raise ValueError(f"Unknown model variant: {variant}. Available: {list(all_model_cfg.keys())}")
-    net, feature_utils = load_model_if_needed(variant)
     model = all_model_cfg[variant]
     seq_cfg = model.seq_cfg
@@ -122,7 +115,8 @@ def generate_audio_gradio(
         sampler_arg_name = "fm"
     rng = torch.Generator(device=device)
-    rng.manual_seed(seed)
     audios = generation_func(
         [prompt]*NUM_SAMPLE,
@@ -135,7 +129,7 @@ def generate_audio_gradio(
     )
     audio = audios[0].float().cpu()
-    def fade_out(x, sr, fade_ms=300):
         n = len(x)
         k = int(sr * fade_ms / 1000)
         if k <= 0 or k >= n:
@@ -168,15 +162,15 @@ def generate_audio_gradio(
 # Gradio input and output components
 input_text = gr.Textbox(lines=2, label="Prompt")
 output_audio = gr.Audio(label="Generated Audio", type="filepath")
-denoising_steps = gr.Slider(minimum=1, maximum=50, value=1, step=5, label="Steps", interactive=True)
 cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
 duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
-seed = gr.Slider(minimum=-1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
 variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
 gr_interface = gr.Interface(
     fn=generate_audio_gradio,
-    inputs=[input_text, duration, cfg_strength, denoising_steps, seed, variant],
     outputs=["text", "audio"],
     title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
     description="",
@@ -193,67 +187,16 @@ gr_interface = gr.Interface(
         ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water"],
         ["Gentle female voice cooing and baby responding with happy gurgles and giggles"],
         ['doorbell ding once followed by footsteps gradually getting louder and a door is opened '],
-        ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background"],
-        ["Powerful ocean waves crashing and receding on sandy beach with distant seagulls"],
-        ["Emulate the lively sounds of a retro arcade: 8-bit game music, coins clinking. People cheering occasionally when players winning"],
-        ["Simulate a forest ambiance with birds chirping and wind rustling through the leaves"],
-        ["A train conductor blows a sharp whistle, metal wheels screech on the rails, and passengers murmur while settling into their seats"],
-        ["Generate an energetic and bustling city street scene with distant traffic and close conversations"],
-        ["Alarms blare with rising urgency as fragments clatter against a metallic hull, interrupted by a faint hiss of escaping air"],
-        ["Create a serene soundscape of a quiet beach at sunset"],
-        ["Tiny pops and hisses of chemical reactions intermingle with the rhythmic pumping of a centrifuge and the soft whirr of air filtration"],
-        ["A train conductor blows a sharp whistle, metal wheels screech on the rails, and passengers murmur while settling into their seats"],
-        ["Emulate the lively sounds of a retro arcade: 8-bit game music, coins clinking. People cheering occasionally when players winning"],
-        ["Quiet whispered conversation gradually fading into distant jet engine roar diminishing into silence"],
-        ["Clear sound of bicycle tires crunching on loose gravel and dirt, followed by deep male laughter echoing"],
-        ["Multiple ducks quacking loudly with splashing water and piercing wild animal shriek in background"],
-        ["Create the underwater soundscape: gentle waves, faint whale calls, and the occasional clink of scuba gear"],
-        ["Recreate the sounds of an active volcano: rumbling earth, lava bubbling, and the occasional loud explosive roar of an eruption"],
-        ["A pile of coins spills onto a wooden table with a metallic clatter, followed by the hushed murmur of a tavern crowd and the creak of a swinging door"],
-        ["Clear male voice speaking, sharp popping sound, followed by genuine group laughter"],
-        ["Stream of water hitting empty ceramic cup, pitch rising as cup fills up"],
-        ["Massive crowd erupting in thunderous applause and excited cheering"],
-        ["Deep rolling thunder with bright lightning strikes crackling through sky"],
-        ["Aggressive dog barking and distressed cat meowing as racing car roars past at high speed"],
-        ["Peaceful stream bubbling and birds singing, interrupted by sudden explosive gunshot"],
-        ["Man speaking outdoors, goat bleating loudly, metal gate scraping closed, ducks quacking frantically, wind howling into microphone"],
-        ["Series of loud aggressive dog barks echoing"],
-        ["Multiple distinct cat meows at different pitches"],
-        ["Rhythmic wooden table tapping overlaid with steady water pouring sound"],
-        ["Sustained crowd applause with camera clicks and amplified male announcer voice"],
-        ["Two sharp gunshots followed by panicked birds taking flight with rapid wing flaps"],
-        ["Deep rhythmic snoring with clear breathing patterns"],
-        ["Multiple racing engines revving and accelerating with sharp whistle piercing through"],
-        ["Massive stadium crowd cheering as thunder crashes and lightning strikes"],
-        ["Heavy helicopter blades chopping through air with engine and wind noise"],
-        ["Dog barking excitedly and man shouting as race car engine roars past"],
-        ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
-        ["Ducks quack and water splashes with some animal screeching in the background"],
-        ["Describe the sound of the ocean"],
-        ["A woman and a baby are having a conversation"],
-        ["A man speaks followed by a popping noise and laughter"],
-        ["A cup is filled from a faucet"],
-        ["An audience cheering and clapping"],
-        ["Rolling thunder with lightning strikes"],
-        ["A dog barking and a cat mewing and a racing car passes by"],
-        ["Gentle water stream, birds chirping and sudden gun shot"],
-        ["A dog barking"],
-        ["A cat meowing"],
-        ["Wooden table tapping sound while water pouring"],
-        ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
-        ["two gunshots followed by birds flying away while chirping"],
-        ["Whistling with birds chirping"],
-        ["A person snoring"],
-        ["Motor vehicles are driving with loud engines and a person whistles"],
-        ["People cheering in a stadium while thunder and lightning strikes"],
-        ["A helicopter is in flight"],
-        ["A dog barking and a man talking and a racing car passes by"],
     ],
     cache_examples="lazy", # Turn on to cache.
 )
-ensure_models_downloaded()
-gr_interface.queue(15).launch()
 # theme = gr.themes.Soft(
 #     primary_hue="blue",

             snapshot_download(repo_id="AndreasXi/MeanAudio", local_dir="./weights")
             break
+def load_model_cache():
+    for variant in all_model_cfg.keys():
+        if variant in MODEL_CACHE:
+            return MODEL_CACHE[variant], FEATURE_UTILS_CACHE['default']
+        else:
+            log.info(f"Loading model {variant} for the first time...")
+            model_cfg = all_model_cfg[variant]
+            net = get_mean_audio(model_cfg.model_name, use_rope=True, text_c_dim=512)
+            net = net.to(device, torch.bfloat16).eval()
+            net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
+            MODEL_CACHE[variant] = net
+            feature_utils = FeaturesUtils(
+                tod_vae_ckpt=model_cfg.vae_path,
+                enable_conditions=True,
+                encoder_name="t5_clap",
+                mode=model_cfg.mode,
+                bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
+                need_vae_encoder=False
+            )
+            FEATURE_UTILS_CACHE['default'] = feature_utils
 @spaces.GPU(duration=60)
     duration,
     cfg_strength,
     num_steps,
     variant,
 ):
     if variant not in all_model_cfg:
         raise ValueError(f"Unknown model variant: {variant}. Available: {list(all_model_cfg.keys())}")
+    net, feature_utils = MODEL_CACHE[variant], FEATURE_UTILS_CACHE['default']
     model = all_model_cfg[variant]
     seq_cfg = model.seq_cfg
         sampler_arg_name = "fm"
     rng = torch.Generator(device=device)
+    # force to 42
+    rng.manual_seed(42)
     audios = generation_func(
         [prompt]*NUM_SAMPLE,
     )
     audio = audios[0].float().cpu()
+    def fade_out(x, sr, fade_ms=50):
         n = len(x)
         k = int(sr * fade_ms / 1000)
         if k <= 0 or k >= n:
 # Gradio input and output components
 input_text = gr.Textbox(lines=2, label="Prompt")
 output_audio = gr.Audio(label="Generated Audio", type="filepath")
+denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="SamplingSteps", interactive=True)
 cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
 duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
+# seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
 variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
 gr_interface = gr.Interface(
     fn=generate_audio_gradio,
+    inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
     outputs=["text", "audio"],
     title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
     description="",
         ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water"],
         ["Gentle female voice cooing and baby responding with happy gurgles and giggles"],
         ['doorbell ding once followed by footsteps gradually getting louder and a door is opened '],
+        ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background"]
     ],
     cache_examples="lazy", # Turn on to cache.
 )
+if __name__ == "__main__":
+    ensure_models_downloaded()
+    load_model_cache()
+    gr_interface.queue(15).launch()
 # theme = gr.themes.Soft(
 #     primary_hue="blue",