update web
Browse files
app.py
CHANGED
@@ -49,32 +49,26 @@ def ensure_models_downloaded():
|
|
49 |
snapshot_download(repo_id="AndreasXi/MeanAudio", local_dir="./weights")
|
50 |
break
|
51 |
|
52 |
-
def
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
MODEL_CACHE[variant] = net
|
74 |
-
FEATURE_UTILS_CACHE[variant] = feature_utils
|
75 |
-
|
76 |
-
log.info(f"Model {variant} loaded and cached successfully")
|
77 |
-
return net, feature_utils
|
78 |
|
79 |
|
80 |
@spaces.GPU(duration=60)
|
@@ -84,7 +78,6 @@ def generate_audio_gradio(
|
|
84 |
duration,
|
85 |
cfg_strength,
|
86 |
num_steps,
|
87 |
-
seed,
|
88 |
variant,
|
89 |
):
|
90 |
|
@@ -93,7 +86,7 @@ def generate_audio_gradio(
|
|
93 |
if variant not in all_model_cfg:
|
94 |
raise ValueError(f"Unknown model variant: {variant}. Available: {list(all_model_cfg.keys())}")
|
95 |
|
96 |
-
net, feature_utils =
|
97 |
|
98 |
model = all_model_cfg[variant]
|
99 |
seq_cfg = model.seq_cfg
|
@@ -122,7 +115,8 @@ def generate_audio_gradio(
|
|
122 |
sampler_arg_name = "fm"
|
123 |
|
124 |
rng = torch.Generator(device=device)
|
125 |
-
|
|
|
126 |
|
127 |
audios = generation_func(
|
128 |
[prompt]*NUM_SAMPLE,
|
@@ -135,7 +129,7 @@ def generate_audio_gradio(
|
|
135 |
)
|
136 |
audio = audios[0].float().cpu()
|
137 |
|
138 |
-
def fade_out(x, sr, fade_ms=
|
139 |
n = len(x)
|
140 |
k = int(sr * fade_ms / 1000)
|
141 |
if k <= 0 or k >= n:
|
@@ -168,15 +162,15 @@ def generate_audio_gradio(
|
|
168 |
# Gradio input and output components
|
169 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
170 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
171 |
-
denoising_steps = gr.Slider(minimum=1, maximum=
|
172 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
|
173 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
174 |
-
seed = gr.Slider(minimum
|
175 |
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
176 |
|
177 |
gr_interface = gr.Interface(
|
178 |
fn=generate_audio_gradio,
|
179 |
-
inputs=[input_text, duration, cfg_strength, denoising_steps,
|
180 |
outputs=["text", "audio"],
|
181 |
title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
|
182 |
description="",
|
@@ -193,67 +187,16 @@ gr_interface = gr.Interface(
|
|
193 |
["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water"],
|
194 |
["Gentle female voice cooing and baby responding with happy gurgles and giggles"],
|
195 |
['doorbell ding once followed by footsteps gradually getting louder and a door is opened '],
|
196 |
-
["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background"]
|
197 |
-
["Powerful ocean waves crashing and receding on sandy beach with distant seagulls"],
|
198 |
-
["Emulate the lively sounds of a retro arcade: 8-bit game music, coins clinking. People cheering occasionally when players winning"],
|
199 |
-
["Simulate a forest ambiance with birds chirping and wind rustling through the leaves"],
|
200 |
-
["A train conductor blows a sharp whistle, metal wheels screech on the rails, and passengers murmur while settling into their seats"],
|
201 |
-
["Generate an energetic and bustling city street scene with distant traffic and close conversations"],
|
202 |
-
["Alarms blare with rising urgency as fragments clatter against a metallic hull, interrupted by a faint hiss of escaping air"],
|
203 |
-
["Create a serene soundscape of a quiet beach at sunset"],
|
204 |
-
["Tiny pops and hisses of chemical reactions intermingle with the rhythmic pumping of a centrifuge and the soft whirr of air filtration"],
|
205 |
-
["A train conductor blows a sharp whistle, metal wheels screech on the rails, and passengers murmur while settling into their seats"],
|
206 |
-
["Emulate the lively sounds of a retro arcade: 8-bit game music, coins clinking. People cheering occasionally when players winning"],
|
207 |
-
["Quiet whispered conversation gradually fading into distant jet engine roar diminishing into silence"],
|
208 |
-
["Clear sound of bicycle tires crunching on loose gravel and dirt, followed by deep male laughter echoing"],
|
209 |
-
["Multiple ducks quacking loudly with splashing water and piercing wild animal shriek in background"],
|
210 |
-
["Create the underwater soundscape: gentle waves, faint whale calls, and the occasional clink of scuba gear"],
|
211 |
-
["Recreate the sounds of an active volcano: rumbling earth, lava bubbling, and the occasional loud explosive roar of an eruption"],
|
212 |
-
["A pile of coins spills onto a wooden table with a metallic clatter, followed by the hushed murmur of a tavern crowd and the creak of a swinging door"],
|
213 |
-
["Clear male voice speaking, sharp popping sound, followed by genuine group laughter"],
|
214 |
-
["Stream of water hitting empty ceramic cup, pitch rising as cup fills up"],
|
215 |
-
["Massive crowd erupting in thunderous applause and excited cheering"],
|
216 |
-
["Deep rolling thunder with bright lightning strikes crackling through sky"],
|
217 |
-
["Aggressive dog barking and distressed cat meowing as racing car roars past at high speed"],
|
218 |
-
["Peaceful stream bubbling and birds singing, interrupted by sudden explosive gunshot"],
|
219 |
-
["Man speaking outdoors, goat bleating loudly, metal gate scraping closed, ducks quacking frantically, wind howling into microphone"],
|
220 |
-
["Series of loud aggressive dog barks echoing"],
|
221 |
-
["Multiple distinct cat meows at different pitches"],
|
222 |
-
["Rhythmic wooden table tapping overlaid with steady water pouring sound"],
|
223 |
-
["Sustained crowd applause with camera clicks and amplified male announcer voice"],
|
224 |
-
["Two sharp gunshots followed by panicked birds taking flight with rapid wing flaps"],
|
225 |
-
["Deep rhythmic snoring with clear breathing patterns"],
|
226 |
-
["Multiple racing engines revving and accelerating with sharp whistle piercing through"],
|
227 |
-
["Massive stadium crowd cheering as thunder crashes and lightning strikes"],
|
228 |
-
["Heavy helicopter blades chopping through air with engine and wind noise"],
|
229 |
-
["Dog barking excitedly and man shouting as race car engine roars past"],
|
230 |
-
["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
|
231 |
-
["Ducks quack and water splashes with some animal screeching in the background"],
|
232 |
-
["Describe the sound of the ocean"],
|
233 |
-
["A woman and a baby are having a conversation"],
|
234 |
-
["A man speaks followed by a popping noise and laughter"],
|
235 |
-
["A cup is filled from a faucet"],
|
236 |
-
["An audience cheering and clapping"],
|
237 |
-
["Rolling thunder with lightning strikes"],
|
238 |
-
["A dog barking and a cat mewing and a racing car passes by"],
|
239 |
-
["Gentle water stream, birds chirping and sudden gun shot"],
|
240 |
-
["A dog barking"],
|
241 |
-
["A cat meowing"],
|
242 |
-
["Wooden table tapping sound while water pouring"],
|
243 |
-
["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
|
244 |
-
["two gunshots followed by birds flying away while chirping"],
|
245 |
-
["Whistling with birds chirping"],
|
246 |
-
["A person snoring"],
|
247 |
-
["Motor vehicles are driving with loud engines and a person whistles"],
|
248 |
-
["People cheering in a stadium while thunder and lightning strikes"],
|
249 |
-
["A helicopter is in flight"],
|
250 |
-
["A dog barking and a man talking and a racing car passes by"],
|
251 |
],
|
252 |
cache_examples="lazy", # Turn on to cache.
|
253 |
)
|
254 |
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
257 |
|
258 |
# theme = gr.themes.Soft(
|
259 |
# primary_hue="blue",
|
|
|
49 |
snapshot_download(repo_id="AndreasXi/MeanAudio", local_dir="./weights")
|
50 |
break
|
51 |
|
52 |
+
def load_model_cache():
|
53 |
+
for variant in all_model_cfg.keys():
|
54 |
+
if variant in MODEL_CACHE:
|
55 |
+
return MODEL_CACHE[variant], FEATURE_UTILS_CACHE['default']
|
56 |
+
else:
|
57 |
+
log.info(f"Loading model {variant} for the first time...")
|
58 |
+
model_cfg = all_model_cfg[variant]
|
59 |
+
net = get_mean_audio(model_cfg.model_name, use_rope=True, text_c_dim=512)
|
60 |
+
net = net.to(device, torch.bfloat16).eval()
|
61 |
+
net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
|
62 |
+
MODEL_CACHE[variant] = net
|
63 |
+
feature_utils = FeaturesUtils(
|
64 |
+
tod_vae_ckpt=model_cfg.vae_path,
|
65 |
+
enable_conditions=True,
|
66 |
+
encoder_name="t5_clap",
|
67 |
+
mode=model_cfg.mode,
|
68 |
+
bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
|
69 |
+
need_vae_encoder=False
|
70 |
+
)
|
71 |
+
FEATURE_UTILS_CACHE['default'] = feature_utils
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
|
74 |
@spaces.GPU(duration=60)
|
|
|
78 |
duration,
|
79 |
cfg_strength,
|
80 |
num_steps,
|
|
|
81 |
variant,
|
82 |
):
|
83 |
|
|
|
86 |
if variant not in all_model_cfg:
|
87 |
raise ValueError(f"Unknown model variant: {variant}. Available: {list(all_model_cfg.keys())}")
|
88 |
|
89 |
+
net, feature_utils = MODEL_CACHE[variant], FEATURE_UTILS_CACHE['default']
|
90 |
|
91 |
model = all_model_cfg[variant]
|
92 |
seq_cfg = model.seq_cfg
|
|
|
115 |
sampler_arg_name = "fm"
|
116 |
|
117 |
rng = torch.Generator(device=device)
|
118 |
+
# force to 42
|
119 |
+
rng.manual_seed(42)
|
120 |
|
121 |
audios = generation_func(
|
122 |
[prompt]*NUM_SAMPLE,
|
|
|
129 |
)
|
130 |
audio = audios[0].float().cpu()
|
131 |
|
132 |
+
def fade_out(x, sr, fade_ms=50):
|
133 |
n = len(x)
|
134 |
k = int(sr * fade_ms / 1000)
|
135 |
if k <= 0 or k >= n:
|
|
|
162 |
# Gradio input and output components
|
163 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
164 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
165 |
+
denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="SamplingSteps", interactive=True)
|
166 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
|
167 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
168 |
+
# seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
|
169 |
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
170 |
|
171 |
gr_interface = gr.Interface(
|
172 |
fn=generate_audio_gradio,
|
173 |
+
inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
|
174 |
outputs=["text", "audio"],
|
175 |
title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
|
176 |
description="",
|
|
|
187 |
["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water"],
|
188 |
["Gentle female voice cooing and baby responding with happy gurgles and giggles"],
|
189 |
['doorbell ding once followed by footsteps gradually getting louder and a door is opened '],
|
190 |
+
["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
],
|
192 |
cache_examples="lazy", # Turn on to cache.
|
193 |
)
|
194 |
|
195 |
+
if __name__ == "__main__":
|
196 |
+
|
197 |
+
ensure_models_downloaded()
|
198 |
+
load_model_cache()
|
199 |
+
gr_interface.queue(15).launch()
|
200 |
|
201 |
# theme = gr.themes.Soft(
|
202 |
# primary_hue="blue",
|