AndreasXi commited on
Commit
d712cde
·
1 Parent(s): 67fd0db

update web

Browse files
Files changed (1) hide show
  1. app.py +33 -90
app.py CHANGED
@@ -49,32 +49,26 @@ def ensure_models_downloaded():
49
  snapshot_download(repo_id="AndreasXi/MeanAudio", local_dir="./weights")
50
  break
51
 
52
- def load_model_if_needed(variant: str):
53
- if variant in MODEL_CACHE:
54
- return MODEL_CACHE[variant], FEATURE_UTILS_CACHE[variant]
55
-
56
- log.info(f"Loading model {variant} for the first time...")
57
- model_cfg = all_model_cfg[variant]
58
-
59
- net = get_mean_audio(model_cfg.model_name, use_rope=True, text_c_dim=512)
60
- net = net.to(device, torch.bfloat16).eval()
61
- net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
62
-
63
- feature_utils = FeaturesUtils(
64
- tod_vae_ckpt=model_cfg.vae_path,
65
- enable_conditions=True,
66
- encoder_name="t5_clap",
67
- mode=model_cfg.mode,
68
- bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
69
- need_vae_encoder=False
70
- )
71
- feature_utils = feature_utils.to(device, torch.bfloat16).eval()
72
-
73
- MODEL_CACHE[variant] = net
74
- FEATURE_UTILS_CACHE[variant] = feature_utils
75
-
76
- log.info(f"Model {variant} loaded and cached successfully")
77
- return net, feature_utils
78
 
79
 
80
  @spaces.GPU(duration=60)
@@ -84,7 +78,6 @@ def generate_audio_gradio(
84
  duration,
85
  cfg_strength,
86
  num_steps,
87
- seed,
88
  variant,
89
  ):
90
 
@@ -93,7 +86,7 @@ def generate_audio_gradio(
93
  if variant not in all_model_cfg:
94
  raise ValueError(f"Unknown model variant: {variant}. Available: {list(all_model_cfg.keys())}")
95
 
96
- net, feature_utils = load_model_if_needed(variant)
97
 
98
  model = all_model_cfg[variant]
99
  seq_cfg = model.seq_cfg
@@ -122,7 +115,8 @@ def generate_audio_gradio(
122
  sampler_arg_name = "fm"
123
 
124
  rng = torch.Generator(device=device)
125
- rng.manual_seed(seed)
 
126
 
127
  audios = generation_func(
128
  [prompt]*NUM_SAMPLE,
@@ -135,7 +129,7 @@ def generate_audio_gradio(
135
  )
136
  audio = audios[0].float().cpu()
137
 
138
- def fade_out(x, sr, fade_ms=300):
139
  n = len(x)
140
  k = int(sr * fade_ms / 1000)
141
  if k <= 0 or k >= n:
@@ -168,15 +162,15 @@ def generate_audio_gradio(
168
  # Gradio input and output components
169
  input_text = gr.Textbox(lines=2, label="Prompt")
170
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
171
- denoising_steps = gr.Slider(minimum=1, maximum=50, value=1, step=5, label="Steps", interactive=True)
172
  cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
173
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
174
- seed = gr.Slider(minimum=-1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
175
  variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
176
 
177
  gr_interface = gr.Interface(
178
  fn=generate_audio_gradio,
179
- inputs=[input_text, duration, cfg_strength, denoising_steps, seed, variant],
180
  outputs=["text", "audio"],
181
  title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
182
  description="",
@@ -193,67 +187,16 @@ gr_interface = gr.Interface(
193
  ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water"],
194
  ["Gentle female voice cooing and baby responding with happy gurgles and giggles"],
195
  ['doorbell ding once followed by footsteps gradually getting louder and a door is opened '],
196
- ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background"],
197
- ["Powerful ocean waves crashing and receding on sandy beach with distant seagulls"],
198
- ["Emulate the lively sounds of a retro arcade: 8-bit game music, coins clinking. People cheering occasionally when players winning"],
199
- ["Simulate a forest ambiance with birds chirping and wind rustling through the leaves"],
200
- ["A train conductor blows a sharp whistle, metal wheels screech on the rails, and passengers murmur while settling into their seats"],
201
- ["Generate an energetic and bustling city street scene with distant traffic and close conversations"],
202
- ["Alarms blare with rising urgency as fragments clatter against a metallic hull, interrupted by a faint hiss of escaping air"],
203
- ["Create a serene soundscape of a quiet beach at sunset"],
204
- ["Tiny pops and hisses of chemical reactions intermingle with the rhythmic pumping of a centrifuge and the soft whirr of air filtration"],
205
- ["A train conductor blows a sharp whistle, metal wheels screech on the rails, and passengers murmur while settling into their seats"],
206
- ["Emulate the lively sounds of a retro arcade: 8-bit game music, coins clinking. People cheering occasionally when players winning"],
207
- ["Quiet whispered conversation gradually fading into distant jet engine roar diminishing into silence"],
208
- ["Clear sound of bicycle tires crunching on loose gravel and dirt, followed by deep male laughter echoing"],
209
- ["Multiple ducks quacking loudly with splashing water and piercing wild animal shriek in background"],
210
- ["Create the underwater soundscape: gentle waves, faint whale calls, and the occasional clink of scuba gear"],
211
- ["Recreate the sounds of an active volcano: rumbling earth, lava bubbling, and the occasional loud explosive roar of an eruption"],
212
- ["A pile of coins spills onto a wooden table with a metallic clatter, followed by the hushed murmur of a tavern crowd and the creak of a swinging door"],
213
- ["Clear male voice speaking, sharp popping sound, followed by genuine group laughter"],
214
- ["Stream of water hitting empty ceramic cup, pitch rising as cup fills up"],
215
- ["Massive crowd erupting in thunderous applause and excited cheering"],
216
- ["Deep rolling thunder with bright lightning strikes crackling through sky"],
217
- ["Aggressive dog barking and distressed cat meowing as racing car roars past at high speed"],
218
- ["Peaceful stream bubbling and birds singing, interrupted by sudden explosive gunshot"],
219
- ["Man speaking outdoors, goat bleating loudly, metal gate scraping closed, ducks quacking frantically, wind howling into microphone"],
220
- ["Series of loud aggressive dog barks echoing"],
221
- ["Multiple distinct cat meows at different pitches"],
222
- ["Rhythmic wooden table tapping overlaid with steady water pouring sound"],
223
- ["Sustained crowd applause with camera clicks and amplified male announcer voice"],
224
- ["Two sharp gunshots followed by panicked birds taking flight with rapid wing flaps"],
225
- ["Deep rhythmic snoring with clear breathing patterns"],
226
- ["Multiple racing engines revving and accelerating with sharp whistle piercing through"],
227
- ["Massive stadium crowd cheering as thunder crashes and lightning strikes"],
228
- ["Heavy helicopter blades chopping through air with engine and wind noise"],
229
- ["Dog barking excitedly and man shouting as race car engine roars past"],
230
- ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
231
- ["Ducks quack and water splashes with some animal screeching in the background"],
232
- ["Describe the sound of the ocean"],
233
- ["A woman and a baby are having a conversation"],
234
- ["A man speaks followed by a popping noise and laughter"],
235
- ["A cup is filled from a faucet"],
236
- ["An audience cheering and clapping"],
237
- ["Rolling thunder with lightning strikes"],
238
- ["A dog barking and a cat mewing and a racing car passes by"],
239
- ["Gentle water stream, birds chirping and sudden gun shot"],
240
- ["A dog barking"],
241
- ["A cat meowing"],
242
- ["Wooden table tapping sound while water pouring"],
243
- ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
244
- ["two gunshots followed by birds flying away while chirping"],
245
- ["Whistling with birds chirping"],
246
- ["A person snoring"],
247
- ["Motor vehicles are driving with loud engines and a person whistles"],
248
- ["People cheering in a stadium while thunder and lightning strikes"],
249
- ["A helicopter is in flight"],
250
- ["A dog barking and a man talking and a racing car passes by"],
251
  ],
252
  cache_examples="lazy", # Turn on to cache.
253
  )
254
 
255
- ensure_models_downloaded()
256
- gr_interface.queue(15).launch()
 
 
 
257
 
258
  # theme = gr.themes.Soft(
259
  # primary_hue="blue",
 
49
  snapshot_download(repo_id="AndreasXi/MeanAudio", local_dir="./weights")
50
  break
51
 
52
+ def load_model_cache():
53
+ for variant in all_model_cfg.keys():
54
+ if variant in MODEL_CACHE:
55
+ return MODEL_CACHE[variant], FEATURE_UTILS_CACHE['default']
56
+ else:
57
+ log.info(f"Loading model {variant} for the first time...")
58
+ model_cfg = all_model_cfg[variant]
59
+ net = get_mean_audio(model_cfg.model_name, use_rope=True, text_c_dim=512)
60
+ net = net.to(device, torch.bfloat16).eval()
61
+ net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
62
+ MODEL_CACHE[variant] = net
63
+ feature_utils = FeaturesUtils(
64
+ tod_vae_ckpt=model_cfg.vae_path,
65
+ enable_conditions=True,
66
+ encoder_name="t5_clap",
67
+ mode=model_cfg.mode,
68
+ bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
69
+ need_vae_encoder=False
70
+ )
71
+ FEATURE_UTILS_CACHE['default'] = feature_utils
 
 
 
 
 
 
72
 
73
 
74
  @spaces.GPU(duration=60)
 
78
  duration,
79
  cfg_strength,
80
  num_steps,
 
81
  variant,
82
  ):
83
 
 
86
  if variant not in all_model_cfg:
87
  raise ValueError(f"Unknown model variant: {variant}. Available: {list(all_model_cfg.keys())}")
88
 
89
+ net, feature_utils = MODEL_CACHE[variant], FEATURE_UTILS_CACHE['default']
90
 
91
  model = all_model_cfg[variant]
92
  seq_cfg = model.seq_cfg
 
115
  sampler_arg_name = "fm"
116
 
117
  rng = torch.Generator(device=device)
118
+ # force to 42
119
+ rng.manual_seed(42)
120
 
121
  audios = generation_func(
122
  [prompt]*NUM_SAMPLE,
 
129
  )
130
  audio = audios[0].float().cpu()
131
 
132
+ def fade_out(x, sr, fade_ms=50):
133
  n = len(x)
134
  k = int(sr * fade_ms / 1000)
135
  if k <= 0 or k >= n:
 
162
  # Gradio input and output components
163
  input_text = gr.Textbox(lines=2, label="Prompt")
164
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
165
+ denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="SamplingSteps", interactive=True)
166
  cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
167
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
168
+ # seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
169
  variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
170
 
171
  gr_interface = gr.Interface(
172
  fn=generate_audio_gradio,
173
+ inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
174
  outputs=["text", "audio"],
175
  title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
176
  description="",
 
187
  ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water"],
188
  ["Gentle female voice cooing and baby responding with happy gurgles and giggles"],
189
  ['doorbell ding once followed by footsteps gradually getting louder and a door is opened '],
190
+ ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  ],
192
  cache_examples="lazy", # Turn on to cache.
193
  )
194
 
195
+ if __name__ == "__main__":
196
+
197
+ ensure_models_downloaded()
198
+ load_model_cache()
199
+ gr_interface.queue(15).launch()
200
 
201
  # theme = gr.themes.Soft(
202
  # primary_hue="blue",