AndreasXi commited on
Commit
085b825
·
1 Parent(s): bbd22e4
Files changed (1) hide show
  1. app.py +16 -17
app.py CHANGED
@@ -111,6 +111,7 @@ def generate_audio_gradio(
111
  cfg_strength,
112
  num_steps,
113
  variant,
 
114
  ):
115
 
116
  if duration <= 0 or num_steps <= 0:
@@ -146,8 +147,7 @@ def generate_audio_gradio(
146
  sampler_arg_name = "fm"
147
 
148
  rng = torch.Generator(device=device)
149
- # force to 42
150
- rng.manual_seed(42)
151
 
152
  audios = generation_func(
153
  [prompt]*NUM_SAMPLE,
@@ -167,7 +167,7 @@ def generate_audio_gradio(
167
 
168
  for i, audio in enumerate(audios):
169
  audio = audio.float().cpu()
170
- audio = fade_out(audio, seq_cfg.sampling_rate)
171
 
172
  current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
173
  filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
@@ -188,7 +188,7 @@ output_audio = gr.Audio(label="Generated Audio", type="filepath")
188
  denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
189
  cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
190
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
191
- # seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
192
  variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
193
 
194
 
@@ -214,27 +214,26 @@ description_text = """
214
 
215
  gr_interface = gr.Interface(
216
  fn=generate_audio_gradio,
217
- inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
218
  outputs=[
219
  gr.Audio(label="🎵 Audio Sample", type="filepath"),
220
  gr.Textbox(label="Prompt Used", interactive=False)
221
  ],
222
  title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
223
- description="",
224
  flagging_mode="never",
225
  examples=[
226
- ["Generate the festive sounds of a fireworks show: explosions lighting up the sky, crowd cheering, and the faint music playing in the background!! Celebration of the new year!", 10, 3, 1, "meanaudio_s_full"],
227
  ["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
228
- ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full"],
229
- ["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full"],
230
- ["A soccer ball hits a goalpost with a metallic clang, followed by cheers, clapping, and the distant hum of a commentator’s voice", 10, 3, 1, "meanaudio_s_full"],
231
- ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full"],
232
- ["Dripping water echoes sharply, a distant growl reverberates through the cavern, and soft scraping metal suggests something lurking unseen", 10, 3, 1, "meanaudio_s_full"],
233
- ["A cow is mooing whilst a lion is roaring in the background as a hunter shoots. A flock of birds subsequently fly away from the trees.", 10, 3, 1, "meanaudio_s_full"],
234
- ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water", 10, 3, 1, "meanaudio_s_full"],
235
- ["Gentle female voice cooing and baby responding with happy gurgles and giggles", 10, 3, 1, "meanaudio_s_full"],
236
- ['doorbell ding once followed by footsteps gradually getting louder and a door is opened ', 10, 3, 1, "meanaudio_s_full"],
237
- ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background", 10, 3, 1, "meanaudio_s_full"]
238
  ],
239
  cache_examples="lazy",
240
  )
 
111
  cfg_strength,
112
  num_steps,
113
  variant,
114
+ seed
115
  ):
116
 
117
  if duration <= 0 or num_steps <= 0:
 
147
  sampler_arg_name = "fm"
148
 
149
  rng = torch.Generator(device=device)
150
+ rng.manual_seed(seed)
 
151
 
152
  audios = generation_func(
153
  [prompt]*NUM_SAMPLE,
 
167
 
168
  for i, audio in enumerate(audios):
169
  audio = audio.float().cpu()
170
+ audio = fade_out(audio, seq_cfg.sampling_rate, fade_ms=100)
171
 
172
  current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
173
  filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
 
188
  denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
189
  cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
190
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
191
+ seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
192
  variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
193
 
194
 
 
214
 
215
  gr_interface = gr.Interface(
216
  fn=generate_audio_gradio,
217
+ inputs=[input_text, duration, cfg_strength, denoising_steps, variant, seed],
218
  outputs=[
219
  gr.Audio(label="🎵 Audio Sample", type="filepath"),
220
  gr.Textbox(label="Prompt Used", interactive=False)
221
  ],
222
  title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
223
+ description=description_text,
224
  flagging_mode="never",
225
  examples=[
226
+ ["Guitar and piano playing a warm music, with a soft and gentle melody, perfect for a romantic evening.", 10, 3, 1, "meanaudio_s_full", 42],
227
  ["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
228
+ ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full", 42],
229
+ ["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full", 42],
230
+ ["The sound of a steam engine.", 10, 3, 1, "meanaudio_s_full", 42],
231
+ ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full", 42],
232
+ ["Chopping meat on a wooden table.", 10, 3, 1, "meanaudio_s_full", 42],
233
+ ["A vehicle engine revving then accelerating at a high rate as a metal surface is whipped followed by tires skidding.", 10, 3, 1, "meanaudio_s_full", 42],
234
+ ["Battlefield scene, continuous roar of artillery and gunfire, high fidelity, the sharp crack of bullets, the thundering explosions of bombs, and the screams of wounded soldiers.", 10, 3, 1, "meanaudio_s_full", 42],
235
+ ["Pop music that upbeat, catchy, and easy to listen, high fidelity, with simple melodies, electronic instruments and polished production.", 10, 3, 1, "meanaudio_s_full", 42],
236
+ ["A fast-paced instrumental piece with a classical vibe featuring stringed instruments, evoking an energetic and uplifting mood.", 10, 3, 1, "meanaudio_s_full", 42]
 
237
  ],
238
  cache_examples="lazy",
239
  )