AndreasXi commited on
Commit
2b7760c
·
1 Parent(s): 0685a2c

update web

Browse files
Files changed (1) hide show
  1. app.py +43 -43
app.py CHANGED
@@ -36,12 +36,22 @@ setup_eval_logging()
36
 
37
  OUTPUT_DIR = Path("./output/gradio")
38
  OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
39
- NUM_SAMPLE = 1
40
 
41
  # Global model cache to avoid reloading
42
  MODEL_CACHE = {}
43
  FEATURE_UTILS_CACHE = {}
44
 
 
 
 
 
 
 
 
 
 
 
45
  def ensure_models_downloaded():
46
  for variant, model_cfg in all_model_cfg.items():
47
  if not model_cfg.model_path.exists():
@@ -94,7 +104,6 @@ def generate_audio_gradio(
94
 
95
  net.update_seq_lengths(seq_cfg.latent_seq_len)
96
 
97
-
98
  if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full':
99
  use_meanflow=True
100
  elif variant == 'fluxaudio_s_full':
@@ -116,7 +125,7 @@ def generate_audio_gradio(
116
 
117
  rng = torch.Generator(device=device)
118
  # force to 42
119
- rng.manual_seed(42)
120
 
121
  audios = generation_func(
122
  [prompt]*NUM_SAMPLE,
@@ -127,43 +136,34 @@ def generate_audio_gradio(
127
  cfg_strength=cfg_strength,
128
  **{sampler_arg_name: sampler},
129
  )
130
- audio = audios[0].float().cpu()
 
 
131
 
132
- def fade_out(x, sr, fade_ms=50):
133
- n = len(x)
134
- k = int(sr * fade_ms / 1000)
135
- if k <= 0 or k >= n:
136
- return x
137
- w = np.linspace(1.0, 0.0, k)
138
- x[-k:] = x[-k:] * w
139
- return x
140
- audio = fade_out(audio, seq_cfg.sampling_rate)
141
-
142
- safe_prompt = (
143
- "".join(c for c in prompt if c.isalnum() or c in (" ", "_"))
144
- .rstrip()
145
- .replace(" ", "_")[:50]
146
- )
147
- current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
148
- filename = f"{safe_prompt}_{current_time_string}.flac"
149
- save_path = OUTPUT_DIR / filename
150
- torchaudio.save(str(save_path), audio, seq_cfg.sampling_rate)
151
- log.info(f"Audio saved to {save_path}")
152
 
 
 
 
 
 
 
 
 
 
 
 
153
  if device == "cuda":
154
  torch.cuda.empty_cache()
155
 
156
- return (
157
- f"Generated audio for prompt: '{prompt}' using {'MeanFlow' if use_meanflow else 'FlowMatching'}",
158
- str(save_path),
159
- )
160
 
161
 
162
  # Gradio input and output components
163
  input_text = gr.Textbox(lines=2, label="Prompt")
164
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
165
- denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="SamplingSteps", interactive=True)
166
- cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale (For MeanAudio, it is forced to 3 as integrated in training)", interactive=True)
167
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
168
  # seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
169
  variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
@@ -171,23 +171,23 @@ variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()),
171
  gr_interface = gr.Interface(
172
  fn=generate_audio_gradio,
173
  inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
174
- outputs=["text", "audio"],
175
  title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
176
  description="",
177
  flagging_mode="never",
178
  examples=[
179
- ["Generate the festive sounds of a fireworks show: explosions lighting up the sky, crowd cheering, and the faint music playing in the background!! Celebration of the new year!", 10, 4.5, 1, "meanaudio_s_full"],
180
- ["Melodic human whistling harmonizing with natural birdsong", 10, 4.5, 1, "meanaudio_s_full"],
181
- ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 4.5, 1, "meanaudio_s_full"],
182
- ["Quiet speech and then and airplane flying away", 10, 4.5, 1, "meanaudio_s_full"],
183
- ["A soccer ball hits a goalpost with a metallic clang, followed by cheers, clapping, and the distant hum of a commentator’s voice", 10, 4.5, 1, "meanaudio_s_full"],
184
- ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 4.5, 1, "meanaudio_s_full"],
185
- ["Dripping water echoes sharply, a distant growl reverberates through the cavern, and soft scraping metal suggests something lurking unseen", 10, 4.5, 1, "meanaudio_s_full"],
186
- ["A cow is mooing whilst a lion is roaring in the background as a hunter shoots. A flock of birds subsequently fly away from the trees.", 10, 4.5, 1, "meanaudio_s_full"],
187
- ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water", 10, 4.5, 1, "meanaudio_s_full"],
188
- ["Gentle female voice cooing and baby responding with happy gurgles and giggles", 10, 4.5, 1, "meanaudio_s_full"],
189
- ['doorbell ding once followed by footsteps gradually getting louder and a door is opened ', 10, 4.5, 1, "meanaudio_s_full"],
190
- ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background", 10, 4.5, 1, "meanaudio_s_full"]
191
  ],
192
  cache_examples="lazy", # Turn on to cache.
193
  )
 
36
 
37
  OUTPUT_DIR = Path("./output/gradio")
38
  OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
39
+ NUM_SAMPLE = 2
40
 
41
  # Global model cache to avoid reloading
42
  MODEL_CACHE = {}
43
  FEATURE_UTILS_CACHE = {}
44
 
45
+
46
+ def fade_out(x, sr, fade_ms=50):
47
+ n = len(x)
48
+ k = int(sr * fade_ms / 1000)
49
+ if k <= 0 or k >= n:
50
+ return x
51
+ w = np.linspace(1.0, 0.0, k)
52
+ x[-k:] = x[-k:] * w
53
+ return x
54
+
55
  def ensure_models_downloaded():
56
  for variant, model_cfg in all_model_cfg.items():
57
  if not model_cfg.model_path.exists():
 
104
 
105
  net.update_seq_lengths(seq_cfg.latent_seq_len)
106
 
 
107
  if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full':
108
  use_meanflow=True
109
  elif variant == 'fluxaudio_s_full':
 
125
 
126
  rng = torch.Generator(device=device)
127
  # force to 42
128
+ # rng.manual_seed(42)
129
 
130
  audios = generation_func(
131
  [prompt]*NUM_SAMPLE,
 
136
  cfg_strength=cfg_strength,
137
  **{sampler_arg_name: sampler},
138
  )
139
+ save_paths = []
140
+ for i, audio in enumerate(audios):
141
+ audio = audio.float().cpu()
142
 
143
+ audio = fade_out(audio, seq_cfg.sampling_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ safe_prompt = (
146
+ "".join(c for c in prompt if c.isalnum() or c in (" ", "_"))
147
+ .rstrip()
148
+ .replace(" ", "_")[:50]
149
+ )
150
+ current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
151
+ filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
152
+ save_path = OUTPUT_DIR / filename
153
+ torchaudio.save(str(save_path), audio, seq_cfg.sampling_rate)
154
+ log.info(f"Audio saved to {save_path}")
155
+ save_paths.append(str(save_path))
156
  if device == "cuda":
157
  torch.cuda.empty_cache()
158
 
159
+ return save_paths
 
 
 
160
 
161
 
162
  # Gradio input and output components
163
  input_text = gr.Textbox(lines=2, label="Prompt")
164
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
165
+ denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
166
+ cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
167
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
168
  # seed = gr.Slider(minimum=1, maximum=1000000, value=42, step=1, label="Seed", interactive=True)
169
  variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
 
171
  gr_interface = gr.Interface(
172
  fn=generate_audio_gradio,
173
  inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
174
+ outputs=["audio", "audio"],
175
  title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
176
  description="",
177
  flagging_mode="never",
178
  examples=[
179
+ ["Generate the festive sounds of a fireworks show: explosions lighting up the sky, crowd cheering, and the faint music playing in the background!! Celebration of the new year!", 10, 3, 1, "meanaudio_s_full"],
180
+ ["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
181
+ ["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full"],
182
+ ["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full"],
183
+ ["A soccer ball hits a goalpost with a metallic clang, followed by cheers, clapping, and the distant hum of a commentator’s voice", 10, 3, 1, "meanaudio_s_full"],
184
+ ["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full"],
185
+ ["Dripping water echoes sharply, a distant growl reverberates through the cavern, and soft scraping metal suggests something lurking unseen", 10, 3, 1, "meanaudio_s_full"],
186
+ ["A cow is mooing whilst a lion is roaring in the background as a hunter shoots. A flock of birds subsequently fly away from the trees.", 10, 3, 1, "meanaudio_s_full"],
187
+ ["The deep growl of an alligator ripples through the swamp as reeds sway with a soft rustle and a turtle splashes into the murky water", 10, 3, 1, "meanaudio_s_full"],
188
+ ["Gentle female voice cooing and baby responding with happy gurgles and giggles", 10, 3, 1, "meanaudio_s_full"],
189
+ ['doorbell ding once followed by footsteps gradually getting louder and a door is opened ', 10, 3, 1, "meanaudio_s_full"],
190
+ ["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background", 10, 3, 1, "meanaudio_s_full"]
191
  ],
192
  cache_examples="lazy", # Turn on to cache.
193
  )