AndreasXi commited on
Commit
98c6962
·
1 Parent(s): 0950fa7

update meanaudio_l_full

Browse files
Files changed (3) hide show
  1. app.py +2 -21
  2. meanaudio/eval_utils.py +7 -1
  3. meanaudio/model/networks.py +12 -1
app.py CHANGED
@@ -127,7 +127,7 @@ def generate_audio_gradio(
127
 
128
  net.update_seq_lengths(seq_cfg.latent_seq_len)
129
 
130
- if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full':
131
  use_meanflow=True
132
  elif variant == 'fluxaudio_s_full':
133
  use_meanflow=False
@@ -184,32 +184,13 @@ def generate_audio_gradio(
184
 
185
  # Gradio input and output components
186
  input_text = gr.Textbox(lines=2, label="Prompt")
 
187
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
188
  denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
189
  cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
190
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
191
  seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
192
- variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
193
-
194
 
195
- # description_text = """
196
- # **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
197
-
198
- # <p align="center">
199
- # <a href="https://huggingface.co/AndreasXi/MeanAudio">
200
- # <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model-HuggingFace-violet" alt="HuggingFace Model">
201
- # </a>
202
- # <a href="https://huggingface.co/spaces/chenxie95/MeanAudio">
203
- # <img src="https://img.shields.io/badge/%F0%9F%9A%80%20Space-HuggingFace-8A2BE2" alt="HuggingFace Space">
204
- # </a>
205
- # <a href="https://meanaudio.github.io/">
206
- # <img src="https://img.shields.io/badge/%F0%9F%93%84%20Project-Page-brightred" alt="Project Page">
207
- # </a>
208
- # <a href="https://github.com/xiquan-li/MeanAudio">
209
- # <img src="https://img.shields.io/badge/%F0%9F%92%BB%20Code-GitHub-black" alt="GitHub">
210
- # </a>
211
- # </p>
212
- # """
213
 
214
  description_text = """
215
  ### **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
 
127
 
128
  net.update_seq_lengths(seq_cfg.latent_seq_len)
129
 
130
+ if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full' or variant == 'meanaudio_l_full':
131
  use_meanflow=True
132
  elif variant == 'fluxaudio_s_full':
133
  use_meanflow=False
 
184
 
185
  # Gradio input and output components
186
  input_text = gr.Textbox(lines=2, label="Prompt")
187
+ variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
188
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
189
  denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
190
  cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
191
  duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
192
  seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
 
 
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  description_text = """
196
  ### **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
meanaudio/eval_utils.py CHANGED
@@ -58,11 +58,17 @@ meanaudio_s_ac = ModelConfig(model_name='meanaudio_s_ac',
58
  vae_path=Path('./weights/v1-16.pth'),
59
  bigvgan_16k_path=Path('./weights/best_netG.pt'),
60
  mode='16k')
 
 
 
 
 
61
 
62
  all_model_cfg: dict[str, ModelConfig] = {
63
- 'fluxaudio_s_full': fluxaudio_s_full,
64
  'meanaudio_s_full': meanaudio_s_full,
65
  'meanaudio_s_ac': meanaudio_s_ac,
 
66
  }
67
 
68
 
 
58
  vae_path=Path('./weights/v1-16.pth'),
59
  bigvgan_16k_path=Path('./weights/best_netG.pt'),
60
  mode='16k')
61
+ meanaudio_l_full = ModelConfig(model_name='meanaudio_l_full',
62
+ model_path=Path('./weights/meanaudio_l_full.pth'), # will be specified later
63
+ vae_path=Path('./weights/v1-16.pth'),
64
+ bigvgan_16k_path=Path('./weights/best_netG.pt'),
65
+ mode='16k')
66
 
67
  all_model_cfg: dict[str, ModelConfig] = {
68
+ 'meanaudio_l_full': meanaudio_l_full,
69
  'meanaudio_s_full': meanaudio_s_full,
70
  'meanaudio_s_ac': meanaudio_s_ac,
71
+ 'fluxaudio_s_full': fluxaudio_s_full,
72
  }
73
 
74
 
meanaudio/model/networks.py CHANGED
@@ -597,11 +597,22 @@ def meanaudio_s(**kwargs) -> MeanAudio:
597
  num_heads=num_heads,
598
  latent_seq_len=312, # for 10s audio
599
  **kwargs)
600
-
 
 
 
 
 
 
 
 
 
601
 
602
  def get_mean_audio(name: str, **kwargs) -> MeanAudio:
603
  if name == 'meanaudio_s_ac' or name == 'meanaudio_s_full':
604
  return meanaudio_s(**kwargs)
 
 
605
  elif name == 'fluxaudio_s_full':
606
  return fluxaudio_s(**kwargs)
607
  else:
 
597
  num_heads=num_heads,
598
  latent_seq_len=312, # for 10s audio
599
  **kwargs)
600
+ def meanaudio_l(**kwargs) -> MeanAudio:
601
+ num_heads = 14
602
+ return MeanAudio(latent_dim=20,
603
+ text_dim=1024,
604
+ hidden_dim=64 * num_heads,
605
+ depth=24,
606
+ fused_depth=16,
607
+ num_heads=num_heads,
608
+ latent_seq_len=312, # for 10s audio
609
+ **kwargs)
610
 
611
  def get_mean_audio(name: str, **kwargs) -> MeanAudio:
612
  if name == 'meanaudio_s_ac' or name == 'meanaudio_s_full':
613
  return meanaudio_s(**kwargs)
614
+ elif name == 'meanaudio_l_full':
615
+ return meanaudio_l(**kwargs)
616
  elif name == 'fluxaudio_s_full':
617
  return fluxaudio_s(**kwargs)
618
  else: