update meanaudio_l_full
Browse files- app.py +2 -21
- meanaudio/eval_utils.py +7 -1
- meanaudio/model/networks.py +12 -1
app.py
CHANGED
@@ -127,7 +127,7 @@ def generate_audio_gradio(
|
|
127 |
|
128 |
net.update_seq_lengths(seq_cfg.latent_seq_len)
|
129 |
|
130 |
-
if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full':
|
131 |
use_meanflow=True
|
132 |
elif variant == 'fluxaudio_s_full':
|
133 |
use_meanflow=False
|
@@ -184,32 +184,13 @@ def generate_audio_gradio(
|
|
184 |
|
185 |
# Gradio input and output components
|
186 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
|
|
187 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
188 |
denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
|
189 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
|
190 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
191 |
seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
|
192 |
-
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
193 |
-
|
194 |
|
195 |
-
# description_text = """
|
196 |
-
# **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
|
197 |
-
|
198 |
-
# <p align="center">
|
199 |
-
# <a href="https://huggingface.co/AndreasXi/MeanAudio">
|
200 |
-
# <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model-HuggingFace-violet" alt="HuggingFace Model">
|
201 |
-
# </a>
|
202 |
-
# <a href="https://huggingface.co/spaces/chenxie95/MeanAudio">
|
203 |
-
# <img src="https://img.shields.io/badge/%F0%9F%9A%80%20Space-HuggingFace-8A2BE2" alt="HuggingFace Space">
|
204 |
-
# </a>
|
205 |
-
# <a href="https://meanaudio.github.io/">
|
206 |
-
# <img src="https://img.shields.io/badge/%F0%9F%93%84%20Project-Page-brightred" alt="Project Page">
|
207 |
-
# </a>
|
208 |
-
# <a href="https://github.com/xiquan-li/MeanAudio">
|
209 |
-
# <img src="https://img.shields.io/badge/%F0%9F%92%BB%20Code-GitHub-black" alt="GitHub">
|
210 |
-
# </a>
|
211 |
-
# </p>
|
212 |
-
# """
|
213 |
|
214 |
description_text = """
|
215 |
### **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
|
|
|
127 |
|
128 |
net.update_seq_lengths(seq_cfg.latent_seq_len)
|
129 |
|
130 |
+
if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full' or variant == 'meanaudio_l_full':
|
131 |
use_meanflow=True
|
132 |
elif variant == 'fluxaudio_s_full':
|
133 |
use_meanflow=False
|
|
|
184 |
|
185 |
# Gradio input and output components
|
186 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
187 |
+
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
188 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
189 |
denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
|
190 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
|
191 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
192 |
seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
|
|
|
|
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
description_text = """
|
196 |
### **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
|
meanaudio/eval_utils.py
CHANGED
@@ -58,11 +58,17 @@ meanaudio_s_ac = ModelConfig(model_name='meanaudio_s_ac',
|
|
58 |
vae_path=Path('./weights/v1-16.pth'),
|
59 |
bigvgan_16k_path=Path('./weights/best_netG.pt'),
|
60 |
mode='16k')
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
all_model_cfg: dict[str, ModelConfig] = {
|
63 |
-
'
|
64 |
'meanaudio_s_full': meanaudio_s_full,
|
65 |
'meanaudio_s_ac': meanaudio_s_ac,
|
|
|
66 |
}
|
67 |
|
68 |
|
|
|
58 |
vae_path=Path('./weights/v1-16.pth'),
|
59 |
bigvgan_16k_path=Path('./weights/best_netG.pt'),
|
60 |
mode='16k')
|
61 |
+
meanaudio_l_full = ModelConfig(model_name='meanaudio_l_full',
|
62 |
+
model_path=Path('./weights/meanaudio_l_full.pth'), # will be specified later
|
63 |
+
vae_path=Path('./weights/v1-16.pth'),
|
64 |
+
bigvgan_16k_path=Path('./weights/best_netG.pt'),
|
65 |
+
mode='16k')
|
66 |
|
67 |
all_model_cfg: dict[str, ModelConfig] = {
|
68 |
+
'meanaudio_l_full': meanaudio_l_full,
|
69 |
'meanaudio_s_full': meanaudio_s_full,
|
70 |
'meanaudio_s_ac': meanaudio_s_ac,
|
71 |
+
'fluxaudio_s_full': fluxaudio_s_full,
|
72 |
}
|
73 |
|
74 |
|
meanaudio/model/networks.py
CHANGED
@@ -597,11 +597,22 @@ def meanaudio_s(**kwargs) -> MeanAudio:
|
|
597 |
num_heads=num_heads,
|
598 |
latent_seq_len=312, # for 10s audio
|
599 |
**kwargs)
|
600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
601 |
|
602 |
def get_mean_audio(name: str, **kwargs) -> MeanAudio:
|
603 |
if name == 'meanaudio_s_ac' or name == 'meanaudio_s_full':
|
604 |
return meanaudio_s(**kwargs)
|
|
|
|
|
605 |
elif name == 'fluxaudio_s_full':
|
606 |
return fluxaudio_s(**kwargs)
|
607 |
else:
|
|
|
597 |
num_heads=num_heads,
|
598 |
latent_seq_len=312, # for 10s audio
|
599 |
**kwargs)
|
600 |
+
def meanaudio_l(**kwargs) -> MeanAudio:
|
601 |
+
num_heads = 14
|
602 |
+
return MeanAudio(latent_dim=20,
|
603 |
+
text_dim=1024,
|
604 |
+
hidden_dim=64 * num_heads,
|
605 |
+
depth=24,
|
606 |
+
fused_depth=16,
|
607 |
+
num_heads=num_heads,
|
608 |
+
latent_seq_len=312, # for 10s audio
|
609 |
+
**kwargs)
|
610 |
|
611 |
def get_mean_audio(name: str, **kwargs) -> MeanAudio:
|
612 |
if name == 'meanaudio_s_ac' or name == 'meanaudio_s_full':
|
613 |
return meanaudio_s(**kwargs)
|
614 |
+
elif name == 'meanaudio_l_full':
|
615 |
+
return meanaudio_l(**kwargs)
|
616 |
elif name == 'fluxaudio_s_full':
|
617 |
return fluxaudio_s(**kwargs)
|
618 |
else:
|