update
Browse files
app.py
CHANGED
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
import spaces
|
3 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
4 |
+
import logging
|
5 |
+
from argparse import ArgumentParser
|
6 |
+
from pathlib import Path
|
7 |
+
import torch
|
8 |
+
import torchaudio
|
9 |
+
import gradio as gr
|
10 |
+
from transformers import AutoModel
|
11 |
+
from meanaudio.eval_utils import (
|
12 |
+
ModelConfig,
|
13 |
+
all_model_cfg,
|
14 |
+
generate_mf,
|
15 |
+
generate_fm,
|
16 |
+
setup_eval_logging,
|
17 |
+
)
|
18 |
+
from meanaudio.model.flow_matching import FlowMatching
|
19 |
+
from meanaudio.model.mean_flow import MeanFlow
|
20 |
+
from meanaudio.model.networks import MeanAudio, get_mean_audio
|
21 |
+
from meanaudio.model.utils.features_utils import FeaturesUtils
|
22 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
23 |
+
torch.backends.cudnn.allow_tf32 = True
|
24 |
+
import gc
|
25 |
+
from datetime import datetime
|
26 |
+
from huggingface_hub import snapshot_download
|
27 |
+
log = logging.getLogger()
|
28 |
+
device = "cpu"
|
29 |
+
if torch.cuda.is_available():
|
30 |
+
device = "cuda"
|
31 |
+
setup_eval_logging()
|
32 |
+
OUTPUT_DIR = Path("./output/gradio")
|
33 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
34 |
+
|
35 |
+
snapshot_download(repo_id="google/flan-t5-large")
|
36 |
+
a=AutoModel.from_pretrained('bert-base-uncased')
|
37 |
+
b=AutoModel.from_pretrained('roberta-base')
|
38 |
+
snapshot_download(repo_id="junxiliu/Meanaudio", local_dir="./weights",allow_patterns=["*.pt", "*.pth"] )
|
39 |
+
|
40 |
+
current_model_states = {
|
41 |
+
|
42 |
+
}
|
43 |
+
|
44 |
+
def load_model_if_needed(
|
45 |
+
variant, model_path, encoder_name, use_rope, text_c_dim
|
46 |
+
):
|
47 |
+
global current_model_states
|
48 |
+
dtype = torch.float32
|
49 |
+
existing_state = current_model_states.get(variant)
|
50 |
+
needs_reload = (
|
51 |
+
existing_state is None
|
52 |
+
or existing_state["args"].variant != variant
|
53 |
+
or existing_state["args"].model_path != model_path
|
54 |
+
or existing_state["args"].encoder_name != encoder_name
|
55 |
+
or existing_state["args"].use_rope != use_rope
|
56 |
+
or existing_state["args"].text_c_dim != text_c_dim
|
57 |
+
)
|
58 |
+
if needs_reload:
|
59 |
+
log.info(f"Loading/reloading model '{variant}'.")
|
60 |
+
if variant not in all_model_cfg:
|
61 |
+
raise ValueError(f"Unknown model variant: {variant}")
|
62 |
+
model: ModelConfig = all_model_cfg[variant]
|
63 |
+
seq_cfg = model.seq_cfg
|
64 |
+
|
65 |
+
class MockArgs:
|
66 |
+
pass
|
67 |
+
mock_args = MockArgs()
|
68 |
+
mock_args.variant = variant
|
69 |
+
mock_args.model_path = model_path
|
70 |
+
mock_args.encoder_name = encoder_name
|
71 |
+
mock_args.use_rope = use_rope
|
72 |
+
mock_args.text_c_dim = text_c_dim
|
73 |
+
|
74 |
+
net: MeanAudio = (
|
75 |
+
get_mean_audio(
|
76 |
+
model.model_name,
|
77 |
+
use_rope=mock_args.use_rope,
|
78 |
+
text_c_dim=mock_args.text_c_dim,
|
79 |
+
)
|
80 |
+
.to(device, dtype)
|
81 |
+
.eval()
|
82 |
+
)
|
83 |
+
net.load_weights(
|
84 |
+
torch.load(
|
85 |
+
mock_args.model_path, map_location=device, weights_only=True
|
86 |
+
)
|
87 |
+
)
|
88 |
+
log.info(f"Loaded weights from {mock_args.model_path}")
|
89 |
+
|
90 |
+
feature_utils = FeaturesUtils(
|
91 |
+
tod_vae_ckpt=model.vae_path,
|
92 |
+
enable_conditions=True,
|
93 |
+
encoder_name=mock_args.encoder_name,
|
94 |
+
mode=model.mode,
|
95 |
+
bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
|
96 |
+
need_vae_encoder=False,
|
97 |
+
)
|
98 |
+
feature_utils = feature_utils.to(device, dtype).eval()
|
99 |
+
|
100 |
+
current_model_states[variant] = {
|
101 |
+
"net": net,
|
102 |
+
"feature_utils": feature_utils,
|
103 |
+
"seq_cfg": seq_cfg,
|
104 |
+
"args": mock_args,
|
105 |
+
}
|
106 |
+
log.info(f"Model '{variant}' loaded successfully.")
|
107 |
+
|
108 |
+
return net, feature_utils, seq_cfg, mock_args
|
109 |
+
else:
|
110 |
+
log.info(f"Model '{variant}' already loaded with current settings. Skipping reload.")
|
111 |
+
|
112 |
+
return existing_state["net"], existing_state["feature_utils"], existing_state["seq_cfg"], existing_state["args"]
|
113 |
+
|
114 |
+
def initialize_all_default_models():
|
115 |
+
log.info("Initializing default models...")
|
116 |
+
default_models = ['meanaudio_mf', 'fluxaudio_fm']
|
117 |
+
common_params = {
|
118 |
+
"encoder_name": "t5_clap",
|
119 |
+
"use_rope": True,
|
120 |
+
"text_c_dim": 512,
|
121 |
+
|
122 |
+
}
|
123 |
+
for variant in default_models:
|
124 |
+
model_path = f"./weights/{variant}.pth"
|
125 |
+
|
126 |
+
try:
|
127 |
+
load_model_if_needed(
|
128 |
+
variant, model_path, **common_params
|
129 |
+
)
|
130 |
+
log.info(f"Default model '{variant}' initialized successfully.")
|
131 |
+
except Exception as e:
|
132 |
+
log.error(f"Failed to initialize default model '{variant}': {e}")
|
133 |
+
|
134 |
+
initialize_all_default_models()
|
135 |
+
|
136 |
+
@spaces.GPU(duration=10)
|
137 |
+
@torch.inference_mode()
|
138 |
+
def generate_audio_gradio(
|
139 |
+
prompt,
|
140 |
+
negative_prompt,
|
141 |
+
duration,
|
142 |
+
cfg_strength,
|
143 |
+
num_steps,
|
144 |
+
seed,
|
145 |
+
variant,
|
146 |
+
):
|
147 |
+
global current_model_states
|
148 |
+
|
149 |
+
model_path = f"./weights/{variant}.pth"
|
150 |
+
encoder_name = "t5_clap"
|
151 |
+
use_rope = True
|
152 |
+
text_c_dim = 512
|
153 |
+
|
154 |
+
model_state = current_model_states.get(variant)
|
155 |
+
if model_state is None:
|
156 |
+
error_msg = f"Error: Model '{variant}' is not available. It may not have been loaded correctly during startup."
|
157 |
+
log.error(error_msg)
|
158 |
+
return error_msg, None
|
159 |
+
|
160 |
+
net = model_state["net"]
|
161 |
+
feature_utils = model_state["feature_utils"]
|
162 |
+
seq_cfg = model_state["seq_cfg"]
|
163 |
+
|
164 |
+
args = model_state["args"]
|
165 |
+
dtype = torch.float32
|
166 |
+
|
167 |
+
temp_seq_cfg = type(seq_cfg)(**seq_cfg.__dict__)
|
168 |
+
temp_seq_cfg.duration = duration
|
169 |
+
|
170 |
+
net.update_seq_lengths(temp_seq_cfg.latent_seq_len)
|
171 |
+
|
172 |
+
rng = torch.Generator(device=device)
|
173 |
+
if seed >= 0:
|
174 |
+
rng.manual_seed(seed)
|
175 |
+
else:
|
176 |
+
rng.seed()
|
177 |
+
|
178 |
+
use_meanflow = variant == "meanaudio_mf"
|
179 |
+
if use_meanflow:
|
180 |
+
sampler = MeanFlow(steps=num_steps)
|
181 |
+
log.info("Using MeanFlow for generation.")
|
182 |
+
generation_func = generate_mf
|
183 |
+
sampler_arg_name = "mf"
|
184 |
+
cfg_strength = 3
|
185 |
+
else:
|
186 |
+
sampler = FlowMatching(
|
187 |
+
min_sigma=0, inference_mode="euler", num_steps=num_steps
|
188 |
+
)
|
189 |
+
log.info("Using FlowMatching for generation.")
|
190 |
+
generation_func = generate_fm
|
191 |
+
sampler_arg_name = "fm"
|
192 |
+
|
193 |
+
prompts = [prompt]
|
194 |
+
audios = generation_func(
|
195 |
+
prompts,
|
196 |
+
negative_text=[negative_prompt],
|
197 |
+
feature_utils=feature_utils,
|
198 |
+
net=net,
|
199 |
+
rng=rng,
|
200 |
+
cfg_strength=cfg_strength,
|
201 |
+
**{sampler_arg_name: sampler},
|
202 |
+
)
|
203 |
+
audio = audios.float().cpu()[0]
|
204 |
+
safe_prompt = (
|
205 |
+
"".join(c for c in prompt if c.isalnum() or c in (" ", "_"))
|
206 |
+
.rstrip()
|
207 |
+
.replace(" ", "_")[:50]
|
208 |
+
)
|
209 |
+
current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
210 |
+
filename = f"{safe_prompt}_{current_time_string}.flac"
|
211 |
+
save_path = OUTPUT_DIR / filename
|
212 |
+
torchaudio.save(str(save_path), audio, temp_seq_cfg.sampling_rate)
|
213 |
+
log.info(f"Audio saved to {save_path}")
|
214 |
+
|
215 |
+
gc.collect()
|
216 |
+
|
217 |
+
return (
|
218 |
+
f"Generated audio for prompt: '{prompt}' using {'MeanFlow' if use_meanflow else 'FlowMatching'}",
|
219 |
+
str(save_path),
|
220 |
+
)
|
221 |
+
|
222 |
+
theme = gr.themes.Soft(
|
223 |
+
primary_hue="blue",
|
224 |
+
secondary_hue="slate",
|
225 |
+
neutral_hue="slate",
|
226 |
+
text_size="sm",
|
227 |
+
spacing_size="sm",
|
228 |
+
).set(
|
229 |
+
background_fill_primary="*neutral_50",
|
230 |
+
background_fill_secondary="*background_fill_primary",
|
231 |
+
block_background_fill="*background_fill_primary",
|
232 |
+
block_border_width="0px",
|
233 |
+
panel_background_fill="*neutral_50",
|
234 |
+
panel_border_width="0px",
|
235 |
+
input_background_fill="*neutral_100",
|
236 |
+
input_border_color="*neutral_200",
|
237 |
+
button_primary_background_fill="*primary_300",
|
238 |
+
button_primary_background_fill_hover="*primary_400",
|
239 |
+
button_secondary_background_fill="*neutral_200",
|
240 |
+
button_secondary_background_fill_hover="*neutral_300",
|
241 |
+
)
|
242 |
+
custom_css = """
|
243 |
+
#main-headertitle {
|
244 |
+
text-align: center;
|
245 |
+
margin-top: 15px;
|
246 |
+
margin-bottom: 10px;
|
247 |
+
color: var(--neutral-600);
|
248 |
+
font-weight: 600;
|
249 |
+
}
|
250 |
+
#main-header {
|
251 |
+
text-align: center;
|
252 |
+
margin-top: 5px;
|
253 |
+
margin-bottom: 10px;
|
254 |
+
color: var(--neutral-600);
|
255 |
+
font-weight: 600;
|
256 |
+
}
|
257 |
+
#model-settings-header, #generation-settings-header {
|
258 |
+
color: var(--neutral-600);
|
259 |
+
margin-top: 8px;
|
260 |
+
margin-bottom: 8px;
|
261 |
+
font-weight: 500;
|
262 |
+
font-size: 1.1em;
|
263 |
+
}
|
264 |
+
.setting-section {
|
265 |
+
padding: 10px 12px;
|
266 |
+
border-radius: 6px;
|
267 |
+
background-color: var(--neutral-50);
|
268 |
+
margin-bottom: 10px;
|
269 |
+
border: 1px solid var(--neutral-100);
|
270 |
+
}
|
271 |
+
hr {
|
272 |
+
border: none;
|
273 |
+
height: 1px;
|
274 |
+
background-color: var(--neutral-200);
|
275 |
+
margin: 8px 0;
|
276 |
+
}
|
277 |
+
#generate-btn {
|
278 |
+
width: 100%;
|
279 |
+
max-width: 250px;
|
280 |
+
margin: 10px auto;
|
281 |
+
display: block;
|
282 |
+
padding: 10px 15px;
|
283 |
+
font-size: 16px;
|
284 |
+
border-radius: 5px;
|
285 |
+
}
|
286 |
+
#status-box {
|
287 |
+
min-height: 50px;
|
288 |
+
display: flex;
|
289 |
+
align-items: center;
|
290 |
+
justify-content: center;
|
291 |
+
padding: 8px;
|
292 |
+
border-radius: 5px;
|
293 |
+
border: 1px solid var(--neutral-200);
|
294 |
+
color: var(--neutral-700);
|
295 |
+
}
|
296 |
+
#project-badges {
|
297 |
+
text-align: center;
|
298 |
+
margin-top: 30px;
|
299 |
+
margin-bottom: 20px;
|
300 |
+
}
|
301 |
+
#project-badges #badge-container {
|
302 |
+
display: flex;
|
303 |
+
gap: 10px;
|
304 |
+
align-items: center;
|
305 |
+
justify-content: center;
|
306 |
+
flex-wrap: wrap;
|
307 |
+
}
|
308 |
+
#project-badges img {
|
309 |
+
border-radius: 5px;
|
310 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
|
311 |
+
height: 20px;
|
312 |
+
transition: transform 0.1s ease, box-shadow 0.1s ease;
|
313 |
+
}
|
314 |
+
#project-badges a:hover img {
|
315 |
+
transform: translateY(-2px);
|
316 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15);
|
317 |
+
}
|
318 |
+
#audio-output {
|
319 |
+
height: 200px;
|
320 |
+
border-radius: 5px;
|
321 |
+
border: 1px solid var(--neutral-200);
|
322 |
+
}
|
323 |
+
.gradio-dropdown label, .gradio-checkbox label, .gradio-number label, .gradio-textbox label {
|
324 |
+
font-weight: 500;
|
325 |
+
color: var(--neutral-700);
|
326 |
+
font-size: 0.9em;
|
327 |
+
}
|
328 |
+
.gradio-row {
|
329 |
+
gap: 8px;
|
330 |
+
}
|
331 |
+
.gradio-block {
|
332 |
+
margin-bottom: 8px;
|
333 |
+
}
|
334 |
+
.setting-section .gradio-block {
|
335 |
+
margin-bottom: 6px;
|
336 |
+
}
|
337 |
+
::-webkit-scrollbar {
|
338 |
+
width: 8px;
|
339 |
+
height: 8px;
|
340 |
+
}
|
341 |
+
::-webkit-scrollbar-track {
|
342 |
+
background: var(--neutral-100);
|
343 |
+
border-radius: 4px;
|
344 |
+
}
|
345 |
+
::-webkit-scrollbar-thumb {
|
346 |
+
background: var(--neutral-300);
|
347 |
+
border-radius: 4px;
|
348 |
+
}
|
349 |
+
::-webkit-scrollbar-thumb:hover {
|
350 |
+
background: var(--neutral-400);
|
351 |
+
}
|
352 |
+
* {
|
353 |
+
scrollbar-width: thin;
|
354 |
+
scrollbar-color: var(--neutral-300) var(--neutral-100);
|
355 |
+
}
|
356 |
+
"""
|
357 |
+
with gr.Blocks(title="MeanAudio Generator", theme=theme, css=custom_css) as demo:
|
358 |
+
gr.Markdown("# MeanAudio:Fast and Faithful Text-to-Audio Generation with Mean Flows", elem_id="main-header")
|
359 |
+
|
360 |
+
project_badges_markdown = '''
|
361 |
+
<div style="display: flex; gap: 10px; align-items: center; justify-content: center; flex-wrap: wrap; margin-bottom: 20px;">
|
362 |
+
<a href="https://huggingface.co/junxiliu/MeanAudio">
|
363 |
+
<img src="https://img.shields.io/badge/Model-HuggingFace-violet?logo=huggingface" alt="Hugging Face Model">
|
364 |
+
</a>
|
365 |
+
<a href="https://huggingface.co/spaces/chenxie95/MeanAudio">
|
366 |
+
<img src="https://img.shields.io/badge/Space-HuggingFace-8A2BE2?logo=huggingface" alt="Hugging Face Space">
|
367 |
+
</a>
|
368 |
+
<a href="https://meanaudio.github.io/">
|
369 |
+
<img src="https://img.shields.io/badge/Project-Page-brightred?style=flat" alt="Project Page">
|
370 |
+
</a>
|
371 |
+
<a href="https://github.com/xiquan-li/MeanAudio">
|
372 |
+
<img src="https://img.shields.io/badge/Code-GitHub-black?logo=github" alt="GitHub">
|
373 |
+
</a>
|
374 |
+
</div>
|
375 |
+
'''
|
376 |
+
|
377 |
+
gr.Markdown(project_badges_markdown, elem_id="project-badges")
|
378 |
+
with gr.Column(elem_classes="setting-section"):
|
379 |
+
with gr.Row():
|
380 |
+
available_variants = (
|
381 |
+
list(all_model_cfg.keys()) if all_model_cfg else []
|
382 |
+
)
|
383 |
+
default_variant = (
|
384 |
+
'meanaudio_mf'
|
385 |
+
)
|
386 |
+
variant = gr.Dropdown(
|
387 |
+
label="Model Variant",
|
388 |
+
choices=available_variants,
|
389 |
+
value=default_variant,
|
390 |
+
interactive=True,
|
391 |
+
scale=3,
|
392 |
+
)
|
393 |
+
|
394 |
+
with gr.Column(elem_classes="setting-section"):
|
395 |
+
with gr.Row():
|
396 |
+
prompt = gr.Textbox(
|
397 |
+
label="Prompt",
|
398 |
+
placeholder="Describe the sound you want to generate...",
|
399 |
+
scale=1,
|
400 |
+
)
|
401 |
+
negative_prompt = gr.Textbox(
|
402 |
+
label="Negative Prompt",
|
403 |
+
placeholder="Describe sounds you want to avoid...",
|
404 |
+
value="",
|
405 |
+
scale=1,
|
406 |
+
)
|
407 |
+
with gr.Row():
|
408 |
+
duration = gr.Number(
|
409 |
+
label="Duration (sec)", value=10.0, minimum=0.1, scale=1
|
410 |
+
)
|
411 |
+
cfg_strength = gr.Number(
|
412 |
+
label="CFG (Meanflow forced to 3)", value=3, minimum=0.0, scale=1
|
413 |
+
)
|
414 |
+
with gr.Row():
|
415 |
+
seed = gr.Number(
|
416 |
+
label="Seed (-1 for random)", value=42, precision=0, scale=1
|
417 |
+
)
|
418 |
+
num_steps = gr.Number(
|
419 |
+
label="Number of Steps",
|
420 |
+
value=1,
|
421 |
+
precision=0,
|
422 |
+
minimum=1,
|
423 |
+
scale=1,
|
424 |
+
)
|
425 |
+
generate_button = gr.Button("Generate", variant="primary", elem_id="generate-btn")
|
426 |
+
generate_output_text = gr.Textbox(
|
427 |
+
label="Result Status", interactive=False, elem_id="status-box"
|
428 |
+
)
|
429 |
+
audio_output = gr.Audio(
|
430 |
+
label="Generated Audio", type="filepath", elem_id="audio-output"
|
431 |
+
)
|
432 |
+
generate_button.click(
|
433 |
+
fn=generate_audio_gradio,
|
434 |
+
inputs=[
|
435 |
+
prompt,
|
436 |
+
negative_prompt,
|
437 |
+
duration,
|
438 |
+
cfg_strength,
|
439 |
+
num_steps,
|
440 |
+
seed,
|
441 |
+
variant,
|
442 |
+
],
|
443 |
+
outputs=[generate_output_text, audio_output],
|
444 |
+
)
|
445 |
+
audio_examples = [
|
446 |
+
["A speech and gunfire followed by a gun being loaded", "", 10.0, 3, 1, 42, "meanaudio_mf"],
|
447 |
+
["Typing on a keyboard", "", 10.0, 3, 1, 42, "meanaudio_mf"],
|
448 |
+
["A man speaks followed by a popping noise and laughter", "", 10.0, 3, 2, 42, "meanaudio_mf"],
|
449 |
+
["Some humming followed by a toilet flushing", "", 10.0, 3, 2, 42, "meanaudio_mf"],
|
450 |
+
["Rain falling on a hard surface as thunder roars in the distance", "", 10.0, 3, 5, 42, "meanaudio_mf"],
|
451 |
+
["Food sizzling and oil popping", "", 10.0, 3, 25, 42, "meanaudio_mf"],
|
452 |
+
["Pots and dishes clanking as a man talks followed by liquid pouring into a container", "", 8.0, 3, 2, 42, "meanaudio_mf"],
|
453 |
+
["A few seconds of silence then a rasping sound against wood", "", 12.0, 3, 2, 42, "meanaudio_mf"],
|
454 |
+
["A man speaks as he gives a speech and then the crowd cheers", "", 10.0, 3, 25, 42, "fluxaudio_fm"],
|
455 |
+
["A goat bleating repeatedly", "", 10.0, 3, 50, 123, "fluxaudio_fm"],
|
456 |
+
["Tires squealing followed by an engine revving", "", 12.0, 4, 25, 456, "fluxaudio_fm"],
|
457 |
+
["Hammer slowly hitting the wooden table", "", 10.0, 3.5, 25, 42, "fluxaudio_fm"],
|
458 |
+
["Dog barking excitedly and man shouting as race car engine roars past", "", 10.0, 3, 1, 42, "meanaudio_mf"],
|
459 |
+
["A dog barking and a cat mewing and a racing car passes by", "", 12.0, 3, 5, -1, "meanaudio_mf"],
|
460 |
+
["Whistling with birds chirping", "", 10.0, 4, 50, 42, "fluxaudio_fm"],
|
461 |
+
]
|
462 |
+
gr.Examples(
|
463 |
+
examples=audio_examples,
|
464 |
+
inputs=[prompt, negative_prompt, duration, cfg_strength, num_steps, seed, variant],
|
465 |
+
outputs=[generate_output_text, audio_output],
|
466 |
+
fn=generate_audio_gradio,
|
467 |
+
examples_per_page=5,
|
468 |
+
label="Example Prompts",
|
469 |
+
)
|
470 |
+
|
471 |
+
if __name__ == "__main__":
|
472 |
+
demo.launch()
|