Spaces:

chenxie95
/

MeanAudio

Running on Zero

App Files Files Community

junxiliu commited on 23 days ago

Commit

fec53ba

1 Parent(s): 93f6a95

update app.py

Browse files

Files changed (3) hide show

MeanAudio.py +0 -147
app.py +19 -5
easyinfer.py +0 -3

MeanAudio.py DELETED Viewed

@@ -1,147 +0,0 @@
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
-import logging
-from pathlib import Path
-import torch
-import torchaudio
-from meanaudio.eval_utils import (ModelConfig, all_model_cfg, generate_mf, generate_fm, setup_eval_logging)
-from meanaudio.model.flow_matching import FlowMatching
-from meanaudio.model.mean_flow import MeanFlow
-from meanaudio.model.networks import MeanAudio, get_mean_audio
-from meanaudio.model.utils.features_utils import FeaturesUtils
-from huggingface_hub import snapshot_download
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-log = logging.getLogger()
-@torch.inference_mode()
-def MeanAudioInference(
-    prompt='',
-    negative_prompt='',
-    model_path='',
-    encoder_name='t5_clap',
-    variant='meanaudio_mf',
-    duration=10,
-    cfg_strength=4.5,
-    num_steps=1,
-    output='./output',
-    seed=42,
-    full_precision=False,
-    use_rope=True,
-    text_c_dim=512,
-    use_meanflow=False
-):
-    '''
-    prompt (str):
-        The text description guiding the audio generation (e.g., "a dog is barking").
-    negative_prompt (str):
-        A text description for sounds that should be avoided in the generated audio.
-    model_path (str):
-        Path to the model weights file. If empty, it defaults to ./weights/{variant}.pth.
-    encoder_name (str):
-        Specifies the text encoder to use (default: 't5_clap').
-    variant (str):
-        Specifies the model variant to load (default: 'meanaudio_mf'). Must be a key in all_model_cfg.
-    duration (int):
-        The desired duration of the generated audio in seconds (default: 10).
-    cfg_strength (float):
-        Classifier-Free Guidance strength. Ignored if use_meanflow is True or variant is 'meanaudio_mf' (default: 4.5).
-    num_steps (int):
-        Number of steps for the generation process (default: 1).
-    output (str):
-        Directory path where the generated audio file will be saved (default: './output').
-    seed (int):
-        Random seed for generation reproducibility (default: 42).
-    full_precision (bool):
-        If True, uses torch.float32 precision; otherwise, uses torch.bfloat16 (default: False).
-    use_rope (bool):
-        Whether to use Rotary Position Embedding in the model (default: True).
-    text_c_dim (int):
-        Dimension of the text context vector (default: 512).
-    use_meanflow (bool):
-        If True, uses the MeanFlow generation method; otherwise, uses FlowMatching. If variant is 'meanaudio_mf', this is automatically set to True (default: False).
-    '''
-    setup_eval_logging()
-    output_dir = Path(output).expanduser()
-    output_dir.mkdir(parents=True, exist_ok=True)
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    dtype = torch.float32 if full_precision else torch.bfloat16
-    if duration <= 0 or num_steps <= 0:
-        raise ValueError("Duration and number of steps must be positive.")
-    if variant not in all_model_cfg:
-         raise ValueError(f"Unknown model variant: {variant}. Available: {list(all_model_cfg.keys())}")
-    if not model_path or model_path == '':
-        model_path = Path(f'./weights/{variant}.pth')
-    else:
-        model_path = Path(model_path)
-    if not model_path.exists():
-        if str(model_path) == f'./weights/{variant}.pth':
-            log.info(f'Model not found at {model_path}')
-            log.info('Downloading models to "./weights/"...')
-            try:
-                weights_dir = Path('./weights')
-                weights_dir.mkdir(exist_ok=True)
-                snapshot_download(repo_id="junxiliu/Meanaudio", local_dir="./weights",allow_patterns=["*.pt", "*.pth"] )
-                raise NotImplementedError("Model download functionality needs to be implemented")
-            except Exception as e:
-                log.error(f"Failed to download model: {e}")
-                raise FileNotFoundError(f"Model file not found and download failed: {model_path}")
-        else:
-            raise FileNotFoundError(f"Model file not found: {model_path}")
-    model = all_model_cfg[variant]
-    seq_cfg = model.seq_cfg
-    seq_cfg.duration = duration
-    net = get_mean_audio(model.model_name, use_rope=use_rope, text_c_dim=text_c_dim)
-    net = net.to(device, dtype).eval()
-    net.load_weights(torch.load(model_path, map_location=device, weights_only=True))
-    net.update_seq_lengths(seq_cfg.latent_seq_len)
-    if variant=='meanaudio_mf':
-        use_meanflow=True
-    if use_meanflow:
-        generation_func = MeanFlow(steps=num_steps)
-        cfg_strength=0
-    else:
-        generation_func = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    feature_utils = FeaturesUtils(
-        tod_vae_ckpt=model.vae_path,
-        enable_conditions=True,
-        encoder_name=encoder_name,
-        mode=model.mode,
-        bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-        need_vae_encoder=False
-    )
-    feature_utils = feature_utils.to(device, dtype).eval()
-    rng = torch.Generator(device=device)
-    rng.manual_seed(seed)
-    generate_fn = generate_mf if use_meanflow else generate_fm
-    kwargs = {
-        'negative_text': [negative_prompt],
-        'feature_utils': feature_utils,
-        'net': net,
-        'rng': rng,
-        'cfg_strength': cfg_strength
-    }
-    if use_meanflow:
-        kwargs['mf'] = generation_func
-    else:
-        kwargs['fm'] = generation_func
-    audios = generate_fn([prompt], **kwargs)
-    audio = audios.float().cpu()[0]
-    safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
-    save_path = output_dir / f'{safe_filename}--numsteps{num_steps}--seed{seed}.wav'
-    torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
-    log.info(f'Audio saved to {save_path}')
-    log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
-    return save_path
-if __name__ == '__main__':
-    MeanAudioInference('a dog is barking')

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import torch
 import torchaudio
 import gradio as gr
 from transformers import AutoModel
 from meanaudio.eval_utils import (
     ModelConfig,
     all_model_cfg,
@@ -31,12 +32,15 @@ if torch.cuda.is_available():
 setup_eval_logging()
 OUTPUT_DIR = Path("./output/gradio")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 snapshot_download(repo_id="google/flan-t5-large")
 a=AutoModel.from_pretrained('bert-base-uncased')
 b=AutoModel.from_pretrained('roberta-base')
 snapshot_download(repo_id="junxiliu/Meanaudio", local_dir="./weights",allow_patterns=["*.pt", "*.pth"] )
 current_model_states = {
 }
@@ -190,16 +194,26 @@ def generate_audio_gradio(
         generation_func = generate_fm
         sampler_arg_name = "fm"
-    prompts = [prompt]
     audios = generation_func(
-        prompts,
-        negative_text=[negative_prompt],
         feature_utils=feature_utils,
         net=net,
         rng=rng,
         cfg_strength=cfg_strength,
         **{sampler_arg_name: sampler},
     )
     audio = audios.float().cpu()[0]
     safe_prompt = (
         "".join(c for c in prompt if c.isalnum() or c in (" ", "_"))

 import torchaudio
 import gradio as gr
 from transformers import AutoModel
+import laion_clap
 from meanaudio.eval_utils import (
     ModelConfig,
     all_model_cfg,
 setup_eval_logging()
 OUTPUT_DIR = Path("./output/gradio")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+NUM_SAMPLE=8
 snapshot_download(repo_id="google/flan-t5-large")
 a=AutoModel.from_pretrained('bert-base-uncased')
 b=AutoModel.from_pretrained('roberta-base')
 snapshot_download(repo_id="junxiliu/Meanaudio", local_dir="./weights",allow_patterns=["*.pt", "*.pth"] )
+_clap_ckpt_path='./weights/music_speech_audioset_epoch_15_esc_89.98.pt'
+laion_clap_model = laion_clap.CLAP_Module(enable_fusion=False,
+                                              amodel='HTSAT-base').cuda().eval()
+laion_clap_model.load_ckpt(_clap_ckpt_path, verbose=False)
 current_model_states = {
 }
         generation_func = generate_fm
         sampler_arg_name = "fm"
     audios = generation_func(
+        [prompt]*NUM_SAMPLE,
+        negative_text=[negative_prompt]*NUM_SAMPLE,
         feature_utils=feature_utils,
         net=net,
         rng=rng,
         cfg_strength=cfg_strength,
         **{sampler_arg_name: sampler},
     )
+    for i in range(NUM_SAMPLE):
+        audio = audios.float().cpu()[i]
+        text_embed = laion_clap_model.get_text_embedding(prompt, use_tensor=True).squeeze()
+        audio_embed = laion_clap_model.get_audio_embedding_from_data(audio, use_tensor=True).squeeze()
+        score = torch.cosine_similarity(text_embed,
+                                    audio_embed,
+                                    dim=-1).mean()
+        all_audios.append(audio)
+        all_scores.append(score)
+    winner_idx = torch.argmax(torch.tensor(all_scores)).item()
+    audio=all_audios[winner_idx]
     audio = audios.float().cpu()[0]
     safe_prompt = (
         "".join(c for c in prompt if c.isalnum() or c in (" ", "_"))

easyinfer.py DELETED Viewed

@@ -1,3 +0,0 @@
-from MeanAudio import MeanAudioInference
-audio_path=MeanAudioInference('a dog is barking')
-print(audio_path)