MusicGen

Paused

App Files Files Community

ford442 commited on Feb 19

Commit

dca097b

verified ·

1 Parent(s): 881ee4d

Update demos/musicgen_app.py

Browse files

Files changed (1) hide show

demos/musicgen_app.py +13 -47

demos/musicgen_app.py CHANGED Viewed

@@ -8,11 +8,9 @@ import sys
 import time
 import typing as tp
 from tempfile import NamedTemporaryFile, gettempdir
 from einops import rearrange
 import torch
 import gradio as gr
 from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
 from audiocraft.models.encodec import InterleaveStereoCompressionModel
@@ -20,17 +18,26 @@ from audiocraft.models import MusicGen, MultiBandDiffusion
 import multiprocessing as mp
 import warnings
-# --- Utility Functions and Classes ---
 class FileCleaner:
     def __init__(self, file_lifetime: float = 3600):
         self.file_lifetime = file_lifetime
         self.files = []
     def add(self, path: tp.Union[str, Path]):
         self._cleanup()
         self.files.append((time.time(), Path(path)))
     def _cleanup(self):
         now = time.time()
         for time_added, path in list(self.files):
@@ -40,9 +47,9 @@ class FileCleaner:
                 self.files.pop(0)
             else:
                 break
 file_cleaner = FileCleaner()
 def convert_wav_to_mp4(wav_path, output_path=None):
     """Converts a WAV file to a waveform MP4 video using ffmpeg."""
     if output_path is None:
@@ -62,19 +69,14 @@ def convert_wav_to_mp4(wav_path, output_path=None):
             "-preset", "fast", # Important, don't do veryslow.
             str(output_path),
         ]
         process = sp.run(command, capture_output=True, text=True, check=True)
         return str(output_path)
     except sp.CalledProcessError as e:
         print(f"Error in ffmpeg conversion: {e}")
         print(f"ffmpeg stdout: {e.stdout}")
         print(f"ffmpeg stderr: {e.stderr}")
         raise  # Re-raise the exception to be caught by Gradio
-# --- Worker Process ---
 def model_worker(model_name: str, task_queue: mp.Queue, result_queue: mp.Queue):
     """
     Persistent worker process (used when NOT running as a daemon).
@@ -83,14 +85,11 @@ def model_worker(model_name: str, task_queue: mp.Queue, result_queue: mp.Queue):
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
         model = MusicGen.get_pretrained(model_name, device=device)
         mbd = MultiBandDiffusion.get_mbd_musicgen(device=device)
         while True:
             task = task_queue.get()
             if task is None:
                 break
             task_id, text, melody, duration, use_diffusion, gen_params = task
             try:
                 model.set_generation_params(duration=duration, **gen_params)
                 target_sr = model.sample_rate
@@ -103,7 +102,6 @@ def model_worker(model_name: str, task_queue: mp.Queue, result_queue: mp.Queue):
                         melody_tensor = melody_tensor.unsqueeze(0)
                     melody_tensor = melody_tensor[..., :int(sr * duration)]
                     processed_melody = convert_audio(melody_tensor, sr, target_sr, target_ac)
                 if processed_melody is not None:
                     output, tokens = model.generate_with_chroma(
                         descriptions=[text],
@@ -114,9 +112,7 @@ def model_worker(model_name: str, task_queue: mp.Queue, result_queue: mp.Queue):
                     )
                 else:
                     output, tokens = model.generate([text], progress=True, return_tokens=True)
                 output = output.detach().cpu()
                 if use_diffusion:
                     if isinstance(model.compression_model, InterleaveStereoCompressionModel):
                         left, right = model.compression_model.get_left_right_codes(tokens)
@@ -129,16 +125,11 @@ def model_worker(model_name: str, task_queue: mp.Queue, result_queue: mp.Queue):
                     result_queue.put((task_id, (output, outputs_diffusion)))
                 else:
                     result_queue.put((task_id, (output, None)))
             except Exception as e:
                 result_queue.put((task_id, e))
     except Exception as e:
         result_queue.put((-1, e))
-# --- Predictor Class (Modified for conditional process creation) ---
 class Predictor:
     def __init__(self, model_name: str):
         self.model_name = model_name
@@ -190,7 +181,6 @@ class Predictor:
                         melody_tensor = melody_tensor.unsqueeze(0)
                     melody_tensor = melody_tensor[..., :int(sr * duration)]
                     processed_melody = convert_audio(melody_tensor, sr, target_sr, target_ac)
                 if processed_melody is not None:
                     output, tokens = self.model.generate_with_chroma(
                         descriptions=[text],
@@ -201,9 +191,7 @@ class Predictor:
                     )
                 else:
                     output, tokens = self.model.generate([text], progress=True, return_tokens=True)
                 output = output.detach().cpu()
                 if use_diffusion:
                     if isinstance(self.model.compression_model, InterleaveStereoCompressionModel):
                         left, right = self.model.compression_model.get_left_right_codes(tokens)
@@ -216,11 +204,8 @@ class Predictor:
                     return task_id, (output, outputs_diffusion) #Return the task id.
                 else:
                   return task_id, (output, None)
             except Exception as e:
                 return task_id, e
         else:
             # Use the multiprocessing queue (multi-process mode)
             self.current_task_id += 1
@@ -239,7 +224,6 @@ class Predictor:
                 result_task_id, result = self.result_queue.get()
                 if result_task_id == task_id:
                     break  # Found the correct result
         if isinstance(result, Exception):
             raise result
         return result
@@ -250,14 +234,12 @@ class Predictor:
             self.task_queue.put(None)
             self.process.join()
 _default_model_name = "facebook/musicgen-melody"
 @spaces.GPU(duration=90)  # Use the decorator for Spaces
 def predict_full(model, model_path, use_mbd, text, melody, duration, topk, topp, temperature, cfg_coef):
     # Initialize Predictor *INSIDE* the function
     predictor = Predictor(model)
     task_id, (wav, diffusion_wav) = predictor.predict( # Unpack directly!
         text=text,
         melody=melody,
@@ -268,11 +250,9 @@ def predict_full(model, model_path, use_mbd, text, melody, duration, topk, topp,
         temperature=temperature,
         cfg_coef=cfg_coef,
     )
     # Save and return audio files
     wav_paths = []
     video_paths = []
     # Save standard output
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         audio_write(
@@ -285,8 +265,6 @@ def predict_full(model, model_path, use_mbd, text, melody, duration, topk, topp,
         video_paths.append(video_path)
         file_cleaner.add(file.name)
         file_cleaner.add(video_path)
     # Save MBD output if used
     if diffusion_wav is not None:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
@@ -300,31 +278,24 @@ def predict_full(model, model_path, use_mbd, text, melody, duration, topk, topp,
             video_paths.append(video_path)
             file_cleaner.add(file.name)
             file_cleaner.add(video_path)
     # Shutdown predictor to prevent hanging processes!
     if not predictor.is_daemon: # Important!
         predictor.shutdown()
     if use_mbd:
          return video_paths[0], wav_paths[0], video_paths[1], wav_paths[1]
     return video_paths[0], wav_paths[0], None, None
 def toggle_audio_src(choice):
     if choice == "mic":
         return gr.update(sources="microphone", value=None, label="Microphone")
     else:
         return gr.update(sources="upload", value=None, label="File")
 def toggle_diffusion(choice):
     if choice == "MultiBand_Diffusion":
         return [gr.update(visible=True)] * 2
     else:
         return [gr.update(visible=False)] * 2
-# --- Gradio UI ---
 def ui_full(launch_kwargs):
     with gr.Blocks() as interface:
@@ -475,7 +446,6 @@ def ui_full(launch_kwargs):
         interface.queue().launch(**launch_kwargs)
-# --- Main Entry Point ---
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -502,12 +472,9 @@ if __name__ == '__main__':
     parser.add_argument(
         '--share', action='store_true', help='Share the gradio UI'
     )
     args = parser.parse_args()
     launch_kwargs = {}
     launch_kwargs['server_name'] = args.listen
     if args.username and args.password:
         launch_kwargs['auth'] = (args.username, args.password)
     if args.server_port:
@@ -516,7 +483,6 @@ if __name__ == '__main__':
         launch_kwargs['inbrowser'] = args.inbrowser
     if args.share:
         launch_kwargs['share'] = args.share
     logging.basicConfig(level=logging.INFO, stream=sys.stderr)
     # Added predictor shutdown
     try:

 import time
 import typing as tp
 from tempfile import NamedTemporaryFile, gettempdir
 from einops import rearrange
 import torch
 import gradio as gr
 from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
 from audiocraft.models.encodec import InterleaveStereoCompressionModel
 import multiprocessing as mp
 import warnings
+os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
+os.environ["SAFETENSORS_FAST_GPU"] = "1"
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+torch.backends.cudnn.allow_tf32 = False
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.benchmark = False
+# torch.backends.cuda.preferred_blas_library="cublas"
+# torch.backends.cuda.preferred_linalg_library="cusolver"
+torch.set_float32_matmul_precision("highest")
 class FileCleaner:
     def __init__(self, file_lifetime: float = 3600):
         self.file_lifetime = file_lifetime
         self.files = []
     def add(self, path: tp.Union[str, Path]):
         self._cleanup()
         self.files.append((time.time(), Path(path)))
     def _cleanup(self):
         now = time.time()
         for time_added, path in list(self.files):
                 self.files.pop(0)
             else:
                 break
 file_cleaner = FileCleaner()
 def convert_wav_to_mp4(wav_path, output_path=None):
     """Converts a WAV file to a waveform MP4 video using ffmpeg."""
     if output_path is None:
             "-preset", "fast", # Important, don't do veryslow.
             str(output_path),
         ]
         process = sp.run(command, capture_output=True, text=True, check=True)
         return str(output_path)
     except sp.CalledProcessError as e:
         print(f"Error in ffmpeg conversion: {e}")
         print(f"ffmpeg stdout: {e.stdout}")
         print(f"ffmpeg stderr: {e.stderr}")
         raise  # Re-raise the exception to be caught by Gradio
 def model_worker(model_name: str, task_queue: mp.Queue, result_queue: mp.Queue):
     """
     Persistent worker process (used when NOT running as a daemon).
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
         model = MusicGen.get_pretrained(model_name, device=device)
         mbd = MultiBandDiffusion.get_mbd_musicgen(device=device)
         while True:
             task = task_queue.get()
             if task is None:
                 break
             task_id, text, melody, duration, use_diffusion, gen_params = task
             try:
                 model.set_generation_params(duration=duration, **gen_params)
                 target_sr = model.sample_rate
                         melody_tensor = melody_tensor.unsqueeze(0)
                     melody_tensor = melody_tensor[..., :int(sr * duration)]
                     processed_melody = convert_audio(melody_tensor, sr, target_sr, target_ac)
                 if processed_melody is not None:
                     output, tokens = model.generate_with_chroma(
                         descriptions=[text],
                     )
                 else:
                     output, tokens = model.generate([text], progress=True, return_tokens=True)
                 output = output.detach().cpu()
                 if use_diffusion:
                     if isinstance(model.compression_model, InterleaveStereoCompressionModel):
                         left, right = model.compression_model.get_left_right_codes(tokens)
                     result_queue.put((task_id, (output, outputs_diffusion)))
                 else:
                     result_queue.put((task_id, (output, None)))
             except Exception as e:
                 result_queue.put((task_id, e))
     except Exception as e:
         result_queue.put((-1, e))
 class Predictor:
     def __init__(self, model_name: str):
         self.model_name = model_name
                         melody_tensor = melody_tensor.unsqueeze(0)
                     melody_tensor = melody_tensor[..., :int(sr * duration)]
                     processed_melody = convert_audio(melody_tensor, sr, target_sr, target_ac)
                 if processed_melody is not None:
                     output, tokens = self.model.generate_with_chroma(
                         descriptions=[text],
                     )
                 else:
                     output, tokens = self.model.generate([text], progress=True, return_tokens=True)
                 output = output.detach().cpu()
                 if use_diffusion:
                     if isinstance(self.model.compression_model, InterleaveStereoCompressionModel):
                         left, right = self.model.compression_model.get_left_right_codes(tokens)
                     return task_id, (output, outputs_diffusion) #Return the task id.
                 else:
                   return task_id, (output, None)
             except Exception as e:
                 return task_id, e
         else:
             # Use the multiprocessing queue (multi-process mode)
             self.current_task_id += 1
                 result_task_id, result = self.result_queue.get()
                 if result_task_id == task_id:
                     break  # Found the correct result
         if isinstance(result, Exception):
             raise result
         return result
             self.task_queue.put(None)
             self.process.join()
 _default_model_name = "facebook/musicgen-melody"
 @spaces.GPU(duration=90)  # Use the decorator for Spaces
 def predict_full(model, model_path, use_mbd, text, melody, duration, topk, topp, temperature, cfg_coef):
     # Initialize Predictor *INSIDE* the function
     predictor = Predictor(model)
     task_id, (wav, diffusion_wav) = predictor.predict( # Unpack directly!
         text=text,
         melody=melody,
         temperature=temperature,
         cfg_coef=cfg_coef,
     )
     # Save and return audio files
     wav_paths = []
     video_paths = []
     # Save standard output
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         audio_write(
         video_paths.append(video_path)
         file_cleaner.add(file.name)
         file_cleaner.add(video_path)
     # Save MBD output if used
     if diffusion_wav is not None:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             video_paths.append(video_path)
             file_cleaner.add(file.name)
             file_cleaner.add(video_path)
     # Shutdown predictor to prevent hanging processes!
     if not predictor.is_daemon: # Important!
         predictor.shutdown()
     if use_mbd:
          return video_paths[0], wav_paths[0], video_paths[1], wav_paths[1]
     return video_paths[0], wav_paths[0], None, None
 def toggle_audio_src(choice):
     if choice == "mic":
         return gr.update(sources="microphone", value=None, label="Microphone")
     else:
         return gr.update(sources="upload", value=None, label="File")
 def toggle_diffusion(choice):
     if choice == "MultiBand_Diffusion":
         return [gr.update(visible=True)] * 2
     else:
         return [gr.update(visible=False)] * 2
 def ui_full(launch_kwargs):
     with gr.Blocks() as interface:
         interface.queue().launch(**launch_kwargs)
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument(
     parser.add_argument(
         '--share', action='store_true', help='Share the gradio UI'
     )
     args = parser.parse_args()
     launch_kwargs = {}
     launch_kwargs['server_name'] = args.listen
     if args.username and args.password:
         launch_kwargs['auth'] = (args.username, args.password)
     if args.server_port:
         launch_kwargs['inbrowser'] = args.inbrowser
     if args.share:
         launch_kwargs['share'] = args.share
     logging.basicConfig(level=logging.INFO, stream=sys.stderr)
     # Added predictor shutdown
     try: