Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import soundfile as sf | |
model_name = "facebook/musicgen-small" | |
config = AutoConfig.from_pretrained(model_name) | |
if not hasattr(config, 'dropout'): | |
config.dropout = 0.1 | |
if not hasattr(config, 'layerdrop'): | |
config.layerdrop = 0.1 | |
if not hasattr(config, 'max_position_embeddings'): | |
config.max_position_embeddings = 2048 | |
if not hasattr(config, 'num_attention_heads'): | |
config.num_attention_heads = 16 | |
if not hasattr(config, 'num_hidden_layers'): | |
config.num_hidden_layers = 24 | |
if not hasattr(config, 'num_hidden_layers'): | |
config.num_hidden_layers = 24 | |
if not hasattr(config, 'hidden_size'): | |
config.hidden_size = 1024 | |
model = AutoModelForCausalLM.from_pretrained(model_name, config=config) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def text_to_audio(prompt): | |
input_ids = tokenizer(prompt, return_tensors="pt").input_ids | |
with torch.no_grad(): | |
output = model.generate(input_ids) | |
audio_data = output[0].cpu().numpy() | |
audio_file = "generated_audio.wav" | |
sf.write(audio_file, audio_data, 22050) | |
return audio_file | |
gr.Interface(fn=text_to_audio, inputs="text", outputs="audio").launch() |