import gradio as gr from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer import torch import soundfile as sf model_name = "facebook/musicgen-small" config = AutoConfig.from_pretrained(model_name) if not hasattr(config, 'dropout'): config.dropout = 0.1 if not hasattr(config, 'layerdrop'): config.layerdrop = 0.1 if not hasattr(config, 'max_position_embeddings'): config.max_position_embeddings = 2048 if not hasattr(config, 'num_attention_heads'): config.num_attention_heads = 16 if not hasattr(config, 'num_hidden_layers'): config.num_hidden_layers = 24 if not hasattr(config, 'num_hidden_layers'): config.num_hidden_layers = 24 if not hasattr(config, 'hidden_size'): config.hidden_size = 1024 model = AutoModelForCausalLM.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) def text_to_audio(prompt): input_ids = tokenizer(prompt, return_tensors="pt").input_ids with torch.no_grad(): output = model.generate(input_ids) audio_data = output[0].cpu().numpy() audio_file = "generated_audio.wav" sf.write(audio_file, audio_data, 22050) return audio_file gr.Interface(fn=text_to_audio, inputs="text", outputs="audio").launch()