Spaces:

Bils
/

AIPromoStudio

Sleeping

App Files Files Community

AIPromoStudio / app.py

Bils

Update app.py

b950350 verified 7 months ago

raw

history blame

6.9 kB

	import gradio as gr
	import os
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	pipeline,
	AutoProcessor,
	MusicgenForConditionalGeneration,
	)
	from scipy.io.wavfile import write
	from pydub import AudioSegment
	from pydub.playback import play
	import tempfile
	from dotenv import load_dotenv
	import spaces

	# Load environment variables
	load_dotenv()
	hf_token = os.getenv("HF_TOKEN")

	# ---------------------------------------------------------------------
	# Script Generation Function
	# ---------------------------------------------------------------------
	@spaces.GPU(duration=300)
	def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	use_auth_token=token,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)
	llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

	system_prompt = (
	f"You are an expert radio imaging producer specializing in sound design and music. "
	f"Based on the user's concept and the selected duration of {duration} seconds, craft a concise, engaging promo script. "
	f"Ensure the script fits within the time limit and suggest a matching music style that complements the theme."
	)

	combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nRefined script and music suggestion:"
	result = llama_pipeline(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)

	generated_text = result[0]["generated_text"].split("Refined script and music suggestion:")[-1].strip()
	script, music_suggestion = generated_text.split("Music Suggestion:")
	return script.strip(), music_suggestion.strip()
	except Exception as e:
	return f"Error generating script: {e}", None

	# ---------------------------------------------------------------------
	# Voice-Over Generation Function
	# ---------------------------------------------------------------------
	@spaces.GPU(duration=300)
	def generate_voice(script: str, speaker: str):
	try:
	# Replace with your chosen TTS model
	tts_model = "coqui/XTTS-v2"
	processor = AutoProcessor.from_pretrained(tts_model)
	model = AutoModelForCausalLM.from_pretrained(tts_model)

	inputs = processor(script, return_tensors="pt")
	speech = model.generate(**inputs)

	output_path = f"{tempfile.gettempdir()}/generated_voice.wav"
	write(output_path, 22050, speech.cpu().numpy())
	return output_path
	except Exception as e:
	return f"Error generating voice-over: {e}"

	# ---------------------------------------------------------------------
	# Music Generation Function
	# ---------------------------------------------------------------------
	@spaces.GPU(duration=300)
	def generate_music(prompt: str, audio_length: int):
	try:
	musicgen_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
	musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	musicgen_model.to(device)

	inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
	outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)

	audio_data = outputs[0, 0].cpu().numpy()
	normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")

	output_path = f"{tempfile.gettempdir()}/generated_music.wav"
	write(output_path, 44100, normalized_audio)

	return output_path
	except Exception as e:
	return f"Error generating music: {e}"

	# ---------------------------------------------------------------------
	# Audio Blending Function with Ducking
	# ---------------------------------------------------------------------
	def blend_audio(voice_path: str, music_path: str, ducking: bool):
	try:
	voice = AudioSegment.from_file(voice_path)
	music = AudioSegment.from_file(music_path)

	if ducking:
	music = music - 10 # Lower music volume for ducking

	combined = music.overlay(voice)
	output_path = f"{tempfile.gettempdir()}/final_promo.wav"
	combined.export(output_path, format="wav")

	return output_path
	except Exception as e:
	return f"Error blending audio: {e}"

	# ---------------------------------------------------------------------
	# Gradio Interface
	# ---------------------------------------------------------------------
	def process_all(user_prompt, llama_model_id, duration, audio_length, speaker, ducking):
	script, music_suggestion = generate_script(user_prompt, llama_model_id, hf_token, duration)
	if "Error" in script:
	return script, None

	voice_path = generate_voice(script, speaker)
	if "Error" in voice_path:
	return voice_path, None

	music_path = generate_music(music_suggestion, audio_length)
	if "Error" in music_path:
	return music_path, None

	final_audio = blend_audio(voice_path, music_path, ducking)
	return f"Script:\n{script}\n\nMusic Suggestion:\n{music_suggestion}", final_audio

	with gr.Blocks() as demo:
	gr.Markdown("""
	# 🎧 AI Promo Studio with Script, Voice, Music, and Mixing 🚀
	Generate fully mixed promos effortlessly with AI-driven tools for radio and media!
	""")

	with gr.Row():
	user_prompt = gr.Textbox(label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show.")
	llama_model_id = gr.Textbox(label="Llama Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct")
	duration = gr.Slider(label="Duration (seconds)", minimum=15, maximum=60, step=15, value=30)
	audio_length = gr.Slider(label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
	speaker = gr.Textbox(label="Voice Style (optional)", placeholder="E.g., male, female, or neutral.")
	ducking = gr.Checkbox(label="Enable Ducking", value=True)

	generate_button = gr.Button("Generate Full Promo")
	script_output = gr.Textbox(label="Generated Script and Music Suggestion")
	audio_output = gr.Audio(label="Final Promo Audio", type="filepath")

	generate_button.click(
	fn=process_all,
	inputs=[user_prompt, llama_model_id, duration, audio_length, speaker, ducking],
	outputs=[script_output, audio_output],
	)

	gr.Markdown("""
	<hr>
	<p style="text-align: center; font-size: 0.9em;">
	Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
	</p>
	""")

	demo.launch(debug=True)