Spaces:

Bils
/

AIPromoStudio

Running

File size: 5,352 Bytes

17d10a7
a15d204
d448add
db46bfb
 
 
 
 
 
 
cf3593c
 
 
dfa5d3e
c243adb
dfa5d3e
cf3593c
 
 
dfa5d3e
 
 
cf3593c
f0b5707
613bd9e
 
 
 
 
f0b5707
dfa5d3e
7bbdf94
613bd9e
f0b5707
613bd9e
cf3593c
d0384c8
dfa5d3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf3593c
17d10a7
 
dfa5d3e
17d10a7
 
dfa5d3e
cf3593c
17d10a7
 
 
d448add
cf3593c
 
 
 
 
d448add
dfa5d3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17d10a7
f0b5707
cf3593c
 
 
 
d448add
17d10a7
 
cf3593c
d448add
cf3593c
dfa5d3e
cf3593c
 
 
3fe530b
7bbdf94

import gradio as gr
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline,
    AutoProcessor, 
    MusicgenForConditionalGeneration
)
from scipy.io.wavfile import write
import tempfile
from dotenv import load_dotenv
import spaces  # Assumes Hugging Face Spaces library supports `@spaces.GPU`

# Load environment variables (e.g., Hugging Face token)
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

# ---------------------------------------------------------------------
# Load Llama 3 Model with Zero GPU
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def load_llama_pipeline_zero_gpu(model_id: str, token: str):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            use_auth_token=token,
            torch_dtype=torch.float16,
            device_map="auto",  # Automatically handles GPU allocation
            trust_remote_code=True
        )
        return pipeline("text-generation", model=model, tokenizer=tokenizer)
    except Exception as e:
        return str(e)

# ---------------------------------------------------------------------
# Generate Radio Script
# ---------------------------------------------------------------------
def generate_script(user_input: str, pipeline_llama):
    try:
        system_prompt = (
            "You are a top-tier radio imaging producer using Llama 3. "
            "Take the user's concept and craft a short, creative promo script."
        )
        combined_prompt = f"{system_prompt}\nUser concept: {user_input}\nRefined script:"
        result = pipeline_llama(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
        return result[0]['generated_text'].split("Refined script:")[-1].strip()
    except Exception as e:
        return f"Error generating script: {e}"

# ---------------------------------------------------------------------
# Load MusicGen Model
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def load_musicgen_model():
    try:
        model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
        return model, processor
    except Exception as e:
        return None, str(e)

# ---------------------------------------------------------------------
# Generate Audio
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def generate_audio(prompt: str, audio_length: int, mg_model, mg_processor):
    try:
        mg_model.to("cuda")  # Move the model to GPU
        inputs = mg_processor(text=[prompt], padding=True, return_tensors="pt")
        outputs = mg_model.generate(**inputs, max_new_tokens=audio_length)
        mg_model.to("cpu")  # Return the model to CPU

        sr = mg_model.config.audio_encoder.sampling_rate
        audio_data = outputs[0, 0].cpu().numpy()
        normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
            write(temp_wav.name, sr, normalized_audio)
            return temp_wav.name
    except Exception as e:
        return f"Error generating audio: {e}"

# ---------------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------------
def radio_imaging_app(user_prompt, llama_model_id, hf_token, audio_length):
    # Load Llama 3 Pipeline with Zero GPU
    pipeline_llama = load_llama_pipeline_zero_gpu(llama_model_id, hf_token)
    if isinstance(pipeline_llama, str):
        return pipeline_llama, None

    # Generate Script
    script = generate_script(user_prompt, pipeline_llama)

    # Load MusicGen
    mg_model, mg_processor = load_musicgen_model()
    if isinstance(mg_processor, str):
        return script, mg_processor

    # Generate Audio
    audio_data = generate_audio(script, audio_length, mg_model, mg_processor)
    if isinstance(audio_data, str):
        return script, audio_data

    return script, audio_data

# ---------------------------------------------------------------------
# Interface
# ---------------------------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🎧 AI Radio Imaging with Llama 3 + MusicGen (Zero GPU)")
    user_prompt = gr.Textbox(label="Enter your promo idea", placeholder="E.g., A 15-second hype jingle for a morning talk show.")
    llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-70B")
    hf_token = gr.Textbox(label="Hugging Face Token", type="password")
    audio_length = gr.Slider(label="Audio Length (tokens)", minimum=128, maximum=1024, step=64, value=512)

    generate_button = gr.Button("Generate Promo Script and Audio")
    script_output = gr.Textbox(label="Generated Script")
    audio_output = gr.Audio(label="Generated Audio", type="filepath")

    generate_button.click(
        fn=radio_imaging_app,
        inputs=[user_prompt, llama_model_id, hf_token, audio_length],
        outputs=[script_output, audio_output]
    )

demo.launch(debug=True)