Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
import librosa | |
from transformers import pipeline | |
import tempfile | |
from functools import lru_cache | |
# Cache the model to avoid reloading on every interaction | |
def load_model(): | |
return pipeline( | |
model='fixie-ai/ultravox-v0_5-llama-3_2-1b', | |
trust_remote_code=True, | |
device_map="auto" # Automatically uses GPU if available | |
) | |
def process_audio(audio_file, user_message): | |
try: | |
# Load audio (supports file upload or microphone input) | |
if isinstance(audio_file, (str, tempfile._TemporaryFileWrapper)): | |
audio_path = audio_file.name if hasattr(audio_file, 'name') else audio_file | |
audio, sr = librosa.load(audio_path, sr=16000) | |
else: # Handle direct numpy array from microphone | |
sr, audio = audio_file | |
# Initialize conversation | |
turns = [ | |
{ | |
"role": "system", | |
"content": "You are a friendly and helpful AI assistant. Respond conversationally to the user's audio input." | |
}, | |
{ | |
"role": "user", | |
"content": user_message if user_message else "Describe what you heard in the audio." | |
} | |
] | |
# Get model prediction | |
pipe = load_model() | |
result = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=100) | |
return result[-1]["content"] | |
except Exception as e: | |
return f"Error processing audio: {str(e)}" | |
# Gradio UI | |
with gr.Blocks(title="UltraVox Audio Assistant") as demo: | |
gr.Markdown("## 🎤 UltraVox Audio Assistant") | |
gr.Markdown("Upload an audio file or speak via microphone, then ask questions about it.") | |
with gr.Row(): | |
audio_input = gr.Audio( | |
sources=["upload", "microphone"], | |
type="filepath", | |
label="Input Audio" | |
) | |
text_input = gr.Textbox( | |
label="Your Question (Optional)", | |
placeholder="Ask me about the audio..." | |
) | |
submit_btn = gr.Button("Process") | |
output = gr.Textbox(label="AI Response", interactive=False) | |
submit_btn.click( | |
fn=process_audio, | |
inputs=[audio_input, text_input], | |
outputs=output | |
) | |
gr.Examples( | |
examples=[ | |
["examples/weather_report.wav", "What's the weather forecast?"], | |
["examples/meeting_notes.mp3", "Summarize the key points"] | |
], | |
inputs=[audio_input, text_input] | |
) | |
if __name__ == "__main__": | |
demo.launch() |