Enrique Cardoza
feat(app): implement audio/video transcription with Groq API
6ec981e
raw
history blame
4.96 kB
import gradio as gr
import os
from groq import Groq
import tempfile
def validate_file(file):
"""Validate uploaded file type and size."""
if file is None:
return False, "No file uploaded"
# Check file size (25MB limit)
file_size_mb = os.path.getsize(file.name) / (1024 * 1024)
if file_size_mb > 25:
return False, f"File size ({file_size_mb:.1f}MB) exceeds 25MB limit"
# Check file extension
valid_extensions = ['.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm', '.flac', '.ogg', '.aac']
file_extension = os.path.splitext(file.name)[1].lower()
if file_extension not in valid_extensions:
return False, f"Invalid file type. Supported formats: {', '.join(valid_extensions)}"
return True, "File is valid"
def transcribe_audio(audio_file, api_key):
"""Transcribe audio/video files into text using Groq's Whisper model.
This tool converts spoken content from audio and video files into written text.
It supports multiple audio formats and handles files up to 25MB in size.
Parameters:
audio_file: An audio or video file to transcribe.
Supported formats: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC.
Maximum size: 25MB.
api_key: Your Groq API key, required for authentication.
You can obtain this from https://console.groq.com/
Returns:
A text transcript of the spoken content in the audio file.
Example:
Upload a podcast episode to get a complete text transcript.
"""
try:
# Validate inputs
if not api_key:
return "Error: Please provide your Groq API key"
if audio_file is None:
return "Error: Please upload an audio or video file"
# Validate file
is_valid, message = validate_file(audio_file)
if not is_valid:
return f"Error: {message}"
# Initialize Groq client
client = Groq(api_key=api_key)
# Read the audio file
with open(audio_file.name, "rb") as file:
# Create transcription
transcription = client.audio.transcriptions.create(
file=(os.path.basename(audio_file.name), file.read()),
model="whisper-large-v3-turbo"
)
return transcription.text
except Exception as e:
return f"Error: {str(e)}"
# Create the Gradio interface with custom layout
with gr.Blocks(title="Audio/Video Transcription with Groq", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎡 Audio/Video Transcription with Groq Whisper")
gr.Markdown("Upload an audio or video file and get an AI-generated transcript using Groq's Whisper model.")
with gr.Row():
# Left column - Input controls
with gr.Column(scale=1):
gr.Markdown("### πŸ“€ Upload & Settings")
audio_input = gr.File(
label="Upload Audio/Video File",
file_types=[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".flac", ".ogg", ".aac"],
file_count="single"
)
api_key_input = gr.Textbox(
label="Groq API Key",
placeholder="Enter your Groq API key here...",
type="password",
lines=1
)
transcribe_btn = gr.Button(
"🎯 Transcribe Audio",
variant="primary",
size="lg"
)
gr.Markdown("### ℹ️ File Requirements")
gr.Markdown("""
- **Max file size**: 25MB
- **Supported formats**: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC
- **Get API key**: [Groq Console](https://console.groq.com/)
""")
# Right column - Output
with gr.Column(scale=1):
gr.Markdown("### πŸ“ Transcript")
transcript_output = gr.Textbox(
label="Generated Transcript",
placeholder="Your transcript will appear here...",
lines=20,
max_lines=30,
show_copy_button=True,
interactive=False
)
# Connect the button to the transcription function
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, api_key_input],
outputs=transcript_output,
show_progress=True
)
# Add examples section
gr.Markdown("### πŸ”— Useful Links")
gr.Markdown("""
- [Get your Groq API key](https://console.groq.com/)
- [Groq Documentation](https://console.groq.com/docs)
- [Supported audio formats](https://platform.openai.com/docs/guides/speech-to-text)
""")
if __name__ == "__main__":
demo.launch(mcp_server=True)