File size: 4,963 Bytes
6ec981e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
import os
from groq import Groq
import tempfile
def validate_file(file):
"""Validate uploaded file type and size."""
if file is None:
return False, "No file uploaded"
# Check file size (25MB limit)
file_size_mb = os.path.getsize(file.name) / (1024 * 1024)
if file_size_mb > 25:
return False, f"File size ({file_size_mb:.1f}MB) exceeds 25MB limit"
# Check file extension
valid_extensions = ['.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm', '.flac', '.ogg', '.aac']
file_extension = os.path.splitext(file.name)[1].lower()
if file_extension not in valid_extensions:
return False, f"Invalid file type. Supported formats: {', '.join(valid_extensions)}"
return True, "File is valid"
def transcribe_audio(audio_file, api_key):
"""Transcribe audio/video files into text using Groq's Whisper model.
This tool converts spoken content from audio and video files into written text.
It supports multiple audio formats and handles files up to 25MB in size.
Parameters:
audio_file: An audio or video file to transcribe.
Supported formats: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC.
Maximum size: 25MB.
api_key: Your Groq API key, required for authentication.
You can obtain this from https://console.groq.com/
Returns:
A text transcript of the spoken content in the audio file.
Example:
Upload a podcast episode to get a complete text transcript.
"""
try:
# Validate inputs
if not api_key:
return "Error: Please provide your Groq API key"
if audio_file is None:
return "Error: Please upload an audio or video file"
# Validate file
is_valid, message = validate_file(audio_file)
if not is_valid:
return f"Error: {message}"
# Initialize Groq client
client = Groq(api_key=api_key)
# Read the audio file
with open(audio_file.name, "rb") as file:
# Create transcription
transcription = client.audio.transcriptions.create(
file=(os.path.basename(audio_file.name), file.read()),
model="whisper-large-v3-turbo"
)
return transcription.text
except Exception as e:
return f"Error: {str(e)}"
# Create the Gradio interface with custom layout
with gr.Blocks(title="Audio/Video Transcription with Groq", theme=gr.themes.Soft()) as demo:
gr.Markdown("# π΅ Audio/Video Transcription with Groq Whisper")
gr.Markdown("Upload an audio or video file and get an AI-generated transcript using Groq's Whisper model.")
with gr.Row():
# Left column - Input controls
with gr.Column(scale=1):
gr.Markdown("### π€ Upload & Settings")
audio_input = gr.File(
label="Upload Audio/Video File",
file_types=[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".flac", ".ogg", ".aac"],
file_count="single"
)
api_key_input = gr.Textbox(
label="Groq API Key",
placeholder="Enter your Groq API key here...",
type="password",
lines=1
)
transcribe_btn = gr.Button(
"π― Transcribe Audio",
variant="primary",
size="lg"
)
gr.Markdown("### βΉοΈ File Requirements")
gr.Markdown("""
- **Max file size**: 25MB
- **Supported formats**: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC
- **Get API key**: [Groq Console](https://console.groq.com/)
""")
# Right column - Output
with gr.Column(scale=1):
gr.Markdown("### π Transcript")
transcript_output = gr.Textbox(
label="Generated Transcript",
placeholder="Your transcript will appear here...",
lines=20,
max_lines=30,
show_copy_button=True,
interactive=False
)
# Connect the button to the transcription function
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, api_key_input],
outputs=transcript_output,
show_progress=True
)
# Add examples section
gr.Markdown("### π Useful Links")
gr.Markdown("""
- [Get your Groq API key](https://console.groq.com/)
- [Groq Documentation](https://console.groq.com/docs)
- [Supported audio formats](https://platform.openai.com/docs/guides/speech-to-text)
""")
if __name__ == "__main__":
demo.launch(mcp_server=True) |