feat(app): implement audio/video transcription with Groq API
Browse files- Replace letter counter with audio/video transcription functionality
- Implement two-column layout with upload controls and transcript display
- Add file validation for supported audio/video formats with 25MB size limit
- Integrate with Groq's Whisper-large-v3-turbo API for transcription
- Add secure API key input field with password protection
- Include helpful information and links for user guidance
- Implement comprehensive error handling with user-friendly messages
- Add detailed docstrings with MCP integration documentation
- Document input parameters with constraints and requirements
- Include usage examples to improve user understanding
- Format docstrings to comply with MCP protocol requirements
- Ensure proper tool exposure with mcp_server=True parameter
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
from groq import Groq
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
def validate_file(file):
|
7 |
+
"""Validate uploaded file type and size."""
|
8 |
+
if file is None:
|
9 |
+
return False, "No file uploaded"
|
10 |
+
|
11 |
+
# Check file size (25MB limit)
|
12 |
+
file_size_mb = os.path.getsize(file.name) / (1024 * 1024)
|
13 |
+
if file_size_mb > 25:
|
14 |
+
return False, f"File size ({file_size_mb:.1f}MB) exceeds 25MB limit"
|
15 |
+
|
16 |
+
# Check file extension
|
17 |
+
valid_extensions = ['.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm', '.flac', '.ogg', '.aac']
|
18 |
+
file_extension = os.path.splitext(file.name)[1].lower()
|
19 |
+
|
20 |
+
if file_extension not in valid_extensions:
|
21 |
+
return False, f"Invalid file type. Supported formats: {', '.join(valid_extensions)}"
|
22 |
+
|
23 |
+
return True, "File is valid"
|
24 |
+
|
25 |
+
def transcribe_audio(audio_file, api_key):
|
26 |
+
"""Transcribe audio/video files into text using Groq's Whisper model.
|
27 |
+
|
28 |
+
This tool converts spoken content from audio and video files into written text.
|
29 |
+
It supports multiple audio formats and handles files up to 25MB in size.
|
30 |
+
|
31 |
+
Parameters:
|
32 |
+
audio_file: An audio or video file to transcribe.
|
33 |
+
Supported formats: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC.
|
34 |
+
Maximum size: 25MB.
|
35 |
+
api_key: Your Groq API key, required for authentication.
|
36 |
+
You can obtain this from https://console.groq.com/
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
A text transcript of the spoken content in the audio file.
|
40 |
+
|
41 |
+
Example:
|
42 |
+
Upload a podcast episode to get a complete text transcript.
|
43 |
+
"""
|
44 |
+
try:
|
45 |
+
# Validate inputs
|
46 |
+
if not api_key:
|
47 |
+
return "Error: Please provide your Groq API key"
|
48 |
+
|
49 |
+
if audio_file is None:
|
50 |
+
return "Error: Please upload an audio or video file"
|
51 |
+
|
52 |
+
# Validate file
|
53 |
+
is_valid, message = validate_file(audio_file)
|
54 |
+
if not is_valid:
|
55 |
+
return f"Error: {message}"
|
56 |
+
|
57 |
+
# Initialize Groq client
|
58 |
+
client = Groq(api_key=api_key)
|
59 |
+
|
60 |
+
# Read the audio file
|
61 |
+
with open(audio_file.name, "rb") as file:
|
62 |
+
# Create transcription
|
63 |
+
transcription = client.audio.transcriptions.create(
|
64 |
+
file=(os.path.basename(audio_file.name), file.read()),
|
65 |
+
model="whisper-large-v3-turbo"
|
66 |
+
)
|
67 |
+
|
68 |
+
return transcription.text
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
return f"Error: {str(e)}"
|
72 |
+
|
73 |
+
# Create the Gradio interface with custom layout
|
74 |
+
with gr.Blocks(title="Audio/Video Transcription with Groq", theme=gr.themes.Soft()) as demo:
|
75 |
+
gr.Markdown("# π΅ Audio/Video Transcription with Groq Whisper")
|
76 |
+
gr.Markdown("Upload an audio or video file and get an AI-generated transcript using Groq's Whisper model.")
|
77 |
+
|
78 |
+
with gr.Row():
|
79 |
+
# Left column - Input controls
|
80 |
+
with gr.Column(scale=1):
|
81 |
+
gr.Markdown("### π€ Upload & Settings")
|
82 |
+
|
83 |
+
audio_input = gr.File(
|
84 |
+
label="Upload Audio/Video File",
|
85 |
+
file_types=[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".flac", ".ogg", ".aac"],
|
86 |
+
file_count="single"
|
87 |
+
)
|
88 |
+
|
89 |
+
api_key_input = gr.Textbox(
|
90 |
+
label="Groq API Key",
|
91 |
+
placeholder="Enter your Groq API key here...",
|
92 |
+
type="password",
|
93 |
+
lines=1
|
94 |
+
)
|
95 |
+
|
96 |
+
transcribe_btn = gr.Button(
|
97 |
+
"π― Transcribe Audio",
|
98 |
+
variant="primary",
|
99 |
+
size="lg"
|
100 |
+
)
|
101 |
+
|
102 |
+
gr.Markdown("### βΉοΈ File Requirements")
|
103 |
+
gr.Markdown("""
|
104 |
+
- **Max file size**: 25MB
|
105 |
+
- **Supported formats**: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC
|
106 |
+
- **Get API key**: [Groq Console](https://console.groq.com/)
|
107 |
+
""")
|
108 |
+
|
109 |
+
# Right column - Output
|
110 |
+
with gr.Column(scale=1):
|
111 |
+
gr.Markdown("### π Transcript")
|
112 |
+
|
113 |
+
transcript_output = gr.Textbox(
|
114 |
+
label="Generated Transcript",
|
115 |
+
placeholder="Your transcript will appear here...",
|
116 |
+
lines=20,
|
117 |
+
max_lines=30,
|
118 |
+
show_copy_button=True,
|
119 |
+
interactive=False
|
120 |
+
)
|
121 |
+
|
122 |
+
# Connect the button to the transcription function
|
123 |
+
transcribe_btn.click(
|
124 |
+
fn=transcribe_audio,
|
125 |
+
inputs=[audio_input, api_key_input],
|
126 |
+
outputs=transcript_output,
|
127 |
+
show_progress=True
|
128 |
+
)
|
129 |
+
|
130 |
+
# Add examples section
|
131 |
+
gr.Markdown("### π Useful Links")
|
132 |
+
gr.Markdown("""
|
133 |
+
- [Get your Groq API key](https://console.groq.com/)
|
134 |
+
- [Groq Documentation](https://console.groq.com/docs)
|
135 |
+
- [Supported audio formats](https://platform.openai.com/docs/guides/speech-to-text)
|
136 |
+
""")
|
137 |
+
|
138 |
+
if __name__ == "__main__":
|
139 |
+
demo.launch(mcp_server=True)
|