Enrique Cardoza commited on
Commit
d49e517
Β·
1 Parent(s): 80bdfa1

feat(transcription): add URL-based audio transcription

Browse files

- Add new transcribe_audio_from_url function for processing audio from URLs
- Implement URL validation with extension and size checks
- Create tabbed interface separating upload and URL methods
- Add functionality to download and process remote audio files
- Reorganize UI to share API key input between both methods
- Add URL-specific requirements and instructions
- Improve error handling for network operations
- Implement temp file management for downloaded audio

Files changed (1) hide show
  1. app.py +214 -50
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  import os
3
  from groq import Groq
4
  import tempfile
 
 
5
 
6
  def validate_file(file):
7
  """Validate uploaded file type and size."""
@@ -22,6 +24,45 @@ def validate_file(file):
22
 
23
  return True, "File is valid"
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def transcribe_audio(audio_file, api_key):
26
  """Transcribe audio/video files into text using Groq's Whisper model.
27
 
@@ -73,64 +114,187 @@ def transcribe_audio(audio_file, api_key):
73
  except Exception as e:
74
  return f"Error: {str(e)}"
75
 
76
- # Create the Gradio interface with custom layout
77
- with gr.Blocks(title="Audio/Video Transcription with Groq", theme=gr.themes.Soft()) as demo:
78
- gr.Markdown("# 🎡 Audio/Video Transcription with Groq Whisper")
79
- gr.Markdown("Upload an audio or video file and get an AI-generated transcript using Groq's Whisper model.")
80
 
81
- with gr.Row():
82
- # Left column - Input controls
83
- with gr.Column(scale=1):
84
- gr.Markdown("### πŸ“€ Upload & Settings")
85
-
86
- audio_input = gr.File(
87
- label="Upload Audio/Video File",
88
- file_types=[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".flac", ".ogg", ".aac"],
89
- file_count="single"
90
- )
91
-
92
- # Show a note if env var is present
93
- api_key_note = "API key will be used from environment variable if set" if os.environ.get("GROQ_API_KEY") else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- api_key_input = gr.Textbox(
96
- label="Groq API Key",
97
- placeholder="Enter your Groq API key here or set GROQ_API_KEY environment variable",
98
- type="password",
99
- lines=1,
100
- info=api_key_note
101
- )
102
 
103
- transcribe_btn = gr.Button(
104
- "🎯 Transcribe Audio",
105
- variant="primary",
106
- size="lg"
107
- )
 
 
 
 
 
 
 
 
 
 
108
 
109
- gr.Markdown("### ℹ️ File Requirements")
110
- gr.Markdown("""
111
- - **Max file size**: 25MB
112
- - **Supported formats**: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC
113
- - **Get API key**: [Groq Console](https://console.groq.com/)
114
- """)
115
-
116
- # Right column - Output
117
- with gr.Column(scale=1):
118
- gr.Markdown("### πŸ“ Transcript")
119
 
120
- transcript_output = gr.Textbox(
121
- label="Generated Transcript",
122
- placeholder="Your transcript will appear here...",
123
- lines=20,
124
- max_lines=30,
125
- show_copy_button=True,
126
- interactive=False
127
- )
 
 
 
 
 
 
128
 
129
- # Connect the button to the transcription function
130
- transcribe_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  fn=transcribe_audio,
132
  inputs=[audio_input, api_key_input],
133
- outputs=transcript_output,
 
 
 
 
 
 
 
134
  show_progress=True
135
  )
136
 
 
2
  import os
3
  from groq import Groq
4
  import tempfile
5
+ import requests
6
+ import urllib.parse
7
 
8
  def validate_file(file):
9
  """Validate uploaded file type and size."""
 
24
 
25
  return True, "File is valid"
26
 
27
+ def validate_url_file(url):
28
+ """Validate file from URL based on extension and size."""
29
+ if not url or url.strip() == "":
30
+ return False, "No URL provided"
31
+
32
+ try:
33
+ # Check if the URL is valid
34
+ parsed_url = urllib.parse.urlparse(url)
35
+ if not all([parsed_url.scheme, parsed_url.netloc]):
36
+ return False, "Invalid URL format"
37
+
38
+ if parsed_url.scheme not in ['http', 'https']:
39
+ return False, "URL must start with http:// or https://"
40
+
41
+ # Check file extension from URL
42
+ valid_extensions = ['.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm', '.flac', '.ogg', '.aac']
43
+ file_extension = os.path.splitext(parsed_url.path)[1].lower()
44
+
45
+ if file_extension not in valid_extensions:
46
+ return False, f"Invalid file type in URL. Supported formats: {', '.join(valid_extensions)}"
47
+
48
+ # Check file size with a HEAD request
49
+ response = requests.head(url, allow_redirects=True, timeout=10)
50
+ if response.status_code != 200:
51
+ return False, f"Could not access URL (HTTP {response.status_code})"
52
+
53
+ content_length = response.headers.get('content-length')
54
+ if content_length:
55
+ file_size_mb = int(content_length) / (1024 * 1024)
56
+ if file_size_mb > 25:
57
+ return False, f"File size ({file_size_mb:.1f}MB) exceeds 25MB limit"
58
+
59
+ return True, "File is valid"
60
+
61
+ except requests.exceptions.RequestException as e:
62
+ return False, f"Error accessing URL: {str(e)}"
63
+ except Exception as e:
64
+ return False, f"Error validating URL: {str(e)}"
65
+
66
  def transcribe_audio(audio_file, api_key):
67
  """Transcribe audio/video files into text using Groq's Whisper model.
68
 
 
114
  except Exception as e:
115
  return f"Error: {str(e)}"
116
 
117
+ def transcribe_audio_from_url(audio_url, api_key):
118
+ """Transcribe audio/video files from a URL into text using Groq's Whisper model.
 
 
119
 
120
+ This tool converts spoken content from audio and video files into written text.
121
+ It supports multiple audio formats and handles files up to 25MB in size.
122
+
123
+ Parameters:
124
+ audio_url: URL to an audio or video file to transcribe (http or https).
125
+ Supported formats: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC.
126
+ Maximum size: 25MB.
127
+ api_key: Your Groq API key, required for authentication.
128
+ You can obtain this from https://console.groq.com/
129
+
130
+ Returns:
131
+ A text transcript of the spoken content in the audio file.
132
+
133
+ Example:
134
+ Provide a URL to a podcast episode to get a complete text transcript.
135
+ """
136
+ try:
137
+ # First check for environment variable, then use provided API key
138
+ actual_api_key = os.environ.get("GROQ_API_KEY", api_key)
139
+
140
+ # Validate API key
141
+ if not actual_api_key:
142
+ return "Error: Please provide your Groq API key or set the GROQ_API_KEY environment variable"
143
+
144
+ if not audio_url or audio_url.strip() == "":
145
+ return "Error: Please provide a URL to an audio or video file"
146
+
147
+ # Validate file from URL
148
+ is_valid, message = validate_url_file(audio_url)
149
+ if not is_valid:
150
+ return f"Error: {message}"
151
+
152
+ # Initialize Groq client
153
+ client = Groq(api_key=actual_api_key)
154
+
155
+ # Download the file to a temporary location
156
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
157
+ response = requests.get(audio_url, stream=True, timeout=30)
158
+ response.raise_for_status()
159
 
160
+ for chunk in response.iter_content(chunk_size=8192):
161
+ temp_file.write(chunk)
 
 
 
 
 
162
 
163
+ temp_file_path = temp_file.name
164
+
165
+ try:
166
+ # Read the downloaded file
167
+ with open(temp_file_path, "rb") as file:
168
+ # Get the original filename from the URL
169
+ filename = os.path.basename(urllib.parse.urlparse(audio_url).path)
170
+ if not filename:
171
+ filename = "audio_from_url"
172
+
173
+ # Create transcription
174
+ transcription = client.audio.transcriptions.create(
175
+ file=(filename, file.read()),
176
+ model="whisper-large-v3-turbo"
177
+ )
178
 
179
+ return transcription.text
 
 
 
 
 
 
 
 
 
180
 
181
+ finally:
182
+ # Clean up the temporary file
183
+ if os.path.exists(temp_file_path):
184
+ os.unlink(temp_file_path)
185
+
186
+ except requests.exceptions.RequestException as e:
187
+ return f"Error downloading file: {str(e)}"
188
+ except Exception as e:
189
+ return f"Error: {str(e)}"
190
+
191
+ # Create the Gradio interface with custom layout
192
+ with gr.Blocks(title="Audio/Video Transcription with Groq", theme=gr.themes.Soft()) as demo:
193
+ gr.Markdown("# 🎡 Audio/Video Transcription with Groq Whisper")
194
+ gr.Markdown("Upload an audio/video file or provide a URL and get an AI-generated transcript using Groq's Whisper model.")
195
 
196
+ # API Key input - shared between tabs
197
+ api_key_note = "API key will be used from environment variable if set" if os.environ.get("GROQ_API_KEY") else ""
198
+ api_key_input = gr.Textbox(
199
+ label="Groq API Key",
200
+ placeholder="Enter your Groq API key here or set GROQ_API_KEY environment variable",
201
+ type="password",
202
+ lines=1,
203
+ info=api_key_note
204
+ )
205
+
206
+ with gr.Tabs():
207
+ # Tab 1: File Upload
208
+ with gr.TabItem("Upload File"):
209
+ with gr.Row():
210
+ # Left column - Input controls
211
+ with gr.Column(scale=1):
212
+ gr.Markdown("### πŸ“€ Upload Audio/Video")
213
+
214
+ audio_input = gr.File(
215
+ label="Upload Audio/Video File",
216
+ file_types=[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".flac", ".ogg", ".aac"],
217
+ file_count="single"
218
+ )
219
+
220
+ upload_transcribe_btn = gr.Button(
221
+ "🎯 Transcribe Uploaded File",
222
+ variant="primary",
223
+ size="lg"
224
+ )
225
+
226
+ gr.Markdown("### ℹ️ File Requirements")
227
+ gr.Markdown("""
228
+ - **Max file size**: 25MB
229
+ - **Supported formats**: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC
230
+ - **Get API key**: [Groq Console](https://console.groq.com/)
231
+ """)
232
+
233
+ # Right column - Output
234
+ with gr.Column(scale=1):
235
+ gr.Markdown("### πŸ“ Transcript")
236
+
237
+ upload_transcript_output = gr.Textbox(
238
+ label="Generated Transcript",
239
+ placeholder="Your transcript will appear here...",
240
+ lines=20,
241
+ max_lines=30,
242
+ show_copy_button=True,
243
+ interactive=False
244
+ )
245
+
246
+ # Tab 2: URL Input
247
+ with gr.TabItem("Audio URL"):
248
+ with gr.Row():
249
+ # Left column - Input controls
250
+ with gr.Column(scale=1):
251
+ gr.Markdown("### πŸ”— Audio/Video URL")
252
+
253
+ url_input = gr.Textbox(
254
+ label="URL to Audio/Video File",
255
+ placeholder="Enter the http/https URL to an audio or video file",
256
+ lines=2
257
+ )
258
+
259
+ url_transcribe_btn = gr.Button(
260
+ "🎯 Transcribe from URL",
261
+ variant="primary",
262
+ size="lg"
263
+ )
264
+
265
+ gr.Markdown("### ℹ️ URL Requirements")
266
+ gr.Markdown("""
267
+ - **URL format**: Must start with http:// or https://
268
+ - **Max file size**: 25MB
269
+ - **Supported formats**: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC
270
+ - **Direct link**: URL must point directly to the audio/video file
271
+ """)
272
+
273
+ # Right column - Output
274
+ with gr.Column(scale=1):
275
+ gr.Markdown("### πŸ“ Transcript")
276
+
277
+ url_transcript_output = gr.Textbox(
278
+ label="Generated Transcript",
279
+ placeholder="Your transcript will appear here...",
280
+ lines=20,
281
+ max_lines=30,
282
+ show_copy_button=True,
283
+ interactive=False
284
+ )
285
+
286
+ # Connect the buttons to their respective transcription functions
287
+ upload_transcribe_btn.click(
288
  fn=transcribe_audio,
289
  inputs=[audio_input, api_key_input],
290
+ outputs=upload_transcript_output,
291
+ show_progress=True
292
+ )
293
+
294
+ url_transcribe_btn.click(
295
+ fn=transcribe_audio_from_url,
296
+ inputs=[url_input, api_key_input],
297
+ outputs=url_transcript_output,
298
  show_progress=True
299
  )
300