NeuralFalcon commited on
Commit
84c2692
·
verified ·
1 Parent(s): 1531726

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +358 -0
utils.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import json
3
+ import os
4
+
5
+ import re
6
+ import uuid
7
+ from pydub import AudioSegment
8
+
9
+ # Ensure the 'subtitles' directory exists
10
+ if not os.path.exists("./subtitles"):
11
+ os.makedirs("./subtitles", exist_ok=True)
12
+
13
+ def clean_file_name(file_path,unique_id=True):
14
+ # Get the base file name and extension
15
+ file_name = os.path.basename(file_path)
16
+ file_name, file_extension = os.path.splitext(file_name)
17
+
18
+ # Replace non-alphanumeric characters with an underscore
19
+ cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name)
20
+
21
+ # Remove any multiple underscores
22
+ clean_file_name = re.sub(r'_+', '_', cleaned).strip('_')
23
+
24
+ # Generate a random UUID for uniqueness
25
+ random_uuid = uuid.uuid4().hex[:6]
26
+ if unique_id:
27
+ clean_file_name = f"{clean_file_name}_{random_uuid}{file_extension}"
28
+ else:
29
+ clean_file_name = f"{clean_file_name}{file_extension}"
30
+
31
+ return clean_file_name
32
+
33
+ def convert_to_mono(file_path, output_format="mp3"):
34
+ # Load the audio (any format supported by ffmpeg/pydub)
35
+ audio = AudioSegment.from_file(file_path)
36
+
37
+ # Convert to mono
38
+ mono_audio = audio.set_channels(1)
39
+
40
+ file_name = os.path.basename(file_path)
41
+ file_name, file_extension = os.path.splitext(file_name)
42
+
43
+ # Get the cleaned output file name and path
44
+ cleaned_file_name = clean_file_name(file_name)
45
+ output_file = f"./subtitles/{cleaned_file_name}.{output_format}"
46
+
47
+ # Export the mono audio
48
+ mono_audio.export(output_file, format=output_format)
49
+ return output_file
50
+
51
+ def format_srt_time(seconds):
52
+ hours = int(seconds // 3600)
53
+ minutes = int((seconds % 3600) // 60)
54
+ sec = int(seconds % 60)
55
+ millisec = int((seconds % 1) * 1000)
56
+ return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"
57
+
58
+ ## Word Level SRT File
59
+ def write_word_srt(mono_audio_path,word_level_timestamps, skip_punctuation=True):
60
+ extension = os.path.splitext(mono_audio_path)[1]
61
+ output_file=mono_audio_path.replace(extension,"_word_level.srt")
62
+ with open(output_file, "w", encoding="utf-8") as f:
63
+ index = 1
64
+
65
+ for entry in word_level_timestamps:
66
+ word = entry["word"]
67
+
68
+ if skip_punctuation and all(c in string.punctuation for c in word):
69
+ continue
70
+
71
+ start_srt = format_srt_time(entry["start"])
72
+ end_srt = format_srt_time(entry["end"])
73
+
74
+ f.write(f"{index}\n{start_srt} --> {end_srt}\n{word}\n\n")
75
+ index += 1
76
+ return output_file
77
+
78
+
79
+ ## Speech To text File
80
+ def write_words_to_txt(mono_audio_path, word_level_timestamps):
81
+
82
+ extension = os.path.splitext(mono_audio_path)[1]
83
+ output_file=mono_audio_path.replace(extension,".txt")
84
+
85
+ with open(output_file, "w", encoding="utf-8") as f:
86
+ words = [
87
+ entry["word"]
88
+ for entry in word_level_timestamps
89
+ if not all(c in string.punctuation for c in entry["word"])
90
+ ]
91
+ text = " ".join(words)
92
+ f.write(text)
93
+ return text, output_file
94
+
95
+
96
+ ## Sentence Level Srt File
97
+ def generate_professional_subtitles(mono_audio_path, word_timestamps, max_words_per_subtitle=8, max_subtitle_duration=5.0, min_pause_for_split=0.5):
98
+ """
99
+ Generates professional subtitles and saves to SRT file by:
100
+ - Splitting at sentence boundaries (., ?, !) when possible
101
+ - Respecting pauses (> min_pause_for_split) for natural breaks
102
+ - Enforcing max_words_per_subtitle and max_subtitle_duration
103
+ - Outputting standard SRT format with proper timing
104
+
105
+ Returns:
106
+ output_file: Path to the generated SRT file
107
+ subtitles: List of subtitle dictionaries with text/start/end
108
+ """
109
+ subtitles = []
110
+ current_sub = {
111
+ "text": "",
112
+ "start": None,
113
+ "end": None,
114
+ "word_count": 0
115
+ }
116
+
117
+ # Prepare output SRT file path
118
+ extension = os.path.splitext(mono_audio_path)[1]
119
+ output_file=mono_audio_path.replace(extension,".srt")
120
+
121
+
122
+ # Process word timestamps to create subtitles
123
+ for word_data in word_timestamps:
124
+ word = word_data['word']
125
+ word_start = word_data['start']
126
+ word_end = word_data['end']
127
+
128
+ # Check for sentence-ending punctuation
129
+ is_end_of_sentence = word.endswith(('.', '?', '!'))
130
+
131
+ # Check for a natural pause (silence between words)
132
+ has_pause = (current_sub["end"] is not None and
133
+ word_start - current_sub["end"] > min_pause_for_split)
134
+
135
+ # Check if we need to split due to constraints
136
+ should_split = (
137
+ is_end_of_sentence or
138
+ has_pause or
139
+ current_sub["word_count"] >= max_words_per_subtitle or
140
+ (current_sub["end"] is not None and
141
+ (word_end - current_sub["start"]) > max_subtitle_duration)
142
+ )
143
+
144
+ if should_split and current_sub["text"]:
145
+ # Finalize current subtitle
146
+ subtitles.append({
147
+ "text": current_sub["text"].strip(),
148
+ "start": current_sub["start"],
149
+ "end": current_sub["end"]
150
+ })
151
+ # Reset for next subtitle
152
+ current_sub = {
153
+ "text": "",
154
+ "start": None,
155
+ "end": None,
156
+ "word_count": 0
157
+ }
158
+
159
+ # Add current word to subtitle
160
+ if current_sub["word_count"] == 0:
161
+ current_sub["start"] = word_start
162
+ current_sub["text"] += " " + word if current_sub["text"] else word
163
+ current_sub["end"] = word_end
164
+ current_sub["word_count"] += 1
165
+
166
+ # Add last subtitle if exists
167
+ if current_sub["text"]:
168
+ subtitles.append({
169
+ "text": current_sub["text"].strip(),
170
+ "start": current_sub["start"],
171
+ "end": current_sub["end"]
172
+ })
173
+
174
+ # Write to SRT file
175
+ with open(output_file, "w", encoding="utf-8") as f:
176
+ for i, sub in enumerate(subtitles, 1):
177
+ f.write(f"{i}\n")
178
+ f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
179
+ f.write(f"{sub['text']}\n\n")
180
+
181
+ return output_file, subtitles
182
+
183
+
184
+ ## For vertical Videos
185
+ def for_yt_shorts(mono_audio_path, word_timestamps, min_silence_between_words=0.3, max_characters_per_subtitle=17):
186
+ """
187
+ Generates optimized subtitles for YouTube Shorts/Instagram Reels by:
188
+ - Combining hyphenated words (e.g., "co-" + "-worker" → "coworker")
189
+ - Respecting max character limits per subtitle (default: 17)
190
+ - Creating natural breaks at pauses (> min_silence_between_words)
191
+ - Outputting properly formatted SRT files
192
+
193
+ Returns:
194
+ output_file: Path to generated SRT file
195
+ subtitles: List of subtitle dictionaries (text/start/end)
196
+ """
197
+ subtitles = []
198
+ current_sub = {
199
+ "text": "",
200
+ "start": None,
201
+ "end": None,
202
+ "char_count": 0
203
+ }
204
+
205
+
206
+ extension = os.path.splitext(mono_audio_path)[1]
207
+ output_file=mono_audio_path.replace(extension,"_shorts.srt")
208
+
209
+ i = 0
210
+ while i < len(word_timestamps):
211
+ # Process current word and any hyphenated continuations
212
+ full_word = word_timestamps[i]['word']
213
+ start_time = word_timestamps[i]['start']
214
+ end_time = word_timestamps[i]['end']
215
+
216
+ # Combine hyphenated words (e.g., "co-" + "-worker")
217
+ while (i + 1 < len(word_timestamps) and
218
+ word_timestamps[i+1]['word'].startswith('-')):
219
+ next_word = word_timestamps[i+1]['word'].lstrip('-')
220
+ full_word += next_word
221
+ end_time = word_timestamps[i+1]['end']
222
+ i += 1
223
+
224
+ # Check if adding this word would exceed character limit
225
+ new_char_count = current_sub["char_count"] + len(full_word) + (1 if current_sub["text"] else 0)
226
+
227
+ # Check for natural break conditions
228
+ needs_break = (
229
+ new_char_count > max_characters_per_subtitle or
230
+ (current_sub["end"] is not None and
231
+ word_timestamps[i]['start'] - current_sub["end"] > min_silence_between_words)
232
+ )
233
+
234
+ if needs_break and current_sub["text"]:
235
+ # Finalize current subtitle
236
+ subtitles.append({
237
+ "text": current_sub["text"].strip(),
238
+ "start": current_sub["start"],
239
+ "end": current_sub["end"]
240
+ })
241
+ # Start new subtitle
242
+ current_sub = {
243
+ "text": full_word,
244
+ "start": start_time,
245
+ "end": end_time,
246
+ "char_count": len(full_word)
247
+ }
248
+ else:
249
+ # Add to current subtitle
250
+ if current_sub["text"]:
251
+ current_sub["text"] += " " + full_word
252
+ current_sub["char_count"] += 1 + len(full_word) # Space + word
253
+ else:
254
+ current_sub["text"] = full_word
255
+ current_sub["start"] = start_time
256
+ current_sub["char_count"] = len(full_word)
257
+ current_sub["end"] = end_time
258
+
259
+ i += 1
260
+
261
+ # Add final subtitle if exists
262
+ if current_sub["text"]:
263
+ subtitles.append({
264
+ "text": current_sub["text"].strip(),
265
+ "start": current_sub["start"],
266
+ "end": current_sub["end"]
267
+ })
268
+
269
+ # Write SRT file
270
+ with open(output_file, "w", encoding="utf-8") as f:
271
+ for idx, sub in enumerate(subtitles, 1):
272
+ f.write(f"{idx}\n")
273
+ f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
274
+ f.write(f"{sub['text']}\n\n")
275
+
276
+ return output_file, subtitles
277
+
278
+
279
+
280
+ ## Save word level timestamp for later use if you are a developer
281
+ def word_timestamp_json(mono_audio_path, word_timestamps):
282
+ """
283
+ Save word timestamps as a JSON file with the same base name as the audio file.
284
+
285
+ Args:
286
+ mono_audio_path: Path to the audio file (e.g., "audio.wav")
287
+ word_timestamps: List of word timestamp dictionaries
288
+
289
+ Returns:
290
+ output_file: Path to the generated JSON file
291
+ word_timestamps: The original word timestamps (unchanged)
292
+ """
293
+ # Create output path
294
+ extension = os.path.splitext(mono_audio_path)[1]
295
+ output_file=mono_audio_path.replace(extension,"_word_timestamps.json")
296
+
297
+ # Save as JSON with pretty formatting
298
+ with open(output_file, 'w', encoding='utf-8') as f:
299
+ json.dump(word_timestamps, f, indent=2, ensure_ascii=False)
300
+
301
+ return output_file
302
+
303
+ ## save all files
304
+ def save_files(mono_audio_path, word_timestamps):
305
+ """
306
+ Processes word timestamps and generates multiple subtitle/text formats for different use cases.
307
+
308
+ Generates:
309
+ 1. Professional SRT subtitles (for standard videos)
310
+ 2. Word-level SRT (for short-form content)
311
+ 3. Optimized vertical video subtitles (Shorts/Reels/TikTok)
312
+ 4. Raw speech-to-text transcript
313
+ 5. JSON timestamp data (for developers)
314
+ 6. Raw transcript text (for immediate use)
315
+
316
+ Args:
317
+ mono_audio_path: Path to the source audio file (WAV format)
318
+ word_timestamps: List of dictionaries containing word-level timestamps
319
+ [{'word': str, 'start': float, 'end': float}, ...]
320
+
321
+ Returns:
322
+ Six separate values in this order:
323
+ default_srt_path: # Traditional subtitles (8 words max)
324
+ word_level_srt_path: # Single-word segments
325
+ shorts_srt_path: # Vertical video optimized
326
+ speech_text_path: # Plain text transcript file
327
+ timestamps_json_path: # Raw timestamp data file
328
+ text: # Raw transcript text string
329
+ """
330
+
331
+ # 1. Generate standard subtitles for traditional videos
332
+ default_srt_path, _ = generate_professional_subtitles(
333
+ mono_audio_path,
334
+ word_timestamps,
335
+ max_words_per_subtitle=8,
336
+ max_subtitle_duration=5.0,
337
+ min_pause_for_split=0.5
338
+ )
339
+
340
+ # 2. Create word-level SRT for short-form content
341
+ word_level_srt_path = write_word_srt(mono_audio_path, word_timestamps)
342
+
343
+ # 3. Generate optimized subtitles for vertical videos
344
+ shorts_srt_path, _ = for_yt_shorts(
345
+ mono_audio_path,
346
+ word_timestamps,
347
+ min_silence_between_words=0.3,
348
+ max_characters_per_subtitle=17
349
+ )
350
+
351
+ # 4. Extract raw transcript text and save to file
352
+ text, speech_text_path = write_words_to_txt(mono_audio_path, word_timestamps)
353
+
354
+ # 5. Save developer-friendly timestamp data
355
+ timestamps_json_path = word_timestamp_json(mono_audio_path, word_timestamps)
356
+
357
+ # Return all six values separately
358
+ return default_srt_path, word_level_srt_path, shorts_srt_path, speech_text_path, timestamps_json_path, text