Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
import re
|
6 |
+
import uuid
|
7 |
+
from pydub import AudioSegment
|
8 |
+
|
9 |
+
# Ensure the 'subtitles' directory exists
|
10 |
+
if not os.path.exists("./subtitles"):
|
11 |
+
os.makedirs("./subtitles", exist_ok=True)
|
12 |
+
|
13 |
+
def clean_file_name(file_path,unique_id=True):
|
14 |
+
# Get the base file name and extension
|
15 |
+
file_name = os.path.basename(file_path)
|
16 |
+
file_name, file_extension = os.path.splitext(file_name)
|
17 |
+
|
18 |
+
# Replace non-alphanumeric characters with an underscore
|
19 |
+
cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name)
|
20 |
+
|
21 |
+
# Remove any multiple underscores
|
22 |
+
clean_file_name = re.sub(r'_+', '_', cleaned).strip('_')
|
23 |
+
|
24 |
+
# Generate a random UUID for uniqueness
|
25 |
+
random_uuid = uuid.uuid4().hex[:6]
|
26 |
+
if unique_id:
|
27 |
+
clean_file_name = f"{clean_file_name}_{random_uuid}{file_extension}"
|
28 |
+
else:
|
29 |
+
clean_file_name = f"{clean_file_name}{file_extension}"
|
30 |
+
|
31 |
+
return clean_file_name
|
32 |
+
|
33 |
+
def convert_to_mono(file_path, output_format="mp3"):
|
34 |
+
# Load the audio (any format supported by ffmpeg/pydub)
|
35 |
+
audio = AudioSegment.from_file(file_path)
|
36 |
+
|
37 |
+
# Convert to mono
|
38 |
+
mono_audio = audio.set_channels(1)
|
39 |
+
|
40 |
+
file_name = os.path.basename(file_path)
|
41 |
+
file_name, file_extension = os.path.splitext(file_name)
|
42 |
+
|
43 |
+
# Get the cleaned output file name and path
|
44 |
+
cleaned_file_name = clean_file_name(file_name)
|
45 |
+
output_file = f"./subtitles/{cleaned_file_name}.{output_format}"
|
46 |
+
|
47 |
+
# Export the mono audio
|
48 |
+
mono_audio.export(output_file, format=output_format)
|
49 |
+
return output_file
|
50 |
+
|
51 |
+
def format_srt_time(seconds):
|
52 |
+
hours = int(seconds // 3600)
|
53 |
+
minutes = int((seconds % 3600) // 60)
|
54 |
+
sec = int(seconds % 60)
|
55 |
+
millisec = int((seconds % 1) * 1000)
|
56 |
+
return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"
|
57 |
+
|
58 |
+
## Word Level SRT File
|
59 |
+
def write_word_srt(mono_audio_path,word_level_timestamps, skip_punctuation=True):
|
60 |
+
extension = os.path.splitext(mono_audio_path)[1]
|
61 |
+
output_file=mono_audio_path.replace(extension,"_word_level.srt")
|
62 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
63 |
+
index = 1
|
64 |
+
|
65 |
+
for entry in word_level_timestamps:
|
66 |
+
word = entry["word"]
|
67 |
+
|
68 |
+
if skip_punctuation and all(c in string.punctuation for c in word):
|
69 |
+
continue
|
70 |
+
|
71 |
+
start_srt = format_srt_time(entry["start"])
|
72 |
+
end_srt = format_srt_time(entry["end"])
|
73 |
+
|
74 |
+
f.write(f"{index}\n{start_srt} --> {end_srt}\n{word}\n\n")
|
75 |
+
index += 1
|
76 |
+
return output_file
|
77 |
+
|
78 |
+
|
79 |
+
## Speech To text File
|
80 |
+
def write_words_to_txt(mono_audio_path, word_level_timestamps):
|
81 |
+
|
82 |
+
extension = os.path.splitext(mono_audio_path)[1]
|
83 |
+
output_file=mono_audio_path.replace(extension,".txt")
|
84 |
+
|
85 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
86 |
+
words = [
|
87 |
+
entry["word"]
|
88 |
+
for entry in word_level_timestamps
|
89 |
+
if not all(c in string.punctuation for c in entry["word"])
|
90 |
+
]
|
91 |
+
text = " ".join(words)
|
92 |
+
f.write(text)
|
93 |
+
return text, output_file
|
94 |
+
|
95 |
+
|
96 |
+
## Sentence Level Srt File
|
97 |
+
def generate_professional_subtitles(mono_audio_path, word_timestamps, max_words_per_subtitle=8, max_subtitle_duration=5.0, min_pause_for_split=0.5):
|
98 |
+
"""
|
99 |
+
Generates professional subtitles and saves to SRT file by:
|
100 |
+
- Splitting at sentence boundaries (., ?, !) when possible
|
101 |
+
- Respecting pauses (> min_pause_for_split) for natural breaks
|
102 |
+
- Enforcing max_words_per_subtitle and max_subtitle_duration
|
103 |
+
- Outputting standard SRT format with proper timing
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
output_file: Path to the generated SRT file
|
107 |
+
subtitles: List of subtitle dictionaries with text/start/end
|
108 |
+
"""
|
109 |
+
subtitles = []
|
110 |
+
current_sub = {
|
111 |
+
"text": "",
|
112 |
+
"start": None,
|
113 |
+
"end": None,
|
114 |
+
"word_count": 0
|
115 |
+
}
|
116 |
+
|
117 |
+
# Prepare output SRT file path
|
118 |
+
extension = os.path.splitext(mono_audio_path)[1]
|
119 |
+
output_file=mono_audio_path.replace(extension,".srt")
|
120 |
+
|
121 |
+
|
122 |
+
# Process word timestamps to create subtitles
|
123 |
+
for word_data in word_timestamps:
|
124 |
+
word = word_data['word']
|
125 |
+
word_start = word_data['start']
|
126 |
+
word_end = word_data['end']
|
127 |
+
|
128 |
+
# Check for sentence-ending punctuation
|
129 |
+
is_end_of_sentence = word.endswith(('.', '?', '!'))
|
130 |
+
|
131 |
+
# Check for a natural pause (silence between words)
|
132 |
+
has_pause = (current_sub["end"] is not None and
|
133 |
+
word_start - current_sub["end"] > min_pause_for_split)
|
134 |
+
|
135 |
+
# Check if we need to split due to constraints
|
136 |
+
should_split = (
|
137 |
+
is_end_of_sentence or
|
138 |
+
has_pause or
|
139 |
+
current_sub["word_count"] >= max_words_per_subtitle or
|
140 |
+
(current_sub["end"] is not None and
|
141 |
+
(word_end - current_sub["start"]) > max_subtitle_duration)
|
142 |
+
)
|
143 |
+
|
144 |
+
if should_split and current_sub["text"]:
|
145 |
+
# Finalize current subtitle
|
146 |
+
subtitles.append({
|
147 |
+
"text": current_sub["text"].strip(),
|
148 |
+
"start": current_sub["start"],
|
149 |
+
"end": current_sub["end"]
|
150 |
+
})
|
151 |
+
# Reset for next subtitle
|
152 |
+
current_sub = {
|
153 |
+
"text": "",
|
154 |
+
"start": None,
|
155 |
+
"end": None,
|
156 |
+
"word_count": 0
|
157 |
+
}
|
158 |
+
|
159 |
+
# Add current word to subtitle
|
160 |
+
if current_sub["word_count"] == 0:
|
161 |
+
current_sub["start"] = word_start
|
162 |
+
current_sub["text"] += " " + word if current_sub["text"] else word
|
163 |
+
current_sub["end"] = word_end
|
164 |
+
current_sub["word_count"] += 1
|
165 |
+
|
166 |
+
# Add last subtitle if exists
|
167 |
+
if current_sub["text"]:
|
168 |
+
subtitles.append({
|
169 |
+
"text": current_sub["text"].strip(),
|
170 |
+
"start": current_sub["start"],
|
171 |
+
"end": current_sub["end"]
|
172 |
+
})
|
173 |
+
|
174 |
+
# Write to SRT file
|
175 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
176 |
+
for i, sub in enumerate(subtitles, 1):
|
177 |
+
f.write(f"{i}\n")
|
178 |
+
f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
|
179 |
+
f.write(f"{sub['text']}\n\n")
|
180 |
+
|
181 |
+
return output_file, subtitles
|
182 |
+
|
183 |
+
|
184 |
+
## For vertical Videos
|
185 |
+
def for_yt_shorts(mono_audio_path, word_timestamps, min_silence_between_words=0.3, max_characters_per_subtitle=17):
|
186 |
+
"""
|
187 |
+
Generates optimized subtitles for YouTube Shorts/Instagram Reels by:
|
188 |
+
- Combining hyphenated words (e.g., "co-" + "-worker" → "coworker")
|
189 |
+
- Respecting max character limits per subtitle (default: 17)
|
190 |
+
- Creating natural breaks at pauses (> min_silence_between_words)
|
191 |
+
- Outputting properly formatted SRT files
|
192 |
+
|
193 |
+
Returns:
|
194 |
+
output_file: Path to generated SRT file
|
195 |
+
subtitles: List of subtitle dictionaries (text/start/end)
|
196 |
+
"""
|
197 |
+
subtitles = []
|
198 |
+
current_sub = {
|
199 |
+
"text": "",
|
200 |
+
"start": None,
|
201 |
+
"end": None,
|
202 |
+
"char_count": 0
|
203 |
+
}
|
204 |
+
|
205 |
+
|
206 |
+
extension = os.path.splitext(mono_audio_path)[1]
|
207 |
+
output_file=mono_audio_path.replace(extension,"_shorts.srt")
|
208 |
+
|
209 |
+
i = 0
|
210 |
+
while i < len(word_timestamps):
|
211 |
+
# Process current word and any hyphenated continuations
|
212 |
+
full_word = word_timestamps[i]['word']
|
213 |
+
start_time = word_timestamps[i]['start']
|
214 |
+
end_time = word_timestamps[i]['end']
|
215 |
+
|
216 |
+
# Combine hyphenated words (e.g., "co-" + "-worker")
|
217 |
+
while (i + 1 < len(word_timestamps) and
|
218 |
+
word_timestamps[i+1]['word'].startswith('-')):
|
219 |
+
next_word = word_timestamps[i+1]['word'].lstrip('-')
|
220 |
+
full_word += next_word
|
221 |
+
end_time = word_timestamps[i+1]['end']
|
222 |
+
i += 1
|
223 |
+
|
224 |
+
# Check if adding this word would exceed character limit
|
225 |
+
new_char_count = current_sub["char_count"] + len(full_word) + (1 if current_sub["text"] else 0)
|
226 |
+
|
227 |
+
# Check for natural break conditions
|
228 |
+
needs_break = (
|
229 |
+
new_char_count > max_characters_per_subtitle or
|
230 |
+
(current_sub["end"] is not None and
|
231 |
+
word_timestamps[i]['start'] - current_sub["end"] > min_silence_between_words)
|
232 |
+
)
|
233 |
+
|
234 |
+
if needs_break and current_sub["text"]:
|
235 |
+
# Finalize current subtitle
|
236 |
+
subtitles.append({
|
237 |
+
"text": current_sub["text"].strip(),
|
238 |
+
"start": current_sub["start"],
|
239 |
+
"end": current_sub["end"]
|
240 |
+
})
|
241 |
+
# Start new subtitle
|
242 |
+
current_sub = {
|
243 |
+
"text": full_word,
|
244 |
+
"start": start_time,
|
245 |
+
"end": end_time,
|
246 |
+
"char_count": len(full_word)
|
247 |
+
}
|
248 |
+
else:
|
249 |
+
# Add to current subtitle
|
250 |
+
if current_sub["text"]:
|
251 |
+
current_sub["text"] += " " + full_word
|
252 |
+
current_sub["char_count"] += 1 + len(full_word) # Space + word
|
253 |
+
else:
|
254 |
+
current_sub["text"] = full_word
|
255 |
+
current_sub["start"] = start_time
|
256 |
+
current_sub["char_count"] = len(full_word)
|
257 |
+
current_sub["end"] = end_time
|
258 |
+
|
259 |
+
i += 1
|
260 |
+
|
261 |
+
# Add final subtitle if exists
|
262 |
+
if current_sub["text"]:
|
263 |
+
subtitles.append({
|
264 |
+
"text": current_sub["text"].strip(),
|
265 |
+
"start": current_sub["start"],
|
266 |
+
"end": current_sub["end"]
|
267 |
+
})
|
268 |
+
|
269 |
+
# Write SRT file
|
270 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
271 |
+
for idx, sub in enumerate(subtitles, 1):
|
272 |
+
f.write(f"{idx}\n")
|
273 |
+
f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
|
274 |
+
f.write(f"{sub['text']}\n\n")
|
275 |
+
|
276 |
+
return output_file, subtitles
|
277 |
+
|
278 |
+
|
279 |
+
|
280 |
+
## Save word level timestamp for later use if you are a developer
|
281 |
+
def word_timestamp_json(mono_audio_path, word_timestamps):
|
282 |
+
"""
|
283 |
+
Save word timestamps as a JSON file with the same base name as the audio file.
|
284 |
+
|
285 |
+
Args:
|
286 |
+
mono_audio_path: Path to the audio file (e.g., "audio.wav")
|
287 |
+
word_timestamps: List of word timestamp dictionaries
|
288 |
+
|
289 |
+
Returns:
|
290 |
+
output_file: Path to the generated JSON file
|
291 |
+
word_timestamps: The original word timestamps (unchanged)
|
292 |
+
"""
|
293 |
+
# Create output path
|
294 |
+
extension = os.path.splitext(mono_audio_path)[1]
|
295 |
+
output_file=mono_audio_path.replace(extension,"_word_timestamps.json")
|
296 |
+
|
297 |
+
# Save as JSON with pretty formatting
|
298 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
299 |
+
json.dump(word_timestamps, f, indent=2, ensure_ascii=False)
|
300 |
+
|
301 |
+
return output_file
|
302 |
+
|
303 |
+
## save all files
|
304 |
+
def save_files(mono_audio_path, word_timestamps):
|
305 |
+
"""
|
306 |
+
Processes word timestamps and generates multiple subtitle/text formats for different use cases.
|
307 |
+
|
308 |
+
Generates:
|
309 |
+
1. Professional SRT subtitles (for standard videos)
|
310 |
+
2. Word-level SRT (for short-form content)
|
311 |
+
3. Optimized vertical video subtitles (Shorts/Reels/TikTok)
|
312 |
+
4. Raw speech-to-text transcript
|
313 |
+
5. JSON timestamp data (for developers)
|
314 |
+
6. Raw transcript text (for immediate use)
|
315 |
+
|
316 |
+
Args:
|
317 |
+
mono_audio_path: Path to the source audio file (WAV format)
|
318 |
+
word_timestamps: List of dictionaries containing word-level timestamps
|
319 |
+
[{'word': str, 'start': float, 'end': float}, ...]
|
320 |
+
|
321 |
+
Returns:
|
322 |
+
Six separate values in this order:
|
323 |
+
default_srt_path: # Traditional subtitles (8 words max)
|
324 |
+
word_level_srt_path: # Single-word segments
|
325 |
+
shorts_srt_path: # Vertical video optimized
|
326 |
+
speech_text_path: # Plain text transcript file
|
327 |
+
timestamps_json_path: # Raw timestamp data file
|
328 |
+
text: # Raw transcript text string
|
329 |
+
"""
|
330 |
+
|
331 |
+
# 1. Generate standard subtitles for traditional videos
|
332 |
+
default_srt_path, _ = generate_professional_subtitles(
|
333 |
+
mono_audio_path,
|
334 |
+
word_timestamps,
|
335 |
+
max_words_per_subtitle=8,
|
336 |
+
max_subtitle_duration=5.0,
|
337 |
+
min_pause_for_split=0.5
|
338 |
+
)
|
339 |
+
|
340 |
+
# 2. Create word-level SRT for short-form content
|
341 |
+
word_level_srt_path = write_word_srt(mono_audio_path, word_timestamps)
|
342 |
+
|
343 |
+
# 3. Generate optimized subtitles for vertical videos
|
344 |
+
shorts_srt_path, _ = for_yt_shorts(
|
345 |
+
mono_audio_path,
|
346 |
+
word_timestamps,
|
347 |
+
min_silence_between_words=0.3,
|
348 |
+
max_characters_per_subtitle=17
|
349 |
+
)
|
350 |
+
|
351 |
+
# 4. Extract raw transcript text and save to file
|
352 |
+
text, speech_text_path = write_words_to_txt(mono_audio_path, word_timestamps)
|
353 |
+
|
354 |
+
# 5. Save developer-friendly timestamp data
|
355 |
+
timestamps_json_path = word_timestamp_json(mono_audio_path, word_timestamps)
|
356 |
+
|
357 |
+
# Return all six values separately
|
358 |
+
return default_srt_path, word_level_srt_path, shorts_srt_path, speech_text_path, timestamps_json_path, text
|