Spaces:

NeuralFalcon
/

parakeet-tdt-0.6b-v2-subtitle

Running

App Files Files Community

NeuralFalcon commited on May 13

Commit

84c2692

verified ·

1 Parent(s): 1531726

Create utils.py

Browse files

Files changed (1) hide show

utils.py +358 -0

utils.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import string
+import json
+import os
+import re
+import uuid
+from pydub import AudioSegment
+# Ensure the 'subtitles' directory exists
+if not os.path.exists("./subtitles"):
+    os.makedirs("./subtitles", exist_ok=True)
+def clean_file_name(file_path,unique_id=True):
+    # Get the base file name and extension
+    file_name = os.path.basename(file_path)
+    file_name, file_extension = os.path.splitext(file_name)
+    # Replace non-alphanumeric characters with an underscore
+    cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name)
+    # Remove any multiple underscores
+    clean_file_name = re.sub(r'_+', '_', cleaned).strip('_')
+    # Generate a random UUID for uniqueness
+    random_uuid = uuid.uuid4().hex[:6]
+    if unique_id:
+        clean_file_name = f"{clean_file_name}_{random_uuid}{file_extension}"
+    else:
+        clean_file_name = f"{clean_file_name}{file_extension}"
+    return clean_file_name
+def convert_to_mono(file_path, output_format="mp3"):
+    # Load the audio (any format supported by ffmpeg/pydub)
+    audio = AudioSegment.from_file(file_path)
+    # Convert to mono
+    mono_audio = audio.set_channels(1)
+    file_name = os.path.basename(file_path)
+    file_name, file_extension = os.path.splitext(file_name)
+    # Get the cleaned output file name and path
+    cleaned_file_name = clean_file_name(file_name)
+    output_file = f"./subtitles/{cleaned_file_name}.{output_format}"
+    # Export the mono audio
+    mono_audio.export(output_file, format=output_format)
+    return output_file
+def format_srt_time(seconds):
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    sec = int(seconds % 60)
+    millisec = int((seconds % 1) * 1000)
+    return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"
+## Word Level SRT File
+def write_word_srt(mono_audio_path,word_level_timestamps, skip_punctuation=True):
+    extension = os.path.splitext(mono_audio_path)[1]
+    output_file=mono_audio_path.replace(extension,"_word_level.srt")
+    with open(output_file, "w", encoding="utf-8") as f:
+        index = 1
+        for entry in word_level_timestamps:
+            word = entry["word"]
+            if skip_punctuation and all(c in string.punctuation for c in word):
+                continue
+            start_srt = format_srt_time(entry["start"])
+            end_srt = format_srt_time(entry["end"])
+            f.write(f"{index}\n{start_srt} --> {end_srt}\n{word}\n\n")
+            index += 1
+    return output_file
+## Speech To text File
+def write_words_to_txt(mono_audio_path, word_level_timestamps):
+    extension = os.path.splitext(mono_audio_path)[1]
+    output_file=mono_audio_path.replace(extension,".txt")
+    with open(output_file, "w", encoding="utf-8") as f:
+        words = [
+            entry["word"]
+            for entry in word_level_timestamps
+            if not all(c in string.punctuation for c in entry["word"])
+        ]
+        text = " ".join(words)
+        f.write(text)
+        return text, output_file
+## Sentence Level Srt File
+def generate_professional_subtitles(mono_audio_path, word_timestamps, max_words_per_subtitle=8, max_subtitle_duration=5.0, min_pause_for_split=0.5):
+    """
+    Generates professional subtitles and saves to SRT file by:
+    - Splitting at sentence boundaries (., ?, !) when possible
+    - Respecting pauses (> min_pause_for_split) for natural breaks
+    - Enforcing max_words_per_subtitle and max_subtitle_duration
+    - Outputting standard SRT format with proper timing
+    Returns:
+        output_file: Path to the generated SRT file
+        subtitles: List of subtitle dictionaries with text/start/end
+    """
+    subtitles = []
+    current_sub = {
+        "text": "",
+        "start": None,
+        "end": None,
+        "word_count": 0
+    }
+    # Prepare output SRT file path
+    extension = os.path.splitext(mono_audio_path)[1]
+    output_file=mono_audio_path.replace(extension,".srt")
+    # Process word timestamps to create subtitles
+    for word_data in word_timestamps:
+        word = word_data['word']
+        word_start = word_data['start']
+        word_end = word_data['end']
+        # Check for sentence-ending punctuation
+        is_end_of_sentence = word.endswith(('.', '?', '!'))
+        # Check for a natural pause (silence between words)
+        has_pause = (current_sub["end"] is not None and
+                    word_start - current_sub["end"] > min_pause_for_split)
+        # Check if we need to split due to constraints
+        should_split = (
+            is_end_of_sentence or
+            has_pause or
+            current_sub["word_count"] >= max_words_per_subtitle or
+            (current_sub["end"] is not None and
+             (word_end - current_sub["start"]) > max_subtitle_duration)
+        )
+        if should_split and current_sub["text"]:
+            # Finalize current subtitle
+            subtitles.append({
+                "text": current_sub["text"].strip(),
+                "start": current_sub["start"],
+                "end": current_sub["end"]
+            })
+            # Reset for next subtitle
+            current_sub = {
+                "text": "",
+                "start": None,
+                "end": None,
+                "word_count": 0
+            }
+        # Add current word to subtitle
+        if current_sub["word_count"] == 0:
+            current_sub["start"] = word_start
+        current_sub["text"] += " " + word if current_sub["text"] else word
+        current_sub["end"] = word_end
+        current_sub["word_count"] += 1
+    # Add last subtitle if exists
+    if current_sub["text"]:
+        subtitles.append({
+            "text": current_sub["text"].strip(),
+            "start": current_sub["start"],
+            "end": current_sub["end"]
+        })
+    # Write to SRT file
+    with open(output_file, "w", encoding="utf-8") as f:
+        for i, sub in enumerate(subtitles, 1):
+            f.write(f"{i}\n")
+            f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
+            f.write(f"{sub['text']}\n\n")
+    return output_file, subtitles
+## For vertical Videos
+def for_yt_shorts(mono_audio_path, word_timestamps, min_silence_between_words=0.3, max_characters_per_subtitle=17):
+    """
+    Generates optimized subtitles for YouTube Shorts/Instagram Reels by:
+    - Combining hyphenated words (e.g., "co-" + "-worker" → "coworker")
+    - Respecting max character limits per subtitle (default: 17)
+    - Creating natural breaks at pauses (> min_silence_between_words)
+    - Outputting properly formatted SRT files
+    Returns:
+        output_file: Path to generated SRT file
+        subtitles: List of subtitle dictionaries (text/start/end)
+    """
+    subtitles = []
+    current_sub = {
+        "text": "",
+        "start": None,
+        "end": None,
+        "char_count": 0
+    }
+    extension = os.path.splitext(mono_audio_path)[1]
+    output_file=mono_audio_path.replace(extension,"_shorts.srt")
+    i = 0
+    while i < len(word_timestamps):
+        # Process current word and any hyphenated continuations
+        full_word = word_timestamps[i]['word']
+        start_time = word_timestamps[i]['start']
+        end_time = word_timestamps[i]['end']
+        # Combine hyphenated words (e.g., "co-" + "-worker")
+        while (i + 1 < len(word_timestamps) and
+               word_timestamps[i+1]['word'].startswith('-')):
+            next_word = word_timestamps[i+1]['word'].lstrip('-')
+            full_word += next_word
+            end_time = word_timestamps[i+1]['end']
+            i += 1
+        # Check if adding this word would exceed character limit
+        new_char_count = current_sub["char_count"] + len(full_word) + (1 if current_sub["text"] else 0)
+        # Check for natural break conditions
+        needs_break = (
+            new_char_count > max_characters_per_subtitle or
+            (current_sub["end"] is not None and
+             word_timestamps[i]['start'] - current_sub["end"] > min_silence_between_words)
+        )
+        if needs_break and current_sub["text"]:
+            # Finalize current subtitle
+            subtitles.append({
+                "text": current_sub["text"].strip(),
+                "start": current_sub["start"],
+                "end": current_sub["end"]
+            })
+            # Start new subtitle
+            current_sub = {
+                "text": full_word,
+                "start": start_time,
+                "end": end_time,
+                "char_count": len(full_word)
+            }
+        else:
+            # Add to current subtitle
+            if current_sub["text"]:
+                current_sub["text"] += " " + full_word
+                current_sub["char_count"] += 1 + len(full_word)  # Space + word
+            else:
+                current_sub["text"] = full_word
+                current_sub["start"] = start_time
+                current_sub["char_count"] = len(full_word)
+            current_sub["end"] = end_time
+        i += 1
+    # Add final subtitle if exists
+    if current_sub["text"]:
+        subtitles.append({
+            "text": current_sub["text"].strip(),
+            "start": current_sub["start"],
+            "end": current_sub["end"]
+        })
+    # Write SRT file
+    with open(output_file, "w", encoding="utf-8") as f:
+        for idx, sub in enumerate(subtitles, 1):
+            f.write(f"{idx}\n")
+            f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
+            f.write(f"{sub['text']}\n\n")
+    return output_file, subtitles
+## Save word level timestamp for later use if you are a developer
+def word_timestamp_json(mono_audio_path, word_timestamps):
+    """
+    Save word timestamps as a JSON file with the same base name as the audio file.
+    Args:
+        mono_audio_path: Path to the audio file (e.g., "audio.wav")
+        word_timestamps: List of word timestamp dictionaries
+    Returns:
+        output_file: Path to the generated JSON file
+        word_timestamps: The original word timestamps (unchanged)
+    """
+    # Create output path
+    extension = os.path.splitext(mono_audio_path)[1]
+    output_file=mono_audio_path.replace(extension,"_word_timestamps.json")
+    # Save as JSON with pretty formatting
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(word_timestamps, f, indent=2, ensure_ascii=False)
+    return output_file
+## save all files
+def save_files(mono_audio_path, word_timestamps):
+    """
+    Processes word timestamps and generates multiple subtitle/text formats for different use cases.
+    Generates:
+    1. Professional SRT subtitles (for standard videos)
+    2. Word-level SRT (for short-form content)
+    3. Optimized vertical video subtitles (Shorts/Reels/TikTok)
+    4. Raw speech-to-text transcript
+    5. JSON timestamp data (for developers)
+    6. Raw transcript text (for immediate use)
+    Args:
+        mono_audio_path: Path to the source audio file (WAV format)
+        word_timestamps: List of dictionaries containing word-level timestamps
+                        [{'word': str, 'start': float, 'end': float}, ...]
+    Returns:
+        Six separate values in this order:
+        default_srt_path:       # Traditional subtitles (8 words max)
+        word_level_srt_path:    # Single-word segments
+        shorts_srt_path:        # Vertical video optimized
+        speech_text_path:       # Plain text transcript file
+        timestamps_json_path:   # Raw timestamp data file
+        text:                   # Raw transcript text string
+    """
+    # 1. Generate standard subtitles for traditional videos
+    default_srt_path, _ = generate_professional_subtitles(
+        mono_audio_path,
+        word_timestamps,
+        max_words_per_subtitle=8,
+        max_subtitle_duration=5.0,
+        min_pause_for_split=0.5
+    )
+    # 2. Create word-level SRT for short-form content
+    word_level_srt_path = write_word_srt(mono_audio_path, word_timestamps)
+    # 3. Generate optimized subtitles for vertical videos
+    shorts_srt_path, _ = for_yt_shorts(
+        mono_audio_path,
+        word_timestamps,
+        min_silence_between_words=0.3,
+        max_characters_per_subtitle=17
+    )
+    # 4. Extract raw transcript text and save to file
+    text, speech_text_path = write_words_to_txt(mono_audio_path, word_timestamps)
+    # 5. Save developer-friendly timestamp data
+    timestamps_json_path = word_timestamp_json(mono_audio_path, word_timestamps)
+    # Return all six values separately
+    return default_srt_path, word_level_srt_path, shorts_srt_path, speech_text_path, timestamps_json_path, text