import re import subprocess import tempfile import os import pycountry from functools import lru_cache import json def clean_transcript(text): """Clean the transcript by removing filler words and consolidating sentences.""" filler_words = [ r'\bum\b', r'\buh\b', r'\blike\b', r'\byou know\b', r'\bkind of\b', r'\bsort of\b', r'\bI mean\b', r'\bbasically\b', r'\bactually\b', r'\bso\b', r'\banyway\b', r'\blike\b', r'\bjust\b' ] for word in filler_words: text = re.sub(f"{word}", "", text, flags=re.IGNORECASE) text = re.sub(r'\s+', ' ', text) return text.strip() def consolidate_similar_items(items): """Consolidate similar items to reduce repetition.""" if not items or len(items) <= 1: return items # Simple similarity measure based on word overlap result = [items[0]] for item in items[1:]: # Convert to sets of words for comparison item_words = set(item.lower().split()) # Check if this item is too similar to any existing item too_similar = False for existing_item in result: existing_words = set(existing_item.lower().split()) # Calculate Jaccard similarity intersection = len(item_words.intersection(existing_words)) union = len(item_words.union(existing_words)) if union > 0 and intersection / union > 0.6: # 60% similarity threshold too_similar = True break if not too_similar: result.append(item) return result def chunk_text(text, max_tokens=800): words = text.split() chunks = [] if len(words) > max_tokens: for i in range(0, len(words), max_tokens): chunk = " ".join(words[i:i + max_tokens]) chunks.append(chunk) return chunks else: return [text] def webm_to_wav(webm_bytes: bytes) -> str: """ Converts webm audio bytes to a wav file and returns the path to the wav file. """ with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as webm_file: webm_path = webm_file.name webm_file.write(webm_bytes) wav_path = webm_path.replace(".webm", ".wav") try: subprocess.run([ "ffmpeg", "-y", "-i", webm_path, "-ar", "16000", "-ac", "1", wav_path ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) finally: os.remove(webm_path) return wav_path def preprocess_transcript(transcript: str) -> str: """ Cleans and normalizes the transcript. - Removes extra whitespace. - Can be expanded to handle speaker diarization, e.g., "Speaker A:" -> "Alice:" """ text = re.sub(r'\s+', ' ', transcript).strip() return text @lru_cache(maxsize=None) def get_language_name(language_code: str) -> str: """Converts a two-letter language code (e.g., 'es') to its full name (e.g., 'Spanish').""" try: lang = pycountry.languages.get(alpha_2=language_code) return lang.name if lang else language_code except Exception: return language_code @lru_cache(maxsize=32) def load_labels(language_code: str = "en") -> dict: filepath = f"locales/{language_code}.json" if not os.path.exists(filepath): filepath = "locales/en.json" with open(filepath, "r", encoding="utf-8") as f: return json.load(f)