import re import subprocess import tempfile import os def clean_transcript(text): """Clean the transcript by removing filler words and consolidating sentences.""" filler_words = [ r'\bum\b', r'\buh\b', r'\blike\b', r'\byou know\b', r'\bkind of\b', r'\bsort of\b', r'\bI mean\b', r'\bbasically\b', r'\bactually\b', r'\bso\b', r'\banyway\b', r'\blike\b', r'\bjust\b' ] for word in filler_words: text = re.sub(f"{word}", "", text, flags=re.IGNORECASE) text = re.sub(r'\s+', ' ', text) return text.strip() def consolidate_similar_items(items): """Consolidate similar items to reduce repetition.""" if not items or len(items) <= 1: return items # Simple similarity measure based on word overlap result = [items[0]] for item in items[1:]: # Convert to sets of words for comparison item_words = set(item.lower().split()) # Check if this item is too similar to any existing item too_similar = False for existing_item in result: existing_words = set(existing_item.lower().split()) # Calculate Jaccard similarity intersection = len(item_words.intersection(existing_words)) union = len(item_words.union(existing_words)) if union > 0 and intersection / union > 0.6: # 60% similarity threshold too_similar = True break if not too_similar: result.append(item) return result def chunk_text(text, max_tokens=800): words = text.split() chunks = [] if len(words) > max_tokens: for i in range(0, len(words), max_tokens): chunk = " ".join(words[i:i + max_tokens]) chunks.append(chunk) return chunks else: return [text] def webm_to_wav(webm_bytes: bytes) -> str: """ Converts webm audio bytes to a wav file and returns the path to the wav file. """ with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as webm_file: webm_path = webm_file.name webm_file.write(webm_bytes) wav_path = webm_path.replace(".webm", ".wav") try: subprocess.run([ "ffmpeg", "-y", "-i", webm_path, "-ar", "16000", "-ac", "1", wav_path ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) finally: os.remove(webm_path) return wav_path def preprocess_transcript(transcript: str) -> str: """ Cleans and normalizes the transcript. - Removes extra whitespace. - Can be expanded to handle speaker diarization, e.g., "Speaker A:" -> "Alice:" """ # Simple cleaning text = re.sub(r'\s+', ' ', transcript).strip() # Advanced: A future step could be to normalize speaker names # e.g., mapping "Bob's update" to "Bob:" return text