import re
import subprocess
import tempfile
import os


def clean_transcript(text):
    """Clean the transcript by removing filler words and consolidating sentences."""

    filler_words = [
        r'\bum\b', r'\buh\b', r'\blike\b', r'\byou know\b', r'\bkind of\b', 
        r'\bsort of\b', r'\bI mean\b', r'\bbasically\b', r'\bactually\b', 
        r'\bso\b', r'\banyway\b', r'\blike\b', r'\bjust\b'
    ]

    for word in filler_words:
        text = re.sub(f"{word}", "", text, flags=re.IGNORECASE)
    
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def consolidate_similar_items(items):
    """Consolidate similar items to reduce repetition."""
    if not items or len(items) <= 1:
        return items
    
    # Simple similarity measure based on word overlap
    result = [items[0]]
    for item in items[1:]:
        # Convert to sets of words for comparison
        item_words = set(item.lower().split())
        
        # Check if this item is too similar to any existing item
        too_similar = False
        for existing_item in result:
            existing_words = set(existing_item.lower().split())
            # Calculate Jaccard similarity
            intersection = len(item_words.intersection(existing_words))
            union = len(item_words.union(existing_words))
            if union > 0 and intersection / union > 0.6:  # 60% similarity threshold
                too_similar = True
                break
                
        if not too_similar:
            result.append(item)
            
    return result


def chunk_text(text, max_tokens=800):
    words = text.split()
    chunks = []
    if len(words) > max_tokens:
        for i in range(0, len(words), max_tokens):
            chunk = " ".join(words[i:i + max_tokens])
            chunks.append(chunk)
        return chunks
    else:
        return [text]

    
def webm_to_wav(webm_bytes: bytes) -> str:
    """
    Converts webm audio bytes to a wav file and returns the path to the wav file.
    """
    with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as webm_file:
        webm_path = webm_file.name
        webm_file.write(webm_bytes)

    wav_path = webm_path.replace(".webm", ".wav")

    try:
        subprocess.run([
            "ffmpeg", "-y", "-i", webm_path, "-ar", "16000", "-ac", "1", wav_path
        ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    finally:
        os.remove(webm_path)

    return wav_path


def preprocess_transcript(transcript: str) -> str:
    """
    Cleans and normalizes the transcript.
    - Removes extra whitespace.
    - Can be expanded to handle speaker diarization, e.g., "Speaker A:" -> "Alice:"
    """
    # Simple cleaning
    text = re.sub(r'\s+', ' ', transcript).strip()
    
    # Advanced: A future step could be to normalize speaker names
    # e.g., mapping "Bob's update" to "Bob:"
    
    return text