Real_Time_diarization

Sleeping

File size: 17,046 Bytes

66992f6

import gradio as gr
import numpy as np
import torch
import torchaudio
from scipy.spatial.distance import cosine
import tempfile
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

try:
    from transformers import pipeline
except ImportError:
    print("transformers not found. Install with: pip install transformers")

# Configuration
class Config:
    # Audio settings
    SAMPLE_RATE = 16000
    
    # Speaker detection
    CHANGE_THRESHOLD = 0.65
    MAX_SPEAKERS = 4
    MIN_SEGMENT_DURATION = 1.0
    EMBEDDING_HISTORY_SIZE = 3
    SPEAKER_MEMORY_SIZE = 20

# Console colors for speakers (HTML version)
SPEAKER_COLORS = [
    "#FFD700",  # Gold
    "#FF6B6B",  # Red
    "#4ECDC4",  # Teal
    "#45B7D1",  # Blue
    "#96CEB4",  # Mint
    "#FFEAA7",  # Light Yellow
    "#DDA0DD",  # Plum
    "#98D8C8",  # Mint Green
]

class SpeakerEncoder:
    """Simplified speaker encoder using torchaudio transforms"""
    
    def __init__(self, device="cpu"):
        self.device = device
        self.embedding_dim = 128
        self.model_loaded = False
        self._setup_model()
    
    def _setup_model(self):
        """Setup a simple MFCC-based feature extractor"""
        try:
            self.mfcc_transform = torchaudio.transforms.MFCC(
                sample_rate=Config.SAMPLE_RATE,
                n_mfcc=13,
                melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23}
            ).to(self.device)
            self.model_loaded = True
            print("Simple MFCC-based encoder initialized")
        except Exception as e:
            print(f"Error setting up encoder: {e}")
            self.model_loaded = False
    
    def extract_embedding(self, audio):
        """Extract speaker embedding from audio"""
        if not self.model_loaded:
            return np.zeros(self.embedding_dim)
        
        try:
            # Ensure audio is float32 and normalized
            if isinstance(audio, np.ndarray):
                audio = torch.from_numpy(audio).float()
            
            # Normalize audio
            if audio.abs().max() > 0:
                audio = audio / audio.abs().max()
            
            # Add batch dimension if needed
            if audio.dim() == 1:
                audio = audio.unsqueeze(0)
            
            # Extract MFCC features
            with torch.no_grad():
                mfcc = self.mfcc_transform(audio)
                # Simple statistics-based embedding
                embedding = torch.cat([
                    mfcc.mean(dim=2).flatten(),
                    mfcc.std(dim=2).flatten(),
                    mfcc.max(dim=2)[0].flatten(),
                    mfcc.min(dim=2)[0].flatten()
                ])
                
                # Pad or truncate to fixed size
                if embedding.size(0) > self.embedding_dim:
                    embedding = embedding[:self.embedding_dim]
                elif embedding.size(0) < self.embedding_dim:
                    padding = torch.zeros(self.embedding_dim - embedding.size(0))
                    embedding = torch.cat([embedding, padding])
            
            return embedding.cpu().numpy()
            
        except Exception as e:
            print(f"Error extracting embedding: {e}")
            return np.zeros(self.embedding_dim)

class SpeakerDetector:
    """Speaker change detection using embeddings"""
    
    def __init__(self, threshold=Config.CHANGE_THRESHOLD, max_speakers=Config.MAX_SPEAKERS):
        self.threshold = threshold
        self.max_speakers = max_speakers
        self.current_speaker = 0
        self.speaker_embeddings = [[] for _ in range(max_speakers)]
        self.speaker_centroids = [None] * max_speakers
        self.active_speakers = {0}
        
    def reset(self):
        """Reset speaker detection state"""
        self.current_speaker = 0
        self.speaker_embeddings = [[] for _ in range(self.max_speakers)]
        self.speaker_centroids = [None] * self.max_speakers
        self.active_speakers = {0}
        
    def detect_speaker(self, embedding):
        """Detect current speaker from embedding"""
        # Initialize first speaker
        if not self.speaker_embeddings[0]:
            self.speaker_embeddings[0].append(embedding)
            self.speaker_centroids[0] = embedding.copy()
            return 0, 1.0
        
        # Calculate similarity with current speaker
        current_centroid = self.speaker_centroids[self.current_speaker]
        if current_centroid is not None:
            similarity = 1.0 - cosine(embedding, current_centroid)
        else:
            similarity = 0.0
        
        # Check for speaker change
        if similarity < self.threshold:
            # Find best matching existing speaker
            best_speaker = self.current_speaker
            best_similarity = similarity
            
            for speaker_id in self.active_speakers:
                if speaker_id == self.current_speaker:
                    continue
                    
                centroid = self.speaker_centroids[speaker_id]
                if centroid is not None:
                    sim = 1.0 - cosine(embedding, centroid)
                    if sim > best_similarity and sim > self.threshold:
                        best_similarity = sim
                        best_speaker = speaker_id
            
            # Create new speaker if no good match and slots available
            if (best_speaker == self.current_speaker and 
                len(self.active_speakers) < self.max_speakers):
                for new_id in range(self.max_speakers):
                    if new_id not in self.active_speakers:
                        best_speaker = new_id
                        best_similarity = 0.0
                        self.active_speakers.add(new_id)
                        break
            
            # Update current speaker if changed
            if best_speaker != self.current_speaker:
                self.current_speaker = best_speaker
                similarity = best_similarity
        
        # Update speaker model
        self._update_speaker_model(self.current_speaker, embedding)
        return self.current_speaker, similarity
    
    def _update_speaker_model(self, speaker_id, embedding):
        """Update speaker model with new embedding"""
        self.speaker_embeddings[speaker_id].append(embedding)
        
        # Keep only recent embeddings
        if len(self.speaker_embeddings[speaker_id]) > Config.SPEAKER_MEMORY_SIZE:
            self.speaker_embeddings[speaker_id] = \
                self.speaker_embeddings[speaker_id][-Config.SPEAKER_MEMORY_SIZE:]
        
        # Update centroid
        if self.speaker_embeddings[speaker_id]:
            self.speaker_centroids[speaker_id] = np.mean(
                self.speaker_embeddings[speaker_id], axis=0
            )

class AudioProcessor:
    """Handles audio processing and transcription"""
    
    def __init__(self):
        self.encoder = SpeakerEncoder()
        self.detector = SpeakerDetector()
        
        # Initialize Whisper model for transcription
        try:
            self.transcriber = pipeline(
                "automatic-speech-recognition",
                model="openai/whisper-base",
                chunk_length_s=30,
                device=0 if torch.cuda.is_available() else -1
            )
            print("Whisper model loaded successfully")
        except Exception as e:
            print(f"Error loading Whisper model: {e}")
            self.transcriber = None
    
    def process_audio_file(self, audio_file):
        """Process uploaded audio file"""
        if audio_file is None:
            return "Please upload an audio file.", ""
        
        try:
            # Reset speaker detection for new file
            self.detector.reset()
            
            # Load audio file
            waveform, sample_rate = torchaudio.load(audio_file)
            
            # Convert to mono if stereo
            if waveform.shape[0] > 1:
                waveform = waveform.mean(dim=0, keepdim=True)
            
            # Resample to 16kHz if needed
            if sample_rate != Config.SAMPLE_RATE:
                resampler = torchaudio.transforms.Resample(sample_rate, Config.SAMPLE_RATE)
                waveform = resampler(waveform)
            
            # Convert to numpy
            audio_data = waveform.squeeze().numpy()
            
            # Transcribe entire audio
            if self.transcriber:
                transcription_result = self.transcriber(audio_file)
                full_transcription = transcription_result['text']
            else:
                full_transcription = "Transcription service unavailable"
            
            # Process audio in chunks for speaker detection
            chunk_duration = 3.0  # 3 second chunks
            chunk_samples = int(chunk_duration * Config.SAMPLE_RATE)
            results = []
            
            for i in range(0, len(audio_data), chunk_samples // 2):  # 50% overlap
                chunk = audio_data[i:i + chunk_samples]
                
                if len(chunk) < Config.SAMPLE_RATE:  # Skip chunks less than 1 second
                    continue
                
                # Extract speaker embedding
                embedding = self.encoder.extract_embedding(chunk)
                speaker_id, similarity = self.detector.detect_speaker(embedding)
                
                # Get timestamp
                start_time = i / Config.SAMPLE_RATE
                end_time = (i + len(chunk)) / Config.SAMPLE_RATE
                
                # Transcribe chunk
                if self.transcriber and len(chunk) > Config.SAMPLE_RATE:
                    # Save chunk temporarily for transcription
                    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
                        torchaudio.save(tmp_file.name, torch.tensor(chunk).unsqueeze(0), Config.SAMPLE_RATE)
                        chunk_result = self.transcriber(tmp_file.name)
                        chunk_text = chunk_result['text'].strip()
                        os.unlink(tmp_file.name)  # Clean up temp file
                else:
                    chunk_text = ""
                
                if chunk_text:  # Only add if there's actual text
                    results.append({
                        'speaker_id': speaker_id,
                        'start_time': start_time,
                        'end_time': end_time,
                        'text': chunk_text,
                        'similarity': similarity
                    })
            
            # Format results
            formatted_output = self._format_results(results)
            return formatted_output, full_transcription
            
        except Exception as e:
            return f"Error processing audio: {str(e)}", ""
    
    def _format_results(self, results):
        """Format results with speaker colors"""
        if not results:
            return "No speech detected in the audio file."
        
        formatted_lines = []
        formatted_lines.append("🎤 **Speaker Diarization Results**\n")
        
        for result in results:
            speaker_id = result['speaker_id']
            start_time = result['start_time']
            end_time = result['end_time']
            text = result['text']
            similarity = result['similarity']
            
            color = SPEAKER_COLORS[speaker_id % len(SPEAKER_COLORS)]
            
            # Format timestamp
            start_min, start_sec = divmod(int(start_time), 60)
            end_min, end_sec = divmod(int(end_time), 60)
            timestamp = f"[{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}]"
            
            # Create colored HTML output
            formatted_lines.append(
                f'<div style="margin-bottom: 10px; padding: 8px; border-left: 4px solid {color}; background-color: {color}20;">'
                f'<strong style="color: {color};">Speaker {speaker_id + 1}</strong> '
                f'<span style="color: #666; font-size: 0.9em;">{timestamp}</span><br>'
                f'<span style="color: #333;">{text}</span>'
                f'</div>'
            )
        
        return "".join(formatted_lines)

# Global processor instance
processor = AudioProcessor()

def process_audio(audio_file, sensitivity):
    """Process audio file with speaker detection"""
    if audio_file is None:
        return "Please upload an audio file.", ""
    
    # Update sensitivity
    processor.detector.threshold = sensitivity
    
    # Process the audio
    diarized_output, full_transcription = processor.process_audio_file(audio_file)
    
    return diarized_output, full_transcription

# Create Gradio interface
def create_interface():
    """Create Gradio interface"""
    
    with gr.Blocks(
        theme=gr.themes.Soft(),
        title="Speaker Diarization & Transcription",
        css="""
        .gradio-container {
            max-width: 1200px !important;
        }
        .speaker-output {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        }
        """
    ) as demo:
        
        gr.Markdown(
            """
            # 🎙️ Speaker Diarization & Transcription
            
            Upload an audio file to automatically detect different speakers and transcribe their speech.
            The system will identify speaker changes and display each speaker's text in different colors.
            """
        )
        
        with gr.Row():
            with gr.Column(scale=1):
                audio_input = gr.Audio(
                    label="Upload Audio File",
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                
                sensitivity_slider = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.65,
                    step=0.05,
                    label="Speaker Change Sensitivity",
                    info="Lower values = more sensitive to speaker changes"
                )
                
                process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
                
                gr.Markdown(
                    """
                    ### Instructions:
                    1. Upload an audio file (WAV, MP3, etc.)
                    2. Adjust sensitivity if needed
                    3. Click "Process Audio"
                    4. View results with speaker colors
                    
                    ### Tips:
                    - Works best with clear speech
                    - Supports multiple file formats
                    - Different speakers shown in different colors
                    - Processing may take a moment for longer files
                    """
                )
            
            with gr.Column(scale=2):
                with gr.Tabs():
                    with gr.TabItem("🎨 Speaker Diarization"):
                        diarized_output = gr.HTML(
                            label="Speaker Diarization Results",
                            elem_classes=["speaker-output"]
                        )
                    
                    with gr.TabItem("📝 Full Transcription"):
                        full_transcription = gr.Textbox(
                            label="Complete Transcription",
                            lines=15,
                            max_lines=20,
                            show_copy_button=True
                        )
        
        # Event handlers
        process_btn.click(
            fn=process_audio,
            inputs=[audio_input, sensitivity_slider],
            outputs=[diarized_output, full_transcription],
            show_progress=True
        )
        
        # Auto-process when audio is uploaded
        audio_input.change(
            fn=process_audio,
            inputs=[audio_input, sensitivity_slider],
            outputs=[diarized_output, full_transcription],
            show_progress=True
        )
        
        gr.Markdown(
            """
            ---
            ### About
            This application uses:
            - **MFCC features** for speaker embedding extraction
            - **Cosine similarity** for speaker change detection
            - **OpenAI Whisper** for speech-to-text transcription
            - **Gradio** for the web interface
            
            **Note**: This is a simplified speaker diarization system. For production use, 
            consider more advanced speaker embedding models like speechbrain or pyannote.audio.
            """
        )
    
    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )