Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on 21 days ago

Commit

7208f76

1 Parent(s): b9dea2c

Fixing Real time audio

Browse files

Files changed (1) hide show

app.py +183 -124

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ import queue
 from collections import deque
 import asyncio
 from typing import Generator, Tuple, List, Optional
 # Configuration parameters (keeping original models)
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
@@ -30,6 +32,7 @@ MIN_SEGMENT_DURATION = 1.0
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
 SAMPLE_RATE = 16000
 # Speaker labels
 SPEAKER_LABELS = [f"Speaker {i+1}" for i in range(ABSOLUTE_MAX_SPEAKERS)]
@@ -220,35 +223,46 @@ class AudioProcessor:
 class RealTimeSpeakerDiarization:
-    """Main class for real-time speaker diarization"""
     def __init__(self, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
         self.change_threshold = change_threshold
         self.max_speakers = max_speakers
         self.transcript_history = []
         self.is_initialized = False
-        # Threading components
-        self.audio_queue = queue.Queue()
-        self.processing_thread = None
-        self.running = False
-    async def initialize(self):
         """Initialize the speaker diarization system"""
         if self.is_initialized:
             return True
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"Initializing ECAPA-TDNN model on {device_str}...")
             self.encoder = SpeechBrainEncoder(device=device_str)
             success = self.encoder.load_model()
             if not success:
                 return False
             self.audio_processor = AudioProcessor(self.encoder)
             self.speaker_detector = SpeakerChangeDetector(
@@ -274,31 +288,89 @@ class RealTimeSpeakerDiarization:
             self.speaker_detector.set_change_threshold(change_threshold)
             self.speaker_detector.set_max_speakers(max_speakers)
-    def process_audio_segment(self, audio_data: np.ndarray, text: str) -> Tuple[int, str]:
-        """Process an audio segment and return speaker ID and formatted text"""
         if not self.is_initialized:
-            return 0, text
         try:
-            # Extract speaker embedding
-            embedding = self.audio_processor.extract_embedding(audio_data)
-            # Detect speaker
-            speaker_id, similarity = self.speaker_detector.add_embedding(embedding)
-            # Format text with speaker label
-            speaker_label = SPEAKER_LABELS[speaker_id]
-            formatted_text = f"{speaker_label}: {text}"
-            return speaker_id, formatted_text
         except Exception as e:
-            print(f"Error processing audio segment: {e}")
-            return 0, f"Speaker 1: {text}"
-    def get_transcript_history(self):
-        """Get the formatted transcript history"""
-        return "\n".join(self.transcript_history)
     def add_to_transcript(self, formatted_text: str):
         """Add formatted text to transcript history"""
@@ -311,82 +383,74 @@ class RealTimeSpeakerDiarization:
     def clear_transcript(self):
         """Clear transcript history and reset speaker detector"""
         self.transcript_history = []
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
 # Global instance
 diarization_system = RealTimeSpeakerDiarization()
-async def initialize_system():
     """Initialize the diarization system"""
-    success = await diarization_system.initialize()
     if success:
         return "✅ Speaker diarization system initialized successfully!"
     else:
         return "❌ Failed to initialize speaker diarization system. Please check your setup."
-def process_audio_with_transcript(audio_data, sample_rate, transcription_text, change_threshold, max_speakers):
-    """Process audio with transcription for speaker diarization"""
     if not diarization_system.is_initialized:
-        return "Please initialize the system first.", ""
-    if audio_data is None or transcription_text.strip() == "":
-        return diarization_system.get_transcript_history(), ""
-    try:
-        # Update settings
-        diarization_system.update_settings(change_threshold, max_speakers)
-        # Convert audio to the right format
-        if len(audio_data.shape) > 1:
-            audio_data = audio_data.mean(axis=1)  # Convert to mono
-        # Resample if needed
-        if sample_rate != SAMPLE_RATE:
-            audio_data = torchaudio.functional.resample(
-                torch.tensor(audio_data), sample_rate, SAMPLE_RATE
-            ).numpy()
-        # Process the audio segment
-        speaker_id, formatted_text = diarization_system.process_audio_segment(audio_data, transcription_text)
-        # Add to transcript
-        diarization_system.add_to_transcript(formatted_text)
-        # Return updated transcript and current speaker info
-        transcript = diarization_system.get_transcript_history()
-        current_speaker_info = f"Current Speaker: {SPEAKER_LABELS[speaker_id]}"
-        return transcript, current_speaker_info
-    except Exception as e:
-        error_msg = f"Error processing audio: {str(e)}"
-        return diarization_system.get_transcript_history(), error_msg
 def clear_conversation():
     """Clear the conversation transcript"""
     diarization_system.clear_transcript()
-    return "", "Conversation cleared."
 def create_gradio_interface():
-    """Create and return the Gradio interface"""
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🎙️ Real-time Speaker Diarization with ASR")
-        gr.Markdown("Upload audio with transcription to perform real-time speaker diarization.")
         # Initialization section
         with gr.Row():
-            init_btn = gr.Button("🚀 Initialize System", variant="primary")
-            init_status = gr.Textbox(label="Initialization Status", interactive=False)
         # Settings section
         with gr.Row():
@@ -409,35 +473,35 @@ def create_gradio_interface():
                     info="Maximum number of speakers to detect"
                 )
-        # Audio input and transcription
         with gr.Row():
             with gr.Column():
-                audio_input = gr.Audio(
-                    label="Audio Input",
-                    type="numpy",
-                    format="wav"
                 )
-                transcription_input = gr.Textbox(
-                    label="Transcription Text",
-                    placeholder="Enter the transcription of the audio...",
-                    lines=3
-                )
-                process_btn = gr.Button("🎯 Process Audio", variant="secondary")
             with gr.Column():
-                current_speaker = gr.Textbox(
-                    label="Current Speaker",
-                    interactive=False
                 )
-                clear_btn = gr.Button("🗑️ Clear Conversation", variant="stop")
         # Output section
         transcript_output = gr.Textbox(
-            label="Live Transcript with Speaker Labels",
             lines=15,
-            max_lines=20,
             interactive=False,
-            placeholder="Processed transcript will appear here..."
         )
         # Event handlers
@@ -446,54 +510,49 @@ def create_gradio_interface():
             outputs=[init_status]
         )
-        process_btn.click(
-            fn=process_audio_with_transcript,
-            inputs=[
-                audio_input,
-                gr.Number(value=SAMPLE_RATE, visible=False),  # Hidden sample rate
-                transcription_input,
-                change_threshold,
-                max_speakers
-            ],
-            outputs=[transcript_output, current_speaker]
         )
         clear_btn.click(
             fn=clear_conversation,
-            outputs=[transcript_output, current_speaker]
-        )
-        # Auto-process when audio and transcription are provided
-        audio_input.change(
-            fn=process_audio_with_transcript,
-            inputs=[
-                audio_input,
-                gr.Number(value=SAMPLE_RATE, visible=False),
-                transcription_input,
-                change_threshold,
-                max_speakers
-            ],
-            outputs=[transcript_output, current_speaker]
         )
         # Instructions
-        gr.Markdown("""
-        ## Instructions:
-        1. **Initialize**: Click "Initialize System" to load the speaker diarization models
-        2. **Upload Audio**: Upload an audio file (WAV format recommended)
-        3. **Add Transcription**: Enter the transcription text for the audio
-        4. **Adjust Settings**:
-           - **Speaker Change Threshold**: Lower values detect speaker changes more easily
-           - **Max Speakers**: Set the maximum number of speakers you expect
-        5. **Process**: Click "Process Audio" or the system will auto-process
-        6. **View Results**: See the transcript with speaker labels (Speaker 1, Speaker 2, etc.)
-        ## Tips:
-        - For similar-sounding speakers, increase the threshold (0.6-0.8)
-        - For different-sounding speakers, lower threshold works better (0.3-0.5)
-        - The system maintains speaker consistency across the conversation
-        - Use "Clear Conversation" to reset the speaker memory
-        """)
     return demo

 from collections import deque
 import asyncio
 from typing import Generator, Tuple, List, Optional
+import whisper
+from transformers import pipeline
 # Configuration parameters (keeping original models)
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
 SAMPLE_RATE = 16000
+CHUNK_DURATION = 2.0  # Process audio in 2-second chunks
 # Speaker labels
 SPEAKER_LABELS = [f"Speaker {i+1}" for i in range(ABSOLUTE_MAX_SPEAKERS)]
 class RealTimeSpeakerDiarization:
+    """Main class for real-time speaker diarization with FastRTC"""
     def __init__(self, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
+        self.transcription_pipeline = None
         self.change_threshold = change_threshold
         self.max_speakers = max_speakers
         self.transcript_history = []
         self.is_initialized = False
+        # Audio processing
+        self.audio_buffer = deque(maxlen=int(SAMPLE_RATE * 10))  # 10 second buffer
+        self.processing_queue = queue.Queue()
+        self.last_processed_time = 0
+        self.current_transcript = ""
+    def initialize(self):
         """Initialize the speaker diarization system"""
         if self.is_initialized:
             return True
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"Initializing models on {device_str}...")
+            # Initialize speaker encoder
             self.encoder = SpeechBrainEncoder(device=device_str)
             success = self.encoder.load_model()
             if not success:
                 return False
+            # Initialize transcription pipeline
+            self.transcription_pipeline = pipeline(
+                "automatic-speech-recognition",
+                model=f"openai/whisper-{REALTIME_TRANSCRIPTION_MODEL}",
+                device=0 if torch.cuda.is_available() else -1,
+                return_timestamps=True
+            )
             self.audio_processor = AudioProcessor(self.encoder)
             self.speaker_detector = SpeakerChangeDetector(
             self.speaker_detector.set_change_threshold(change_threshold)
             self.speaker_detector.set_max_speakers(max_speakers)
+    def process_audio_stream(self, audio_chunk, sample_rate):
+        """Process real-time audio stream from FastRTC"""
         if not self.is_initialized:
+            return self.get_current_transcript(), "System not initialized"
         try:
+            # Convert to numpy array if needed
+            if hasattr(audio_chunk, 'numpy'):
+                audio_data = audio_chunk.numpy()
+            else:
+                audio_data = np.array(audio_chunk)
+            # Handle different audio formats
+            if len(audio_data.shape) > 1:
+                audio_data = audio_data.mean(axis=1)  # Convert to mono
+            # Resample if needed
+            if sample_rate != SAMPLE_RATE:
+                audio_data = torchaudio.functional.resample(
+                    torch.tensor(audio_data), sample_rate, SAMPLE_RATE
+                ).numpy()
+            # Add to buffer
+            self.audio_buffer.extend(audio_data)
+            # Process if we have enough audio
+            current_time = time.time()
+            if (current_time - self.last_processed_time) >= CHUNK_DURATION:
+                self.process_buffered_audio()
+                self.last_processed_time = current_time
+            return self.get_current_transcript(), f"Processing... Buffer: {len(self.audio_buffer)} samples"
+        except Exception as e:
+            error_msg = f"Error processing audio stream: {str(e)}"
+            print(error_msg)
+            return self.get_current_transcript(), error_msg
+    def process_buffered_audio(self):
+        """Process buffered audio for transcription and speaker diarization"""
+        if len(self.audio_buffer) < int(SAMPLE_RATE * MIN_LENGTH_OF_RECORDING):
+            return
+        try:
+            # Get audio data from buffer
+            audio_data = np.array(list(self.audio_buffer))
+            # Transcribe audio
+            if len(audio_data) > 0:
+                result = self.transcription_pipeline(
+                    audio_data,
+                    return_timestamps=True,
+                    generate_kwargs={"language": TRANSCRIPTION_LANGUAGE}
+                )
+                transcription = result["text"].strip()
+                if transcription and len(transcription) > 0:
+                    # Extract speaker embedding
+                    embedding = self.audio_processor.extract_embedding(audio_data)
+                    # Detect speaker
+                    speaker_id, similarity = self.speaker_detector.add_embedding(embedding)
+                    # Format text with speaker label
+                    speaker_label = SPEAKER_LABELS[speaker_id]
+                    formatted_text = f"{speaker_label}: {transcription}"
+                    # Add to transcript
+                    self.add_to_transcript(formatted_text)
+                    print(f"Transcribed: {formatted_text} (Similarity: {similarity:.3f})")
+            # Clear part of the buffer to prevent memory issues
+            if len(self.audio_buffer) > SAMPLE_RATE * 5:  # Keep last 5 seconds
+                self.audio_buffer = deque(list(self.audio_buffer)[-SAMPLE_RATE * 3:], maxlen=int(SAMPLE_RATE * 10))
         except Exception as e:
+            print(f"Error in process_buffered_audio: {e}")
+    def get_current_transcript(self):
+        """Get the current transcript"""
+        return "\n".join(self.transcript_history) if self.transcript_history else "Listening..."
     def add_to_transcript(self, formatted_text: str):
         """Add formatted text to transcript history"""
     def clear_transcript(self):
         """Clear transcript history and reset speaker detector"""
         self.transcript_history = []
+        self.audio_buffer.clear()
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
+    def get_status(self):
+        """Get current system status"""
+        if not self.is_initialized:
+            return "System not initialized"
+        if self.speaker_detector:
+            active_speakers = len(self.speaker_detector.active_speakers)
+            current_speaker = self.speaker_detector.current_speaker + 1
+            similarity = self.speaker_detector.last_similarity
+            return f"Active: {active_speakers} speakers | Current: Speaker {current_speaker} | Similarity: {similarity:.3f}"
+        return "Ready"
 # Global instance
 diarization_system = RealTimeSpeakerDiarization()
+def initialize_system():
     """Initialize the diarization system"""
+    success = diarization_system.initialize()
     if success:
         return "✅ Speaker diarization system initialized successfully!"
     else:
         return "❌ Failed to initialize speaker diarization system. Please check your setup."
+def process_realtime_audio(audio_stream, change_threshold, max_speakers):
+    """Process real-time audio stream from FastRTC"""
     if not diarization_system.is_initialized:
+        return "Please initialize the system first.", "System not ready"
+    # Update settings
+    diarization_system.update_settings(change_threshold, max_speakers)
+    if audio_stream is None:
+        return diarization_system.get_current_transcript(), diarization_system.get_status()
+    # Process the audio stream
+    transcript, status = diarization_system.process_audio_stream(audio_stream, SAMPLE_RATE)
+    return transcript, diarization_system.get_status()
 def clear_conversation():
     """Clear the conversation transcript"""
     diarization_system.clear_transcript()
+    return "Conversation cleared. Listening...", "Ready"
 def create_gradio_interface():
+    """Create and return the Gradio interface with FastRTC"""
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎙️ Real-time Speaker Diarization with FastRTC")
+        gr.Markdown("Speak into your microphone for real-time speaker diarization and transcription.")
         # Initialization section
         with gr.Row():
+            init_btn = gr.Button("🚀 Initialize System", variant="primary", scale=1)
+            init_status = gr.Textbox(label="System Status", interactive=False, scale=2)
         # Settings section
         with gr.Row():
                     info="Maximum number of speakers to detect"
                 )
+        # FastRTC Audio Input
         with gr.Row():
             with gr.Column():
+                # FastRTC component for real-time audio
+                audio_input = gr.FastRTC(
+                    audio=True,
+                    video=False,
+                    label="🎤 Real-time Audio Input",
+                    audio_sample_rate=SAMPLE_RATE,
+                    audio_channels=1
                 )
+                clear_btn = gr.Button("🗑️ Clear Conversation", variant="stop")
             with gr.Column():
+                current_status = gr.Textbox(
+                    label="Current Status",
+                    interactive=False,
+                    value="Click Initialize to start"
                 )
         # Output section
         transcript_output = gr.Textbox(
+            label="🔴 Live Transcript with Speaker Labels",
             lines=15,
+            max_lines=25,
             interactive=False,
+            value="Click Initialize, then start speaking...",
+            autoscroll=True
         )
         # Event handlers
             outputs=[init_status]
         )
+        # FastRTC stream processing
+        audio_input.stream(
+            fn=process_realtime_audio,
+            inputs=[audio_input, change_threshold, max_speakers],
+            outputs=[transcript_output, current_status],
+            time_limit=30  # Process in 30-second chunks
         )
         clear_btn.click(
             fn=clear_conversation,
+            outputs=[transcript_output, current_status]
         )
         # Instructions
+        with gr.Accordion("📋 Instructions", open=False):
+            gr.Markdown("""
+            ## How to Use:
+            1. **Initialize**: Click "🚀 Initialize System" to load the AI models (this may take a moment)
+            2. **Allow Microphone**: Your browser will ask for microphone permission - please allow it
+            3. **Adjust Settings**:
+               - **Speaker Change Threshold**:
+                 - Lower (0.3-0.5) for speakers with different voices
+                 - Higher (0.6-0.8) for speakers with similar voices
+               - **Max Speakers**: Set expected number of speakers (2-10)
+            4. **Start Speaking**: The system will automatically transcribe and identify speakers
+            5. **View Results**: See real-time transcript with speaker labels (Speaker 1, Speaker 2, etc.)
+            6. **Clear**: Use "Clear Conversation" to reset and start fresh
+            ## Features:
+            - ✅ Real-time audio processing via FastRTC
+            - ✅ Automatic speech recognition with Whisper
+            - ✅ Speaker diarization with ECAPA-TDNN
+            - ✅ Live transcript with speaker labels
+            - ✅ Configurable sensitivity settings
+            - ✅ Support for up to 10 speakers
+            ## Tips:
+            - Speak clearly and allow brief pauses between speakers
+            - The system learns speaker characteristics over time
+            - Better results with distinct speaker voices
+            - Ensure good microphone quality for best performance
+            """)
     return demo