Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 23

Commit

fd289b1

1 Parent(s): 7208f76

Fixing Real-time

Browse files

Files changed (1) hide show

app.py +324 -293

app.py CHANGED Viewed

@@ -5,16 +5,13 @@ import torchaudio
 import time
 import os
 import urllib.request
-from scipy.spatial.distance import cosine
-import threading
 import queue
-from collections import deque
-import asyncio
-from typing import Generator, Tuple, List, Optional
-import whisper
-from transformers import pipeline
-# Configuration parameters (keeping original models)
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 FINAL_BEAM_SIZE = 5
 REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
@@ -31,11 +28,24 @@ EMBEDDING_HISTORY_SIZE = 5
 MIN_SEGMENT_DURATION = 1.0
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
 SAMPLE_RATE = 16000
-CHUNK_DURATION = 2.0  # Process audio in 2-second chunks
-# Speaker labels
-SPEAKER_LABELS = [f"Speaker {i+1}" for i in range(ABSOLUTE_MAX_SPEAKERS)]
 class SpeechBrainEncoder:
     """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
@@ -47,11 +57,24 @@ class SpeechBrainEncoder:
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
@@ -87,8 +110,28 @@ class SpeechBrainEncoder:
             return np.zeros(self.embedding_dim)
 class SpeakerChangeDetector:
-    """Speaker change detector that supports configurable number of speakers"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
@@ -196,373 +239,361 @@ class SpeakerChangeDetector:
             )
         return self.current_speaker, similarity
-class AudioProcessor:
-    """Processes audio data to extract speaker embeddings"""
-    def __init__(self, encoder):
-        self.encoder = encoder
-    def extract_embedding(self, audio_data):
-        try:
-            # Ensure audio is float32 and normalized
-            if audio_data.dtype != np.float32:
-                audio_data = audio_data.astype(np.float32)
-            # Normalize if needed
-            if np.abs(audio_data).max() > 1.0:
-                audio_data = audio_data / np.abs(audio_data).max()
-            # Extract embedding using the loaded encoder
-            embedding = self.encoder.embed_utterance(audio_data)
-            return embedding
-        except Exception as e:
-            print(f"Embedding extraction error: {e}")
-            return np.zeros(self.encoder.embedding_dim)
-class RealTimeSpeakerDiarization:
-    """Main class for real-time speaker diarization with FastRTC"""
-    def __init__(self, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
-        self.transcription_pipeline = None
-        self.change_threshold = change_threshold
-        self.max_speakers = max_speakers
-        self.transcript_history = []
-        self.is_initialized = False
-        # Audio processing
-        self.audio_buffer = deque(maxlen=int(SAMPLE_RATE * 10))  # 10 second buffer
-        self.processing_queue = queue.Queue()
-        self.last_processed_time = 0
-        self.current_transcript = ""
-    def initialize(self):
-        """Initialize the speaker diarization system"""
-        if self.is_initialized:
-            return True
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"Initializing models on {device_str}...")
-            # Initialize speaker encoder
             self.encoder = SpeechBrainEncoder(device=device_str)
             success = self.encoder.load_model()
-            if not success:
-                return False
-            # Initialize transcription pipeline
-            self.transcription_pipeline = pipeline(
-                "automatic-speech-recognition",
-                model=f"openai/whisper-{REALTIME_TRANSCRIPTION_MODEL}",
-                device=0 if torch.cuda.is_available() else -1,
-                return_timestamps=True
-            )
-            self.audio_processor = AudioProcessor(self.encoder)
-            self.speaker_detector = SpeakerChangeDetector(
-                embedding_dim=self.encoder.embedding_dim,
-                change_threshold=self.change_threshold,
-                max_speakers=self.max_speakers
-            )
-            self.is_initialized = True
-            print("Speaker diarization system initialized successfully!")
             return True
         except Exception as e:
-            print(f"Initialization error: {e}")
             return False
-    def update_settings(self, change_threshold, max_speakers):
-        """Update diarization settings"""
-        self.change_threshold = change_threshold
-        self.max_speakers = max_speakers
-        if self.speaker_detector:
-            self.speaker_detector.set_change_threshold(change_threshold)
-            self.speaker_detector.set_max_speakers(max_speakers)
-    def process_audio_stream(self, audio_chunk, sample_rate):
-        """Process real-time audio stream from FastRTC"""
-        if not self.is_initialized:
-            return self.get_current_transcript(), "System not initialized"
         try:
-            # Convert to numpy array if needed
-            if hasattr(audio_chunk, 'numpy'):
-                audio_data = audio_chunk.numpy()
             else:
-                audio_data = np.array(audio_chunk)
-            # Handle different audio formats
-            if len(audio_data.shape) > 1:
-                audio_data = audio_data.mean(axis=1)  # Convert to mono
-            # Resample if needed
-            if sample_rate != SAMPLE_RATE:
-                audio_data = torchaudio.functional.resample(
-                    torch.tensor(audio_data), sample_rate, SAMPLE_RATE
-                ).numpy()
-            # Add to buffer
-            self.audio_buffer.extend(audio_data)
-            # Process if we have enough audio
-            current_time = time.time()
-            if (current_time - self.last_processed_time) >= CHUNK_DURATION:
-                self.process_buffered_audio()
-                self.last_processed_time = current_time
-            return self.get_current_transcript(), f"Processing... Buffer: {len(self.audio_buffer)} samples"
         except Exception as e:
-            error_msg = f"Error processing audio stream: {str(e)}"
-            print(error_msg)
-            return self.get_current_transcript(), error_msg
-    def process_buffered_audio(self):
-        """Process buffered audio for transcription and speaker diarization"""
-        if len(self.audio_buffer) < int(SAMPLE_RATE * MIN_LENGTH_OF_RECORDING):
-            return
         try:
-            # Get audio data from buffer
-            audio_data = np.array(list(self.audio_buffer))
-            # Transcribe audio
-            if len(audio_data) > 0:
-                result = self.transcription_pipeline(
-                    audio_data,
-                    return_timestamps=True,
-                    generate_kwargs={"language": TRANSCRIPTION_LANGUAGE}
-                )
-                transcription = result["text"].strip()
-                if transcription and len(transcription) > 0:
-                    # Extract speaker embedding
-                    embedding = self.audio_processor.extract_embedding(audio_data)
-                    # Detect speaker
-                    speaker_id, similarity = self.speaker_detector.add_embedding(embedding)
-                    # Format text with speaker label
-                    speaker_label = SPEAKER_LABELS[speaker_id]
-                    formatted_text = f"{speaker_label}: {transcription}"
-                    # Add to transcript
-                    self.add_to_transcript(formatted_text)
-                    print(f"Transcribed: {formatted_text} (Similarity: {similarity:.3f})")
-            # Clear part of the buffer to prevent memory issues
-            if len(self.audio_buffer) > SAMPLE_RATE * 5:  # Keep last 5 seconds
-                self.audio_buffer = deque(list(self.audio_buffer)[-SAMPLE_RATE * 3:], maxlen=int(SAMPLE_RATE * 10))
         except Exception as e:
-            print(f"Error in process_buffered_audio: {e}")
-    def get_current_transcript(self):
-        """Get the current transcript"""
-        return "\n".join(self.transcript_history) if self.transcript_history else "Listening..."
-    def add_to_transcript(self, formatted_text: str):
-        """Add formatted text to transcript history"""
-        self.transcript_history.append(formatted_text)
-        # Keep only last 50 entries to prevent memory issues
-        if len(self.transcript_history) > 50:
-            self.transcript_history = self.transcript_history[-50:]
     def clear_transcript(self):
-        """Clear transcript history and reset speaker detector"""
-        self.transcript_history = []
-        self.audio_buffer.clear()
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
-    def get_status(self):
-        """Get current system status"""
-        if not self.is_initialized:
-            return "System not initialized"
-        if self.speaker_detector:
-            active_speakers = len(self.speaker_detector.active_speakers)
-            current_speaker = self.speaker_detector.current_speaker + 1
-            similarity = self.speaker_detector.last_similarity
-            return f"Active: {active_speakers} speakers | Current: Speaker {current_speaker} | Similarity: {similarity:.3f}"
-        return "Ready"
 # Global instance
-diarization_system = RealTimeSpeakerDiarization()
-def initialize_system():
-    """Initialize the diarization system"""
-    success = diarization_system.initialize()
-    if success:
-        return "✅ Speaker diarization system initialized successfully!"
-    else:
-        return "❌ Failed to initialize speaker diarization system. Please check your setup."
-def process_realtime_audio(audio_stream, change_threshold, max_speakers):
-    """Process real-time audio stream from FastRTC"""
-    if not diarization_system.is_initialized:
-        return "Please initialize the system first.", "System not ready"
-    # Update settings
-    diarization_system.update_settings(change_threshold, max_speakers)
-    if audio_stream is None:
-        return diarization_system.get_current_transcript(), diarization_system.get_status()
-    # Process the audio stream
-    transcript, status = diarization_system.process_audio_stream(audio_stream, SAMPLE_RATE)
-    return transcript, diarization_system.get_status()
-def clear_conversation():
-    """Clear the conversation transcript"""
-    diarization_system.clear_transcript()
-    return "Conversation cleared. Listening...", "Ready"
-def create_gradio_interface():
-    """Create and return the Gradio interface with FastRTC"""
-    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🎙️ Real-time Speaker Diarization with FastRTC")
-        gr.Markdown("Speak into your microphone for real-time speaker diarization and transcription.")
-        # Initialization section
-        with gr.Row():
-            init_btn = gr.Button("🚀 Initialize System", variant="primary", scale=1)
-            init_status = gr.Textbox(label="System Status", interactive=False, scale=2)
-        # Settings section
         with gr.Row():
-            with gr.Column():
                 change_threshold = gr.Slider(
-                    minimum=0.1,
-                    maximum=0.9,
                     value=DEFAULT_CHANGE_THRESHOLD,
                     step=0.05,
                     label="Speaker Change Threshold",
                     info="Lower values = more sensitive to speaker changes"
                 )
-            with gr.Column():
                 max_speakers = gr.Slider(
                     minimum=2,
                     maximum=ABSOLUTE_MAX_SPEAKERS,
                     value=DEFAULT_MAX_SPEAKERS,
                     step=1,
-                    label="Maximum Number of Speakers",
                     info="Maximum number of speakers to detect"
                 )
-        # FastRTC Audio Input
-        with gr.Row():
-            with gr.Column():
-                # FastRTC component for real-time audio
-                audio_input = gr.FastRTC(
-                    audio=True,
-                    video=False,
-                    label="🎤 Real-time Audio Input",
-                    audio_sample_rate=SAMPLE_RATE,
-                    audio_channels=1
-                )
-                clear_btn = gr.Button("🗑️ Clear Conversation", variant="stop")
-            with gr.Column():
-                current_status = gr.Textbox(
-                    label="Current Status",
-                    interactive=False,
-                    value="Click Initialize to start"
-                )
-        # Output section
-        transcript_output = gr.Textbox(
-            label="🔴 Live Transcript with Speaker Labels",
-            lines=15,
-            max_lines=25,
-            interactive=False,
-            value="Click Initialize, then start speaking...",
-            autoscroll=True
-        )
-        # Event handlers
-        init_btn.click(
-            fn=initialize_system,
-            outputs=[init_status]
-        )
-        # FastRTC stream processing
         audio_input.stream(
-            fn=process_realtime_audio,
             inputs=[audio_input, change_threshold, max_speakers],
-            outputs=[transcript_output, current_status],
-            time_limit=30  # Process in 30-second chunks
         )
         clear_btn.click(
-            fn=clear_conversation,
-            outputs=[transcript_output, current_status]
         )
-        # Instructions
-        with gr.Accordion("📋 Instructions", open=False):
-            gr.Markdown("""
-            ## How to Use:
-            1. **Initialize**: Click "🚀 Initialize System" to load the AI models (this may take a moment)
-            2. **Allow Microphone**: Your browser will ask for microphone permission - please allow it
-            3. **Adjust Settings**:
-               - **Speaker Change Threshold**:
-                 - Lower (0.3-0.5) for speakers with different voices
-                 - Higher (0.6-0.8) for speakers with similar voices
-               - **Max Speakers**: Set expected number of speakers (2-10)
-            4. **Start Speaking**: The system will automatically transcribe and identify speakers
-            5. **View Results**: See real-time transcript with speaker labels (Speaker 1, Speaker 2, etc.)
-            6. **Clear**: Use "Clear Conversation" to reset and start fresh
-            ## Features:
-            - ✅ Real-time audio processing via FastRTC
-            - ✅ Automatic speech recognition with Whisper
-            - ✅ Speaker diarization with ECAPA-TDNN
-            - ✅ Live transcript with speaker labels
-            - ✅ Configurable sensitivity settings
-            - ✅ Support for up to 10 speakers
-            ## Tips:
-            - Speak clearly and allow brief pauses between speakers
-            - The system learns speaker characteristics over time
-            - Better results with distinct speaker voices
-            - Ensure good microphone quality for best performance
-            """)
-    return demo
 if __name__ == "__main__":
-    # Create and launch the Gradio interface
-    demo = create_gradio_interface()
-    demo.launch(
-        share=True,
         server_name="0.0.0.0",
         server_port=7860,
-        show_error=True
     )

 import time
 import os
 import urllib.request
 import queue
+import threading
+from scipy.spatial.distance import cosine
+from RealtimeSTT import AudioToTextRecorder
+# Configuration parameters (kept same as original)
+SILENCE_THRESHS = [0, 0.4]
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 FINAL_BEAM_SIZE = 5
 REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
 MIN_SEGMENT_DURATION = 1.0
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
+# Audio parameters
+FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
+BUFFER_SIZE = 512
+CHANNELS = 1
+# Speaker colors for HTML display
+SPEAKER_COLORS = [
+    "#FFFF00", "#FF0000", "#00FF00", "#00FFFF", "#FF00FF",
+    "#0000FF", "#FF8000", "#00FF80", "#8000FF", "#FFFFFF"
+]
+SPEAKER_COLOR_NAMES = [
+    "Yellow", "Red", "Green", "Cyan", "Magenta",
+    "Blue", "Orange", "Spring Green", "Purple", "White"
+]
 class SpeechBrainEncoder:
     """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
+    def _download_model(self):
+        """Download pre-trained SpeechBrain ECAPA-TDNN model if not present"""
+        model_url = "https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/embedding_model.ckpt"
+        model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
+        if not os.path.exists(model_path):
+            print(f"Downloading ECAPA-TDNN model to {model_path}...")
+            urllib.request.urlretrieve(model_url, model_path)
+        return model_path
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
+            model_path = self._download_model()
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
             return np.zeros(self.embedding_dim)
+class AudioProcessor:
+    """Processes audio data to extract speaker embeddings"""
+    def __init__(self, encoder):
+        self.encoder = encoder
+    def extract_embedding(self, audio_int16):
+        try:
+            float_audio = audio_int16.astype(np.float32) / 32768.0
+            if np.abs(float_audio).max() > 1.0:
+                float_audio = float_audio / np.abs(float_audio).max()
+            embedding = self.encoder.embed_utterance(float_audio)
+            return embedding
+        except Exception as e:
+            print(f"Embedding extraction error: {e}")
+            return np.zeros(self.encoder.embedding_dim)
 class SpeakerChangeDetector:
+    """Speaker change detector with configurable number of speakers"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
             )
         return self.current_speaker, similarity
+    def get_color_for_speaker(self, speaker_id):
+        """Return color for speaker ID"""
+        if 0 <= speaker_id < len(SPEAKER_COLORS):
+            return SPEAKER_COLORS[speaker_id]
+        return "#FFFFFF"
+class RealtimeASRDiarization:
+    """Main class for real-time ASR with speaker diarization"""
+    def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
+        self.recorder = None
+        self.is_recording = False
+        self.full_sentences = []
+        self.sentence_speakers = []
+        self.pending_sentences = []
+        self.last_realtime_text = ""
+        self.sentence_queue = queue.Queue()
+        self.change_threshold = DEFAULT_CHANGE_THRESHOLD
+        self.max_speakers = DEFAULT_MAX_SPEAKERS
+        # Initialize model
+        self.initialize_model()
+    def initialize_model(self):
+        """Initialize the speaker encoder model"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"Using device: {device_str}")
             self.encoder = SpeechBrainEncoder(device=device_str)
             success = self.encoder.load_model()
+            if success:
+                print("ECAPA-TDNN model loaded successfully!")
+                self.audio_processor = AudioProcessor(self.encoder)
+                self.speaker_detector = SpeakerChangeDetector(
+                    embedding_dim=self.encoder.embedding_dim,
+                    change_threshold=self.change_threshold,
+                    max_speakers=self.max_speakers
+                )
+                # Start sentence processing thread
+                self.sentence_thread = threading.Thread(target=self.process_sentences, daemon=True)
+                self.sentence_thread.start()
+            else:
+                print("Failed to load ECAPA-TDNN model")
+        except Exception as e:
+            print(f"Model initialization error: {e}")
+    def process_sentences(self):
+        """Process sentences in background thread"""
+        while True:
+            try:
+                text, audio_bytes = self.sentence_queue.get(timeout=1)
+                self.process_sentence(text, audio_bytes)
+            except queue.Empty:
+                continue
+    def process_sentence(self, text, audio_bytes):
+        """Process a sentence with speaker diarization"""
+        if self.audio_processor is None or self.speaker_detector is None:
+            return
+        try:
+            # Convert audio data to int16
+            audio_int16 = np.int16(audio_bytes * 32767)
+            # Extract speaker embedding
+            speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
+            # Store sentence and embedding
+            self.full_sentences.append((text, speaker_embedding))
+            # Fill in any missing speaker assignments
+            while len(self.sentence_speakers) < len(self.full_sentences) - 1:
+                self.sentence_speakers.append(0)
+            # Detect speaker changes
+            speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
+            self.sentence_speakers.append(speaker_id)
+            # Remove from pending
+            if text in self.pending_sentences:
+                self.pending_sentences.remove(text)
+        except Exception as e:
+            print(f"Error processing sentence: {e}")
+    def setup_recorder(self):
+        """Setup the audio recorder"""
+        try:
+            recorder_config = {
+                'spinner': False,
+                'use_microphone': False,
+                'model': FINAL_TRANSCRIPTION_MODEL,
+                'language': TRANSCRIPTION_LANGUAGE,
+                'silero_sensitivity': SILERO_SENSITIVITY,
+                'webrtc_sensitivity': WEBRTC_SENSITIVITY,
+                'post_speech_silence_duration': SILENCE_THRESHS[1],
+                'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
+                'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
+                'min_gap_between_recordings': 0,
+                'enable_realtime_transcription': True,
+                'realtime_processing_pause': 0,
+                'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
+                'on_realtime_transcription_update': self.live_text_detected,
+                'beam_size': FINAL_BEAM_SIZE,
+                'beam_size_realtime': REALTIME_BEAM_SIZE,
+                'buffer_size': BUFFER_SIZE,
+                'sample_rate': SAMPLE_RATE,
+            }
+            self.recorder = AudioToTextRecorder(**recorder_config)
             return True
         except Exception as e:
+            print(f"Error setting up recorder: {e}")
             return False
+    def live_text_detected(self, text):
+        """Handle live text detection"""
+        text = text.strip()
+        if not text:
+            return
+        sentence_delimiters = '.?!。'
+        prob_sentence_end = (
+            len(self.last_realtime_text) > 0
+            and text[-1] in sentence_delimiters
+            and self.last_realtime_text[-1] in sentence_delimiters
+        )
+        self.last_realtime_text = text
+        if prob_sentence_end:
+            if FAST_SENTENCE_END:
+                self.recorder.stop()
+            else:
+                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
+        else:
+            self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
+    def process_audio_chunk(self, audio_chunk):
+        """Process incoming audio chunk from FastRTC"""
+        if self.recorder is None:
+            if not self.setup_recorder():
+                return "Failed to setup recorder"
         try:
+            # Convert audio to the format expected by the recorder
+            if isinstance(audio_chunk, tuple):
+                sample_rate, audio_data = audio_chunk
             else:
+                audio_data = audio_chunk
+                sample_rate = SAMPLE_RATE
+            # Ensure audio is in the right format
+            if audio_data.dtype != np.int16:
+                if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
+                    audio_data = (audio_data * 32767).astype(np.int16)
+                else:
+                    audio_data = audio_data.astype(np.int16)
+            # Convert to bytes and feed to recorder
+            audio_bytes = audio_data.tobytes()
+            self.recorder.feed_audio(audio_bytes)
+            # Process final text if available
+            def process_final_text(text):
+                text = text.strip()
+                if text:
+                    self.pending_sentences.append(text)
+                    audio_bytes = self.recorder.last_transcription_bytes
+                    self.sentence_queue.put((text, audio_bytes))
+            # Get transcription
+            self.recorder.text(process_final_text)
+            return self.get_formatted_transcript()
         except Exception as e:
+            print(f"Error processing audio: {e}")
+            return f"Error: {e}"
+    def get_formatted_transcript(self):
+        """Get formatted transcript with speaker labels"""
         try:
+            transcript_parts = []
+            # Add completed sentences with speaker labels
+            for i, (sentence_text, _) in enumerate(self.full_sentences):
+                if i < len(self.sentence_speakers):
+                    speaker_id = self.sentence_speakers[i]
+                    speaker_label = f"Speaker {speaker_id + 1}"
+                    transcript_parts.append(f"{speaker_label}: {sentence_text}")
+            # Add pending sentences
+            for pending in self.pending_sentences:
+                transcript_parts.append(f"[Processing]: {pending}")
+            # Add current live text
+            if self.last_realtime_text:
+                transcript_parts.append(f"[Live]: {self.last_realtime_text}")
+            return "\n".join(transcript_parts)
         except Exception as e:
+            print(f"Error formatting transcript: {e}")
+            return "Error formatting transcript"
+    def update_settings(self, change_threshold, max_speakers):
+        """Update diarization settings"""
+        self.change_threshold = change_threshold
+        self.max_speakers = max_speakers
+        if self.speaker_detector:
+            self.speaker_detector.set_change_threshold(change_threshold)
+            self.speaker_detector.set_max_speakers(max_speakers)
     def clear_transcript(self):
+        """Clear all transcript data"""
+        self.full_sentences = []
+        self.sentence_speakers = []
+        self.pending_sentences = []
+        self.last_realtime_text = ""
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
 # Global instance
+asr_diarization = RealtimeASRDiarization()
+def process_audio_stream(audio_chunk, change_threshold, max_speakers):
+    """Process audio stream and return transcript"""
+    # Update settings if changed
+    asr_diarization.update_settings(change_threshold, max_speakers)
+    # Process audio
+    transcript = asr_diarization.process_audio_chunk(audio_chunk)
+    return transcript
+def clear_transcript():
+    """Clear the transcript"""
+    asr_diarization.clear_transcript()
+    return "Transcript cleared. Ready for new input..."
+def create_interface():
+    """Create Gradio interface with FastRTC"""
+    with gr.Blocks(title="Real-time Speaker Diarization") as iface:
+        gr.Markdown("# Real-time ASR with Speaker Diarization")
+        gr.Markdown("Speak into your microphone to see real-time transcription with speaker labels!")
         with gr.Row():
+            with gr.Column(scale=3):
+                # Audio input with FastRTC
+                audio_input = gr.Audio(
+                    sources=["microphone"],
+                    streaming=True,
+                    label="Microphone Input"
+                )
+                # Transcript output
+                transcript_output = gr.Textbox(
+                    label="Live Transcript with Speaker Labels",
+                    lines=15,
+                    max_lines=20,
+                    value="Ready to start transcription...",
+                    interactive=False
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("### Settings")
+                # Speaker change threshold
                 change_threshold = gr.Slider(
+                    minimum=0.1,
+                    maximum=0.95,
                     value=DEFAULT_CHANGE_THRESHOLD,
                     step=0.05,
                     label="Speaker Change Threshold",
                     info="Lower values = more sensitive to speaker changes"
                 )
+                # Max speakers
                 max_speakers = gr.Slider(
                     minimum=2,
                     maximum=ABSOLUTE_MAX_SPEAKERS,
                     value=DEFAULT_MAX_SPEAKERS,
                     step=1,
+                    label="Maximum Speakers",
                     info="Maximum number of speakers to detect"
                 )
+                # Clear button
+                clear_btn = gr.Button("Clear Transcript", variant="secondary")
+                gr.Markdown("### Speaker Colors")
+                color_info = "\\n".join([
+                    f"Speaker {i+1}: {SPEAKER_COLOR_NAMES[i]}"
+                    for i in range(min(DEFAULT_MAX_SPEAKERS, len(SPEAKER_COLOR_NAMES)))
+                ])
+                gr.Markdown(color_info)
+        # Set up streaming
         audio_input.stream(
+            fn=process_audio_stream,
             inputs=[audio_input, change_threshold, max_speakers],
+            outputs=[transcript_output],
+            show_progress=False
         )
+        # Clear button functionality
         clear_btn.click(
+            fn=clear_transcript,
+            outputs=[transcript_output]
         )
+        gr.Markdown("""
+        ### Instructions:
+        1. Allow microphone access when prompted
+        2. Start speaking - transcription will appear in real-time
+        3. Different speakers will be automatically detected and labeled
+        4. Adjust the threshold if speaker changes aren't detected properly
+        5. Use the clear button to reset the transcript
+        ### Notes:
+        - The system works best with clear audio and distinct speakers
+        - It may take a moment to load the speaker recognition model on first use
+        - Lower threshold values make the system more sensitive to speaker changes
+        """)
+    return iface
 if __name__ == "__main__":
+    # Create and launch the interface
+    iface = create_interface()
+    iface.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=True
     )