Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on 20 days ago

Commit

c263c26

1 Parent(s): 29b89b3

Code error correction

Browse files

Files changed (1) hide show

app.py +213 -170

app.py CHANGED Viewed

@@ -9,8 +9,13 @@ import urllib.request
 import torchaudio
 from scipy.spatial.distance import cosine
 import json
-import io
-import wave
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
@@ -34,8 +39,9 @@ ABSOLUTE_MAX_SPEAKERS = 10
 # Global variables
 FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
-BUFFER_SIZE = 512
 CHANNELS = 1
 # Speaker colors
 SPEAKER_COLORS = [
@@ -73,7 +79,7 @@ class SpeechBrainEncoder:
         model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
         if not os.path.exists(model_path):
-            print(f"Downloading ECAPA-TDNN model to {model_path}...")
             urllib.request.urlretrieve(model_url, model_path)
         return model_path
@@ -94,7 +100,7 @@ class SpeechBrainEncoder:
             self.model_loaded = True
             return True
         except Exception as e:
-            print(f"Error loading ECAPA-TDNN model: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
@@ -116,7 +122,7 @@ class SpeechBrainEncoder:
             return embedding.squeeze().cpu().numpy()
         except Exception as e:
-            print(f"Error extracting embedding: {e}")
             return np.zeros(self.embedding_dim)
@@ -135,7 +141,7 @@ class AudioProcessor:
             return embedding
         except Exception as e:
-            print(f"Embedding extraction error: {e}")
             return np.zeros(self.encoder.embedding_dim)
@@ -270,83 +276,105 @@ class SpeakerChangeDetector:
 class WhisperTranscriber:
-    """Simple Whisper transcriber for audio chunks"""
     def __init__(self, model_name="distil-large-v3"):
         self.model = None
         self.processor = None
         self.model_name = model_name
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def load_model(self):
         """Load Whisper model"""
         try:
             from transformers import WhisperProcessor, WhisperForConditionalGeneration
-            self.processor = WhisperProcessor.from_pretrained(f"distil-whisper/{self.model_name}")
-            self.model = WhisperForConditionalGeneration.from_pretrained(f"distil-whisper/{self.model_name}")
-            self.model.to(self.device)
             return True
         except Exception as e:
-            print(f"Error loading Whisper model: {e}")
             return False
     def transcribe(self, audio_array, sample_rate=16000):
         """Transcribe audio array"""
         try:
-            if self.model is None:
                 return ""
-            # Ensure audio is the right sample rate
             if sample_rate != 16000:
-                audio_array = torchaudio.functional.resample(
-                    torch.tensor(audio_array).float(),
-                    orig_freq=sample_rate,
-                    new_freq=16000
-                ).numpy()
-            # Process audio
-            inputs = self.processor(audio_array, sampling_rate=16000, return_tensors="pt")
-            inputs = inputs.to(self.device)
-            # Generate transcription
             with torch.no_grad():
-                predicted_ids = self.model.generate(inputs["input_features"])
-            # Decode transcription
-            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
-            return transcription[0] if transcription else ""
         except Exception as e:
-            print(f"Transcription error: {e}")
             return ""
-class RealtimeSpeakerDiarization:
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
         self.transcriber = None
-        self.audio_buffer = []
         self.processing_thread = None
-        self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
-        self.pending_sentences = []
-        self.displayed_text = ""
         self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
-        self.audio_chunks = []
-        self.chunk_counter = 0
     def initialize_models(self):
         """Initialize the speaker encoder and transcription models"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"Using device: {device_str}")
             # Initialize speaker encoder
             self.encoder = SpeechBrainEncoder(device=device_str)
@@ -363,124 +391,131 @@ class RealtimeSpeakerDiarization:
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
-                print("Models loaded successfully!")
                 return True
             else:
-                print("Failed to load models")
                 return False
         except Exception as e:
-            print(f"Model initialization error: {e}")
             return False
-    def process_audio_stream(self, audio_data, sample_rate):
-        """Process incoming audio stream data"""
-        if not self.is_running or self.encoder is None:
             return
         try:
-            # Convert audio data to numpy array if needed
-            if isinstance(audio_data, tuple):
-                sample_rate, audio_array = audio_data
-            else:
-                audio_array = audio_data
-            # Ensure audio is float32 and normalized
-            if audio_array.dtype != np.float32:
-                if audio_array.dtype == np.int16:
-                    audio_array = audio_array.astype(np.float32) / 32768.0
-                else:
-                    audio_array = audio_array.astype(np.float32)
-            # Ensure mono audio
-            if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
-                audio_array = np.mean(audio_array, axis=1)
-            # Add to buffer
-            self.audio_buffer.extend(audio_array.flatten())
-            # Process when we have enough audio (about 2 seconds)
-            target_length = int(sample_rate * 2.0)
-            if len(self.audio_buffer) >= target_length:
-                self.process_audio_chunk()
-        except Exception as e:
-            print(f"Error processing audio stream: {e}")
-    def process_audio_chunk(self):
-        """Process accumulated audio chunk"""
-        try:
-            if len(self.audio_buffer) < SAMPLE_RATE:  # Need at least 1 second
-                return
-            # Get audio chunk
-            audio_chunk = np.array(self.audio_buffer[:int(SAMPLE_RATE * 2)])
-            self.audio_buffer = self.audio_buffer[int(SAMPLE_RATE * 1.5):]  # Keep some overlap
-            # Transcribe audio
-            transcription = self.transcriber.transcribe(audio_chunk, SAMPLE_RATE)
-            if transcription.strip():
-                # Extract speaker embedding
-                speaker_embedding = self.audio_processor.extract_embedding(audio_chunk)
-                # Add to queue for processing
-                self.sentence_queue.put((transcription.strip(), speaker_embedding))
         except Exception as e:
-            print(f"Error processing audio chunk: {e}")
-    def process_sentence_queue(self):
-        """Process sentences in the queue for speaker detection"""
         while self.is_running:
             try:
-                text, speaker_embedding = self.sentence_queue.get(timeout=1)
-                # Store sentence and embedding
-                self.full_sentences.append((text, speaker_embedding))
-                # Fill in missing speaker assignments
-                while len(self.sentence_speakers) < len(self.full_sentences) - 1:
-                    self.sentence_speakers.append(0)
-                # Detect speaker changes
-                speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
-                self.sentence_speakers.append(speaker_id)
             except queue.Empty:
                 continue
             except Exception as e:
-                print(f"Error processing sentence: {e}")
     def start_recording(self):
-        """Start the recording and transcription process"""
-        if self.encoder is None:
             return "Please initialize models first!"
         try:
-            # Start sentence processing thread
             self.is_running = True
-            self.processing_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             self.processing_thread.start()
-            return "Recording started successfully! Start speaking into your microphone."
         except Exception as e:
             return f"Error starting recording: {e}"
     def stop_recording(self):
         """Stop the recording process"""
         self.is_running = False
-        self.audio_buffer = []
         return "Recording stopped!"
     def clear_conversation(self):
         """Clear all conversation data"""
         self.full_sentences = []
         self.sentence_speakers = []
-        self.pending_sentences = []
-        self.displayed_text = ""
         self.audio_buffer = []
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
@@ -504,26 +539,24 @@ class RealtimeSpeakerDiarization:
     def get_formatted_conversation(self):
         """Get the formatted conversation with speaker colors"""
         try:
             sentences_with_style = []
-            # Process completed sentences
-            for i, sentence in enumerate(self.full_sentences):
-                sentence_text, _ = sentence
                 if i >= len(self.sentence_speakers):
                     color = "#FFFFFF"
-                    speaker_name = "Speaker ?"
                 else:
-                    speaker_id = self.sentence_speakers[i]
                     color = self.speaker_detector.get_color_for_speaker(speaker_id)
                     speaker_name = f"Speaker {speaker_id + 1}"
                 sentences_with_style.append(
-                    f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
-            if sentences_with_style:
-                return "<br><br>".join(sentences_with_style)
-            else:
-                return "Waiting for speech input..."
         except Exception as e:
             return f"Error formatting conversation: {e}"
@@ -535,6 +568,7 @@ class RealtimeSpeakerDiarization:
         try:
             status = self.speaker_detector.get_status_info()
             status_lines = [
                 f"**Current Speaker:** {status['current_speaker'] + 1}",
@@ -542,7 +576,8 @@ class RealtimeSpeakerDiarization:
                 f"**Last Similarity:** {status['last_similarity']:.3f}",
                 f"**Change Threshold:** {status['threshold']:.2f}",
                 f"**Total Sentences:** {len(self.full_sentences)}",
-                f"**Audio Buffer Size:** {len(self.audio_buffer)}",
                 "",
                 "**Speaker Segment Counts:**"
             ]
@@ -558,7 +593,7 @@ class RealtimeSpeakerDiarization:
 # Global instance
-diarization_system = RealtimeSpeakerDiarization()
 def initialize_system():
@@ -600,49 +635,56 @@ def get_status():
     return diarization_system.get_status_info()
-def process_audio(audio_data):
-    """Process audio from Gradio audio input"""
-    if audio_data is not None:
-        sample_rate, audio_array = audio_data
-        diarization_system.process_audio_stream(audio_array, sample_rate)
     return get_conversation(), get_status()
-# Create Gradio interface
 def create_interface():
-    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as app:
-        gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
-        gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding using your browser's microphone.")
         with gr.Row():
             with gr.Column(scale=2):
-                # Audio input
                 audio_input = gr.Audio(
-                    source="microphone",
                     type="numpy",
                     streaming=True,
-                    label="🎙️ Microphone Input"
                 )
                 # Main conversation display
                 conversation_output = gr.HTML(
-                    value="<i>Click 'Initialize System' to start...</i>",
-                    label="Live Conversation"
                 )
                 # Control buttons
                 with gr.Row():
-                    init_btn = gr.Button("🔧 Initialize System", variant="secondary")
-                    start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False)
-                    stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False)
-                    clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
                 # Status display
                 status_output = gr.Textbox(
                     label="System Status",
                     value="System not initialized",
                     lines=10,
-                    interactive=False
                 )
             with gr.Column(scale=1):
@@ -655,7 +697,7 @@ def create_interface():
                     step=0.05,
                     value=DEFAULT_CHANGE_THRESHOLD,
                     label="Speaker Change Sensitivity",
-                    info="Lower values = more sensitive to speaker changes"
                 )
                 max_speakers_slider = gr.Slider(
@@ -666,26 +708,23 @@ def create_interface():
                     label="Maximum Number of Speakers"
                 )
-                update_settings_btn = gr.Button("Update Settings")
                 # Speaker color legend
                 gr.Markdown("## 🎨 Speaker Colors")
                 color_info = []
                 for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
-                    color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
                 gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
-                # Instructions
                 gr.Markdown("""
-                ## 📋 Instructions
-                1. **Initialize System** - Load AI models
-                2. **Allow microphone access** when prompted
-                3. **Start Recording** - Begin real-time processing
-                4. **Speak naturally** - The system will detect different speakers
-                5. **Stop Recording** when done
-                **Note:** Processing happens in real-time with ~2 second chunks for better accuracy.
                 """)
         # Event handlers
@@ -693,25 +732,25 @@ def create_interface():
             result = initialize_system()
             if "successfully" in result:
                 return (
-                    result,
                     gr.update(interactive=True),   # start_btn
                     gr.update(interactive=True),   # clear_btn
-                    get_conversation(),
-                    get_status()
                 )
             else:
                 return (
-                    result,
                     gr.update(interactive=False),  # start_btn
                     gr.update(interactive=False),  # clear_btn
-                    get_conversation(),
-                    get_status()
                 )
         def on_start():
             result = start_recording()
             return (
-                result,
                 gr.update(interactive=False),  # start_btn
                 gr.update(interactive=True),   # stop_btn
             )
@@ -719,11 +758,15 @@ def create_interface():
         def on_stop():
             result = stop_recording()
             return (
-                result,
                 gr.update(interactive=True),   # start_btn
                 gr.update(interactive=False),  # stop_btn
             )
         # Connect event handlers
         init_btn.click(
             on_initialize,
@@ -751,19 +794,19 @@ def create_interface():
             outputs=[status_output]
         )
-        # Process streaming audio
         audio_input.stream(
-            process_audio,
             inputs=[audio_input],
             outputs=[conversation_output, status_output],
-            time_limit=60,
-            stream_every=0.5
         )
-        # Auto-refresh every 3 seconds
-        refresh_timer = gr.Timer(3.0)
         refresh_timer.tick(
-            lambda: (get_conversation(), get_status()),
             outputs=[conversation_output, status_output]
         )

 import torchaudio
 from scipy.spatial.distance import cosine
 import json
+import asyncio
+from typing import Iterator
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
 # Global variables
 FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
+BUFFER_SIZE = 1024
 CHANNELS = 1
+CHUNK_DURATION_MS = 100  # 100ms chunks for FastRTC
 # Speaker colors
 SPEAKER_COLORS = [
         model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
         if not os.path.exists(model_path):
+            logger.info(f"Downloading ECAPA-TDNN model to {model_path}...")
             urllib.request.urlretrieve(model_url, model_path)
         return model_path
             self.model_loaded = True
             return True
         except Exception as e:
+            logger.error(f"Error loading ECAPA-TDNN model: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
             return embedding.squeeze().cpu().numpy()
         except Exception as e:
+            logger.error(f"Error extracting embedding: {e}")
             return np.zeros(self.embedding_dim)
             return embedding
         except Exception as e:
+            logger.error(f"Embedding extraction error: {e}")
             return np.zeros(self.encoder.embedding_dim)
 class WhisperTranscriber:
+    """Whisper transcriber using transformers with FastRTC optimization"""
     def __init__(self, model_name="distil-large-v3"):
         self.model = None
         self.processor = None
         self.model_name = model_name
+        self.model_loaded = False
     def load_model(self):
         """Load Whisper model"""
         try:
             from transformers import WhisperProcessor, WhisperForConditionalGeneration
+            model_id = f"distil-whisper/distil-{self.model_name}" if "distil" in self.model_name else f"openai/whisper-{self.model_name}"
+            self.processor = WhisperProcessor.from_pretrained(model_id)
+            self.model = WhisperForConditionalGeneration.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                low_cpu_mem_usage=True,
+                use_safetensors=True
+            )
+            if torch.cuda.is_available():
+                self.model = self.model.cuda()
+            self.model_loaded = True
             return True
         except Exception as e:
+            logger.error(f"Error loading Whisper model: {e}")
             return False
     def transcribe(self, audio_array, sample_rate=16000):
         """Transcribe audio array"""
+        if not self.model_loaded:
+            return ""
         try:
+            # Ensure audio is the right length and format
+            if len(audio_array) < 1600:  # Less than 0.1 seconds
                 return ""
+            # Resample if needed
             if sample_rate != 16000:
+                import torchaudio.functional as F
+                audio_tensor = torch.tensor(audio_array, dtype=torch.float32)
+                audio_array = F.resample(audio_tensor, sample_rate, 16000).numpy()
+            # Process with Whisper
+            inputs = self.processor(
+                audio_array,
+                sampling_rate=16000,
+                return_tensors="pt",
+                truncation=False,
+                padding=True
+            )
+            if torch.cuda.is_available():
+                inputs = {k: v.cuda() for k, v in inputs.items()}
             with torch.no_grad():
+                predicted_ids = self.model.generate(
+                    inputs["input_features"],
+                    max_length=448,
+                    num_beams=1,
+                    do_sample=False,
+                    use_cache=True
+                )
+            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+            return transcription.strip()
         except Exception as e:
+            logger.error(f"Transcription error: {e}")
             return ""
+class FastRTCSpeakerDiarization:
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
         self.transcriber = None
+        self.audio_queue = queue.Queue(maxsize=100)
         self.processing_thread = None
         self.full_sentences = []
         self.sentence_speakers = []
         self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
+        self.audio_buffer = []
+        self.buffer_duration = 3.0  # seconds
+        self.last_transcription_time = time.time()
+        self.chunk_size = int(SAMPLE_RATE * CHUNK_DURATION_MS / 1000)
     def initialize_models(self):
         """Initialize the speaker encoder and transcription models"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Using device: {device_str}")
             # Initialize speaker encoder
             self.encoder = SpeechBrainEncoder(device=device_str)
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
+                logger.info("Models loaded successfully!")
                 return True
             else:
+                logger.error("Failed to load models")
                 return False
         except Exception as e:
+            logger.error(f"Model initialization error: {e}")
             return False
+    def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int):
+        """Process individual audio chunk from FastRTC"""
+        if not self.is_running or audio_chunk is None:
             return
         try:
+            # Ensure audio chunk is in correct format
+            if isinstance(audio_chunk, np.ndarray):
+                # Ensure mono audio
+                if len(audio_chunk.shape) > 1:
+                    audio_chunk = audio_chunk.mean(axis=1)
+                # Normalize audio
+                if audio_chunk.dtype != np.float32:
+                    audio_chunk = audio_chunk.astype(np.float32)
+                if np.abs(audio_chunk).max() > 1.0:
+                    audio_chunk = audio_chunk / np.abs(audio_chunk).max()
+                # Add to buffer
+                self.audio_buffer.extend(audio_chunk)
+                # Keep buffer to specified duration
+                max_buffer_length = int(self.buffer_duration * sample_rate)
+                if len(self.audio_buffer) > max_buffer_length:
+                    self.audio_buffer = self.audio_buffer[-max_buffer_length:]
+                # Process if enough audio accumulated and enough time passed
+                current_time = time.time()
+                if (current_time - self.last_transcription_time > 1.5 and
+                    len(self.audio_buffer) > sample_rate * 0.8):  # At least 0.8 seconds
+                    if not self.audio_queue.full():
+                        self.audio_queue.put((np.array(self.audio_buffer[-int(sample_rate * 2):]), sample_rate))
+                        self.last_transcription_time = current_time
         except Exception as e:
+            logger.error(f"Audio chunk processing error: {e}")
+    def process_audio_queue(self):
+        """Process audio from the queue"""
         while self.is_running:
             try:
+                audio_data, sample_rate = self.audio_queue.get(timeout=1)
+                if len(audio_data) < 1600:  # Skip very short audio
+                    continue
+                # Transcribe audio
+                transcription = self.transcriber.transcribe(audio_data, sample_rate)
+                if transcription and len(transcription.strip()) > 0:
+                    # Extract speaker embedding
+                    speaker_embedding = self.audio_processor.extract_embedding(audio_data)
+                    # Detect speaker
+                    speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
+                    # Store results
+                    self.full_sentences.append(transcription.strip())
+                    self.sentence_speakers.append(speaker_id)
+                    logger.info(f"Processed: Speaker {speaker_id + 1}: {transcription.strip()[:50]}...")
             except queue.Empty:
                 continue
             except Exception as e:
+                logger.error(f"Error processing audio queue: {e}")
     def start_recording(self):
+        """Start the recording and processing"""
+        if self.encoder is None or self.transcriber is None:
             return "Please initialize models first!"
         try:
             self.is_running = True
+            self.audio_buffer = []
+            self.last_transcription_time = time.time()
+            # Clear the queue
+            while not self.audio_queue.empty():
+                try:
+                    self.audio_queue.get_nowait()
+                except queue.Empty:
+                    break
+            # Start processing thread
+            self.processing_thread = threading.Thread(target=self.process_audio_queue, daemon=True)
             self.processing_thread.start()
+            logger.info("Recording started successfully!")
+            return "Recording started successfully!"
         except Exception as e:
+            logger.error(f"Error starting recording: {e}")
             return f"Error starting recording: {e}"
     def stop_recording(self):
         """Stop the recording process"""
         self.is_running = False
+        logger.info("Recording stopped!")
         return "Recording stopped!"
     def clear_conversation(self):
         """Clear all conversation data"""
         self.full_sentences = []
         self.sentence_speakers = []
         self.audio_buffer = []
+        # Clear the queue
+        while not self.audio_queue.empty():
+            try:
+                self.audio_queue.get_nowait()
+            except queue.Empty:
+                break
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
     def get_formatted_conversation(self):
         """Get the formatted conversation with speaker colors"""
         try:
+            if not self.full_sentences:
+                return "Waiting for speech input... 🎤"
             sentences_with_style = []
+            for i, sentence in enumerate(self.full_sentences[-10:]):  # Show last 10 sentences
                 if i >= len(self.sentence_speakers):
                     color = "#FFFFFF"
+                    speaker_name = "Unknown"
                 else:
+                    speaker_id = self.sentence_speakers[-(10-i) if len(self.sentence_speakers) >= 10 else i]
                     color = self.speaker_detector.get_color_for_speaker(speaker_id)
                     speaker_name = f"Speaker {speaker_id + 1}"
                 sentences_with_style.append(
+                    f'<p><span style="color:{color}; font-weight: bold;">{speaker_name}:</span> {sentence}</p>')
+            return "".join(sentences_with_style)
         except Exception as e:
             return f"Error formatting conversation: {e}"
         try:
             status = self.speaker_detector.get_status_info()
+            queue_size = self.audio_queue.qsize()
             status_lines = [
                 f"**Current Speaker:** {status['current_speaker'] + 1}",
                 f"**Last Similarity:** {status['last_similarity']:.3f}",
                 f"**Change Threshold:** {status['threshold']:.2f}",
                 f"**Total Sentences:** {len(self.full_sentences)}",
+                f"**Buffer Length:** {len(self.audio_buffer)} samples",
+                f"**Queue Size:** {queue_size}",
                 "",
                 "**Speaker Segment Counts:**"
             ]
 # Global instance
+diarization_system = FastRTCSpeakerDiarization()
 def initialize_system():
     return diarization_system.get_status_info()
+def process_audio_stream(audio_stream):
+    """Process streaming audio from FastRTC"""
+    if audio_stream is not None and diarization_system.is_running:
+        sample_rate, audio_data = audio_stream
+        diarization_system.process_audio_chunk(audio_data, sample_rate)
     return get_conversation(), get_status()
+# Create Gradio interface with FastRTC
 def create_interface():
+    with gr.Blocks(title="FastRTC Real-time Speaker Diarization", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 🎤 FastRTC Real-time Speech Recognition with Speaker Diarization")
+        gr.Markdown("This app uses Hugging Face FastRTC for real-time audio streaming with automatic speaker identification and color-coding.")
         with gr.Row():
             with gr.Column(scale=2):
+                # FastRTC Audio input for real-time streaming
                 audio_input = gr.Audio(
+                    sources=["microphone"],
                     type="numpy",
                     streaming=True,
+                    label="🎙️ FastRTC Microphone Input",
+                    format="wav",
+                    show_download_button=False,
+                    container=True,
+                    elem_id="fastrtc_audio"
                 )
                 # Main conversation display
                 conversation_output = gr.HTML(
+                    value="<i>Click 'Initialize System' and then 'Start Recording' to begin...</i>",
+                    label="Live Conversation",
+                    elem_id="conversation_display"
                 )
                 # Control buttons
                 with gr.Row():
+                    init_btn = gr.Button("🔧 Initialize System", variant="secondary", size="lg")
+                    start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False, size="lg")
+                    stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False, size="lg")
+                    clear_btn = gr.Button("🗑️ Clear", interactive=False, size="lg")
                 # Status display
                 status_output = gr.Textbox(
                     label="System Status",
                     value="System not initialized",
                     lines=10,
+                    interactive=False,
+                    show_copy_button=True
                 )
             with gr.Column(scale=1):
                     step=0.05,
                     value=DEFAULT_CHANGE_THRESHOLD,
                     label="Speaker Change Sensitivity",
+                    info="Lower = more sensitive to changes"
                 )
                 max_speakers_slider = gr.Slider(
                     label="Maximum Number of Speakers"
                 )
+                update_settings_btn = gr.Button("Update Settings", variant="secondary")
                 # Speaker color legend
                 gr.Markdown("## 🎨 Speaker Colors")
                 color_info = []
                 for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
+                    color_info.append(f'<span style="color:{color}; font-size: 16px;">●</span> Speaker {i+1} ({name})')
                 gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
+                # Performance info
+                gr.Markdown("## 📊 Performance")
                 gr.Markdown("""
+                - **FastRTC**: Low-latency audio streaming
+                - **Whisper**: distil-large-v3 for transcription
+                - **ECAPA-TDNN**: Speaker embeddings
+                - **Real-time**: ~100ms processing chunks
                 """)
         # Event handlers
             result = initialize_system()
             if "successfully" in result:
                 return (
+                    result,  # status_output
                     gr.update(interactive=True),   # start_btn
                     gr.update(interactive=True),   # clear_btn
+                    get_conversation(),  # conversation_output
+                    get_status()  # status_output update
                 )
             else:
                 return (
+                    result,  # status_output
                     gr.update(interactive=False),  # start_btn
                     gr.update(interactive=False),  # clear_btn
+                    get_conversation(),  # conversation_output
+                    get_status()  # status_output update
                 )
         def on_start():
             result = start_recording()
             return (
+                result,  # status_output
                 gr.update(interactive=False),  # start_btn
                 gr.update(interactive=True),   # stop_btn
             )
         def on_stop():
             result = stop_recording()
             return (
+                result,  # status_output
                 gr.update(interactive=True),   # start_btn
                 gr.update(interactive=False),  # stop_btn
             )
+        # Auto-refresh function
+        def refresh_display():
+            return get_conversation(), get_status()
         # Connect event handlers
         init_btn.click(
             on_initialize,
             outputs=[status_output]
         )
+        # FastRTC streaming audio processing
         audio_input.stream(
+            process_audio_stream,
             inputs=[audio_input],
             outputs=[conversation_output, status_output],
+            stream_every=0.1,  # Process every 100ms
+            time_limit=None
         )
+        # Auto-refresh timer
+        refresh_timer = gr.Timer(2.0)
         refresh_timer.tick(
+            refresh_display,
             outputs=[conversation_output, status_output]
         )