Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 23

Commit

10008f1

1 Parent(s): fd289b1

Reverting

Browse files

Files changed (1) hide show

app.py +430 -236

app.py CHANGED Viewed

@@ -1,16 +1,19 @@
 import gradio as gr
 import numpy as np
 import torch
-import torchaudio
 import time
 import os
 import urllib.request
-import queue
-import threading
 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
-# Configuration parameters (kept same as original)
 SILENCE_THRESHS = [0, 0.4]
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 FINAL_BEAM_SIZE = 5
@@ -29,20 +32,28 @@ MIN_SEGMENT_DURATION = 1.0
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
-# Audio parameters
 FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
 BUFFER_SIZE = 512
 CHANNELS = 1
-# Speaker colors for HTML display
 SPEAKER_COLORS = [
-    "#FFFF00", "#FF0000", "#00FF00", "#00FFFF", "#FF00FF",
-    "#0000FF", "#FF8000", "#00FF80", "#8000FF", "#FFFFFF"
 ]
 SPEAKER_COLOR_NAMES = [
-    "Yellow", "Red", "Green", "Cyan", "Magenta",
     "Blue", "Orange", "Spring Green", "Purple", "White"
 ]
@@ -131,7 +142,7 @@ class AudioProcessor:
 class SpeakerChangeDetector:
-    """Speaker change detector with configurable number of speakers"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
@@ -245,28 +256,87 @@ class SpeakerChangeDetector:
         if 0 <= speaker_id < len(SPEAKER_COLORS):
             return SPEAKER_COLORS[speaker_id]
         return "#FFFFFF"
-class RealtimeASRDiarization:
-    """Main class for real-time ASR with speaker diarization"""
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
         self.recorder = None
-        self.is_recording = False
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
         self.last_realtime_text = ""
-        self.sentence_queue = queue.Queue()
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
-        # Initialize model
-        self.initialize_model()
-    def initialize_model(self):
         """Initialize the speaker encoder model"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
@@ -276,69 +346,95 @@ class RealtimeASRDiarization:
             success = self.encoder.load_model()
             if success:
-                print("ECAPA-TDNN model loaded successfully!")
                 self.audio_processor = AudioProcessor(self.encoder)
                 self.speaker_detector = SpeakerChangeDetector(
                     embedding_dim=self.encoder.embedding_dim,
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
-                # Start sentence processing thread
-                self.sentence_thread = threading.Thread(target=self.process_sentences, daemon=True)
-                self.sentence_thread.start()
             else:
                 print("Failed to load ECAPA-TDNN model")
         except Exception as e:
             print(f"Model initialization error: {e}")
-    def process_sentences(self):
-        """Process sentences in background thread"""
-        while True:
             try:
-                text, audio_bytes = self.sentence_queue.get(timeout=1)
-                self.process_sentence(text, audio_bytes)
-            except queue.Empty:
-                continue
-    def process_sentence(self, text, audio_bytes):
-        """Process a sentence with speaker diarization"""
-        if self.audio_processor is None or self.speaker_detector is None:
-            return
-        try:
-            # Convert audio data to int16
-            audio_int16 = np.int16(audio_bytes * 32767)
-            # Extract speaker embedding
-            speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
-            # Store sentence and embedding
-            self.full_sentences.append((text, speaker_embedding))
-            # Fill in any missing speaker assignments
-            while len(self.sentence_speakers) < len(self.full_sentences) - 1:
-                self.sentence_speakers.append(0)
-            # Detect speaker changes
-            speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
-            self.sentence_speakers.append(speaker_id)
-            # Remove from pending
-            if text in self.pending_sentences:
-                self.pending_sentences.remove(text)
-        except Exception as e:
-            print(f"Error processing sentence: {e}")
-    def setup_recorder(self):
-        """Setup the audio recorder"""
         try:
             recorder_config = {
                 'spinner': False,
-                'use_microphone': False,
                 'model': FINAL_TRANSCRIPTION_MODEL,
                 'language': TRANSCRIPTION_LANGUAGE,
                 'silero_sensitivity': SILERO_SENSITIVITY,
@@ -356,119 +452,44 @@ class RealtimeASRDiarization:
                 'buffer_size': BUFFER_SIZE,
                 'sample_rate': SAMPLE_RATE,
             }
             self.recorder = AudioToTextRecorder(**recorder_config)
-            return True
-        except Exception as e:
-            print(f"Error setting up recorder: {e}")
-            return False
-    def live_text_detected(self, text):
-        """Handle live text detection"""
-        text = text.strip()
-        if not text:
-            return
-        sentence_delimiters = '.?!。'
-        prob_sentence_end = (
-            len(self.last_realtime_text) > 0
-            and text[-1] in sentence_delimiters
-            and self.last_realtime_text[-1] in sentence_delimiters
-        )
-        self.last_realtime_text = text
-        if prob_sentence_end:
-            if FAST_SENTENCE_END:
-                self.recorder.stop()
-            else:
-                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
-        else:
-            self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
-    def process_audio_chunk(self, audio_chunk):
-        """Process incoming audio chunk from FastRTC"""
-        if self.recorder is None:
-            if not self.setup_recorder():
-                return "Failed to setup recorder"
-        try:
-            # Convert audio to the format expected by the recorder
-            if isinstance(audio_chunk, tuple):
-                sample_rate, audio_data = audio_chunk
-            else:
-                audio_data = audio_chunk
-                sample_rate = SAMPLE_RATE
-            # Ensure audio is in the right format
-            if audio_data.dtype != np.int16:
-                if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
-                    audio_data = (audio_data * 32767).astype(np.int16)
-                else:
-                    audio_data = audio_data.astype(np.int16)
-            # Convert to bytes and feed to recorder
-            audio_bytes = audio_data.tobytes()
-            self.recorder.feed_audio(audio_bytes)
-            # Process final text if available
-            def process_final_text(text):
-                text = text.strip()
-                if text:
-                    self.pending_sentences.append(text)
-                    audio_bytes = self.recorder.last_transcription_bytes
-                    self.sentence_queue.put((text, audio_bytes))
-            # Get transcription
-            self.recorder.text(process_final_text)
-            return self.get_formatted_transcript()
         except Exception as e:
-            print(f"Error processing audio: {e}")
-            return f"Error: {e}"
-    def get_formatted_transcript(self):
-        """Get formatted transcript with speaker labels"""
         try:
-            transcript_parts = []
-            # Add completed sentences with speaker labels
-            for i, (sentence_text, _) in enumerate(self.full_sentences):
-                if i < len(self.sentence_speakers):
-                    speaker_id = self.sentence_speakers[i]
-                    speaker_label = f"Speaker {speaker_id + 1}"
-                    transcript_parts.append(f"{speaker_label}: {sentence_text}")
-            # Add pending sentences
-            for pending in self.pending_sentences:
-                transcript_parts.append(f"[Processing]: {pending}")
-            # Add current live text
-            if self.last_realtime_text:
-                transcript_parts.append(f"[Live]: {self.last_realtime_text}")
-            return "\n".join(transcript_parts)
         except Exception as e:
-            print(f"Error formatting transcript: {e}")
-            return "Error formatting transcript"
-    def update_settings(self, change_threshold, max_speakers):
-        """Update diarization settings"""
-        self.change_threshold = change_threshold
-        self.max_speakers = max_speakers
-        if self.speaker_detector:
-            self.speaker_detector.set_change_threshold(change_threshold)
-            self.speaker_detector.set_max_speakers(max_speakers)
-    def clear_transcript(self):
-        """Clear all transcript data"""
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
         self.last_realtime_text = ""
         if self.speaker_detector:
@@ -477,122 +498,295 @@ class RealtimeASRDiarization:
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
 # Global instance
-asr_diarization = RealtimeASRDiarization()
-def process_audio_stream(audio_chunk, change_threshold, max_speakers):
-    """Process audio stream and return transcript"""
-    # Update settings if changed
-    asr_diarization.update_settings(change_threshold, max_speakers)
-    # Process audio
-    transcript = asr_diarization.process_audio_chunk(audio_chunk)
-    return transcript
-def clear_transcript():
-    """Clear the transcript"""
-    asr_diarization.clear_transcript()
-    return "Transcript cleared. Ready for new input..."
 def create_interface():
-    """Create Gradio interface with FastRTC"""
-    with gr.Blocks(title="Real-time Speaker Diarization") as iface:
-        gr.Markdown("# Real-time ASR with Speaker Diarization")
-        gr.Markdown("Speak into your microphone to see real-time transcription with speaker labels!")
         with gr.Row():
-            with gr.Column(scale=3):
-                # Audio input with FastRTC
                 audio_input = gr.Audio(
                     sources=["microphone"],
                     streaming=True,
-                    label="Microphone Input"
                 )
-                # Transcript output
-                transcript_output = gr.Textbox(
-                    label="Live Transcript with Speaker Labels",
-                    lines=15,
-                    max_lines=20,
-                    value="Ready to start transcription...",
-                    interactive=False
                 )
             with gr.Column(scale=1):
-                gr.Markdown("### Settings")
-                # Speaker change threshold
-                change_threshold = gr.Slider(
                     minimum=0.1,
                     maximum=0.95,
-                    value=DEFAULT_CHANGE_THRESHOLD,
                     step=0.05,
-                    label="Speaker Change Threshold",
                     info="Lower values = more sensitive to speaker changes"
                 )
-                # Max speakers
-                max_speakers = gr.Slider(
                     minimum=2,
                     maximum=ABSOLUTE_MAX_SPEAKERS,
-                    value=DEFAULT_MAX_SPEAKERS,
                     step=1,
-                    label="Maximum Speakers",
-                    info="Maximum number of speakers to detect"
                 )
-                # Clear button
-                clear_btn = gr.Button("Clear Transcript", variant="secondary")
-                gr.Markdown("### Speaker Colors")
-                color_info = "\\n".join([
-                    f"Speaker {i+1}: {SPEAKER_COLOR_NAMES[i]}"
-                    for i in range(min(DEFAULT_MAX_SPEAKERS, len(SPEAKER_COLOR_NAMES)))
-                ])
-                gr.Markdown(color_info)
-        # Set up streaming
-        audio_input.stream(
-            fn=process_audio_stream,
-            inputs=[audio_input, change_threshold, max_speakers],
-            outputs=[transcript_output],
-            show_progress=False
         )
-        # Clear button functionality
         clear_btn.click(
-            fn=clear_transcript,
-            outputs=[transcript_output]
         )
-        gr.Markdown("""
-        ### Instructions:
-        1. Allow microphone access when prompted
-        2. Start speaking - transcription will appear in real-time
-        3. Different speakers will be automatically detected and labeled
-        4. Adjust the threshold if speaker changes aren't detected properly
-        5. Use the clear button to reset the transcript
-        ### Notes:
-        - The system works best with clear audio and distinct speakers
-        - It may take a moment to load the speaker recognition model on first use
-        - Lower threshold values make the system more sensitive to speaker changes
-        """)
-    return iface
 if __name__ == "__main__":
-    # Create and launch the interface
-    iface = create_interface()
-    iface.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=True

 import gradio as gr
 import numpy as np
+import queue
 import torch
 import time
+import threading
 import os
 import urllib.request
+import torchaudio
 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
+import json
+import io
+import wave
+# Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 FINAL_BEAM_SIZE = 5
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
+# Global variables
 FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
 BUFFER_SIZE = 512
 CHANNELS = 1
+# Speaker colors
 SPEAKER_COLORS = [
+    "#FFFF00",  # Yellow
+    "#FF0000",  # Red
+    "#00FF00",  # Green
+    "#00FFFF",  # Cyan
+    "#FF00FF",  # Magenta
+    "#0000FF",  # Blue
+    "#FF8000",  # Orange
+    "#00FF80",  # Spring Green
+    "#8000FF",  # Purple
+    "#FFFFFF",  # White
 ]
 SPEAKER_COLOR_NAMES = [
+    "Yellow", "Red", "Green", "Cyan", "Magenta",
     "Blue", "Orange", "Spring Green", "Purple", "White"
 ]
 class SpeakerChangeDetector:
+    """Speaker change detector that supports a configurable number of speakers"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
         if 0 <= speaker_id < len(SPEAKER_COLORS):
             return SPEAKER_COLORS[speaker_id]
         return "#FFFFFF"
+    def get_status_info(self):
+        """Return status information about the speaker change detector"""
+        speaker_counts = [len(self.speaker_embeddings[i]) for i in range(self.max_speakers)]
+        return {
+            "current_speaker": self.current_speaker,
+            "speaker_counts": speaker_counts,
+            "active_speakers": len(self.active_speakers),
+            "max_speakers": self.max_speakers,
+            "last_similarity": self.last_similarity,
+            "threshold": self.change_threshold
+        }
+class WebRTCAudioProcessor:
+    """Processes WebRTC audio streams for speaker diarization"""
+    def __init__(self, diarization_system):
+        self.diarization_system = diarization_system
+        self.audio_buffer = []
+        self.buffer_lock = threading.Lock()
+        self.processing_thread = None
+        self.is_processing = False
+    def process_audio(self, audio_data, sample_rate):
+        """Process incoming audio data from WebRTC"""
+        try:
+            # Convert audio data to numpy array if needed
+            if isinstance(audio_data, bytes):
+                audio_array = np.frombuffer(audio_data, dtype=np.int16)
+            elif isinstance(audio_data, tuple):
+                # Handle tuple format (sample_rate, audio_array)
+                sample_rate, audio_array = audio_data
+                if isinstance(audio_array, np.ndarray):
+                    if audio_array.dtype != np.int16:
+                        audio_array = (audio_array * 32767).astype(np.int16)
+                else:
+                    audio_array = np.array(audio_array, dtype=np.int16)
+            else:
+                audio_array = np.array(audio_data, dtype=np.int16)
+            # Ensure mono audio
+            if len(audio_array.shape) > 1:
+                audio_array = audio_array[:, 0]
+            # Add to buffer
+            with self.buffer_lock:
+                self.audio_buffer.extend(audio_array)
+                # Process buffer when it's large enough (1 second of audio)
+                if len(self.audio_buffer) >= sample_rate:
+                    buffer_to_process = np.array(self.audio_buffer[:sample_rate])
+                    self.audio_buffer = self.audio_buffer[sample_rate//2:]  # Keep 50% overlap
+                    # Feed to recorder in separate thread
+                    if self.diarization_system.recorder:
+                        audio_bytes = buffer_to_process.tobytes()
+                        self.diarization_system.recorder.feed_audio(audio_bytes)
+        except Exception as e:
+            print(f"Error processing WebRTC audio: {e}")
+class RealtimeSpeakerDiarization:
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
         self.recorder = None
+        self.webrtc_processor = None
+        self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
+        self.displayed_text = ""
         self.last_realtime_text = ""
+        self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
+    def initialize_models(self):
         """Initialize the speaker encoder model"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
             success = self.encoder.load_model()
             if success:
                 self.audio_processor = AudioProcessor(self.encoder)
                 self.speaker_detector = SpeakerChangeDetector(
                     embedding_dim=self.encoder.embedding_dim,
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
+                self.webrtc_processor = WebRTCAudioProcessor(self)
+                print("ECAPA-TDNN model loaded successfully!")
+                return True
             else:
                 print("Failed to load ECAPA-TDNN model")
+                return False
         except Exception as e:
             print(f"Model initialization error: {e}")
+            return False
+    def live_text_detected(self, text):
+        """Callback for real-time transcription updates"""
+        text = text.strip()
+        if text:
+            sentence_delimiters = '.?!。'
+            prob_sentence_end = (
+                len(self.last_realtime_text) > 0
+                and text[-1] in sentence_delimiters
+                and self.last_realtime_text[-1] in sentence_delimiters
+            )
+            self.last_realtime_text = text
+            if prob_sentence_end and FAST_SENTENCE_END:
+                self.recorder.stop()
+            elif prob_sentence_end:
+                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
+            else:
+                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
+    def process_final_text(self, text):
+        """Process final transcribed text with speaker embedding"""
+        text = text.strip()
+        if text:
             try:
+                bytes_data = self.recorder.last_transcription_bytes
+                self.sentence_queue.put((text, bytes_data))
+                self.pending_sentences.append(text)
+            except Exception as e:
+                print(f"Error processing final text: {e}")
+    def process_sentence_queue(self):
+        """Process sentences in the queue for speaker detection"""
+        while self.is_running:
+            try:
+                text, bytes_data = self.sentence_queue.get(timeout=1)
+                # Convert audio data to int16
+                audio_int16 = np.int16(bytes_data * 32767)
+                # Extract speaker embedding
+                speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
+                # Store sentence and embedding
+                self.full_sentences.append((text, speaker_embedding))
+                # Fill in missing speaker assignments
+                while len(self.sentence_speakers) < len(self.full_sentences) - 1:
+                    self.sentence_speakers.append(0)
+                # Detect speaker changes
+                speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
+                self.sentence_speakers.append(speaker_id)
+                # Remove from pending
+                if text in self.pending_sentences:
+                    self.pending_sentences.remove(text)
+            except queue.Empty:
+                continue
+            except Exception as e:
+                print(f"Error processing sentence: {e}")
+    def start_recording(self):
+        """Start the recording and transcription process"""
+        if self.encoder is None:
+            return "Please initialize models first!"
         try:
+            # Setup recorder configuration for WebRTC input
             recorder_config = {
                 'spinner': False,
+                'use_microphone': False,  # We'll feed audio manually
                 'model': FINAL_TRANSCRIPTION_MODEL,
                 'language': TRANSCRIPTION_LANGUAGE,
                 'silero_sensitivity': SILERO_SENSITIVITY,
                 'buffer_size': BUFFER_SIZE,
                 'sample_rate': SAMPLE_RATE,
             }
             self.recorder = AudioToTextRecorder(**recorder_config)
+            # Start sentence processing thread
+            self.is_running = True
+            self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
+            self.sentence_thread.start()
+            # Start transcription thread
+            self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
+            self.transcription_thread.start()
+            return "Recording started successfully! WebRTC audio input ready."
         except Exception as e:
+            return f"Error starting recording: {e}"
+    def run_transcription(self):
+        """Run the transcription loop"""
         try:
+            while self.is_running:
+                self.recorder.text(self.process_final_text)
         except Exception as e:
+            print(f"Transcription error: {e}")
+    def stop_recording(self):
+        """Stop the recording process"""
+        self.is_running = False
+        if self.recorder:
+            self.recorder.stop()
+        return "Recording stopped!"
+    def clear_conversation(self):
+        """Clear all conversation data"""
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
+        self.displayed_text = ""
         self.last_realtime_text = ""
         if self.speaker_detector:
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
+        return "Conversation cleared!"
+    def update_settings(self, threshold, max_speakers):
+        """Update speaker detection settings"""
+        self.change_threshold = threshold
+        self.max_speakers = max_speakers
+        if self.speaker_detector:
+            self.speaker_detector.set_change_threshold(threshold)
+            self.speaker_detector.set_max_speakers(max_speakers)
+        return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
+    def get_formatted_conversation(self):
+        """Get the formatted conversation with speaker colors"""
+        try:
+            sentences_with_style = []
+            # Process completed sentences
+            for i, sentence in enumerate(self.full_sentences):
+                sentence_text, _ = sentence
+                if i >= len(self.sentence_speakers):
+                    color = "#FFFFFF"
+                else:
+                    speaker_id = self.sentence_speakers[i]
+                    color = self.speaker_detector.get_color_for_speaker(speaker_id)
+                    speaker_name = f"Speaker {speaker_id + 1}"
+                sentences_with_style.append(
+                    f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
+            # Add pending sentences
+            for pending_sentence in self.pending_sentences:
+                sentences_with_style.append(
+                    f'<span style="color:#60FFFF;"><b>Processing:</b> {pending_sentence}</span>')
+            if sentences_with_style:
+                return "<br><br>".join(sentences_with_style)
+            else:
+                return "Waiting for speech input..."
+        except Exception as e:
+            return f"Error formatting conversation: {e}"
+    def get_status_info(self):
+        """Get current status information"""
+        if not self.speaker_detector:
+            return "Speaker detector not initialized"
+        try:
+            status = self.speaker_detector.get_status_info()
+            status_lines = [
+                f"**Current Speaker:** {status['current_speaker'] + 1}",
+                f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
+                f"**Last Similarity:** {status['last_similarity']:.3f}",
+                f"**Change Threshold:** {status['threshold']:.2f}",
+                f"**Total Sentences:** {len(self.full_sentences)}",
+                "",
+                "**Speaker Segment Counts:**"
+            ]
+            for i in range(status['max_speakers']):
+                color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
+                status_lines.append(f"Speaker {i+1} ({color_name}): {status['speaker_counts'][i]}")
+            return "\n".join(status_lines)
+        except Exception as e:
+            return f"Error getting status: {e}"
 # Global instance
+diarization_system = RealtimeSpeakerDiarization()
+def initialize_system():
+    """Initialize the diarization system"""
+    success = diarization_system.initialize_models()
+    if success:
+        return "✅ System initialized successfully! Models loaded."
+    else:
+        return "❌ Failed to initialize system. Please check the logs."
+def start_recording():
+    """Start recording and transcription"""
+    return diarization_system.start_recording()
+def stop_recording():
+    """Stop recording and transcription"""
+    return diarization_system.stop_recording()
+def clear_conversation():
+    """Clear the conversation"""
+    return diarization_system.clear_conversation()
+def update_settings(threshold, max_speakers):
+    """Update system settings"""
+    return diarization_system.update_settings(threshold, max_speakers)
+def get_conversation():
+    """Get the current conversation"""
+    return diarization_system.get_formatted_conversation()
+def get_status():
+    """Get system status"""
+    return diarization_system.get_status_info()
+def process_audio_stream(audio):
+    """Process audio stream from WebRTC"""
+    if diarization_system.webrtc_processor and diarization_system.is_running:
+        diarization_system.webrtc_processor.process_audio(audio, SAMPLE_RATE)
+    return None
+# Create Gradio interface
 def create_interface():
+    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Dark()) as app:
+        gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
+        gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding using WebRTC.")
         with gr.Row():
+            with gr.Column(scale=2):
+                # WebRTC Audio Input
                 audio_input = gr.Audio(
                     sources=["microphone"],
                     streaming=True,
+                    label="🎙️ Microphone Input",
+                    type="numpy"
                 )
+                # Main conversation display
+                conversation_output = gr.HTML(
+                    value="<i>Click 'Initialize System' to start...</i>",
+                    label="Live Conversation"
                 )
+                # Control buttons
+                with gr.Row():
+                    init_btn = gr.Button("🔧 Initialize System", variant="secondary")
+                    start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False)
+                    stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False)
+                    clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
+                # Status display
+                status_output = gr.Textbox(
+                    label="System Status",
+                    value="System not initialized",
+                    lines=8,
+                    interactive=False
+                )
             with gr.Column(scale=1):
+                # Settings panel
+                gr.Markdown("## ⚙️ Settings")
+                threshold_slider = gr.Slider(
                     minimum=0.1,
                     maximum=0.95,
                     step=0.05,
+                    value=DEFAULT_CHANGE_THRESHOLD,
+                    label="Speaker Change Sensitivity",
                     info="Lower values = more sensitive to speaker changes"
                 )
+                max_speakers_slider = gr.Slider(
                     minimum=2,
                     maximum=ABSOLUTE_MAX_SPEAKERS,
                     step=1,
+                    value=DEFAULT_MAX_SPEAKERS,
+                    label="Maximum Number of Speakers"
                 )
+                update_settings_btn = gr.Button("Update Settings")
+                # Instructions
+                gr.Markdown("## 📝 Instructions")
+                gr.Markdown("""
+                1. Click **Initialize System** to load models
+                2. Click **Start Recording** to begin processing
+                3. Allow microphone access when prompted
+                4. Speak into your microphone
+                5. Watch real-time transcription with speaker labels
+                6. Adjust settings as needed
+                """)
+                # Speaker color legend
+                gr.Markdown("## 🎨 Speaker Colors")
+                color_info = []
+                for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
+                    color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
+                gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
+        # Auto-refresh conversation and status
+        def refresh_display():
+            return get_conversation(), get_status()
+        # Event handlers
+        def on_initialize():
+            result = initialize_system()
+            if "successfully" in result:
+                return (
+                    result,
+                    gr.update(interactive=True),   # start_btn
+                    gr.update(interactive=True),   # clear_btn
+                    get_conversation(),
+                    get_status()
+                )
+            else:
+                return (
+                    result,
+                    gr.update(interactive=False),  # start_btn
+                    gr.update(interactive=False),  # clear_btn
+                    get_conversation(),
+                    get_status()
+                )
+        def on_start():
+            result = start_recording()
+            return (
+                result,
+                gr.update(interactive=False),  # start_btn
+                gr.update(interactive=True),   # stop_btn
+            )
+        def on_stop():
+            result = stop_recording()
+            return (
+                result,
+                gr.update(interactive=True),   # start_btn
+                gr.update(interactive=False),  # stop_btn
+            )
+        # Connect event handlers
+        init_btn.click(
+            on_initialize,
+            outputs=[status_output, start_btn, clear_btn, conversation_output, status_output]
+        )
+        start_btn.click(
+            on_start,
+            outputs=[status_output, start_btn, stop_btn]
+        )
+        stop_btn.click(
+            on_stop,
+            outputs=[status_output, start_btn, stop_btn]
         )
         clear_btn.click(
+            clear_conversation,
+            outputs=[status_output]
+        )
+        update_settings_btn.click(
+            update_settings,
+            inputs=[threshold_slider, max_speakers_slider],
+            outputs=[status_output]
         )
+        # Connect WebRTC audio stream to processing
+        audio_input.stream(
+            process_audio_stream,
+            inputs=[audio_input],
+            outputs=[]
+        )
+        # Auto-refresh every 2 seconds when recording
+        refresh_timer = gr.Timer(2.0)
+        refresh_timer.tick(
+            refresh_display,
+            outputs=[conversation_output, status_output]
+        )
+    return app
 if __name__ == "__main__":
+    app = create_interface()
+    app.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=True