Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on 17 days ago

Commit

91b17d7

1 Parent(s): 89ba8a1

Check point 4

Browse files

Files changed (1) hide show

app.py +93 -85

app.py CHANGED Viewed

@@ -330,10 +330,47 @@ class RealtimeSpeakerDiarization:
             logger.error(f"Model initialization error: {e}")
             return False
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
         with self.transcription_lock:
             self.last_transcription = text.strip()
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
@@ -419,7 +456,7 @@ class RealtimeSpeakerDiarization:
             # Setup recorder configuration
             recorder_config = {
                 'spinner': False,
-                'use_microphone': False,  # Must be False since we're using FastRTC
                 'model': FINAL_TRANSCRIPTION_MODEL,
                 'language': TRANSCRIPTION_LANGUAGE,
                 'silero_sensitivity': SILERO_SENSITIVITY,
@@ -429,7 +466,7 @@ class RealtimeSpeakerDiarization:
                 'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
                 'min_gap_between_recordings': 0,
                 'enable_realtime_transcription': True,
-                'realtime_processing_pause': 0.05,  # Faster updates for live transcription
                 'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
                 'on_realtime_transcription_update': self.live_text_detected,
                 'beam_size': FINAL_BEAM_SIZE,
@@ -447,8 +484,7 @@ class RealtimeSpeakerDiarization:
             self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
             self.transcription_thread.start()
-            logger.info("Recording started with FastRTC integration")
-            return "Recording started successfully! Speak now..."
         except Exception as e:
             logger.error(f"Error starting recording: {e}")
@@ -587,26 +623,35 @@ class DiarizationHandler(AsyncStreamHandler):
                 return
             # Extract audio data
-            if hasattr(frame, 'data'):
-                audio_data = frame.data
-            else:
-                audio_data = frame
-            # Convert to numpy array
-            if isinstance(audio_data, bytes):
-                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
-            elif isinstance(audio_data, tuple) and len(audio_data) >= 2:
-                sample_rate, data = audio_data
-                audio_array = np.array(data, dtype=np.float32)
-            elif isinstance(audio_data, (list, tuple)):
-                audio_array = np.array(audio_data, dtype=np.float32)
             else:
-                audio_array = np.array(audio_data, dtype=np.float32)
             # Ensure 1D
             if len(audio_array.shape) > 1:
                 audio_array = audio_array.flatten()
             # Buffer audio chunks
             self.audio_buffer.extend(audio_array)
@@ -615,16 +660,8 @@ class DiarizationHandler(AsyncStreamHandler):
                 chunk = np.array(self.audio_buffer[:self.buffer_size])
                 self.audio_buffer = self.audio_buffer[self.buffer_size:]
-                # Process both for speaker detection and feed to the recorder for transcription
                 await self.process_audio_async(chunk)
-                # If recorder exists, feed audio for transcription
-                if self.diarization_system.recorder:
-                    # Convert to bytes for the recorder's audio buffer
-                    audio_bytes = (chunk * 32768.0).astype(np.int16).tobytes()
-                    if hasattr(self.diarization_system.recorder, '_handle_audio'):
-                        # Send audio to the recorder's audio buffer
-                        self.diarization_system.recorder._handle_audio(audio_bytes)
         except Exception as e:
             logger.error(f"Error in FastRTC receive: {e}")
@@ -643,17 +680,18 @@ class DiarizationHandler(AsyncStreamHandler):
             logger.error(f"Error in async audio processing: {e}")
     async def start_up(self):
-        """Called when stream starts"""
-        logger.info("FastRTC stream handler started")
     async def shutdown(self):
-        """Called when stream ends"""
-        logger.info("FastRTC stream handler shutdown")
 # Global instances
 diarization_system = RealtimeSpeakerDiarization()
-audio_handler = None
 def initialize_system():
     """Initialize the diarization system"""
@@ -661,14 +699,20 @@ def initialize_system():
     try:
         success = diarization_system.initialize_models()
         if success:
-            # Create a fresh handler that uses our diarization system
             handler = DiarizationHandler(diarization_system)
             # Update the Stream's handler
-            stream.handler = handler
-            logger.info("FastRTC handler initialized successfully")
-            return "✅ System initialized successfully! Click 'Start' to begin recording."
         else:
             return "❌ Failed to initialize system. Check logs for details."
     except Exception as e:
@@ -685,8 +729,7 @@ def start_recording():
 def on_start():
     result = start_recording()
-    # When starting recording, update UI and return WebRTC component with autostart=True
-    return result, gr.update(interactive=False), gr.update(interactive=True), gr.update(autostart=True)
 def stop_recording():
     """Stop recording and transcription"""
@@ -726,15 +769,6 @@ def get_status():
     except Exception as e:
         return f"Error getting status: {str(e)}"
-def refresh_conversation():
-    """Get the current conversation and update live transcription status"""
-    has_live = diarization_system.last_transcription != ""
-    status = "🟢 **Live Transcription Status:** Active" if has_live else "🟠 **Live Transcription Status:** Ready (No speech detected)"
-    if not diarization_system.is_running:
-        status = "🔴 **Live Transcription Status:** Not running"
-    return get_conversation(), status
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as interface:
@@ -743,17 +777,12 @@ def create_interface():
         with gr.Row():
             with gr.Column(scale=2):
-                # Replace standard Gradio audio with FastRTC WebRTC component
                 audio_component = WebRTC(
-                    stream=stream,
-                    label="Audio Input (FastRTC)",
-                    show_audio_waveform=True,
-                    autostart=False,
-                )
-                # Add live transcription status indicator
-                live_transcription_status = gr.Markdown(
-                    "🔴 **Live Transcription Status:** Waiting to initialize...",
                 )
                 # Conversation display
@@ -829,8 +858,7 @@ def create_interface():
         def on_start():
             result = start_recording()
-            # When starting recording, update UI and return WebRTC component with autostart=True
-            return result, gr.update(interactive=False), gr.update(interactive=True), gr.update(autostart=True)
         def on_stop():
             result = stop_recording()
@@ -858,7 +886,7 @@ def create_interface():
         start_btn.click(
             fn=on_start,
-            outputs=[status_output, start_btn, stop_btn, audio_component]
         )
         stop_btn.click(
@@ -879,7 +907,7 @@ def create_interface():
         # Auto-refresh conversation display every 1 second
         conversation_timer = gr.Timer(1)
-        conversation_timer.tick(refresh_conversation, outputs=[conversation_output, live_transcription_status])
         # Auto-refresh status every 2 seconds
         status_timer = gr.Timer(2)
@@ -891,29 +919,9 @@ def create_interface():
 # FastAPI setup for FastRTC integration
 app = FastAPI()
-# Create a placeholder handler - will be properly initialized later
-class DefaultHandler(AsyncStreamHandler):
-    def __init__(self):
-        super().__init__()
-    async def receive(self, frame):
-        pass
-    async def emit(self):
-        return None
-    def copy(self):
-        return DefaultHandler()
-    async def shutdown(self):
-        pass
-    async def start_up(self):
-        pass
-# Initialize with placeholder handler
-stream = Stream(handler=DefaultHandler(), modality="audio", mode="send-receive")
-stream.mount(app)
 @app.get("/")
 async def root():

             logger.error(f"Model initialization error: {e}")
             return False
+    def feed_audio(self, audio_data):
+        """Feed audio data directly to the recorder for live transcription"""
+        if not self.is_running or not self.recorder:
+            return
+        try:
+            # Normalize if needed
+            if isinstance(audio_data, np.ndarray):
+                if audio_data.dtype != np.float32:
+                    audio_data = audio_data.astype(np.float32)
+                # Convert to int16 for the recorder
+                audio_int16 = (audio_data * 32767).astype(np.int16)
+                audio_bytes = audio_int16.tobytes()
+                # Feed to recorder
+                self.recorder.feed_audio(audio_bytes)
+                # Also process for speaker detection
+                self.process_audio_chunk(audio_data)
+            elif isinstance(audio_data, bytes):
+                # Feed raw bytes directly
+                self.recorder.feed_audio(audio_data)
+                # Convert to float for speaker detection
+                audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
+                audio_float = audio_int16.astype(np.float32) / 32768.0
+                self.process_audio_chunk(audio_float)
+            logger.debug("Audio fed to recorder")
+        except Exception as e:
+            logger.error(f"Error feeding audio: {e}")
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
         with self.transcription_lock:
             self.last_transcription = text.strip()
+        # Update the display immediately on new transcription
+        self.update_conversation_display()
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
             # Setup recorder configuration
             recorder_config = {
                 'spinner': False,
+                'use_microphone': False,  # Change to False for Hugging Face Spaces
                 'model': FINAL_TRANSCRIPTION_MODEL,
                 'language': TRANSCRIPTION_LANGUAGE,
                 'silero_sensitivity': SILERO_SENSITIVITY,
                 'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
                 'min_gap_between_recordings': 0,
                 'enable_realtime_transcription': True,
+                'realtime_processing_pause': 0.1,
                 'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
                 'on_realtime_transcription_update': self.live_text_detected,
                 'beam_size': FINAL_BEAM_SIZE,
             self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
             self.transcription_thread.start()
+            return "Recording started successfully!"
         except Exception as e:
             logger.error(f"Error starting recording: {e}")
                 return
             # Extract audio data
+            audio_data = getattr(frame, 'data', frame)
+            # Check if this is a tuple (sample_rate, audio_array)
+            if isinstance(audio_data, tuple) and len(audio_data) >= 2:
+                sample_rate, audio_array = audio_data
             else:
+                # If not a tuple, assume it's raw audio bytes/array
+                sample_rate = SAMPLE_RATE  # Use default sample rate
+                # Convert to numpy array
+                if isinstance(audio_data, bytes):
+                    audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
+                elif isinstance(audio_data, (list, tuple)):
+                    audio_array = np.array(audio_data, dtype=np.float32)
+                else:
+                    audio_array = np.array(audio_data, dtype=np.float32)
             # Ensure 1D
             if len(audio_array.shape) > 1:
                 audio_array = audio_array.flatten()
+            # Send audio to recorder for live transcription
+            if self.diarization_system.recorder:
+                try:
+                    self.diarization_system.recorder.feed_audio(audio_array)
+                    logger.info("Fed audio to recorder")
+                except Exception as e:
+                    logger.error(f"Error feeding audio to recorder: {e}")
             # Buffer audio chunks
             self.audio_buffer.extend(audio_array)
                 chunk = np.array(self.audio_buffer[:self.buffer_size])
                 self.audio_buffer = self.audio_buffer[self.buffer_size:]
+                # Process asynchronously
                 await self.process_audio_async(chunk)
         except Exception as e:
             logger.error(f"Error in FastRTC receive: {e}")
             logger.error(f"Error in async audio processing: {e}")
     async def start_up(self):
+        logger.info("DiarizationHandler started")
     async def shutdown(self):
+        logger.info("DiarizationHandler shutdown")
 # Global instances
 diarization_system = RealtimeSpeakerDiarization()
+# We'll initialize the stream in initialize_system()
+# For now, just create a placeholder
+stream = None
 def initialize_system():
     """Initialize the diarization system"""
     try:
         success = diarization_system.initialize_models()
         if success:
+            # Create a DiarizationHandler linked to our system
             handler = DiarizationHandler(diarization_system)
             # Update the Stream's handler
+            stream = Stream(
+                handler=handler,
+                modality="audio",
+                mode="send-receive",
+                stream_name="audio_stream"  # Match the stream_name in WebRTC component
+            )
+            # Mount the stream to the FastAPI app
+            stream.mount(app)
+            return "✅ System initialized successfully!"
         else:
             return "❌ Failed to initialize system. Check logs for details."
     except Exception as e:
 def on_start():
     result = start_recording()
+    return result, gr.update(interactive=False), gr.update(interactive=True)
 def stop_recording():
     """Stop recording and transcription"""
     except Exception as e:
         return f"Error getting status: {str(e)}"
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as interface:
         with gr.Row():
             with gr.Column(scale=2):
+                # Replace standard Audio with WebRTC component
                 audio_component = WebRTC(
+                    label="Audio Input",
+                    stream_name="audio_stream",
+                    modality="audio",
+                    mode="send-receive"
                 )
                 # Conversation display
         def on_start():
             result = start_recording()
+            return result, gr.update(interactive=False), gr.update(interactive=True)
         def on_stop():
             result = stop_recording()
         start_btn.click(
             fn=on_start,
+            outputs=[status_output, start_btn, stop_btn]
         )
         stop_btn.click(
         # Auto-refresh conversation display every 1 second
         conversation_timer = gr.Timer(1)
+        conversation_timer.tick(refresh_conversation, outputs=[conversation_output])
         # Auto-refresh status every 2 seconds
         status_timer = gr.Timer(2)
 # FastAPI setup for FastRTC integration
 app = FastAPI()
+# We'll initialize the stream in initialize_system()
+# For now, just create a placeholder
+stream = None
 @app.get("/")
 async def root():