Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on 18 days ago

Commit

876be23

1 Parent(s): 534a53d

Check point 4

Browse files

Files changed (1) hide show

app.py +65 -28

app.py CHANGED Viewed

@@ -10,12 +10,13 @@ import torchaudio
 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
-from fastrtc import Stream, AsyncStreamHandler, WebRTC
 import json
 import asyncio
 import uvicorn
 from queue import Queue
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -330,10 +331,47 @@ class RealtimeSpeakerDiarization:
             logger.error(f"Model initialization error: {e}")
             return False
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
         with self.transcription_lock:
             self.last_transcription = text.strip()
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
@@ -652,7 +690,9 @@ class DiarizationHandler(AsyncStreamHandler):
 # Global instances
 diarization_system = RealtimeSpeakerDiarization()
-# We'll initialize the stream properly in initialize_system()
 def initialize_system():
     """Initialize the diarization system"""
@@ -666,9 +706,12 @@ def initialize_system():
             stream = Stream(
                 handler=handler,
                 modality="audio",
-                mode="send-receive",
-                stream_name="audio_stream"  # Match the stream_name in WebRTC component
             )
             return "✅ System initialized successfully!"
         else:
             return "❌ Failed to initialize system. Check logs for details."
@@ -737,7 +780,6 @@ def create_interface():
                 # Replace standard Audio with WebRTC component
                 audio_component = WebRTC(
                     label="Audio Input",
-                    stream_name="audio_stream",
                     modality="audio",
                     mode="send-receive"
                 )
@@ -869,6 +911,21 @@ def create_interface():
         # Auto-refresh status every 2 seconds
         status_timer = gr.Timer(2)
         status_timer.tick(refresh_status, outputs=[status_output])
     return interface
@@ -876,29 +933,9 @@ def create_interface():
 # FastAPI setup for FastRTC integration
 app = FastAPI()
-# Create a placeholder handler - will be properly initialized later
-class DefaultHandler(AsyncStreamHandler):
-    def __init__(self):
-        super().__init__()
-    async def receive(self, frame):
-        pass
-    async def emit(self):
-        return None
-    def copy(self):
-        return DefaultHandler()
-    async def shutdown(self):
-        pass
-    async def start_up(self):
-        pass
-# Initialize with placeholder handler
-stream = Stream(handler=DefaultHandler(), modality="audio", mode="send-receive")
-stream.mount(app)
 @app.get("/")
 async def root():

 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
+from fastrtc import Stream, AsyncStreamHandler
 import json
 import asyncio
 import uvicorn
 from queue import Queue
 import logging
+from gradio_webrtc import WebRTC
 # Set up logging
 logging.basicConfig(level=logging.INFO)
             logger.error(f"Model initialization error: {e}")
             return False
+    def feed_audio(self, audio_data):
+        """Feed audio data directly to the recorder for live transcription"""
+        if not self.is_running or not self.recorder:
+            return
+        try:
+            # Normalize if needed
+            if isinstance(audio_data, np.ndarray):
+                if audio_data.dtype != np.float32:
+                    audio_data = audio_data.astype(np.float32)
+                # Convert to int16 for the recorder
+                audio_int16 = (audio_data * 32767).astype(np.int16)
+                audio_bytes = audio_int16.tobytes()
+                # Feed to recorder
+                self.recorder.feed_audio(audio_bytes)
+                # Also process for speaker detection
+                self.process_audio_chunk(audio_data)
+            elif isinstance(audio_data, bytes):
+                # Feed raw bytes directly
+                self.recorder.feed_audio(audio_data)
+                # Convert to float for speaker detection
+                audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
+                audio_float = audio_int16.astype(np.float32) / 32768.0
+                self.process_audio_chunk(audio_float)
+            logger.debug("Audio fed to recorder")
+        except Exception as e:
+            logger.error(f"Error feeding audio: {e}")
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
         with self.transcription_lock:
             self.last_transcription = text.strip()
+        # Update the display immediately on new transcription
+        self.update_conversation_display()
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
 # Global instances
 diarization_system = RealtimeSpeakerDiarization()
+# We'll initialize the stream in initialize_system()
+# For now, just create a placeholder
+stream = None
 def initialize_system():
     """Initialize the diarization system"""
             stream = Stream(
                 handler=handler,
                 modality="audio",
+                mode="send-receive"
             )
+            # Mount the stream to the FastAPI app
+            stream.mount(app)
             return "✅ System initialized successfully!"
         else:
             return "❌ Failed to initialize system. Check logs for details."
                 # Replace standard Audio with WebRTC component
                 audio_component = WebRTC(
                     label="Audio Input",
                     modality="audio",
                     mode="send-receive"
                 )
         # Auto-refresh status every 2 seconds
         status_timer = gr.Timer(2)
         status_timer.tick(refresh_status, outputs=[status_output])
+        # Connect the WebRTC component to our processing function
+        def process_webrtc_audio(audio_data):
+            if audio_data is not None and diarization_system.is_running:
+                try:
+                    # Feed audio to our diarization system
+                    diarization_system.feed_audio(audio_data)
+                except Exception as e:
+                    logger.error(f"Error processing WebRTC audio: {e}")
+            return get_conversation()
+        audio_component.stream(
+            fn=process_webrtc_audio,
+            outputs=[conversation_output]
+        )
     return interface
 # FastAPI setup for FastRTC integration
 app = FastAPI()
+# We'll initialize the stream in initialize_system()
+# For now, just create a placeholder
+stream = None
 @app.get("/")
 async def root():