Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on 16 days ago

Commit

a5c083c

1 Parent(s): a3ec320

Check point 4

Browse files

Files changed (1) hide show

app.py +69 -325

app.py CHANGED Viewed

@@ -10,13 +10,12 @@ import torchaudio
 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
-from fastrtc import Stream, AsyncStreamHandler
 import json
 import asyncio
 import uvicorn
 from queue import Queue
 import logging
-from gradio_webrtc import WebRTC
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -330,7 +329,7 @@ class RealtimeSpeakerDiarization:
         except Exception as e:
             logger.error(f"Model initialization error: {e}")
             return False
     def feed_audio(self, audio_data):
         """Feed audio data directly to the recorder for live transcription"""
         if not self.is_running or not self.recorder:
@@ -601,117 +600,47 @@ class RealtimeSpeakerDiarization:
             logger.error(f"Error processing audio chunk: {e}")
-# FastRTC Audio Handler
-class DiarizationHandler(AsyncStreamHandler):
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
-        self.audio_buffer = []
-        self.buffer_size = BUFFER_SIZE
-    def copy(self):
-        """Return a fresh handler for each new stream connection"""
-        return DiarizationHandler(self.diarization_system)
-    async def emit(self):
-        """Not used - we only receive audio"""
-        return None
-    async def receive(self, frame):
-        """Receive audio data from FastRTC"""
         try:
-            if not self.diarization_system.is_running:
-                return
             # Extract audio data
-            audio_data = getattr(frame, 'data', frame)
-            # Check if this is a tuple (sample_rate, audio_array)
-            if isinstance(audio_data, tuple) and len(audio_data) >= 2:
-                sample_rate, audio_array = audio_data
-            else:
-                # If not a tuple, assume it's raw audio bytes/array
-                sample_rate = SAMPLE_RATE  # Use default sample rate
-                # Convert to numpy array
-                if isinstance(audio_data, bytes):
-                    audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
-                elif isinstance(audio_data, (list, tuple)):
-                    audio_array = np.array(audio_data, dtype=np.float32)
-                else:
-                    audio_array = np.array(audio_data, dtype=np.float32)
-            # Ensure 1D
-            if len(audio_array.shape) > 1:
-                audio_array = audio_array.flatten()
-            # Send audio to recorder for live transcription
-            if self.diarization_system.recorder:
-                try:
-                    self.diarization_system.recorder.feed_audio(audio_array)
-                    logger.info("Fed audio to recorder")
-                except Exception as e:
-                    logger.error(f"Error feeding audio to recorder: {e}")
-            # Buffer audio chunks
-            self.audio_buffer.extend(audio_array)
-            # Process in chunks
-            while len(self.audio_buffer) >= self.buffer_size:
-                chunk = np.array(self.audio_buffer[:self.buffer_size])
-                self.audio_buffer = self.audio_buffer[self.buffer_size:]
-                # Process asynchronously
-                await self.process_audio_async(chunk)
         except Exception as e:
-            logger.error(f"Error in FastRTC receive: {e}")
-    async def process_audio_async(self, audio_data):
-        """Process audio data asynchronously"""
-        try:
-            loop = asyncio.get_event_loop()
-            await loop.run_in_executor(
-                None,
-                self.diarization_system.process_audio_chunk,
-                audio_data,
-                SAMPLE_RATE
-            )
-        except Exception as e:
-            logger.error(f"Error in async audio processing: {e}")
-    async def start_up(self):
-        logger.info("DiarizationHandler started")
-    async def shutdown(self):
-        logger.info("DiarizationHandler shutdown")
-# Global instances
 diarization_system = RealtimeSpeakerDiarization()
-# We'll initialize the stream in initialize_system()
-# For now, just create a placeholder
-stream = None
 def initialize_system():
     """Initialize the diarization system"""
-    global stream
     try:
         success = diarization_system.initialize_models()
         if success:
-            # Create a DiarizationHandler linked to our system
-            handler = DiarizationHandler(diarization_system)
-            # Update the Stream's handler
-            stream = Stream(
-                handler=handler,
-                modality="audio",
-                mode="send-receive"
-            )
-            # Mount the stream to the FastAPI app
-            stream.mount(app)
             return "✅ System initialized successfully!"
         else:
             return "❌ Failed to initialize system. Check logs for details."
@@ -727,10 +656,6 @@ def start_recording():
     except Exception as e:
         return f"❌ Failed to start recording: {str(e)}"
-def on_start():
-    result = start_recording()
-    return result, gr.update(interactive=False), gr.update(interactive=True)
 def stop_recording():
     """Stop recording and transcription"""
     try:
@@ -769,232 +694,52 @@ def get_status():
     except Exception as e:
         return f"Error getting status: {str(e)}"
-# Create Gradio interface
-def create_interface():
-    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
-        gr.Markdown("Live transcription with automatic speaker identification using FastRTC audio streaming.")
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Replace standard Audio with WebRTC component
-                audio_component = WebRTC(
-                    label="Audio Input",
-                    modality="audio",
-                    mode="send-receive"
-                )
-                # Conversation display
-                conversation_output = gr.HTML(
-                    value="<div style='padding: 20px; background: #f8f9fa; border-radius: 10px; min-height: 300px;'><i>Click 'Initialize System' to start...</i></div>",
-                    label="Live Conversation"
-                )
-                # Control buttons
-                with gr.Row():
-                    init_btn = gr.Button("🔧 Initialize System", variant="secondary", size="lg")
-                    start_btn = gr.Button("🎙️ Start", variant="primary", size="lg", interactive=False)
-                    stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", interactive=False)
-                    clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg", interactive=False)
-                # Status display
-                status_output = gr.Textbox(
-                    label="System Status",
-                    value="Ready to initialize...",
-                    lines=8,
-                    interactive=False
-                )
-            with gr.Column(scale=1):
-                # Settings
-                gr.Markdown("## ⚙️ Settings")
-                threshold_slider = gr.Slider(
-                    minimum=0.3,
-                    maximum=0.9,
-                    step=0.05,
-                    value=DEFAULT_CHANGE_THRESHOLD,
-                    label="Speaker Change Sensitivity",
-                    info="Lower = more sensitive"
-                )
-                max_speakers_slider = gr.Slider(
-                    minimum=2,
-                    maximum=ABSOLUTE_MAX_SPEAKERS,
-                    step=1,
-                    value=DEFAULT_MAX_SPEAKERS,
-                    label="Maximum Speakers"
-                )
-                update_btn = gr.Button("Update Settings", variant="secondary")
-                # Instructions
-                gr.Markdown("""
-                ## 📋 Instructions
-                1. **Initialize** the system (loads AI models)
-                2. **Start** recording
-                3. **Speak** - system will transcribe and identify speakers
-                4. **Monitor** real-time results below
-                ## 🎨 Speaker Colors
-                - 🔴 Speaker 1 (Red)
-                - 🟢 Speaker 2 (Teal)
-                - 🔵 Speaker 3 (Blue)
-                - 🟡 Speaker 4 (Green)
-                - 🟣 Speaker 5 (Yellow)
-                - 🟤 Speaker 6 (Plum)
-                - 🟫 Speaker 7 (Mint)
-                - 🟨 Speaker 8 (Gold)
-                """)
-        # Event handlers
-        def on_initialize():
-            result = initialize_system()
-            if "✅" in result:
-                return result, gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
-            else:
-                return result, gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
-        def on_start():
-            result = start_recording()
-            return result, gr.update(interactive=False), gr.update(interactive=True)
-        def on_stop():
-            result = stop_recording()
-            return result, gr.update(interactive=True), gr.update(interactive=False)
-        def on_clear():
-            result = clear_conversation()
-            return result
-        def on_update_settings(threshold, max_speakers):
-            result = update_settings(threshold, int(max_speakers))
-            return result
-        def refresh_conversation():
-            return get_conversation()
-        def refresh_status():
-            return get_status()
-        # Button click handlers
-        init_btn.click(
-            fn=on_initialize,
-            outputs=[status_output, start_btn, stop_btn, clear_btn]
-        )
-        start_btn.click(
-            fn=on_start,
-            outputs=[status_output, start_btn, stop_btn]
-        )
-        stop_btn.click(
-            fn=on_stop,
-            outputs=[status_output, start_btn, stop_btn]
-        )
-        clear_btn.click(
-            fn=on_clear,
-            outputs=[status_output]
-        )
-        update_btn.click(
-            fn=on_update_settings,
-            inputs=[threshold_slider, max_speakers_slider],
-            outputs=[status_output]
-        )
-        # Auto-refresh conversation display every 1 second
-        conversation_timer = gr.Timer(1)
-        conversation_timer.tick(refresh_conversation, outputs=[conversation_output])
-        # Auto-refresh status every 2 seconds
-        status_timer = gr.Timer(2)
-        status_timer.tick(refresh_status, outputs=[status_output])
-        # Connect the WebRTC component to our processing function
-        def process_webrtc_audio(audio_data):
-            if audio_data is not None and diarization_system.is_running:
-                try:
-                    # Feed audio to our diarization system
-                    diarization_system.feed_audio(audio_data)
-                except Exception as e:
-                    logger.error(f"Error processing WebRTC audio: {e}")
-            return get_conversation()
-        audio_component.stream(
-            fn=process_webrtc_audio,
-            outputs=[conversation_output]
-        )
-    return interface
-# FastAPI setup for FastRTC integration
-app = FastAPI()
-# We'll initialize the stream in initialize_system()
-# For now, just create a placeholder
-stream = None
-@app.get("/")
-async def root():
-    return {"message": "Real-time Speaker Diarization API"}
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy", "system_running": diarization_system.is_running}
-@app.post("/initialize")
-async def api_initialize():
-    result = initialize_system()
-    return {"result": result, "success": "✅" in result}
-@app.post("/start")
-async def api_start():
-    result = start_recording()
-    return {"result": result, "success": "🎙️" in result}
-@app.post("/stop")
-async def api_stop():
-    result = stop_recording()
-    return {"result": result, "success": "⏹️" in result}
-@app.post("/clear")
-async def api_clear():
-    result = clear_conversation()
-    return {"result": result}
-@app.get("/conversation")
-async def api_get_conversation():
-    return {"conversation": get_conversation()}
-@app.get("/status")
-async def api_get_status():
-    return {"status": get_status()}
-@app.post("/settings")
-async def api_update_settings(threshold: float, max_speakers: int):
-    result = update_settings(threshold, max_speakers)
-    return {"result": result}
 # Main execution
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="Real-time Speaker Diarization System")
-    parser.add_argument("--mode", choices=["gradio", "api", "both"], default="gradio",
-                       help="Run mode: gradio interface, API only, or both")
     parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
     parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
     parser.add_argument("--api-port", type=int, default=8000, help="API port (when running both)")
     args = parser.parse_args()
-    if args.mode == "gradio":
-        # Run Gradio interface only
-        interface = create_interface()
-        interface.launch(
             server_name=args.host,
             server_port=args.port,
             share=True,
@@ -1003,6 +748,8 @@ if __name__ == "__main__":
     elif args.mode == "api":
         # Run FastAPI only
         uvicorn.run(
             app,
             host=args.host,
@@ -1011,20 +758,12 @@ if __name__ == "__main__":
         )
     elif args.mode == "both":
-        # Run both Gradio and FastAPI
-        import multiprocessing
         import threading
-        def run_gradio():
-            interface = create_interface()
-            interface.launch(
-                server_name=args.host,
-                server_port=args.port,
-                share=True,
-                show_error=True
-            )
         def run_fastapi():
             uvicorn.run(
                 app,
                 host=args.host,
@@ -1036,5 +775,10 @@ if __name__ == "__main__":
         api_thread = threading.Thread(target=run_fastapi, daemon=True)
         api_thread.start()
-        # Start Gradio in main thread
-        run_gradio()

 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
+from fastrtc import Stream, ReplyOnPause, AudioStreamHandler
 import json
 import asyncio
 import uvicorn
 from queue import Queue
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
         except Exception as e:
             logger.error(f"Model initialization error: {e}")
             return False
     def feed_audio(self, audio_data):
         """Feed audio data directly to the recorder for live transcription"""
         if not self.is_running or not self.recorder:
             logger.error(f"Error processing audio chunk: {e}")
+# Create diarization handler for FastRTC
+class DiarizationAudioHandler(AudioStreamHandler):
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
+    def receive(self, frame):
+        """Process incoming audio frame"""
+        if not self.diarization_system.is_running:
+            return
         try:
             # Extract audio data
+            sample_rate, audio_array = frame
+            # Send audio to diarization system for processing
+            self.diarization_system.feed_audio(audio_array)
         except Exception as e:
+            logger.error(f"Error processing FastRTC audio: {e}")
+    def copy(self):
+        """Return a fresh handler instance"""
+        return DiarizationAudioHandler(self.diarization_system)
+    def shutdown(self):
+        """Clean up resources"""
+        pass
+    def start_up(self):
+        """Initialize resources"""
+        logger.info("DiarizationAudioHandler started")
+# Global diarization system instance
 diarization_system = RealtimeSpeakerDiarization()
 def initialize_system():
     """Initialize the diarization system"""
     try:
         success = diarization_system.initialize_models()
         if success:
             return "✅ System initialized successfully!"
         else:
             return "❌ Failed to initialize system. Check logs for details."
     except Exception as e:
         return f"❌ Failed to start recording: {str(e)}"
 def stop_recording():
     """Stop recording and transcription"""
     try:
     except Exception as e:
         return f"Error getting status: {str(e)}"
+# Create handler wrapper function for FastRTC
+def diarization_handler(audio_data):
+    """Handler function for FastRTC stream"""
+    try:
+        # Process the audio data
+        diarization_system.process_audio_chunk(audio_data[1], audio_data[0])
+        # Just yield the original audio back (echo)
+        # This can be changed to just return None since we don't need echo
+        # This can be changed to just return None since we don't need echo
+        yield audio_data
+    except Exception as e:
+        logger.error(f"Error in diarization handler: {e}")
+# Create FastRTC stream with ReplyOnPause pattern
+stream = Stream(
+    handler=ReplyOnPause(diarization_handler),
+    modality="audio",
+    mode="send-receive",
+    ui_args={
+        "title": "Real-time Speaker Diarization",
+        "description": "Live transcription with automatic speaker identification"
+    }
+)
 # Main execution
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="Real-time Speaker Diarization System")
+    parser.add_argument("--mode", choices=["ui", "api", "both"], default="ui",
+                       help="Run mode: FastRTC UI, API only, or both")
     parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
     parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
     parser.add_argument("--api-port", type=int, default=8000, help="API port (when running both)")
     args = parser.parse_args()
+    # Initialize the system before running anything
+    initialize_system()
+    start_recording()
+    if args.mode == "ui":
+        # Launch the FastRTC built-in UI
+        stream.ui.launch(
             server_name=args.host,
             server_port=args.port,
             share=True,
     elif args.mode == "api":
         # Run FastAPI only
+        app = FastAPI()
+        stream.mount(app)
         uvicorn.run(
             app,
             host=args.host,
         )
     elif args.mode == "both":
+        # Run both FastRTC UI and API
         import threading
         def run_fastapi():
+            app = FastAPI()
+            stream.mount(app)
             uvicorn.run(
                 app,
                 host=args.host,
         api_thread = threading.Thread(target=run_fastapi, daemon=True)
         api_thread.start()
+        # Start FastRTC UI in main thread
+        stream.ui.launch(
+            server_name=args.host,
+            server_port=args.port,
+            share=True,
+            show_error=True
+        )