Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 24

Commit

57c1aba

1 Parent(s): ff9d9e6

Revert portg

Browse files

Files changed (1) hide show

app.py +176 -121

app.py CHANGED Viewed

@@ -291,6 +291,7 @@ class RealtimeSpeakerDiarization:
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
         self.current_conversation = ""
     def initialize_models(self):
         """Initialize the speaker encoder model"""
@@ -389,7 +390,7 @@ class RealtimeSpeakerDiarization:
             return "Please initialize models first!"
         try:
-            # Setup recorder configuration for WebRTC input
             recorder_config = {
                 'spinner': False,
                 'use_microphone': False,  # We'll feed audio manually
@@ -530,30 +531,37 @@ class RealtimeSpeakerDiarization:
         except Exception as e:
             return f"Error getting status: {e}"
-    def process_audio(self, audio_data):
-        """Process audio data from FastRTC"""
         if not self.is_running or not self.recorder:
             return
         try:
-            # Extract audio data from FastRTC format (sample_rate, numpy_array)
-            sample_rate, audio_array = audio_data
-            # Convert to int16 format
-            if audio_array.dtype != np.int16:
-                audio_array = (audio_array * 32767).astype(np.int16)
-            # Convert to bytes and feed to recorder
-            audio_bytes = audio_array.tobytes()
             self.recorder.feed_audio(audio_bytes)
         except Exception as e:
-            print(f"Error processing FastRTC audio: {e}")
 # FastRTC Audio Handler
 class DiarizationHandler(AsyncStreamHandler):
     def __init__(self, diarization_system):
-        super().__init__()
         self.diarization_system = diarization_system
     def copy(self):
@@ -564,10 +572,21 @@ class DiarizationHandler(AsyncStreamHandler):
         """Not used in this implementation"""
         return None
-    async def receive(self, data):
         """Receive audio data from FastRTC and process it"""
-        if self.diarization_system.is_running:
-            self.diarization_system.process_audio(data)
 # Global instance
@@ -613,61 +632,6 @@ def get_status():
     return diarization_system.get_status_info()
-# Get Cloudflare TURN credentials for FastRTC
-async def get_cloudflare_credentials():
-    # Check if HF_TOKEN is set in environment
-    hf_token = os.environ.get("HF_TOKEN")
-    # If not set, use a default Hugging Face token if available
-    if not hf_token:
-        # Log a warning that user should set their own token
-        print("Warning: HF_TOKEN environment variable not set. Please set your own Hugging Face token.")
-        # Try to use the Hugging Face token from the environment
-        from huggingface_hub import HfApi
-        try:
-            api = HfApi()
-            hf_token = api.token
-            if not hf_token:
-                print("Error: No Hugging Face token available. TURN relay may not work properly.")
-        except:
-            print("Error: Failed to get Hugging Face token. TURN relay may not work properly.")
-    # Get Cloudflare TURN credentials using the Hugging Face token
-    if hf_token:
-        try:
-            return await get_cloudflare_turn_credentials_async(hf_token=hf_token)
-        except Exception as e:
-            print(f"Error getting Cloudflare TURN credentials: {e}")
-    # Fallback to a default configuration that may not work
-    return {
-        "iceServers": [
-            {
-                "urls": "stun:stun.l.google.com:19302"
-            }
-        ]
-    }
-# Setup FastRTC stream handler with TURN server configuration
-def setup_fastrtc_handler():
-    """Set up FastRTC audio stream handler with TURN server configuration"""
-    handler = DiarizationHandler(diarization_system)
-    # Get server-side credentials (longer TTL)
-    server_credentials = get_cloudflare_turn_credentials(ttl=360000)
-    stream = Stream(
-        handler=handler,
-        modality="audio",
-        mode="receive",
-        rtc_configuration=get_cloudflare_credentials,  # Async function for client-side credentials
-        server_rtc_configuration=server_credentials    # Server-side credentials with longer TTL
-    )
-    return stream
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as interface:
@@ -676,32 +640,6 @@ def create_interface():
         with gr.Row():
             with gr.Column(scale=2):
-                # FastRTC Audio Component
-                fastrtc_html = gr.HTML("""
-                <div class="fastrtc-container" style="margin-bottom: 20px;">
-                    <h3>🎙️ FastRTC Audio Input</h3>
-                    <p>Click the button below to start the audio stream:</p>
-                    <button id="start-fastrtc" style="background: #3498db; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer;">
-                        Start FastRTC Audio
-                    </button>
-                    <div id="fastrtc-status" style="margin-top: 10px; font-style: italic;">Not connected</div>
-                    <script>
-                        document.getElementById('start-fastrtc').addEventListener('click', function() {
-                            document.getElementById('fastrtc-status').textContent = 'Connecting...';
-                            // FastRTC will initialize the connection
-                            fetch('/start-rtc', { method: 'POST' })
-                                .then(response => response.text())
-                                .then(data => {
-                                    document.getElementById('fastrtc-status').textContent = 'Connected! Speak now...';
-                                })
-                                .catch(error => {
-                                    document.getElementById('fastrtc-status').textContent = 'Connection error: ' + error;
-                                });
-                        });
-                    </script>
-                </div>
-                """)
                 # Main conversation display
                 conversation_output = gr.HTML(
                     value="<i>Click 'Initialize System' to start...</i>",
@@ -751,7 +689,7 @@ def create_interface():
                 gr.Markdown("""
                 1. Click **Initialize System** to load models
                 2. Click **Start Recording** to begin processing
-                3. Click **Start FastRTC Audio** to connect your microphone
                 4. Allow microphone access when prompted
                 5. Speak into your microphone
                 6. Watch real-time transcription with speaker labels
@@ -772,13 +710,6 @@ def create_interface():
                 This app uses FastRTC for low-latency audio streaming.
                 For optimal performance, use a modern browser and allow microphone access when prompted.
                 """)
-                # Hugging Face Token Information
-                gr.Markdown("""
-                ## 🔑 Hugging Face Token
-                This app uses Cloudflare TURN server via Hugging Face integration.
-                If audio connection fails, set your HF_TOKEN environment variable in the Space settings.
-                """)
         # Auto-refresh conversation and status
         def refresh_display():
@@ -857,25 +788,149 @@ def create_interface():
     return interface
-# 1) Create FastAPI app
-app = FastAPI()
-# 2) Create Gradio interface
-gradio_interface = create_interface()
-# 3) Mount Gradio onto FastAPI at root
-app = gr.mount_gradio_app(app, gradio_interface, path="/")
-# 4) Initialize and mount FastRTC stream on the same app
-rtc_stream = setup_fastrtc_handler()
-rtc_stream.mount(app)
-# 5) Expose an endpoint to trigger the client-side RTC handshake
-@app.post("/start-rtc")
-async def start_rtc():
-    await rtc_stream.start_client()
-    return {"status": "success"}
-# 6) Local dev via uvicorn; HF Spaces will auto-detect 'app' and ignore this
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
         self.current_conversation = ""
+        self.audio_buffer = []
     def initialize_models(self):
         """Initialize the speaker encoder model"""
             return "Please initialize models first!"
         try:
+            # Setup recorder configuration for manual audio input
             recorder_config = {
                 'spinner': False,
                 'use_microphone': False,  # We'll feed audio manually
         except Exception as e:
             return f"Error getting status: {e}"
+    def feed_audio_data(self, audio_data):
+        """Feed audio data to the recorder"""
         if not self.is_running or not self.recorder:
             return
         try:
+            # Ensure audio is in the correct format (16-bit PCM)
+            if isinstance(audio_data, np.ndarray):
+                if audio_data.dtype != np.int16:
+                    # Convert float to int16
+                    if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
+                        audio_data = (audio_data * 32767).astype(np.int16)
+                    else:
+                        audio_data = audio_data.astype(np.int16)
+                # Convert to bytes
+                audio_bytes = audio_data.tobytes()
+            else:
+                audio_bytes = audio_data
+            # Feed to recorder
             self.recorder.feed_audio(audio_bytes)
         except Exception as e:
+            print(f"Error feeding audio data: {e}")
 # FastRTC Audio Handler
 class DiarizationHandler(AsyncStreamHandler):
     def __init__(self, diarization_system):
+        super().__init__(reply_on_pause=ReplyOnPause.NEVER)
         self.diarization_system = diarization_system
     def copy(self):
         """Not used in this implementation"""
         return None
+    async def receive(self, frame):
         """Receive audio data from FastRTC and process it"""
+        try:
+            if self.diarization_system.is_running:
+                # Frame should be a numpy array of audio data
+                if hasattr(frame, 'data'):
+                    audio_data = frame.data
+                else:
+                    audio_data = frame
+                # Feed audio data to the diarization system
+                self.diarization_system.feed_audio_data(audio_data)
+        except Exception as e:
+            print(f"Error in FastRTC handler: {e}")
 # Global instance
     return diarization_system.get_status_info()
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as interface:
         with gr.Row():
             with gr.Column(scale=2):
                 # Main conversation display
                 conversation_output = gr.HTML(
                     value="<i>Click 'Initialize System' to start...</i>",
                 gr.Markdown("""
                 1. Click **Initialize System** to load models
                 2. Click **Start Recording** to begin processing
+                3. Use the FastRTC interface below to connect your microphone
                 4. Allow microphone access when prompted
                 5. Speak into your microphone
                 6. Watch real-time transcription with speaker labels
                 This app uses FastRTC for low-latency audio streaming.
                 For optimal performance, use a modern browser and allow microphone access when prompted.
                 """)
         # Auto-refresh conversation and status
         def refresh_display():
     return interface
+# Main application setup
+def create_app():
+    """Create and configure the FastAPI app with Gradio and FastRTC"""
+    # Create FastAPI app
+    app = FastAPI(
+        title="Real-time Speaker Diarization",
+        description="Real-time speech recognition with speaker diarization using FastRTC",
+        version="1.0.0"
+    )
+    # Create Gradio interface
+    gradio_interface = create_interface()
+    # Mount Gradio interface
+    app = gr.mount_gradio_app(app, gradio_interface, path="/")
+    # Setup FastRTC stream
+    try:
+        # Create the handler
+        handler = DiarizationHandler(diarization_system)
+        # Get TURN credentials
+        hf_token = os.environ.get("HF_TOKEN")
+        if not hf_token:
+            print("Warning: HF_TOKEN not set. Audio streaming may not work properly.")
+            # Use basic STUN server as fallback
+            rtc_config = {
+                "iceServers": [{"urls": "stun:stun.l.google.com:19302"}]
+            }
+        else:
+            # Get Cloudflare TURN credentials
+            turn_credentials = get_cloudflare_turn_credentials(hf_token)
+            rtc_config = {
+                "iceServers": [
+                    {"urls": "stun:stun.l.google.com:19302"},
+                    {
+                        "urls": f"turn:{turn_credentials['urls'][0]}",
+                        "username": turn_credentials["username"],
+                        "credential": turn_credentials["credential"]
+                    }
+                ]
+            }
+        # Create FastRTC stream
+        stream = Stream(
+            handler=handler,
+            rtc_config=rtc_config,
+            audio_sample_rate=SAMPLE_RATE,
+            audio_channels=CHANNELS
+        )
+        # Add FastRTC endpoints
+        app.mount("/stream", stream.app)
+        print("FastRTC stream configured successfully!")
+    except Exception as e:
+        print(f"Warning: Failed to setup FastRTC stream: {e}")
+        print("Audio streaming will not be available.")
+    return app
+# Health check endpoint
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "timestamp": time.time(),
+        "system_initialized": diarization_system.encoder is not None,
+        "recording_active": diarization_system.is_running
+    }
+# API endpoint to get conversation
+@app.get("/api/conversation")
+async def get_conversation_api():
+    """API endpoint to get current conversation"""
+    return {
+        "conversation": diarization_system.get_formatted_conversation(),
+        "status": diarization_system.get_status_info(),
+        "is_recording": diarization_system.is_running
+    }
+# API endpoint to control recording
+@app.post("/api/control/{action}")
+async def control_recording(action: str):
+    """API endpoint to control recording (start/stop/clear)"""
+    if action == "start":
+        result = diarization_system.start_recording()
+    elif action == "stop":
+        result = diarization_system.stop_recording()
+    elif action == "clear":
+        result = diarization_system.clear_conversation()
+    elif action == "initialize":
+        result = initialize_system()
+    else:
+        return {"error": "Invalid action. Use: start, stop, clear, or initialize"}
+    return {"result": result, "is_recording": diarization_system.is_running}
+# Main entry point
 if __name__ == "__main__":
+    # Create the app
+    app = create_app()
+    # Configuration
+    host = os.environ.get("HOST", "0.0.0.0")
+    port = int(os.environ.get("PORT", 7860))
+    print(f"""
+    🎤 Real-time Speaker Diarization Server
+    =====================================
+    Starting server on: http://{host}:{port}
+    Features:
+    - Real-time speech recognition
+    - Speaker diarization with color coding
+    - FastRTC low-latency audio streaming
+    - Web interface for easy interaction
+    Make sure to:
+    1. Set HF_TOKEN environment variable for TURN server access
+    2. Allow microphone access in your browser
+    3. Use a modern browser for best performance
+    API Endpoints:
+    - GET  /health - Health check
+    - GET  /api/conversation - Get current conversation
+    - POST /api/control/{action} - Control recording (start/stop/clear/initialize)
+    - WS   /stream - FastRTC audio stream endpoint
+    """)
+    # Run the server
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_level="info",
+        access_log=True
+    )