Saiyaswanth007 commited on
Commit
3e9ecf3
·
1 Parent(s): 1dd5469
Files changed (1) hide show
  1. ui.py +472 -496
ui.py CHANGED
@@ -1,530 +1,506 @@
1
  import gradio as gr
2
- import asyncio
3
- import websockets
4
- import json
5
- import logging
6
- import time
7
- from typing import Dict, Any, Optional
8
- import threading
9
- from queue import Queue
10
- import base64
11
- import numpy as np
12
- import os
13
-
14
- # Configure logging
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- # Environment-configurable HF Space URL (matching backend.py)
19
- HF_SPACE_URL = os.getenv("HF_SPACE_URL", "https://androidguy-speaker-diarization.hf.space")
20
- API_WS = f"wss://{HF_SPACE_URL}/ws_inference"
21
-
22
- class TranscriptionWebSocketServer:
23
- """WebSocket server that receives audio from backend and returns transcription results"""
24
-
25
- def __init__(self):
26
- self.connected_clients = set()
27
- self.is_running = False
28
- self.websocket_server = None
29
- self.conversation_history = []
30
- self.processing_stats = {
31
- "total_audio_chunks": 0,
32
- "total_transcriptions": 0,
33
- "last_audio_received": None,
34
- "server_start_time": time.time(),
35
- "backend_url": HF_SPACE_URL
36
- }
37
 
38
- async def handle_client_connection(self, websocket, path):
39
- """Handle incoming WebSocket connections from the backend"""
40
- client_addr = websocket.remote_address
41
- logger.info(f"Backend client connected from {client_addr}")
42
 
43
- self.connected_clients.add(websocket)
 
44
 
45
- try:
46
- # Send initial connection acknowledgment
47
- await websocket.send(json.dumps({
48
- "type": "connection_ack",
49
- "status": "connected",
50
- "timestamp": time.time(),
51
- "message": "HuggingFace transcription service ready"
52
- }))
53
-
54
- # Handle incoming messages/audio data
55
- async for message in websocket:
56
- try:
57
- if isinstance(message, bytes):
58
- # Handle binary audio data
59
- await self.process_audio_data(message, websocket)
60
- else:
61
- # Handle text messages (JSON)
62
- await self.handle_text_message(message, websocket)
63
-
64
- except Exception as e:
65
- logger.error(f"Error processing message: {e}")
66
- await self.send_error(websocket, f"Processing error: {str(e)}")
67
-
68
- except websockets.exceptions.ConnectionClosed:
69
- logger.info("Backend client disconnected")
70
- except Exception as e:
71
- logger.error(f"Client connection error: {e}")
72
- finally:
73
- self.connected_clients.discard(websocket)
74
- logger.info(f"Client removed. Active connections: {len(self.connected_clients)}")
75
-
76
- async def process_audio_data(self, audio_data: bytes, websocket):
77
- """Process incoming audio data and return transcription results"""
78
- try:
79
- self.processing_stats["total_audio_chunks"] += 1
80
- self.processing_stats["last_audio_received"] = time.time()
81
-
82
- logger.debug(f"Received {len(audio_data)} bytes of audio data")
83
-
84
- # Try to import and use your inference functions
85
- try:
86
- from inference import transcribe_audio, identify_speakers
87
-
88
- # Process the audio for transcription
89
- transcription_result = await transcribe_audio(audio_data)
90
-
91
- if transcription_result:
92
- # Process for speaker diarization if available
93
- try:
94
- speaker_info = await identify_speakers(audio_data)
95
- transcription_result.update(speaker_info)
96
- except Exception as e:
97
- logger.warning(f"Speaker diarization failed: {e}")
98
- transcription_result["speaker"] = "Unknown"
99
-
100
- # Update conversation history
101
- self.update_conversation_history(transcription_result)
102
 
103
- # Send result back to backend
104
- response = {
105
- "type": "processing_result",
106
- "timestamp": time.time(),
107
- "data": transcription_result
 
 
 
108
  }
109
 
110
- await websocket.send(json.dumps(response))
111
- self.processing_stats["total_transcriptions"] += 1
112
-
113
- logger.info(f"Sent transcription result: {transcription_result.get('text', '')[:50]}...")
114
-
115
- except ImportError:
116
- # Fallback if inference module is not available
117
- logger.warning("Inference module not found, using mock transcription")
118
-
119
- # Try to use shared.py for processing if available
120
- try:
121
- from shared import RealtimeSpeakerDiarization
122
-
123
- # Initialize if not already initialized
124
- if not hasattr(self, 'diarization_system'):
125
- self.diarization_system = RealtimeSpeakerDiarization()
126
- await asyncio.to_thread(self.diarization_system.initialize_models)
127
- await asyncio.to_thread(self.diarization_system.start_recording)
128
-
129
- # Process the audio chunk
130
- result = await asyncio.to_thread(self.diarization_system.process_audio_chunk, audio_data)
131
-
132
- # Format result for response
133
- if result and result["status"] != "error":
134
- mock_result = {
135
- "text": result.get("text", f"[Processing {len(audio_data)} bytes]"),
136
- "speaker": f"Speaker_{result.get('speaker_id', 0) + 1}",
137
- "confidence": result.get("similarity", 0.85),
138
- "timestamp": time.time()
139
  }
140
- else:
141
- # Fallback mock result
142
- mock_result = {
143
- "text": f"[Mock transcription - {len(audio_data)} bytes processed]",
144
- "speaker": "Speaker_1",
145
- "confidence": 0.85,
146
- "timestamp": time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  }
148
-
149
- # Update conversation history
150
- self.update_conversation_history(mock_result)
151
-
152
- response = {
153
- "type": "processing_result",
154
- "timestamp": time.time(),
155
- "data": mock_result
156
  }
157
 
158
- await websocket.send(json.dumps(response))
159
- self.processing_stats["total_transcriptions"] += 1
160
-
161
- except Exception as e:
162
- logger.warning(f"Failed to use shared module: {e}")
163
-
164
- # Basic mock transcription as last resort
165
- mock_result = {
166
- "text": f"[Mock transcription - {len(audio_data)} bytes processed]",
167
- "speaker": "Speaker_1",
168
- "confidence": 0.85,
169
- "timestamp": time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  }
171
 
172
- self.update_conversation_history(mock_result)
173
-
174
- response = {
175
- "type": "processing_result",
176
- "timestamp": time.time(),
177
- "data": mock_result
 
 
 
 
 
 
 
 
 
178
  }
179
 
180
- await websocket.send(json.dumps(response))
181
-
182
- except Exception as e:
183
- logger.error(f"Audio processing error: {e}")
184
- await self.send_error(websocket, f"Audio processing failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- async def handle_text_message(self, message: str, websocket):
187
- """Handle text-based messages from backend"""
188
- try:
189
- data = json.loads(message)
190
- message_type = data.get("type", "unknown")
191
-
192
- logger.info(f"Received message type: {message_type}")
193
-
194
- if message_type == "ping":
195
- # Respond to ping with pong
196
- await websocket.send(json.dumps({
197
- "type": "pong",
198
- "timestamp": time.time()
199
- }))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- elif message_type == "config":
202
- # Handle configuration updates
203
- logger.info(f"Configuration update: {data}")
 
 
204
 
205
- # Apply configuration settings if available
206
- settings = data.get("settings", {})
207
- if "max_speakers" in settings:
208
- max_speakers = settings.get("max_speakers")
209
- logger.info(f"Setting max_speakers to {max_speakers}")
 
 
 
 
 
 
 
 
 
210
 
211
- if "threshold" in settings:
212
- threshold = settings.get("threshold")
213
- logger.info(f"Setting speaker change threshold to {threshold}")
 
 
 
 
 
214
 
215
- # Send acknowledgment
216
- await websocket.send(json.dumps({
217
- "type": "config_ack",
218
- "message": "Configuration received",
219
- "timestamp": time.time()
220
- }))
 
221
 
222
- elif message_type == "status_request":
223
- # Send status information
224
- await websocket.send(json.dumps({
225
- "type": "status_response",
226
- "data": self.get_processing_stats(),
227
- "timestamp": time.time()
228
- }))
229
 
230
- else:
231
- logger.warning(f"Unknown message type: {message_type}")
 
 
 
 
 
232
 
233
- except json.JSONDecodeError:
234
- logger.error(f"Invalid JSON received: {message}")
235
- await self.send_error(websocket, "Invalid JSON format")
236
-
237
- async def send_error(self, websocket, error_message: str):
238
- """Send error message to client"""
239
- try:
240
- await websocket.send(json.dumps({
241
- "type": "error",
242
- "message": error_message,
243
- "timestamp": time.time()
244
- }))
245
- except Exception as e:
246
- logger.error(f"Failed to send error message: {e}")
247
-
248
- def update_conversation_history(self, transcription_result: Dict[str, Any]):
249
- """Update conversation history with new transcription"""
250
- history_entry = {
251
- "timestamp": time.time(),
252
- "text": transcription_result.get("text", ""),
253
- "speaker": transcription_result.get("speaker", "Unknown"),
254
- "confidence": transcription_result.get("confidence", 0.0)
255
- }
256
-
257
- self.conversation_history.append(history_entry)
258
-
259
- # Keep only last 50 entries to prevent memory issues
260
- if len(self.conversation_history) > 50:
261
- self.conversation_history = self.conversation_history[-50:]
262
-
263
- def get_processing_stats(self):
264
- """Get processing statistics"""
265
- return {
266
- "connected_clients": len(self.connected_clients),
267
- "total_audio_chunks": self.processing_stats["total_audio_chunks"],
268
- "total_transcriptions": self.processing_stats["total_transcriptions"],
269
- "last_audio_received": self.processing_stats["last_audio_received"],
270
- "server_uptime": time.time() - self.processing_stats["server_start_time"],
271
- "conversation_entries": len(self.conversation_history),
272
- "backend_url": self.processing_stats.get("backend_url", HF_SPACE_URL)
273
- }
274
-
275
- async def start_server(self, host="0.0.0.0", port=7860):
276
- """Start the WebSocket server"""
277
- try:
278
- # Start WebSocket server on /ws_inference endpoint
279
- self.websocket_server = await websockets.serve(
280
- self.handle_client_connection,
281
- host,
282
- port,
283
- subprotocols=[],
284
- path="/ws_inference"
285
- )
286
-
287
- self.is_running = True
288
- logger.info(f"WebSocket server started on ws://{host}:{port}/ws_inference")
289
-
290
- # Keep the server running
291
- await self.websocket_server.wait_closed()
292
-
293
- except Exception as e:
294
- logger.error(f"Failed to start WebSocket server: {e}")
295
- self.is_running = False
296
-
297
- # Initialize the WebSocket server
298
- ws_server = TranscriptionWebSocketServer()
299
-
300
- def create_gradio_interface():
301
- """Create Gradio interface for monitoring and testing"""
302
-
303
- def get_server_status():
304
- """Get current server status"""
305
- stats = ws_server.get_processing_stats()
306
 
307
- status_text = f"""
308
- ### Server Status
309
- - **WebSocket Server**: {'🟢 Running' if ws_server.is_running else '🔴 Stopped'}
310
- - **Connected Clients**: {stats['connected_clients']}
311
- - **Server Uptime**: {stats['server_uptime']:.1f} seconds
312
-
313
- ### Processing Statistics
314
- - **Audio Chunks Processed**: {stats['total_audio_chunks']}
315
- - **Transcriptions Generated**: {stats['total_transcriptions']}
316
- - **Last Audio Received**: {time.ctime(stats['last_audio_received']) if stats['last_audio_received'] else 'Never'}
317
-
318
- ### Conversation
319
- - **History Entries**: {stats['conversation_entries']}
320
- """
321
-
322
- return status_text
323
-
324
- def get_recent_transcriptions():
325
- """Get recent transcription results"""
326
- if not ws_server.conversation_history:
327
- return "No transcriptions yet. Waiting for audio data from backend..."
328
-
329
- recent_entries = ws_server.conversation_history[-10:] # Last 10 entries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
- formatted_text = "### Recent Transcriptions\n\n"
332
- for entry in recent_entries:
333
- timestamp = time.strftime("%H:%M:%S", time.localtime(entry['timestamp']))
334
- speaker = entry['speaker']
335
- text = entry['text']
336
- confidence = entry['confidence']
337
-
338
- # Extract speaker number for color matching with shared.py
339
- speaker_num = 0
340
- if speaker.startswith("Speaker_"):
341
- try:
342
- speaker_num = int(speaker.split("_")[1]) - 1
343
- except (ValueError, IndexError):
344
- speaker_num = 0
345
-
346
- # Use colors from shared.py if possible
347
  try:
348
- from shared import SPEAKER_COLORS
349
- color = SPEAKER_COLORS[speaker_num % len(SPEAKER_COLORS)]
350
- except (ImportError, IndexError):
351
- # Fallback colors
352
- colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F"]
353
- color = colors[speaker_num % len(colors)]
354
-
355
- formatted_text += f"<span style='color:{color};font-weight:bold;'>[{timestamp}] {speaker}</span> (confidence: {confidence:.2f})\n"
356
- formatted_text += f"{text}\n\n"
357
 
358
- return formatted_text
359
-
360
- def clear_conversation_history():
361
- """Clear conversation history"""
362
- ws_server.conversation_history.clear()
363
- return "Conversation history cleared!"
364
-
365
- # Create Gradio interface
366
- with gr.Blocks(
367
- title="Real-time Audio Transcription Service",
368
- theme=gr.themes.Soft()
369
- ) as demo:
370
-
371
- gr.Markdown("# 🎤 Real-time Audio Transcription Service")
372
- gr.Markdown("This HuggingFace Space receives audio from your backend and returns transcription results with speaker diarization.")
373
-
374
- with gr.Tab("📊 Server Status"):
375
- status_display = gr.Markdown(get_server_status())
376
-
377
- with gr.Row():
378
- refresh_status_btn = gr.Button("🔄 Refresh Status", variant="primary")
379
-
380
- refresh_status_btn.click(
381
- fn=get_server_status,
382
- outputs=status_display,
383
- every=None
384
- )
385
-
386
- with gr.Tab("📝 Live Transcription"):
387
- transcription_display = gr.Markdown(get_recent_transcriptions())
388
-
389
- with gr.Row():
390
- refresh_transcription_btn = gr.Button("🔄 Refresh Transcriptions", variant="primary")
391
- clear_history_btn = gr.Button("🗑️ Clear History", variant="secondary")
392
-
393
- refresh_transcription_btn.click(
394
- fn=get_recent_transcriptions,
395
- outputs=transcription_display
396
- )
397
-
398
- clear_history_btn.click(
399
- fn=clear_conversation_history,
400
- outputs=gr.Markdown()
401
- )
402
-
403
- with gr.Tab("🔧 Connection Info"):
404
- gr.Markdown(f"""
405
- ### WebSocket Connection Details
406
-
407
- **WebSocket Endpoint**: `wss://{HF_SPACE_URL}/ws_inference`
408
-
409
- ### Backend Connection
410
- Your backend should connect to this WebSocket endpoint and:
411
-
412
- 1. **Send Audio Data**: Stream raw audio bytes to this endpoint
413
- 2. **Receive Results**: Get JSON responses with transcription results
414
 
415
- ### Expected Message Flow
416
-
417
- **Backend → HuggingFace**:
418
- - Raw audio bytes (binary data)
419
- - Configuration messages (JSON)
420
-
421
- **HuggingFace → Backend**:
422
- ```json
423
- {{
424
- "type": "processing_result",
425
- "timestamp": 1234567890.123,
426
- "data": {{
427
- "text": "transcribed text here",
428
- "speaker": "Speaker_1",
429
- "confidence": 0.95
430
- }}
431
- }}
432
- ```
433
-
434
- ### Test Connection
435
- Your backend is configured to connect to: `{ws_server.processing_stats.get('backend_url', HF_SPACE_URL)}`
436
- """)
437
 
438
- with gr.Tab("🚀 API Documentation"):
439
- gr.Markdown("""
440
- ### WebSocket API Reference
441
-
442
- #### Endpoint
443
- - **URL**: `/ws_inference`
444
- - **Protocol**: WebSocket
445
- - **Accepts**: Binary audio data + JSON messages
446
-
447
- #### Message Types
448
-
449
- ##### 1. Audio Processing
450
- - **Input**: Raw audio bytes (binary)
451
- - **Output**: Processing result (JSON)
452
-
453
- ##### 2. Configuration
454
- - **Input**:
455
- ```json
456
- {
457
- "type": "config",
458
- "settings": {
459
- "language": "en",
460
- "enable_diarization": true,
461
- "max_speakers": 4,
462
- "threshold": 0.65
463
- }
464
- }
465
- ```
466
-
467
- ##### 3. Status Check
468
- - **Input**: `{"type": "status_request"}`
469
- - **Output**: Server statistics
470
-
471
- ##### 4. Ping/Pong
472
- - **Input**: `{"type": "ping"}`
473
- - **Output**: `{"type": "pong", "timestamp": 1234567890}`
474
-
475
- #### Error Handling
476
- All errors are returned as:
477
- ```json
478
- {
479
- "type": "error",
480
- "message": "Error description",
481
- "timestamp": 1234567890.123
482
- }
483
- ```
484
- """)
485
-
486
  return demo
487
 
488
- def run_websocket_server():
489
- """Run WebSocket server in background thread"""
490
- loop = asyncio.new_event_loop()
491
- asyncio.set_event_loop(loop)
492
-
493
- try:
494
- logger.info("Starting WebSocket server thread...")
495
- loop.run_until_complete(ws_server.start_server())
496
- except Exception as e:
497
- logger.error(f"WebSocket server error: {e}")
498
- finally:
499
- loop.close()
500
-
501
- # Mount UI to inference.py
502
- def mount_ui(app):
503
- """Mount Gradio interface to FastAPI app"""
504
- try:
505
- demo = create_gradio_interface()
506
- # Mount without starting server (FastAPI will handle it)
507
- demo.mount_to_app(app)
508
- logger.info("Gradio UI mounted to FastAPI app")
509
- return True
510
- except Exception as e:
511
- logger.error(f"Error mounting UI: {e}")
512
- return False
513
-
514
- # Start WebSocket server in background
515
- logger.info("Initializing WebSocket server...")
516
- websocket_thread = threading.Thread(target=run_websocket_server, daemon=True)
517
- websocket_thread.start()
518
 
519
- # Give server time to start
520
- time.sleep(2)
 
521
 
522
- # Create and launch Gradio interface
523
  if __name__ == "__main__":
524
- demo = create_gradio_interface()
525
- demo.launch(
526
- server_name="0.0.0.0",
527
- server_port=7860,
528
- share=True,
529
- show_error=True
530
- )
 
1
  import gradio as gr
2
+ from fastapi import FastAPI
3
+ from shared import DEFAULT_CHANGE_THRESHOLD, DEFAULT_MAX_SPEAKERS, ABSOLUTE_MAX_SPEAKERS, FINAL_TRANSCRIPTION_MODEL, REALTIME_TRANSCRIPTION_MODEL
4
+ print(gr.__version__)
5
+ # Connection configuration (separate signaling server from model server)
6
+ # These will be replaced at deployment time with the correct URLs
7
+ RENDER_SIGNALING_URL = "wss://render-signal-audio.onrender.com/stream"
8
+ HF_SPACE_URL = "https://androidguy-speaker-diarization.hf.space"
9
+
10
+ def build_ui():
11
+ """Build Gradio UI for speaker diarization"""
12
+ with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as demo:
13
+ # Add configuration variables to page using custom component
14
+ gr.HTML(
15
+ f"""
16
+ <!-- Configuration parameters -->
17
+ <script>
18
+ window.RENDER_SIGNALING_URL = "{RENDER_SIGNALING_URL}";
19
+ window.HF_SPACE_URL = "{HF_SPACE_URL}";
20
+ </script>
21
+ """
22
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Header and description
25
+ gr.Markdown("# 🎤 Live Speaker Diarization")
26
+ gr.Markdown(f"Real-time speech recognition with automatic speaker identification")
 
27
 
28
+ # Add transcription model info
29
+ gr.Markdown(f"**Using Models:** Final: {FINAL_TRANSCRIPTION_MODEL}, Realtime: {REALTIME_TRANSCRIPTION_MODEL}")
30
 
31
+ # Status indicator
32
+ connection_status = gr.HTML(
33
+ """<div class="status-indicator">
34
+ <span id="status-text" style="color:#888;">Waiting to connect...</span>
35
+ <span id="status-icon" style="width:10px; height:10px; display:inline-block;
36
+ background-color:#888; border-radius:50%; margin-left:5px;"></span>
37
+ </div>"""
38
+ )
39
+
40
+ with gr.Row():
41
+ with gr.Column(scale=2):
42
+ # Conversation display with embedded JavaScript for WebRTC and audio handling
43
+ conversation_display = gr.HTML(
44
+ """
45
+ <div class='output' id="conversation" style='padding:20px; background:#111; border-radius:10px;
46
+ min-height:400px; font-family:Arial; font-size:16px; line-height:1.5; overflow-y:auto;'>
47
+ <i>Click 'Start Listening' to begin...</i>
48
+ </div>
49
+
50
+ <script>
51
+ // Global variables
52
+ let rtcConnection;
53
+ let mediaStream;
54
+ let wsConnection;
55
+ let statusUpdateInterval;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ // Check connection to HF space
58
+ async function checkHfConnection() {
59
+ try {
60
+ let response = await fetch(`${window.HF_SPACE_URL}/health`);
61
+ return response.ok;
62
+ } catch (err) {
63
+ return false;
64
+ }
65
  }
66
 
67
+ // Start the connection and audio streaming
68
+ async function startStreaming() {
69
+ try {
70
+ // Update status
71
+ updateStatus('connecting');
72
+
73
+ // Request microphone access
74
+ mediaStream = await navigator.mediaDevices.getUserMedia({audio: {
75
+ echoCancellation: true,
76
+ noiseSuppression: true,
77
+ autoGainControl: true
78
+ }});
79
+
80
+ // Set up WebRTC connection to Render signaling server
81
+ await setupWebRTC();
82
+
83
+ // Also connect WebSocket directly to HF Space for conversation updates
84
+ setupWebSocket();
85
+
86
+ // Start status update interval
87
+ statusUpdateInterval = setInterval(updateConnectionInfo, 5000);
88
+
89
+ // Update status
90
+ updateStatus('connected');
91
+
92
+ document.getElementById("conversation").innerHTML = "<i>Connected! Start speaking...</i>";
93
+ } catch (err) {
94
+ console.error('Error starting stream:', err);
95
+ updateStatus('error', err.message);
96
  }
97
+ }
98
+
99
+ // Set up WebRTC connection to Render signaling server
100
+ async function setupWebRTC() {
101
+ try {
102
+ if (rtcConnection) {
103
+ rtcConnection.close();
104
+ }
105
+
106
+ // Use FastRTC's connection approach
107
+ const pc = new RTCPeerConnection({
108
+ iceServers: [{ urls: 'stun:stun.l.google.com:19302' }]
109
+ });
110
+
111
+ // Add audio track
112
+ mediaStream.getAudioTracks().forEach(track => {
113
+ pc.addTrack(track, mediaStream);
114
+ });
115
+
116
+ // Connect to FastRTC signaling via WebSocket
117
+ const signalWs = new WebSocket(window.RENDER_SIGNALING_URL.replace('wss://', 'wss://'));
118
+
119
+ // Handle signaling messages
120
+ signalWs.onmessage = async (event) => {
121
+ const message = JSON.parse(event.data);
122
+
123
+ if (message.type === 'offer') {
124
+ await pc.setRemoteDescription(new RTCSessionDescription(message));
125
+ const answer = await pc.createAnswer();
126
+ await pc.setLocalDescription(answer);
127
+ signalWs.send(JSON.stringify(pc.localDescription));
128
+ } else if (message.type === 'candidate') {
129
+ if (message.candidate) {
130
+ await pc.addIceCandidate(new RTCIceCandidate(message));
131
+ }
132
+ }
133
+ };
134
+
135
+ // Send ICE candidates
136
+ pc.onicecandidate = (event) => {
137
+ if (event.candidate) {
138
+ signalWs.send(JSON.stringify({
139
+ type: 'candidate',
140
+ candidate: event.candidate
141
+ }));
142
+ }
143
+ };
144
+
145
+ // Keep connection reference
146
+ rtcConnection = pc;
147
+
148
+ // Wait for connection to be established
149
+ await new Promise((resolve, reject) => {
150
+ const timeout = setTimeout(() => reject(new Error("WebRTC connection timeout")), 10000);
151
+ pc.onconnectionstatechange = () => {
152
+ if (pc.connectionState === 'connected') {
153
+ clearTimeout(timeout);
154
+ resolve();
155
+ } else if (pc.connectionState === 'failed' || pc.connectionState === 'disconnected') {
156
+ clearTimeout(timeout);
157
+ reject(new Error("WebRTC connection failed"));
158
+ }
159
+ };
160
+ });
161
+
162
+ updateStatus('connected');
163
+ } catch (err) {
164
+ console.error('WebRTC setup error:', err);
165
+ updateStatus('error', 'WebRTC setup failed: ' + err.message);
166
  }
 
 
 
 
 
 
 
 
167
  }
168
 
169
+ // Set up WebSocket connection to HF Space for conversation updates
170
+ function setupWebSocket() {
171
+ const wsUrl = window.RENDER_SIGNALING_URL.replace('stream', 'ws_relay');
172
+ wsConnection = new WebSocket(wsUrl);
173
+
174
+ wsConnection.onopen = () => {
175
+ console.log('WebSocket connection established');
176
+ };
177
+
178
+ wsConnection.onmessage = (event) => {
179
+ try {
180
+ // Parse the JSON message
181
+ const message = JSON.parse(event.data);
182
+
183
+ // Process different message types
184
+ switch(message.type) {
185
+ case 'transcription':
186
+ // Handle transcription data
187
+ if (message && message.data && typeof message.data === 'object') {
188
+ document.getElementById("conversation").innerHTML = message.data.conversation_html ||
189
+ JSON.stringify(message.data);
190
+ }
191
+ break;
192
+
193
+ case 'processing_result':
194
+ // Handle individual audio chunk processing result
195
+ console.log('Processing result:', message.data);
196
+
197
+ // Update status info if needed
198
+ if (message.data && message.data.status === "processed") {
199
+ const statusElem = document.getElementById('status-text');
200
+ if (statusElem) {
201
+ const speakerId = message.data.speaker_id !== undefined ?
202
+ `Speaker ${message.data.speaker_id + 1}` : '';
203
+
204
+ if (speakerId) {
205
+ statusElem.textContent = `Connected - ${speakerId} active`;
206
+ }
207
+ }
208
+ } else if (message.data && message.data.status === "error") {
209
+ updateStatus('error', message.data.message || 'Processing error');
210
+ }
211
+ break;
212
+
213
+ case 'connection':
214
+ console.log('Connection status:', message.status);
215
+ updateStatus(message.status === 'connected' ? 'connected' : 'warning');
216
+ break;
217
+
218
+ case 'connection_established':
219
+ console.log('Connection established:', message);
220
+ updateStatus('connected');
221
+
222
+ // If initial conversation is provided, display it
223
+ if (message.conversation) {
224
+ document.getElementById("conversation").innerHTML = message.conversation;
225
+ }
226
+ break;
227
+
228
+ case 'conversation_update':
229
+ if (message.conversation_html) {
230
+ document.getElementById("conversation").innerHTML = message.conversation_html;
231
+ }
232
+ break;
233
+
234
+ case 'conversation_cleared':
235
+ document.getElementById("conversation").innerHTML =
236
+ "<i>Conversation cleared. Start speaking again...</i>";
237
+ break;
238
+
239
+ case 'error':
240
+ console.error('Error message from server:', message.message);
241
+ updateStatus('warning', message.message);
242
+ break;
243
+
244
+ default:
245
+ // If it's just HTML content without proper JSON structure (legacy format)
246
+ document.getElementById("conversation").innerHTML = event.data;
247
+ }
248
+
249
+ // Auto-scroll to bottom
250
+ const container = document.getElementById("conversation");
251
+ container.scrollTop = container.scrollHeight;
252
+ } catch (e) {
253
+ // Fallback for non-JSON messages (legacy format)
254
+ document.getElementById("conversation").innerHTML = event.data;
255
+
256
+ // Auto-scroll to bottom
257
+ const container = document.getElementById("conversation");
258
+ container.scrollTop = container.scrollHeight;
259
+ }
260
+ };
261
+
262
+ wsConnection.onerror = (error) => {
263
+ console.error('WebSocket error:', error);
264
+ updateStatus('warning', 'WebSocket error');
265
+ };
266
+
267
+ wsConnection.onclose = () => {
268
+ console.log('WebSocket connection closed');
269
+ // Try to reconnect after a delay
270
+ setTimeout(setupWebSocket, 3000);
271
+ };
272
  }
273
 
274
+ // Update connection info in the UI
275
+ async function updateConnectionInfo() {
276
+ try {
277
+ const hfConnected = await checkHfConnection();
278
+ if (!hfConnected) {
279
+ updateStatus('warning', 'HF Space connection issue');
280
+ } else if (rtcConnection?.connectionState === 'connected' ||
281
+ rtcConnection?.iceConnectionState === 'connected') {
282
+ updateStatus('connected');
283
+ } else {
284
+ updateStatus('warning', 'Connection unstable');
285
+ }
286
+ } catch (err) {
287
+ console.error('Error updating connection info:', err);
288
+ }
289
  }
290
 
291
+ // Update status indicator
292
+ function updateStatus(status, message = '') {
293
+ const statusText = document.getElementById('status-text');
294
+ const statusIcon = document.getElementById('status-icon');
295
+
296
+ switch(status) {
297
+ case 'connected':
298
+ statusText.textContent = 'Connected';
299
+ statusIcon.style.backgroundColor = '#4CAF50';
300
+ break;
301
+ case 'connecting':
302
+ statusText.textContent = 'Connecting...';
303
+ statusIcon.style.backgroundColor = '#FFC107';
304
+ break;
305
+ case 'disconnected':
306
+ statusText.textContent = 'Disconnected';
307
+ statusIcon.style.backgroundColor = '#9E9E9E';
308
+ break;
309
+ case 'error':
310
+ statusText.textContent = 'Error: ' + message;
311
+ statusIcon.style.backgroundColor = '#F44336';
312
+ break;
313
+ case 'warning':
314
+ statusText.textContent = 'Warning: ' + message;
315
+ statusIcon.style.backgroundColor = '#FF9800';
316
+ break;
317
+ default:
318
+ statusText.textContent = 'Unknown';
319
+ statusIcon.style.backgroundColor = '#9E9E9E';
320
+ }
321
+ }
322
 
323
+ // Stop streaming and clean up
324
+ function stopStreaming() {
325
+ // Close WebRTC connection
326
+ if (rtcConnection) {
327
+ rtcConnection.close();
328
+ rtcConnection = null;
329
+ }
330
+
331
+ // Close WebSocket
332
+ if (wsConnection) {
333
+ wsConnection.close();
334
+ wsConnection = null;
335
+ }
336
+
337
+ // Stop all tracks in media stream
338
+ if (mediaStream) {
339
+ mediaStream.getTracks().forEach(track => track.stop());
340
+ mediaStream = null;
341
+ }
342
+
343
+ // Clear interval
344
+ if (statusUpdateInterval) {
345
+ clearInterval(statusUpdateInterval);
346
+ statusUpdateInterval = null;
347
+ }
348
+
349
+ // Update status
350
+ updateStatus('disconnected');
351
+ }
352
+
353
+ // Set up event listeners when the DOM is loaded
354
+ document.addEventListener('DOMContentLoaded', () => {
355
+ updateStatus('disconnected');
356
+ });
357
+ </script>
358
+ """,
359
+ label="Live Conversation"
360
+ )
361
 
362
+ # Control buttons
363
+ with gr.Row():
364
+ start_btn = gr.Button("▶️ Start Listening", variant="primary", size="lg")
365
+ stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg")
366
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg")
367
 
368
+ # Status display
369
+ status_output = gr.Markdown(
370
+ """
371
+ ## System Status
372
+ Waiting to connect...
373
+
374
+ *Click Start Listening to begin*
375
+ """,
376
+ label="Status Information"
377
+ )
378
+
379
+ with gr.Column(scale=1):
380
+ # Settings
381
+ gr.Markdown("## ⚙️ Settings")
382
 
383
+ threshold_slider = gr.Slider(
384
+ minimum=0.3,
385
+ maximum=0.9,
386
+ step=0.05,
387
+ value=DEFAULT_CHANGE_THRESHOLD,
388
+ label="Speaker Change Sensitivity",
389
+ info="Lower = more sensitive (more speaker changes)"
390
+ )
391
 
392
+ max_speakers_slider = gr.Slider(
393
+ minimum=2,
394
+ maximum=ABSOLUTE_MAX_SPEAKERS,
395
+ step=1,
396
+ value=DEFAULT_MAX_SPEAKERS,
397
+ label="Maximum Speakers"
398
+ )
399
 
400
+ update_btn = gr.Button("Update Settings", variant="secondary")
 
 
 
 
 
 
401
 
402
+ # Instructions
403
+ gr.Markdown("""
404
+ ## 📋 Instructions
405
+ 1. **Start Listening** - allows browser to access microphone
406
+ 2. **Speak** - system will transcribe and identify speakers
407
+ 3. **Stop** when finished
408
+ 4. **Clear** to reset conversation
409
 
410
+ ## 🎨 Speaker Colors
411
+ - 🔴 Speaker 1 (Red)
412
+ - 🟢 Speaker 2 (Teal)
413
+ - 🔵 Speaker 3 (Blue)
414
+ - 🟡 Speaker 4 (Green)
415
+ - Speaker 5 (Yellow)
416
+ - 🟣 Speaker 6 (Plum)
417
+ - 🟤 Speaker 7 (Mint)
418
+ - 🟠 Speaker 8 (Gold)
419
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
+ # JavaScript to connect buttons to the script functions
422
+ gr.HTML("""
423
+ <script>
424
+ // Wait for Gradio to fully load
425
+ document.addEventListener('DOMContentLoaded', () => {
426
+ // Wait a bit for Gradio buttons to be created
427
+ setTimeout(() => {
428
+ // Get the buttons
429
+ const startBtn = document.querySelector('button[aria-label="Start Listening"]');
430
+ const stopBtn = document.querySelector('button[aria-label="Stop"]');
431
+ const clearBtn = document.querySelector('button[aria-label="Clear"]');
432
+
433
+ if (startBtn) startBtn.onclick = () => startStreaming();
434
+ if (stopBtn) stopBtn.onclick = () => stopStreaming();
435
+ if (clearBtn) clearBtn.onclick = () => {
436
+ // Make API call to clear conversation
437
+ fetch(`${window.HF_SPACE_URL}/clear`, {
438
+ method: 'POST'
439
+ }).then(resp => resp.json())
440
+ .then(data => {
441
+ document.getElementById("conversation").innerHTML =
442
+ "<i>Conversation cleared. Start speaking again...</i>";
443
+ });
444
+ }
445
+
446
+ // Set up settings update
447
+ const updateBtn = document.querySelector('button[aria-label="Update Settings"]');
448
+ if (updateBtn) updateBtn.onclick = () => {
449
+ const threshold = document.querySelector('input[aria-label="Speaker Change Sensitivity"]').value;
450
+ const maxSpeakers = document.querySelector('input[aria-label="Maximum Speakers"]').value;
451
+
452
+ fetch(`${window.HF_SPACE_URL}/settings?threshold=${threshold}&max_speakers=${maxSpeakers}`, {
453
+ method: 'POST'
454
+ }).then(resp => resp.json())
455
+ .then(data => {
456
+ const statusOutput = document.querySelector('.prose');
457
+ if (statusOutput) {
458
+ statusOutput.innerHTML = `
459
+ <h2>System Status</h2>
460
+ <p>Settings updated:</p>
461
+ <ul>
462
+ <li>Threshold: ${threshold}</li>
463
+ <li>Max Speakers: ${maxSpeakers}</li>
464
+ </ul>
465
+ <p>Transcription Models:</p>
466
+ <ul>
467
+ <li>Final: ${window.FINAL_TRANSCRIPTION_MODEL || "distil-large-v3"}</li>
468
+ <li>Realtime: ${window.REALTIME_TRANSCRIPTION_MODEL || "distil-small.en"}</li>
469
+ </ul>
470
+ `;
471
+ }
472
+ });
473
+ }
474
+ }, 1000);
475
+ });
476
+ </script>
477
+ """)
478
 
479
+ # Set up periodic status updates
480
+ def get_status():
481
+ """API call to get system status - called periodically"""
482
+ import requests
 
 
 
 
 
 
 
 
 
 
 
 
483
  try:
484
+ resp = requests.get(f"{HF_SPACE_URL}/status")
485
+ if resp.status_code == 200:
486
+ return resp.json().get('status', 'No status information')
487
+ return "Error getting status"
488
+ except Exception as e:
489
+ return f"Connection error: {str(e)}"
 
 
 
490
 
491
+ status_timer = gr.Timer(5)
492
+ status_timer.tick(fn=get_status, outputs=status_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  return demo
496
 
497
+ # Create Gradio interface
498
+ demo = build_ui()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
 
500
+ def mount_ui(app: FastAPI):
501
+ """Mount Gradio app to FastAPI"""
502
+ app.mount("/ui", demo.app)
503
 
504
+ # For standalone testing
505
  if __name__ == "__main__":
506
+ demo.launch()