Saiyaswanth007 commited on
Commit
42eafc4
·
1 Parent(s): b37c0fc

Code fixing

Browse files
Files changed (1) hide show
  1. app.py +244 -190
app.py CHANGED
@@ -8,14 +8,20 @@ import os
8
  import urllib.request
9
  import torchaudio
10
  from scipy.spatial.distance import cosine
 
 
11
  import json
12
  import io
13
  import wave
14
- from fastrtc import Stream, ReplyOnPause, AsyncStreamHandler, get_stt_model
15
 
16
  # Simplified configuration parameters
17
  SILENCE_THRESHS = [0, 0.4]
18
- FINAL_TRANSCRIPTION_MODEL = "moonshine/base" # Using FastRTC's moonshine model
 
 
 
 
19
  SILERO_SENSITIVITY = 0.4
20
  WEBRTC_SENSITIVITY = 3
21
  MIN_LENGTH_OF_RECORDING = 0.7
@@ -267,65 +273,12 @@ class SpeakerChangeDetector:
267
  }
268
 
269
 
270
- class DiarizationStreamHandler(AsyncStreamHandler):
271
- """FastRTC stream handler for real-time diarization"""
272
- def __init__(self, diarization_system):
273
- super().__init__(input_sample_rate=16000)
274
- self.diarization_system = diarization_system
275
- self.stt_model = get_stt_model(model=FINAL_TRANSCRIPTION_MODEL)
276
- self.current_text = ""
277
- self.current_audio_buffer = []
278
- self.transcript_queue = queue.Queue()
279
-
280
- def copy(self):
281
- return DiarizationStreamHandler(self.diarization_system)
282
-
283
- async def start_up(self):
284
- """Initialize the stream handler"""
285
- pass
286
-
287
- async def receive(self, frame):
288
- """Process incoming audio frame"""
289
- # Extract audio data
290
- sample_rate, audio_data = frame
291
-
292
- # Convert to numpy array if needed
293
- if isinstance(audio_data, torch.Tensor):
294
- audio_data = audio_data.numpy()
295
-
296
- # Add to buffer
297
- self.current_audio_buffer.append(audio_data)
298
-
299
- # If buffer is large enough, process it
300
- if len(self.current_audio_buffer) > 3: # Process ~1.5 seconds of audio
301
- # Concatenate audio data
302
- combined_audio = np.concatenate(self.current_audio_buffer)
303
-
304
- # Run speech-to-text
305
- text = self.stt_model.stt((16000, combined_audio))
306
-
307
- if text and text.strip():
308
- # Save text and audio for processing
309
- self.transcript_queue.put((text, combined_audio))
310
- self.current_text = text
311
-
312
- # Reset buffer but keep some overlap
313
- if len(self.current_audio_buffer) > 5:
314
- self.current_audio_buffer = self.current_audio_buffer[-2:]
315
-
316
- async def emit(self):
317
- """Emit processed data"""
318
- # Return current text as dummy; actual processing is done in background
319
- return self.current_text
320
-
321
-
322
  class RealtimeSpeakerDiarization:
323
  def __init__(self):
324
  self.encoder = None
325
  self.audio_processor = None
326
  self.speaker_detector = None
327
- self.stream = None
328
- self.stream_handler = None
329
  self.sentence_queue = queue.Queue()
330
  self.full_sentences = []
331
  self.sentence_speakers = []
@@ -335,6 +288,7 @@ class RealtimeSpeakerDiarization:
335
  self.is_running = False
336
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
337
  self.max_speakers = DEFAULT_MAX_SPEAKERS
 
338
 
339
  def initialize_models(self):
340
  """Initialize the speaker encoder model"""
@@ -361,69 +315,45 @@ class RealtimeSpeakerDiarization:
361
  print(f"Model initialization error: {e}")
362
  return False
363
 
364
- def start_stream(self, app):
365
- """Start the FastRTC stream"""
366
- if self.encoder is None:
367
- return "Please initialize models first!"
368
-
369
- try:
370
- # Create a FastRTC stream handler
371
- self.stream_handler = DiarizationStreamHandler(self)
372
-
373
- # Create FastRTC stream
374
- self.stream = Stream(
375
- handler=self.stream_handler,
376
- modality="audio",
377
- mode="send-receive"
378
  )
379
-
380
- # Mount the stream to the provided FastAPI app
381
- self.stream.mount(app)
382
-
383
- # Start sentence processing thread
384
- self.is_running = True
385
- self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
386
- self.sentence_thread.start()
387
-
388
- # Start diarization processor thread
389
- self.diarization_thread = threading.Thread(target=self.process_transcript_queue, daemon=True)
390
- self.diarization_thread.start()
391
-
392
- return "Stream started successfully! Ready for audio input."
393
-
394
- except Exception as e:
395
- return f"Error starting stream: {e}"
396
 
397
- def process_transcript_queue(self):
398
- """Process transcripts from the stream handler"""
399
- while self.is_running:
 
400
  try:
401
- if self.stream_handler and not self.stream_handler.transcript_queue.empty():
402
- text, audio_data = self.stream_handler.transcript_queue.get(timeout=1)
403
-
404
- # Add to sentence queue for diarization
405
- self.pending_sentences.append(text)
406
- self.sentence_queue.put((text, audio_data))
407
- except queue.Empty:
408
- time.sleep(0.1) # Short sleep to prevent CPU hogging
409
  except Exception as e:
410
- print(f"Error processing transcript queue: {e}")
411
- time.sleep(0.5) # Slightly longer sleep on error
412
 
413
  def process_sentence_queue(self):
414
  """Process sentences in the queue for speaker detection"""
415
  while self.is_running:
416
  try:
417
- text, audio_data = self.sentence_queue.get(timeout=1)
418
 
419
  # Convert audio data to int16
420
- if isinstance(audio_data, np.ndarray):
421
- if audio_data.dtype != np.int16:
422
- audio_int16 = (audio_data * 32767).astype(np.int16)
423
- else:
424
- audio_int16 = audio_data
425
- else:
426
- audio_int16 = np.int16(audio_data * 32767)
427
 
428
  # Extract speaker embedding
429
  speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
@@ -442,16 +372,73 @@ class RealtimeSpeakerDiarization:
442
  # Remove from pending
443
  if text in self.pending_sentences:
444
  self.pending_sentences.remove(text)
 
 
 
445
 
446
  except queue.Empty:
447
  continue
448
  except Exception as e:
449
  print(f"Error processing sentence: {e}")
450
 
451
- def stop_stream(self):
452
- """Stop the stream and processing"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  self.is_running = False
454
- return "Stream stopped!"
 
 
455
 
456
  def clear_conversation(self):
457
  """Clear all conversation data"""
@@ -460,6 +447,7 @@ class RealtimeSpeakerDiarization:
460
  self.pending_sentences = []
461
  self.displayed_text = ""
462
  self.last_realtime_text = ""
 
463
 
464
  if self.speaker_detector:
465
  self.speaker_detector = SpeakerChangeDetector(
@@ -491,6 +479,7 @@ class RealtimeSpeakerDiarization:
491
  sentence_text, _ = sentence
492
  if i >= len(self.sentence_speakers):
493
  color = "#FFFFFF"
 
494
  else:
495
  speaker_id = self.sentence_speakers[i]
496
  color = self.speaker_detector.get_color_for_speaker(speaker_id)
@@ -539,38 +528,130 @@ class RealtimeSpeakerDiarization:
539
  except Exception as e:
540
  return f"Error getting status: {e}"
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
  # Global instance
544
  diarization_system = RealtimeSpeakerDiarization()
545
 
546
 
547
- # Create Gradio interface with FastAPI app integrated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  def create_interface():
549
- app = gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome())
550
-
551
- with app:
552
  gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
553
- gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding using FastRTC.")
554
 
555
  with gr.Row():
556
  with gr.Column(scale=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  # Main conversation display
558
  conversation_output = gr.HTML(
559
- value="<i>Click 'Initialize System' and then 'Start Stream' to begin...</i>",
560
  label="Live Conversation"
561
  )
562
 
563
- # FastRTC microphone widget for visualization only (the real audio comes through FastRTC stream)
564
- audio_widget = gr.Audio(
565
- label="🎙️ Microphone Input (Click Start Stream to enable)",
566
- type="microphone"
567
- )
568
-
569
  # Control buttons
570
  with gr.Row():
571
  init_btn = gr.Button("🔧 Initialize System", variant="secondary")
572
- start_btn = gr.Button("🎙️ Start Stream", variant="primary", interactive=False)
573
- stop_btn = gr.Button("⏹️ Stop Stream", variant="stop", interactive=False)
574
  clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
575
 
576
  # Status display
@@ -608,28 +689,12 @@ def create_interface():
608
  gr.Markdown("## 📝 Instructions")
609
  gr.Markdown("""
610
  1. Click **Initialize System** to load models
611
- 2. Click **Start Stream** to begin processing
612
- 3. Allow microphone access when prompted
613
- 4. Speak into your microphone
614
- 5. Watch real-time transcription with speaker labels
615
- 6. Adjust settings as needed
616
- """)
617
-
618
- # QR code for mobile access
619
- gr.Markdown("## 📱 Mobile Access")
620
- gr.Markdown("Scan this QR code to access from mobile device:")
621
- qr_code = gr.HTML("""
622
- <div id="qrcode" style="text-align: center;"></div>
623
- <script src="https://cdn.jsdelivr.net/npm/qrcode-generator@1.4.4/qrcode.min.js"></script>
624
- <script>
625
- setTimeout(function() {
626
- var currentUrl = window.location.href;
627
- var qr = qrcode(0, 'M');
628
- qr.addData(currentUrl);
629
- qr.make();
630
- document.getElementById('qrcode').innerHTML = qr.createImgTag(5);
631
- }, 1000);
632
- </script>
633
  """)
634
 
635
  # Speaker color legend
@@ -639,10 +704,17 @@ def create_interface():
639
  color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
640
 
641
  gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
 
 
 
 
 
 
 
642
 
643
  # Auto-refresh conversation and status
644
  def refresh_display():
645
- return get_formatted_conversation(), get_status()
646
 
647
  # Event handlers
648
  def on_initialize():
@@ -652,7 +724,7 @@ def create_interface():
652
  result,
653
  gr.update(interactive=True), # start_btn
654
  gr.update(interactive=True), # clear_btn
655
- get_formatted_conversation(),
656
  get_status()
657
  )
658
  else:
@@ -660,58 +732,26 @@ def create_interface():
660
  result,
661
  gr.update(interactive=False), # start_btn
662
  gr.update(interactive=False), # clear_btn
663
- get_formatted_conversation(),
664
  get_status()
665
  )
666
 
667
- def on_start_stream():
668
- result = start_stream(app)
669
  return (
670
  result,
671
  gr.update(interactive=False), # start_btn
672
  gr.update(interactive=True), # stop_btn
673
  )
674
 
675
- def on_stop_stream():
676
- result = stop_stream()
677
  return (
678
  result,
679
  gr.update(interactive=True), # start_btn
680
  gr.update(interactive=False), # stop_btn
681
  )
682
 
683
- def initialize_system():
684
- """Initialize the diarization system"""
685
- success = diarization_system.initialize_models()
686
- if success:
687
- return "✅ System initialized successfully! Models loaded."
688
- else:
689
- return "❌ Failed to initialize system. Please check the logs."
690
-
691
- def start_stream(app):
692
- """Start the FastRTC stream"""
693
- return diarization_system.start_stream(app)
694
-
695
- def stop_stream():
696
- """Stop the FastRTC stream"""
697
- return diarization_system.stop_stream()
698
-
699
- def clear_conversation():
700
- """Clear the conversation"""
701
- return diarization_system.clear_conversation()
702
-
703
- def update_settings(threshold, max_speakers):
704
- """Update system settings"""
705
- return diarization_system.update_settings(threshold, max_speakers)
706
-
707
- def get_formatted_conversation():
708
- """Get the current conversation"""
709
- return diarization_system.get_formatted_conversation()
710
-
711
- def get_status():
712
- """Get system status"""
713
- return diarization_system.get_status_info()
714
-
715
  # Connect event handlers
716
  init_btn.click(
717
  on_initialize,
@@ -719,12 +759,12 @@ def create_interface():
719
  )
720
 
721
  start_btn.click(
722
- on_start_stream,
723
  outputs=[status_output, start_btn, stop_btn]
724
  )
725
 
726
  stop_btn.click(
727
- on_stop_stream,
728
  outputs=[status_output, start_btn, stop_btn]
729
  )
730
 
@@ -739,7 +779,7 @@ def create_interface():
739
  outputs=[status_output]
740
  )
741
 
742
- # Auto-refresh every 2 seconds when streaming
743
  refresh_timer = gr.Timer(2.0)
744
  refresh_timer.tick(
745
  refresh_display,
@@ -749,10 +789,24 @@ def create_interface():
749
  return app
750
 
751
 
752
- if __name__ == "__main__":
 
 
 
 
753
  app = create_interface()
 
 
 
 
 
754
  app.launch(
755
  server_name="0.0.0.0",
756
  server_port=7860,
757
  share=True
758
  )
 
 
 
 
 
 
8
  import urllib.request
9
  import torchaudio
10
  from scipy.spatial.distance import cosine
11
+ from RealtimeSTT import AudioToTextRecorder
12
+ from fastrtc import Stream, AsyncStreamHandler, ReplyOnPause
13
  import json
14
  import io
15
  import wave
16
+ import asyncio
17
 
18
  # Simplified configuration parameters
19
  SILENCE_THRESHS = [0, 0.4]
20
+ FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
21
+ FINAL_BEAM_SIZE = 5
22
+ REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
23
+ REALTIME_BEAM_SIZE = 5
24
+ TRANSCRIPTION_LANGUAGE = "en"
25
  SILERO_SENSITIVITY = 0.4
26
  WEBRTC_SENSITIVITY = 3
27
  MIN_LENGTH_OF_RECORDING = 0.7
 
273
  }
274
 
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  class RealtimeSpeakerDiarization:
277
  def __init__(self):
278
  self.encoder = None
279
  self.audio_processor = None
280
  self.speaker_detector = None
281
+ self.recorder = None
 
282
  self.sentence_queue = queue.Queue()
283
  self.full_sentences = []
284
  self.sentence_speakers = []
 
288
  self.is_running = False
289
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
290
  self.max_speakers = DEFAULT_MAX_SPEAKERS
291
+ self.current_conversation = ""
292
 
293
  def initialize_models(self):
294
  """Initialize the speaker encoder model"""
 
315
  print(f"Model initialization error: {e}")
316
  return False
317
 
318
+ def live_text_detected(self, text):
319
+ """Callback for real-time transcription updates"""
320
+ text = text.strip()
321
+ if text:
322
+ sentence_delimiters = '.?!。'
323
+ prob_sentence_end = (
324
+ len(self.last_realtime_text) > 0
325
+ and text[-1] in sentence_delimiters
326
+ and self.last_realtime_text[-1] in sentence_delimiters
 
 
 
 
 
327
  )
328
+
329
+ self.last_realtime_text = text
330
+
331
+ if prob_sentence_end and FAST_SENTENCE_END:
332
+ self.recorder.stop()
333
+ elif prob_sentence_end:
334
+ self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
335
+ else:
336
+ self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
 
 
 
 
 
 
 
 
337
 
338
+ def process_final_text(self, text):
339
+ """Process final transcribed text with speaker embedding"""
340
+ text = text.strip()
341
+ if text:
342
  try:
343
+ bytes_data = self.recorder.last_transcription_bytes
344
+ self.sentence_queue.put((text, bytes_data))
345
+ self.pending_sentences.append(text)
 
 
 
 
 
346
  except Exception as e:
347
+ print(f"Error processing final text: {e}")
 
348
 
349
  def process_sentence_queue(self):
350
  """Process sentences in the queue for speaker detection"""
351
  while self.is_running:
352
  try:
353
+ text, bytes_data = self.sentence_queue.get(timeout=1)
354
 
355
  # Convert audio data to int16
356
+ audio_int16 = np.frombuffer(bytes_data, dtype=np.int16)
 
 
 
 
 
 
357
 
358
  # Extract speaker embedding
359
  speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
 
372
  # Remove from pending
373
  if text in self.pending_sentences:
374
  self.pending_sentences.remove(text)
375
+
376
+ # Update conversation display
377
+ self.current_conversation = self.get_formatted_conversation()
378
 
379
  except queue.Empty:
380
  continue
381
  except Exception as e:
382
  print(f"Error processing sentence: {e}")
383
 
384
+ def start_recording(self):
385
+ """Start the recording and transcription process"""
386
+ if self.encoder is None:
387
+ return "Please initialize models first!"
388
+
389
+ try:
390
+ # Setup recorder configuration for WebRTC input
391
+ recorder_config = {
392
+ 'spinner': False,
393
+ 'use_microphone': False, # We'll feed audio manually
394
+ 'model': FINAL_TRANSCRIPTION_MODEL,
395
+ 'language': TRANSCRIPTION_LANGUAGE,
396
+ 'silero_sensitivity': SILERO_SENSITIVITY,
397
+ 'webrtc_sensitivity': WEBRTC_SENSITIVITY,
398
+ 'post_speech_silence_duration': SILENCE_THRESHS[1],
399
+ 'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
400
+ 'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
401
+ 'min_gap_between_recordings': 0,
402
+ 'enable_realtime_transcription': True,
403
+ 'realtime_processing_pause': 0,
404
+ 'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
405
+ 'on_realtime_transcription_update': self.live_text_detected,
406
+ 'beam_size': FINAL_BEAM_SIZE,
407
+ 'beam_size_realtime': REALTIME_BEAM_SIZE,
408
+ 'buffer_size': BUFFER_SIZE,
409
+ 'sample_rate': SAMPLE_RATE,
410
+ }
411
+
412
+ self.recorder = AudioToTextRecorder(**recorder_config)
413
+
414
+ # Start sentence processing thread
415
+ self.is_running = True
416
+ self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
417
+ self.sentence_thread.start()
418
+
419
+ # Start transcription thread
420
+ self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
421
+ self.transcription_thread.start()
422
+
423
+ return "Recording started successfully! FastRTC audio input ready."
424
+
425
+ except Exception as e:
426
+ return f"Error starting recording: {e}"
427
+
428
+ def run_transcription(self):
429
+ """Run the transcription loop"""
430
+ try:
431
+ while self.is_running:
432
+ self.recorder.text(self.process_final_text)
433
+ except Exception as e:
434
+ print(f"Transcription error: {e}")
435
+
436
+ def stop_recording(self):
437
+ """Stop the recording process"""
438
  self.is_running = False
439
+ if self.recorder:
440
+ self.recorder.stop()
441
+ return "Recording stopped!"
442
 
443
  def clear_conversation(self):
444
  """Clear all conversation data"""
 
447
  self.pending_sentences = []
448
  self.displayed_text = ""
449
  self.last_realtime_text = ""
450
+ self.current_conversation = "Conversation cleared!"
451
 
452
  if self.speaker_detector:
453
  self.speaker_detector = SpeakerChangeDetector(
 
479
  sentence_text, _ = sentence
480
  if i >= len(self.sentence_speakers):
481
  color = "#FFFFFF"
482
+ speaker_name = "Unknown"
483
  else:
484
  speaker_id = self.sentence_speakers[i]
485
  color = self.speaker_detector.get_color_for_speaker(speaker_id)
 
528
  except Exception as e:
529
  return f"Error getting status: {e}"
530
 
531
+ def process_audio(self, audio_data):
532
+ """Process audio data from FastRTC"""
533
+ if not self.is_running or not self.recorder:
534
+ return
535
+
536
+ try:
537
+ # Extract audio data from FastRTC format (sample_rate, numpy_array)
538
+ sample_rate, audio_array = audio_data
539
+
540
+ # Convert to int16 format
541
+ if audio_array.dtype != np.int16:
542
+ audio_array = (audio_array * 32767).astype(np.int16)
543
+
544
+ # Convert to bytes and feed to recorder
545
+ audio_bytes = audio_array.tobytes()
546
+ self.recorder.feed_audio(audio_bytes)
547
+ except Exception as e:
548
+ print(f"Error processing FastRTC audio: {e}")
549
+
550
+
551
+ # FastRTC Audio Handler
552
+ class DiarizationHandler(AsyncStreamHandler):
553
+ def __init__(self, diarization_system):
554
+ super().__init__()
555
+ self.diarization_system = diarization_system
556
+
557
+ async def emit(self):
558
+ """Not used in this implementation"""
559
+ return None
560
+
561
+ async def receive(self, data):
562
+ """Receive audio data from FastRTC and process it"""
563
+ if self.diarization_system.is_running:
564
+ self.diarization_system.process_audio(data)
565
+
566
 
567
  # Global instance
568
  diarization_system = RealtimeSpeakerDiarization()
569
 
570
 
571
+ def initialize_system():
572
+ """Initialize the diarization system"""
573
+ success = diarization_system.initialize_models()
574
+ if success:
575
+ return "✅ System initialized successfully! Models loaded."
576
+ else:
577
+ return "❌ Failed to initialize system. Please check the logs."
578
+
579
+
580
+ def start_recording():
581
+ """Start recording and transcription"""
582
+ return diarization_system.start_recording()
583
+
584
+
585
+ def stop_recording():
586
+ """Stop recording and transcription"""
587
+ return diarization_system.stop_recording()
588
+
589
+
590
+ def clear_conversation():
591
+ """Clear the conversation"""
592
+ return diarization_system.clear_conversation()
593
+
594
+
595
+ def update_settings(threshold, max_speakers):
596
+ """Update system settings"""
597
+ return diarization_system.update_settings(threshold, max_speakers)
598
+
599
+
600
+ def get_conversation():
601
+ """Get the current conversation"""
602
+ return diarization_system.get_formatted_conversation()
603
+
604
+
605
+ def get_status():
606
+ """Get system status"""
607
+ return diarization_system.get_status_info()
608
+
609
+
610
+ # Setup FastRTC stream handler
611
+ def setup_fastrtc_handler():
612
+ """Set up FastRTC audio stream handler"""
613
+ handler = DiarizationHandler(diarization_system)
614
+ stream = Stream(handler=handler, modality="audio", mode="receive")
615
+ return stream
616
+
617
+
618
+ # Create Gradio interface
619
  def create_interface():
620
+ with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as app:
 
 
621
  gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
622
+ gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding.")
623
 
624
  with gr.Row():
625
  with gr.Column(scale=2):
626
+ # FastRTC Audio Component
627
+ fastrtc_html = gr.HTML("""
628
+ <div class="fastrtc-container" style="margin-bottom: 20px;">
629
+ <h3>🎙️ FastRTC Audio Input</h3>
630
+ <p>Click the button below to start the audio stream:</p>
631
+ <button id="start-fastrtc" style="background: #3498db; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer;">
632
+ Start FastRTC Audio
633
+ </button>
634
+ <div id="fastrtc-status" style="margin-top: 10px; font-style: italic;">Not connected</div>
635
+ <script>
636
+ document.getElementById('start-fastrtc').addEventListener('click', function() {
637
+ document.getElementById('fastrtc-status').textContent = 'Connecting...';
638
+ // FastRTC will be initialized here by the middleware
639
+ });
640
+ </script>
641
+ </div>
642
+ """)
643
+
644
  # Main conversation display
645
  conversation_output = gr.HTML(
646
+ value="<i>Click 'Initialize System' to start...</i>",
647
  label="Live Conversation"
648
  )
649
 
 
 
 
 
 
 
650
  # Control buttons
651
  with gr.Row():
652
  init_btn = gr.Button("🔧 Initialize System", variant="secondary")
653
+ start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False)
654
+ stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False)
655
  clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
656
 
657
  # Status display
 
689
  gr.Markdown("## 📝 Instructions")
690
  gr.Markdown("""
691
  1. Click **Initialize System** to load models
692
+ 2. Click **Start Recording** to begin processing
693
+ 3. Click **Start FastRTC Audio** to connect your microphone
694
+ 4. Allow microphone access when prompted
695
+ 5. Speak into your microphone
696
+ 6. Watch real-time transcription with speaker labels
697
+ 7. Adjust settings as needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  """)
699
 
700
  # Speaker color legend
 
704
  color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
705
 
706
  gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
707
+
708
+ # FastRTC Integration Notice
709
+ gr.Markdown("""
710
+ ## ℹ️ About FastRTC
711
+ This app uses FastRTC for low-latency audio streaming.
712
+ For optimal performance, use a modern browser and allow microphone access when prompted.
713
+ """)
714
 
715
  # Auto-refresh conversation and status
716
  def refresh_display():
717
+ return diarization_system.get_formatted_conversation(), diarization_system.get_status_info()
718
 
719
  # Event handlers
720
  def on_initialize():
 
724
  result,
725
  gr.update(interactive=True), # start_btn
726
  gr.update(interactive=True), # clear_btn
727
+ get_conversation(),
728
  get_status()
729
  )
730
  else:
 
732
  result,
733
  gr.update(interactive=False), # start_btn
734
  gr.update(interactive=False), # clear_btn
735
+ get_conversation(),
736
  get_status()
737
  )
738
 
739
+ def on_start():
740
+ result = start_recording()
741
  return (
742
  result,
743
  gr.update(interactive=False), # start_btn
744
  gr.update(interactive=True), # stop_btn
745
  )
746
 
747
+ def on_stop():
748
+ result = stop_recording()
749
  return (
750
  result,
751
  gr.update(interactive=True), # start_btn
752
  gr.update(interactive=False), # stop_btn
753
  )
754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755
  # Connect event handlers
756
  init_btn.click(
757
  on_initialize,
 
759
  )
760
 
761
  start_btn.click(
762
+ on_start,
763
  outputs=[status_output, start_btn, stop_btn]
764
  )
765
 
766
  stop_btn.click(
767
+ on_stop,
768
  outputs=[status_output, start_btn, stop_btn]
769
  )
770
 
 
779
  outputs=[status_output]
780
  )
781
 
782
+ # Auto-refresh every 2 seconds when recording
783
  refresh_timer = gr.Timer(2.0)
784
  refresh_timer.tick(
785
  refresh_display,
 
789
  return app
790
 
791
 
792
+ async def main():
793
+ # Setup FastRTC stream
794
+ stream = setup_fastrtc_handler()
795
+
796
+ # Create Gradio app
797
  app = create_interface()
798
+
799
+ # Mount FastRTC stream to the Gradio app
800
+ stream.mount(app)
801
+
802
+ # Launch the app
803
  app.launch(
804
  server_name="0.0.0.0",
805
  server_port=7860,
806
  share=True
807
  )
808
+
809
+
810
+ if __name__ == "__main__":
811
+ # Run the async application
812
+ asyncio.run(main())