Saiyaswanth007 commited on
Commit
57c1aba
·
1 Parent(s): ff9d9e6

Revert portg

Browse files
Files changed (1) hide show
  1. app.py +176 -121
app.py CHANGED
@@ -291,6 +291,7 @@ class RealtimeSpeakerDiarization:
291
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
292
  self.max_speakers = DEFAULT_MAX_SPEAKERS
293
  self.current_conversation = ""
 
294
 
295
  def initialize_models(self):
296
  """Initialize the speaker encoder model"""
@@ -389,7 +390,7 @@ class RealtimeSpeakerDiarization:
389
  return "Please initialize models first!"
390
 
391
  try:
392
- # Setup recorder configuration for WebRTC input
393
  recorder_config = {
394
  'spinner': False,
395
  'use_microphone': False, # We'll feed audio manually
@@ -530,30 +531,37 @@ class RealtimeSpeakerDiarization:
530
  except Exception as e:
531
  return f"Error getting status: {e}"
532
 
533
- def process_audio(self, audio_data):
534
- """Process audio data from FastRTC"""
535
  if not self.is_running or not self.recorder:
536
  return
537
 
538
  try:
539
- # Extract audio data from FastRTC format (sample_rate, numpy_array)
540
- sample_rate, audio_array = audio_data
541
-
542
- # Convert to int16 format
543
- if audio_array.dtype != np.int16:
544
- audio_array = (audio_array * 32767).astype(np.int16)
 
 
 
 
 
 
 
545
 
546
- # Convert to bytes and feed to recorder
547
- audio_bytes = audio_array.tobytes()
548
  self.recorder.feed_audio(audio_bytes)
 
549
  except Exception as e:
550
- print(f"Error processing FastRTC audio: {e}")
551
 
552
 
553
  # FastRTC Audio Handler
554
  class DiarizationHandler(AsyncStreamHandler):
555
  def __init__(self, diarization_system):
556
- super().__init__()
557
  self.diarization_system = diarization_system
558
 
559
  def copy(self):
@@ -564,10 +572,21 @@ class DiarizationHandler(AsyncStreamHandler):
564
  """Not used in this implementation"""
565
  return None
566
 
567
- async def receive(self, data):
568
  """Receive audio data from FastRTC and process it"""
569
- if self.diarization_system.is_running:
570
- self.diarization_system.process_audio(data)
 
 
 
 
 
 
 
 
 
 
 
571
 
572
 
573
  # Global instance
@@ -613,61 +632,6 @@ def get_status():
613
  return diarization_system.get_status_info()
614
 
615
 
616
- # Get Cloudflare TURN credentials for FastRTC
617
- async def get_cloudflare_credentials():
618
- # Check if HF_TOKEN is set in environment
619
- hf_token = os.environ.get("HF_TOKEN")
620
-
621
- # If not set, use a default Hugging Face token if available
622
- if not hf_token:
623
- # Log a warning that user should set their own token
624
- print("Warning: HF_TOKEN environment variable not set. Please set your own Hugging Face token.")
625
- # Try to use the Hugging Face token from the environment
626
- from huggingface_hub import HfApi
627
- try:
628
- api = HfApi()
629
- hf_token = api.token
630
- if not hf_token:
631
- print("Error: No Hugging Face token available. TURN relay may not work properly.")
632
- except:
633
- print("Error: Failed to get Hugging Face token. TURN relay may not work properly.")
634
-
635
- # Get Cloudflare TURN credentials using the Hugging Face token
636
- if hf_token:
637
- try:
638
- return await get_cloudflare_turn_credentials_async(hf_token=hf_token)
639
- except Exception as e:
640
- print(f"Error getting Cloudflare TURN credentials: {e}")
641
-
642
- # Fallback to a default configuration that may not work
643
- return {
644
- "iceServers": [
645
- {
646
- "urls": "stun:stun.l.google.com:19302"
647
- }
648
- ]
649
- }
650
-
651
-
652
- # Setup FastRTC stream handler with TURN server configuration
653
- def setup_fastrtc_handler():
654
- """Set up FastRTC audio stream handler with TURN server configuration"""
655
- handler = DiarizationHandler(diarization_system)
656
-
657
- # Get server-side credentials (longer TTL)
658
- server_credentials = get_cloudflare_turn_credentials(ttl=360000)
659
-
660
- stream = Stream(
661
- handler=handler,
662
- modality="audio",
663
- mode="receive",
664
- rtc_configuration=get_cloudflare_credentials, # Async function for client-side credentials
665
- server_rtc_configuration=server_credentials # Server-side credentials with longer TTL
666
- )
667
-
668
- return stream
669
-
670
-
671
  # Create Gradio interface
672
  def create_interface():
673
  with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as interface:
@@ -676,32 +640,6 @@ def create_interface():
676
 
677
  with gr.Row():
678
  with gr.Column(scale=2):
679
- # FastRTC Audio Component
680
- fastrtc_html = gr.HTML("""
681
- <div class="fastrtc-container" style="margin-bottom: 20px;">
682
- <h3>🎙️ FastRTC Audio Input</h3>
683
- <p>Click the button below to start the audio stream:</p>
684
- <button id="start-fastrtc" style="background: #3498db; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer;">
685
- Start FastRTC Audio
686
- </button>
687
- <div id="fastrtc-status" style="margin-top: 10px; font-style: italic;">Not connected</div>
688
- <script>
689
- document.getElementById('start-fastrtc').addEventListener('click', function() {
690
- document.getElementById('fastrtc-status').textContent = 'Connecting...';
691
- // FastRTC will initialize the connection
692
- fetch('/start-rtc', { method: 'POST' })
693
- .then(response => response.text())
694
- .then(data => {
695
- document.getElementById('fastrtc-status').textContent = 'Connected! Speak now...';
696
- })
697
- .catch(error => {
698
- document.getElementById('fastrtc-status').textContent = 'Connection error: ' + error;
699
- });
700
- });
701
- </script>
702
- </div>
703
- """)
704
-
705
  # Main conversation display
706
  conversation_output = gr.HTML(
707
  value="<i>Click 'Initialize System' to start...</i>",
@@ -751,7 +689,7 @@ def create_interface():
751
  gr.Markdown("""
752
  1. Click **Initialize System** to load models
753
  2. Click **Start Recording** to begin processing
754
- 3. Click **Start FastRTC Audio** to connect your microphone
755
  4. Allow microphone access when prompted
756
  5. Speak into your microphone
757
  6. Watch real-time transcription with speaker labels
@@ -772,13 +710,6 @@ def create_interface():
772
  This app uses FastRTC for low-latency audio streaming.
773
  For optimal performance, use a modern browser and allow microphone access when prompted.
774
  """)
775
-
776
- # Hugging Face Token Information
777
- gr.Markdown("""
778
- ## 🔑 Hugging Face Token
779
- This app uses Cloudflare TURN server via Hugging Face integration.
780
- If audio connection fails, set your HF_TOKEN environment variable in the Space settings.
781
- """)
782
 
783
  # Auto-refresh conversation and status
784
  def refresh_display():
@@ -857,25 +788,149 @@ def create_interface():
857
  return interface
858
 
859
 
860
- # 1) Create FastAPI app
861
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
862
 
863
- # 2) Create Gradio interface
864
- gradio_interface = create_interface()
865
 
866
- # 3) Mount Gradio onto FastAPI at root
867
- app = gr.mount_gradio_app(app, gradio_interface, path="/")
 
 
 
 
 
 
 
 
868
 
869
- # 4) Initialize and mount FastRTC stream on the same app
870
- rtc_stream = setup_fastrtc_handler()
871
- rtc_stream.mount(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
872
 
873
- # 5) Expose an endpoint to trigger the client-side RTC handshake
874
- @app.post("/start-rtc")
875
- async def start_rtc():
876
- await rtc_stream.start_client()
877
- return {"status": "success"}
878
 
879
- # 6) Local dev via uvicorn; HF Spaces will auto-detect 'app' and ignore this
880
  if __name__ == "__main__":
881
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
292
  self.max_speakers = DEFAULT_MAX_SPEAKERS
293
  self.current_conversation = ""
294
+ self.audio_buffer = []
295
 
296
  def initialize_models(self):
297
  """Initialize the speaker encoder model"""
 
390
  return "Please initialize models first!"
391
 
392
  try:
393
+ # Setup recorder configuration for manual audio input
394
  recorder_config = {
395
  'spinner': False,
396
  'use_microphone': False, # We'll feed audio manually
 
531
  except Exception as e:
532
  return f"Error getting status: {e}"
533
 
534
+ def feed_audio_data(self, audio_data):
535
+ """Feed audio data to the recorder"""
536
  if not self.is_running or not self.recorder:
537
  return
538
 
539
  try:
540
+ # Ensure audio is in the correct format (16-bit PCM)
541
+ if isinstance(audio_data, np.ndarray):
542
+ if audio_data.dtype != np.int16:
543
+ # Convert float to int16
544
+ if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
545
+ audio_data = (audio_data * 32767).astype(np.int16)
546
+ else:
547
+ audio_data = audio_data.astype(np.int16)
548
+
549
+ # Convert to bytes
550
+ audio_bytes = audio_data.tobytes()
551
+ else:
552
+ audio_bytes = audio_data
553
 
554
+ # Feed to recorder
 
555
  self.recorder.feed_audio(audio_bytes)
556
+
557
  except Exception as e:
558
+ print(f"Error feeding audio data: {e}")
559
 
560
 
561
  # FastRTC Audio Handler
562
  class DiarizationHandler(AsyncStreamHandler):
563
  def __init__(self, diarization_system):
564
+ super().__init__(reply_on_pause=ReplyOnPause.NEVER)
565
  self.diarization_system = diarization_system
566
 
567
  def copy(self):
 
572
  """Not used in this implementation"""
573
  return None
574
 
575
+ async def receive(self, frame):
576
  """Receive audio data from FastRTC and process it"""
577
+ try:
578
+ if self.diarization_system.is_running:
579
+ # Frame should be a numpy array of audio data
580
+ if hasattr(frame, 'data'):
581
+ audio_data = frame.data
582
+ else:
583
+ audio_data = frame
584
+
585
+ # Feed audio data to the diarization system
586
+ self.diarization_system.feed_audio_data(audio_data)
587
+
588
+ except Exception as e:
589
+ print(f"Error in FastRTC handler: {e}")
590
 
591
 
592
  # Global instance
 
632
  return diarization_system.get_status_info()
633
 
634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  # Create Gradio interface
636
  def create_interface():
637
  with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as interface:
 
640
 
641
  with gr.Row():
642
  with gr.Column(scale=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  # Main conversation display
644
  conversation_output = gr.HTML(
645
  value="<i>Click 'Initialize System' to start...</i>",
 
689
  gr.Markdown("""
690
  1. Click **Initialize System** to load models
691
  2. Click **Start Recording** to begin processing
692
+ 3. Use the FastRTC interface below to connect your microphone
693
  4. Allow microphone access when prompted
694
  5. Speak into your microphone
695
  6. Watch real-time transcription with speaker labels
 
710
  This app uses FastRTC for low-latency audio streaming.
711
  For optimal performance, use a modern browser and allow microphone access when prompted.
712
  """)
 
 
 
 
 
 
 
713
 
714
  # Auto-refresh conversation and status
715
  def refresh_display():
 
788
  return interface
789
 
790
 
791
+ # Main application setup
792
+ def create_app():
793
+ """Create and configure the FastAPI app with Gradio and FastRTC"""
794
+ # Create FastAPI app
795
+ app = FastAPI(
796
+ title="Real-time Speaker Diarization",
797
+ description="Real-time speech recognition with speaker diarization using FastRTC",
798
+ version="1.0.0"
799
+ )
800
+
801
+ # Create Gradio interface
802
+ gradio_interface = create_interface()
803
+
804
+ # Mount Gradio interface
805
+ app = gr.mount_gradio_app(app, gradio_interface, path="/")
806
+
807
+ # Setup FastRTC stream
808
+ try:
809
+ # Create the handler
810
+ handler = DiarizationHandler(diarization_system)
811
+
812
+ # Get TURN credentials
813
+ hf_token = os.environ.get("HF_TOKEN")
814
+ if not hf_token:
815
+ print("Warning: HF_TOKEN not set. Audio streaming may not work properly.")
816
+ # Use basic STUN server as fallback
817
+ rtc_config = {
818
+ "iceServers": [{"urls": "stun:stun.l.google.com:19302"}]
819
+ }
820
+ else:
821
+ # Get Cloudflare TURN credentials
822
+ turn_credentials = get_cloudflare_turn_credentials(hf_token)
823
+ rtc_config = {
824
+ "iceServers": [
825
+ {"urls": "stun:stun.l.google.com:19302"},
826
+ {
827
+ "urls": f"turn:{turn_credentials['urls'][0]}",
828
+ "username": turn_credentials["username"],
829
+ "credential": turn_credentials["credential"]
830
+ }
831
+ ]
832
+ }
833
+
834
+ # Create FastRTC stream
835
+ stream = Stream(
836
+ handler=handler,
837
+ rtc_config=rtc_config,
838
+ audio_sample_rate=SAMPLE_RATE,
839
+ audio_channels=CHANNELS
840
+ )
841
+
842
+ # Add FastRTC endpoints
843
+ app.mount("/stream", stream.app)
844
+
845
+ print("FastRTC stream configured successfully!")
846
+
847
+ except Exception as e:
848
+ print(f"Warning: Failed to setup FastRTC stream: {e}")
849
+ print("Audio streaming will not be available.")
850
+
851
+ return app
852
+
853
+
854
+ # Health check endpoint
855
+ @app.get("/health")
856
+ async def health_check():
857
+ """Health check endpoint"""
858
+ return {
859
+ "status": "healthy",
860
+ "timestamp": time.time(),
861
+ "system_initialized": diarization_system.encoder is not None,
862
+ "recording_active": diarization_system.is_running
863
+ }
864
 
 
 
865
 
866
+ # API endpoint to get conversation
867
+ @app.get("/api/conversation")
868
+ async def get_conversation_api():
869
+ """API endpoint to get current conversation"""
870
+ return {
871
+ "conversation": diarization_system.get_formatted_conversation(),
872
+ "status": diarization_system.get_status_info(),
873
+ "is_recording": diarization_system.is_running
874
+ }
875
+
876
 
877
+ # API endpoint to control recording
878
+ @app.post("/api/control/{action}")
879
+ async def control_recording(action: str):
880
+ """API endpoint to control recording (start/stop/clear)"""
881
+ if action == "start":
882
+ result = diarization_system.start_recording()
883
+ elif action == "stop":
884
+ result = diarization_system.stop_recording()
885
+ elif action == "clear":
886
+ result = diarization_system.clear_conversation()
887
+ elif action == "initialize":
888
+ result = initialize_system()
889
+ else:
890
+ return {"error": "Invalid action. Use: start, stop, clear, or initialize"}
891
+
892
+ return {"result": result, "is_recording": diarization_system.is_running}
893
 
 
 
 
 
 
894
 
895
+ # Main entry point
896
  if __name__ == "__main__":
897
+ # Create the app
898
+ app = create_app()
899
+
900
+ # Configuration
901
+ host = os.environ.get("HOST", "0.0.0.0")
902
+ port = int(os.environ.get("PORT", 7860))
903
+
904
+ print(f"""
905
+ 🎤 Real-time Speaker Diarization Server
906
+ =====================================
907
+
908
+ Starting server on: http://{host}:{port}
909
+
910
+ Features:
911
+ - Real-time speech recognition
912
+ - Speaker diarization with color coding
913
+ - FastRTC low-latency audio streaming
914
+ - Web interface for easy interaction
915
+
916
+ Make sure to:
917
+ 1. Set HF_TOKEN environment variable for TURN server access
918
+ 2. Allow microphone access in your browser
919
+ 3. Use a modern browser for best performance
920
+
921
+ API Endpoints:
922
+ - GET /health - Health check
923
+ - GET /api/conversation - Get current conversation
924
+ - POST /api/control/{action} - Control recording (start/stop/clear/initialize)
925
+ - WS /stream - FastRTC audio stream endpoint
926
+
927
+ """)
928
+
929
+ # Run the server
930
+ uvicorn.run(
931
+ app,
932
+ host=host,
933
+ port=port,
934
+ log_level="info",
935
+ access_log=True
936
+ )