Saiyaswanth007 commited on
Commit
691302d
·
1 Parent(s): 7177b58

Revert portg

Browse files
Files changed (1) hide show
  1. app.py +347 -231
app.py CHANGED
@@ -560,106 +560,177 @@ class RealtimeSpeakerDiarization:
560
 
561
 
562
  # FastRTC Audio Handler
563
- class DiarizationHandler(AsyncStreamHandler):
 
 
 
 
 
 
 
 
 
 
 
 
564
  def __init__(self, diarization_system):
565
- super().__init__()
566
  self.diarization_system = diarization_system
 
 
567
 
568
  def copy(self):
569
  # Return a fresh handler for each new stream connection
570
  return DiarizationHandler(self.diarization_system)
571
 
572
- async def emit(self):
573
- """Not used in this implementation"""
574
- return None
575
-
576
- async def receive(self, frame):
577
- """Receive audio data from FastRTC and process it"""
578
  try:
579
- if self.diarization_system.is_running:
580
- # Frame should be a numpy array of audio data
581
- if hasattr(frame, 'data'):
582
- audio_data = frame.data
 
 
 
583
  else:
584
- audio_data = frame
 
 
 
 
585
 
586
- # Feed audio data to the diarization system
587
- self.diarization_system.feed_audio_data(audio_data)
 
588
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
  except Exception as e:
590
- print(f"Error in FastRTC handler: {e}")
591
 
592
 
593
  # Global instance
594
  diarization_system = RealtimeSpeakerDiarization()
 
595
 
596
 
597
  def initialize_system():
598
  """Initialize the diarization system"""
599
- success = diarization_system.initialize_models()
600
- if success:
601
- return "✅ System initialized successfully! Models loaded."
602
- else:
603
- return "❌ Failed to initialize system. Please check the logs."
 
 
 
 
 
604
 
605
 
606
  def start_recording():
607
  """Start recording and transcription"""
608
- return diarization_system.start_recording()
 
 
 
 
609
 
610
 
611
  def stop_recording():
612
  """Stop recording and transcription"""
613
- return diarization_system.stop_recording()
 
 
 
 
614
 
615
 
616
  def clear_conversation():
617
  """Clear the conversation"""
618
- return diarization_system.clear_conversation()
 
 
 
 
619
 
620
 
621
  def update_settings(threshold, max_speakers):
622
  """Update system settings"""
623
- return diarization_system.update_settings(threshold, max_speakers)
 
 
 
 
624
 
625
 
626
  def get_conversation():
627
  """Get the current conversation"""
628
- return diarization_system.get_formatted_conversation()
 
 
 
629
 
630
 
631
  def get_status():
632
  """Get system status"""
633
- return diarization_system.get_status_info()
 
 
 
634
 
635
 
636
  # Create Gradio interface
637
  def create_interface():
638
- with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as interface:
639
  gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
640
- gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding.")
641
 
642
  with gr.Row():
643
  with gr.Column(scale=2):
644
  # Main conversation display
645
  conversation_output = gr.HTML(
646
- value="<i>Click 'Initialize System' to start...</i>",
647
- label="Live Conversation"
 
648
  )
649
 
650
  # Control buttons
651
  with gr.Row():
652
- init_btn = gr.Button("🔧 Initialize System", variant="secondary")
653
- start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False)
654
- stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False)
655
- clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
 
 
 
 
 
 
 
656
 
657
  # Status display
658
  status_output = gr.Textbox(
659
  label="System Status",
660
- value="System not initialized",
661
- lines=8,
662
- interactive=False
 
663
  )
664
 
665
  with gr.Column(scale=1):
@@ -670,276 +741,321 @@ def create_interface():
670
  minimum=0.1,
671
  maximum=0.95,
672
  step=0.05,
673
- value=DEFAULT_CHANGE_THRESHOLD,
674
  label="Speaker Change Sensitivity",
675
- info="Lower values = more sensitive to speaker changes"
676
  )
677
 
678
  max_speakers_slider = gr.Slider(
679
  minimum=2,
680
- maximum=ABSOLUTE_MAX_SPEAKERS,
681
  step=1,
682
- value=DEFAULT_MAX_SPEAKERS,
683
  label="Maximum Number of Speakers"
684
  )
685
 
686
- update_settings_btn = gr.Button("Update Settings")
 
 
 
 
 
 
 
 
 
 
687
 
688
  # Instructions
689
- gr.Markdown("## 📝 Instructions")
690
  gr.Markdown("""
691
- 1. Click **Initialize System** to load models
692
- 2. Click **Start Recording** to begin processing
693
- 3. Use the FastRTC interface below to connect your microphone
694
- 4. Allow microphone access when prompted
695
- 5. Speak into your microphone
696
- 6. Watch real-time transcription with speaker labels
697
- 7. Adjust settings as needed
698
  """)
699
 
700
  # Speaker color legend
701
  gr.Markdown("## 🎨 Speaker Colors")
702
- color_info = []
703
- for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
704
- color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
 
 
 
 
 
 
 
705
 
706
- gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
 
 
707
 
708
- # FastRTC Integration Notice
709
- gr.Markdown("""
710
- ## ℹ️ About FastRTC
711
- This app uses FastRTC for low-latency audio streaming.
712
- For optimal performance, use a modern browser and allow microphone access when prompted.
713
- """)
714
 
715
  # Auto-refresh conversation and status
716
  def refresh_display():
717
- return diarization_system.get_formatted_conversation(), diarization_system.get_status_info()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
 
719
  # Event handlers
720
  def on_initialize():
721
- result = initialize_system()
722
- if "successfully" in result:
 
 
 
 
723
  return (
724
- result,
725
- gr.update(interactive=True), # start_btn
726
- gr.update(interactive=True), # clear_btn
727
- get_conversation(),
728
- get_status()
729
  )
730
- else:
 
731
  return (
732
- result,
733
- gr.update(interactive=False), # start_btn
734
- gr.update(interactive=False), # clear_btn
735
- get_conversation(),
736
- get_status()
737
  )
738
 
739
  def on_start():
740
- result = start_recording()
741
- return (
742
- result,
743
- gr.update(interactive=False), # start_btn
744
- gr.update(interactive=True), # stop_btn
745
- )
 
 
 
 
 
 
 
 
 
 
 
 
746
 
747
  def on_stop():
748
- result = stop_recording()
749
- return (
750
- result,
751
- gr.update(interactive=True), # start_btn
752
- gr.update(interactive=False), # stop_btn
753
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
 
755
  # Connect event handlers
756
  init_btn.click(
757
  on_initialize,
758
- outputs=[status_output, start_btn, clear_btn, conversation_output, status_output]
759
  )
760
 
761
  start_btn.click(
762
  on_start,
763
- outputs=[status_output, start_btn, stop_btn]
764
  )
765
 
766
  stop_btn.click(
767
  on_stop,
768
- outputs=[status_output, start_btn, stop_btn]
769
  )
770
 
771
  clear_btn.click(
772
- clear_conversation,
773
- outputs=[status_output]
774
  )
775
 
776
  update_settings_btn.click(
777
- update_settings,
778
  inputs=[threshold_slider, max_speakers_slider],
779
  outputs=[status_output]
780
  )
781
 
782
- # Auto-refresh every 2 seconds when recording
783
  refresh_timer = gr.Timer(2.0)
784
  refresh_timer.tick(
785
  refresh_display,
786
- outputs=[conversation_output, status_output]
787
  )
788
 
789
  return interface
790
 
791
 
792
- # Create API router for endpoints
793
- router = APIRouter()
794
-
795
- # Health check endpoint
796
- @router.get("/health")
797
- async def health_check():
798
- """Health check endpoint"""
799
- return {
800
- "status": "healthy",
801
- "timestamp": time.time(),
802
- "system_initialized": diarization_system.encoder is not None,
803
- "recording_active": diarization_system.is_running
804
- }
805
-
806
-
807
- # API endpoint to get conversation
808
- @router.get("/api/conversation")
809
- async def get_conversation_api():
810
- """API endpoint to get current conversation"""
811
- return {
812
- "conversation": diarization_system.get_formatted_conversation(),
813
- "status": diarization_system.get_status_info(),
814
- "is_recording": diarization_system.is_running
815
- }
816
-
817
-
818
- # API endpoint to control recording
819
- @router.post("/api/control/{action}")
820
- async def control_recording(action: str):
821
- """API endpoint to control recording (start/stop/clear/initialize)"""
822
- if action == "start":
823
- result = diarization_system.start_recording()
824
- elif action == "stop":
825
- result = diarization_system.stop_recording()
826
- elif action == "clear":
827
- result = diarization_system.clear_conversation()
828
- elif action == "initialize":
829
- result = initialize_system()
830
- else:
831
- return {"error": "Invalid action. Use: start, stop, clear, or initialize"}
832
-
833
- return {"result": result, "is_recording": diarization_system.is_running}
834
-
835
-
836
- # Main application setup
837
- def create_app():
838
- """Create and configure the FastAPI app with Gradio and FastRTC"""
839
- # Create FastAPI app
840
  app = FastAPI(
841
  title="Real-time Speaker Diarization",
842
  description="Real-time speech recognition with speaker diarization using FastRTC",
843
  version="1.0.0"
844
  )
845
 
846
- # Include API routes
847
- app.include_router(router)
848
 
849
- # Create Gradio interface
850
- gradio_interface = create_interface()
 
 
 
 
 
 
 
851
 
852
- # Mount Gradio interface
853
- app = gr.mount_gradio_app(app, gradio_interface, path="/")
 
 
 
 
 
 
 
 
 
 
854
 
855
- # Setup FastRTC stream
856
- try:
857
- # Create the handler
858
- handler = DiarizationHandler(diarization_system)
859
-
860
- # Get TURN credentials
861
- hf_token = os.environ.get("HF_TOKEN")
862
- if not hf_token:
863
- print("Warning: HF_TOKEN not set. Audio streaming may not work properly.")
864
- # Use basic STUN server as fallback
865
- rtc_config = {
866
- "iceServers": [{"urls": "stun:stun.l.google.com:19302"}]
 
 
 
 
 
 
 
867
  }
868
- else:
869
- # Get Cloudflare TURN credentials
870
- try:
871
- turn_credentials = get_cloudflare_turn_credentials(hf_token)
872
-
873
- # Safely extract credentials from the response
874
- ice_servers = []
875
-
876
- # Always add STUN server
877
- ice_servers.append({"urls": "stun:stun.l.google.com:19302"})
878
-
879
- # Check for and add TURN server if available
880
- if turn_credentials and isinstance(turn_credentials, dict):
881
- # Handle different possible structures
882
- if 'iceServers' in turn_credentials:
883
- # If credentials already have iceServers, use them directly
884
- rtc_config = turn_credentials
885
- elif 'urls' in turn_credentials and isinstance(turn_credentials['urls'], list) and turn_credentials['urls']:
886
- # Structure: {urls: [...], username: "...", credential: "..."}
887
- ice_servers.append({
888
- "urls": [f"turn:{url}" for url in turn_credentials["urls"]],
889
- "username": turn_credentials.get("username", ""),
890
- "credential": turn_credentials.get("credential", "")
891
- })
892
- rtc_config = {"iceServers": ice_servers}
893
- elif 'url' in turn_credentials:
894
- # Structure with single URL
895
- ice_servers.append({
896
- "urls": f"turn:{turn_credentials['url']}",
897
- "username": turn_credentials.get("username", ""),
898
- "credential": turn_credentials.get("credential", "")
899
- })
900
- rtc_config = {"iceServers": ice_servers}
901
- else:
902
- print("Warning: Unexpected TURN credentials format. Using STUN only.")
903
- rtc_config = {"iceServers": ice_servers}
904
- else:
905
- print("Warning: Could not get TURN credentials. Using STUN only.")
906
- rtc_config = {"iceServers": ice_servers}
907
- except Exception as e:
908
- print(f"Warning: Error getting TURN credentials: {e}. Using STUN only.")
909
- rtc_config = {
910
- "iceServers": [{"urls": "stun:stun.l.google.com:19302"}]
911
- }
912
-
913
- # Create FastRTC stream
914
- stream = Stream(
915
- handler=handler,
916
- rtc_configuration=rtc_config,
917
- modality="audio",
918
- mode="send-receive"
919
- )
920
-
921
- # Add FastRTC endpoints
922
- app.mount("/stream", stream)
923
-
924
- print("FastRTC stream configured successfully!")
925
 
926
- except Exception as e:
927
- print(f"Warning: Failed to setup FastRTC stream: {e}")
928
- print("Audio streaming will not be available.")
 
 
 
 
 
 
 
 
 
 
 
929
 
 
930
  return app
931
 
932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933
 
934
 
935
- # Main entry point
936
  if __name__ == "__main__":
937
- # Create the app
938
- app = create_app()
939
- interface = create_interface()
940
- # Simple launch - HF Spaces will handle host/port automatically
941
- interface.launch(
942
- share=False, # Not needed in HF Spaces
943
- server_name="0.0.0.0", # Required for HF Spaces
944
- # Don't specify server_port - let HF Spaces handle it
945
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
 
561
 
562
  # FastRTC Audio Handler
563
+ # FastRTC Audio Handler for Real-time Diarization
564
+ import asyncio
565
+ import numpy as np
566
+ from fastrtc import FastRTCClient, AudioFrame
567
+ from fastapi import FastAPI, APIRouter
568
+ import gradio as gr
569
+ import time
570
+ import os
571
+ import threading
572
+ from queue import Queue
573
+ import json
574
+
575
+ class DiarizationHandler:
576
  def __init__(self, diarization_system):
 
577
  self.diarization_system = diarization_system
578
+ self.audio_queue = Queue()
579
+ self.is_processing = False
580
 
581
  def copy(self):
582
  # Return a fresh handler for each new stream connection
583
  return DiarizationHandler(self.diarization_system)
584
 
585
+ async def on_audio_frame(self, frame: AudioFrame):
586
+ """Handle incoming audio frames from FastRTC"""
 
 
 
 
587
  try:
588
+ if self.diarization_system.is_running and frame.data is not None:
589
+ # Convert audio frame to numpy array
590
+ if isinstance(frame.data, bytes):
591
+ # Convert bytes to numpy array (assuming 16-bit PCM)
592
+ audio_data = np.frombuffer(frame.data, dtype=np.int16)
593
+ elif hasattr(frame, 'to_ndarray'):
594
+ audio_data = frame.to_ndarray()
595
  else:
596
+ audio_data = np.array(frame.data, dtype=np.float32)
597
+
598
+ # Ensure audio is in the right format (mono, float32, -1 to 1 range)
599
+ if audio_data.dtype == np.int16:
600
+ audio_data = audio_data.astype(np.float32) / 32768.0
601
 
602
+ # If stereo, convert to mono
603
+ if len(audio_data.shape) > 1:
604
+ audio_data = np.mean(audio_data, axis=1)
605
 
606
+ # Feed to diarization system
607
+ await self.process_audio_async(audio_data, frame.sample_rate)
608
+
609
+ except Exception as e:
610
+ print(f"Error processing audio frame: {e}")
611
+
612
+ async def process_audio_async(self, audio_data, sample_rate=16000):
613
+ """Process audio data asynchronously"""
614
+ try:
615
+ # Run in thread pool to avoid blocking
616
+ loop = asyncio.get_event_loop()
617
+ await loop.run_in_executor(
618
+ None,
619
+ self.diarization_system.feed_audio_data,
620
+ audio_data,
621
+ sample_rate
622
+ )
623
  except Exception as e:
624
+ print(f"Error in async audio processing: {e}")
625
 
626
 
627
  # Global instance
628
  diarization_system = RealtimeSpeakerDiarization()
629
+ audio_handler = None
630
 
631
 
632
  def initialize_system():
633
  """Initialize the diarization system"""
634
+ global audio_handler
635
+ try:
636
+ success = diarization_system.initialize_models()
637
+ if success:
638
+ audio_handler = DiarizationHandler(diarization_system)
639
+ return "✅ System initialized successfully! Models loaded and FastRTC handler ready."
640
+ else:
641
+ return "❌ Failed to initialize system. Please check the logs."
642
+ except Exception as e:
643
+ return f"❌ Initialization error: {str(e)}"
644
 
645
 
646
  def start_recording():
647
  """Start recording and transcription"""
648
+ try:
649
+ result = diarization_system.start_recording()
650
+ return f"🎙️ {result} - FastRTC audio streaming is active."
651
+ except Exception as e:
652
+ return f"❌ Failed to start recording: {str(e)}"
653
 
654
 
655
  def stop_recording():
656
  """Stop recording and transcription"""
657
+ try:
658
+ result = diarization_system.stop_recording()
659
+ return f"⏹️ {result}"
660
+ except Exception as e:
661
+ return f"❌ Failed to stop recording: {str(e)}"
662
 
663
 
664
  def clear_conversation():
665
  """Clear the conversation"""
666
+ try:
667
+ result = diarization_system.clear_conversation()
668
+ return f"🗑️ {result}"
669
+ except Exception as e:
670
+ return f"❌ Failed to clear conversation: {str(e)}"
671
 
672
 
673
  def update_settings(threshold, max_speakers):
674
  """Update system settings"""
675
+ try:
676
+ result = diarization_system.update_settings(threshold, max_speakers)
677
+ return f"⚙️ {result}"
678
+ except Exception as e:
679
+ return f"❌ Failed to update settings: {str(e)}"
680
 
681
 
682
  def get_conversation():
683
  """Get the current conversation"""
684
+ try:
685
+ return diarization_system.get_formatted_conversation()
686
+ except Exception as e:
687
+ return f"<i>Error getting conversation: {str(e)}</i>"
688
 
689
 
690
  def get_status():
691
  """Get system status"""
692
+ try:
693
+ return diarization_system.get_status_info()
694
+ except Exception as e:
695
+ return f"Error getting status: {str(e)}"
696
 
697
 
698
  # Create Gradio interface
699
  def create_interface():
700
+ with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as interface:
701
  gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
702
+ gr.Markdown("This app performs real-time speech recognition with automatic speaker identification using FastRTC for low-latency audio streaming.")
703
 
704
  with gr.Row():
705
  with gr.Column(scale=2):
706
  # Main conversation display
707
  conversation_output = gr.HTML(
708
+ value="<div style='padding: 20px; background: #f5f5f5; border-radius: 10px;'><i>Click 'Initialize System' to start...</i></div>",
709
+ label="Live Conversation",
710
+ elem_id="conversation_display"
711
  )
712
 
713
  # Control buttons
714
  with gr.Row():
715
+ init_btn = gr.Button("🔧 Initialize System", variant="secondary", size="lg")
716
+ start_btn = gr.Button("🎙️ Start Recording", variant="primary", size="lg", interactive=False)
717
+ stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", size="lg", interactive=False)
718
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg", interactive=False)
719
+
720
+ # Audio connection status
721
+ with gr.Row():
722
+ connection_status = gr.HTML(
723
+ value="<div style='padding: 10px; background: #fff3cd; border-radius: 5px;'>🔌 FastRTC: Not connected</div>",
724
+ label="Connection Status"
725
+ )
726
 
727
  # Status display
728
  status_output = gr.Textbox(
729
  label="System Status",
730
+ value="System not initialized. Please click 'Initialize System' to begin.",
731
+ lines=6,
732
+ interactive=False,
733
+ show_copy_button=True
734
  )
735
 
736
  with gr.Column(scale=1):
 
741
  minimum=0.1,
742
  maximum=0.95,
743
  step=0.05,
744
+ value=0.5, # DEFAULT_CHANGE_THRESHOLD
745
  label="Speaker Change Sensitivity",
746
+ info="Lower = more sensitive to speaker changes"
747
  )
748
 
749
  max_speakers_slider = gr.Slider(
750
  minimum=2,
751
+ maximum=10, # ABSOLUTE_MAX_SPEAKERS
752
  step=1,
753
+ value=4, # DEFAULT_MAX_SPEAKERS
754
  label="Maximum Number of Speakers"
755
  )
756
 
757
+ update_settings_btn = gr.Button("Update Settings", variant="secondary")
758
+
759
+ # Audio settings
760
+ gr.Markdown("## 🔊 Audio Settings")
761
+ gr.Markdown("""
762
+ **Recommended settings:**
763
+ - Use a good quality microphone
764
+ - Ensure stable internet connection
765
+ - Speak clearly and avoid background noise
766
+ - Position microphone 6-12 inches from mouth
767
+ """)
768
 
769
  # Instructions
770
+ gr.Markdown("## 📝 How to Use")
771
  gr.Markdown("""
772
+ 1. **Initialize**: Click "Initialize System" to load AI models
773
+ 2. **Connect**: Allow microphone access when prompted
774
+ 3. **Start**: Click "Start Recording" to begin processing
775
+ 4. **Speak**: Talk into your microphone naturally
776
+ 5. **Monitor**: Watch real-time transcription with speaker labels
777
+ 6. **Adjust**: Fine-tune settings as needed
 
778
  """)
779
 
780
  # Speaker color legend
781
  gr.Markdown("## 🎨 Speaker Colors")
782
+ speaker_colors = [
783
+ ("#FF6B6B", "Red"),
784
+ ("#4ECDC4", "Teal"),
785
+ ("#45B7D1", "Blue"),
786
+ ("#96CEB4", "Green"),
787
+ ("#FFEAA7", "Yellow"),
788
+ ("#DDA0DD", "Plum"),
789
+ ("#98D8C8", "Mint"),
790
+ ("#F7DC6F", "Gold")
791
+ ]
792
 
793
+ color_html = ""
794
+ for i, (color, name) in enumerate(speaker_colors[:4]):
795
+ color_html += f'<div style="display: inline-block; margin: 5px;"><span style="color:{color}; font-size: 20px;">●</span> Speaker {i+1} ({name})</div><br>'
796
 
797
+ gr.HTML(color_html)
 
 
 
 
 
798
 
799
  # Auto-refresh conversation and status
800
  def refresh_display():
801
+ try:
802
+ conversation = get_conversation()
803
+ status = get_status()
804
+
805
+ # Update connection status based on system state
806
+ if diarization_system.is_running:
807
+ conn_status = "<div style='padding: 10px; background: #d4edda; border-radius: 5px;'>🟢 FastRTC: Connected & Recording</div>"
808
+ elif hasattr(diarization_system, 'encoder') and diarization_system.encoder is not None:
809
+ conn_status = "<div style='padding: 10px; background: #d1ecf1; border-radius: 5px;'>🔵 FastRTC: Ready to connect</div>"
810
+ else:
811
+ conn_status = "<div style='padding: 10px; background: #f8d7da; border-radius: 5px;'>🔴 FastRTC: System not initialized</div>"
812
+
813
+ return conversation, status, conn_status
814
+ except Exception as e:
815
+ error_msg = f"Error refreshing display: {str(e)}"
816
+ return f"<i>{error_msg}</i>", error_msg, "<div style='padding: 10px; background: #f8d7da; border-radius: 5px;'>❌ FastRTC: Error</div>"
817
 
818
  # Event handlers
819
  def on_initialize():
820
+ try:
821
+ result = initialize_system()
822
+ success = "successfully" in result.lower()
823
+
824
+ conversation, status, conn_status = refresh_display()
825
+
826
  return (
827
+ result, # status_output
828
+ gr.update(interactive=success), # start_btn
829
+ gr.update(interactive=success), # clear_btn
830
+ conversation, # conversation_output
831
+ conn_status # connection_status
832
  )
833
+ except Exception as e:
834
+ error_msg = f"❌ Initialization failed: {str(e)}"
835
  return (
836
+ error_msg,
837
+ gr.update(interactive=False),
838
+ gr.update(interactive=False),
839
+ "<i>System not ready</i>",
840
+ "<div style='padding: 10px; background: #f8d7da; border-radius: 5px;'>❌ FastRTC: Initialization failed</div>"
841
  )
842
 
843
  def on_start():
844
+ try:
845
+ result = start_recording()
846
+ conversation, status, conn_status = refresh_display()
847
+
848
+ return (
849
+ result, # status_output
850
+ gr.update(interactive=False), # start_btn
851
+ gr.update(interactive=True), # stop_btn
852
+ conn_status # connection_status
853
+ )
854
+ except Exception as e:
855
+ error_msg = f"❌ Failed to start: {str(e)}"
856
+ return (
857
+ error_msg,
858
+ gr.update(interactive=True),
859
+ gr.update(interactive=False),
860
+ "<div style='padding: 10px; background: #f8d7da; border-radius: 5px;'>❌ FastRTC: Start failed</div>"
861
+ )
862
 
863
  def on_stop():
864
+ try:
865
+ result = stop_recording()
866
+ conversation, status, conn_status = refresh_display()
867
+
868
+ return (
869
+ result, # status_output
870
+ gr.update(interactive=True), # start_btn
871
+ gr.update(interactive=False), # stop_btn
872
+ conn_status # connection_status
873
+ )
874
+ except Exception as e:
875
+ error_msg = f"❌ Failed to stop: {str(e)}"
876
+ return (
877
+ error_msg,
878
+ gr.update(interactive=False),
879
+ gr.update(interactive=True),
880
+ "<div style='padding: 10px; background: #f8d7da; border-radius: 5px;'>❌ FastRTC: Stop failed</div>"
881
+ )
882
+
883
+ def on_clear():
884
+ try:
885
+ result = clear_conversation()
886
+ conversation, status, conn_status = refresh_display()
887
+ return result, conversation
888
+ except Exception as e:
889
+ error_msg = f"❌ Failed to clear: {str(e)}"
890
+ return error_msg, "<i>Error clearing conversation</i>"
891
+
892
+ def on_update_settings(threshold, max_speakers):
893
+ try:
894
+ result = update_settings(threshold, max_speakers)
895
+ return result
896
+ except Exception as e:
897
+ return f"❌ Failed to update settings: {str(e)}"
898
 
899
  # Connect event handlers
900
  init_btn.click(
901
  on_initialize,
902
+ outputs=[status_output, start_btn, clear_btn, conversation_output, connection_status]
903
  )
904
 
905
  start_btn.click(
906
  on_start,
907
+ outputs=[status_output, start_btn, stop_btn, connection_status]
908
  )
909
 
910
  stop_btn.click(
911
  on_stop,
912
+ outputs=[status_output, start_btn, stop_btn, connection_status]
913
  )
914
 
915
  clear_btn.click(
916
+ on_clear,
917
+ outputs=[status_output, conversation_output]
918
  )
919
 
920
  update_settings_btn.click(
921
+ on_update_settings,
922
  inputs=[threshold_slider, max_speakers_slider],
923
  outputs=[status_output]
924
  )
925
 
926
+ # Auto-refresh every 2 seconds when active
927
  refresh_timer = gr.Timer(2.0)
928
  refresh_timer.tick(
929
  refresh_display,
930
+ outputs=[conversation_output, status_output, connection_status]
931
  )
932
 
933
  return interface
934
 
935
 
936
+ # FastAPI setup for HuggingFace Spaces
937
+ def create_fastapi_app():
938
+ """Create FastAPI app with proper FastRTC integration"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939
  app = FastAPI(
940
  title="Real-time Speaker Diarization",
941
  description="Real-time speech recognition with speaker diarization using FastRTC",
942
  version="1.0.0"
943
  )
944
 
945
+ # API Routes
946
+ router = APIRouter()
947
 
948
+ @router.get("/health")
949
+ async def health_check():
950
+ """Health check endpoint"""
951
+ return {
952
+ "status": "healthy",
953
+ "timestamp": time.time(),
954
+ "system_initialized": hasattr(diarization_system, 'encoder') and diarization_system.encoder is not None,
955
+ "recording_active": diarization_system.is_running if hasattr(diarization_system, 'is_running') else False
956
+ }
957
 
958
+ @router.get("/api/conversation")
959
+ async def get_conversation_api():
960
+ """Get current conversation"""
961
+ try:
962
+ return {
963
+ "conversation": get_conversation(),
964
+ "status": get_status(),
965
+ "is_recording": diarization_system.is_running if hasattr(diarization_system, 'is_running') else False,
966
+ "timestamp": time.time()
967
+ }
968
+ except Exception as e:
969
+ return {"error": str(e), "timestamp": time.time()}
970
 
971
+ @router.post("/api/control/{action}")
972
+ async def control_recording(action: str):
973
+ """Control recording actions"""
974
+ try:
975
+ if action == "start":
976
+ result = start_recording()
977
+ elif action == "stop":
978
+ result = stop_recording()
979
+ elif action == "clear":
980
+ result = clear_conversation()
981
+ elif action == "initialize":
982
+ result = initialize_system()
983
+ else:
984
+ return {"error": "Invalid action. Use: start, stop, clear, or initialize"}
985
+
986
+ return {
987
+ "result": result,
988
+ "is_recording": diarization_system.is_running if hasattr(diarization_system, 'is_running') else False,
989
+ "timestamp": time.time()
990
  }
991
+ except Exception as e:
992
+ return {"error": str(e), "timestamp": time.time()}
993
+
994
+ # FastRTC WebSocket endpoint for audio streaming
995
+ @router.websocket("/ws/audio")
996
+ async def websocket_audio_endpoint(websocket):
997
+ """WebSocket endpoint for FastRTC audio streaming"""
998
+ await websocket.accept()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
 
1000
+ try:
1001
+ while True:
1002
+ # Receive audio data from FastRTC client
1003
+ data = await websocket.receive_bytes()
1004
+
1005
+ if audio_handler and diarization_system.is_running:
1006
+ # Create audio frame and process
1007
+ frame = AudioFrame(data=data, sample_rate=16000)
1008
+ await audio_handler.on_audio_frame(frame)
1009
+
1010
+ except Exception as e:
1011
+ print(f"WebSocket error: {e}")
1012
+ finally:
1013
+ await websocket.close()
1014
 
1015
+ app.include_router(router)
1016
  return app
1017
 
1018
 
1019
+ # Main application entry point
1020
+ def create_app():
1021
+ """Create the complete application for HuggingFace Spaces"""
1022
+
1023
+ # Create FastAPI app
1024
+ fastapi_app = create_fastapi_app()
1025
+
1026
+ # Create Gradio interface
1027
+ gradio_interface = create_interface()
1028
+
1029
+ # Mount Gradio on FastAPI
1030
+ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
1031
+
1032
+ return app, gradio_interface
1033
 
1034
 
1035
+ # Entry point for HuggingFace Spaces
1036
  if __name__ == "__main__":
1037
+ try:
1038
+ # Create the application
1039
+ app, interface = create_app()
1040
+
1041
+ # Launch for HuggingFace Spaces
1042
+ interface.launch(
1043
+ server_name="0.0.0.0",
1044
+ server_port=int(os.environ.get("PORT", 7860)),
1045
+ share=False,
1046
+ show_error=True,
1047
+ quiet=False
1048
+ )
1049
+
1050
+ except Exception as e:
1051
+ print(f"Failed to launch application: {e}")
1052
+ # Fallback - launch just Gradio interface
1053
+ try:
1054
+ interface = create_interface()
1055
+ interface.launch(
1056
+ server_name="0.0.0.0",
1057
+ server_port=int(os.environ.get("PORT", 7860)),
1058
+ share=False
1059
+ )
1060
+ except Exception as fallback_error:
1061
+ print(f"Fallback launch also failed: {fallback_error}")