Saiyaswanth007 commited on
Commit
88f78ff
·
1 Parent(s): 7609dee

Updated code

Browse files
Files changed (2) hide show
  1. app.py +130 -286
  2. realtime_diarize.py +0 -581
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  import numpy as np
3
- import soundcard as sc
4
  import queue
5
  import torch
6
  import time
@@ -9,8 +8,9 @@ import os
9
  import urllib.request
10
  import torchaudio
11
  from scipy.spatial.distance import cosine
12
- from RealtimeSTT import AudioToTextRecorder
13
  import json
 
 
14
 
15
  # Simplified configuration parameters
16
  SILENCE_THRESHS = [0, 0.4]
@@ -33,7 +33,6 @@ ABSOLUTE_MAX_SPEAKERS = 10
33
 
34
  # Global variables
35
  FAST_SENTENCE_END = True
36
- USE_MICROPHONE = False
37
  SAMPLE_RATE = 16000
38
  BUFFER_SIZE = 512
39
  CHANNELS = 1
@@ -58,6 +57,9 @@ SPEAKER_COLOR_NAMES = [
58
  ]
59
 
60
 
 
 
 
61
  class SpeechBrainEncoder:
62
  """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
63
  def __init__(self, device="cpu"):
@@ -68,24 +70,11 @@ class SpeechBrainEncoder:
68
  self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
69
  os.makedirs(self.cache_dir, exist_ok=True)
70
 
71
- def _download_model(self):
72
- """Download pre-trained SpeechBrain ECAPA-TDNN model if not present"""
73
- model_url = "https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/embedding_model.ckpt"
74
- model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
75
-
76
- if not os.path.exists(model_path):
77
- print(f"Downloading ECAPA-TDNN model to {model_path}...")
78
- urllib.request.urlretrieve(model_url, model_path)
79
-
80
- return model_path
81
-
82
  def load_model(self):
83
  """Load the ECAPA-TDNN model"""
84
  try:
85
  from speechbrain.pretrained import EncoderClassifier
86
 
87
- model_path = self._download_model()
88
-
89
  self.model = EncoderClassifier.from_hparams(
90
  source="speechbrain/spkrec-ecapa-voxceleb",
91
  savedir=self.cache_dir,
@@ -93,9 +82,10 @@ class SpeechBrainEncoder:
93
  )
94
 
95
  self.model_loaded = True
 
96
  return True
97
  except Exception as e:
98
- print(f"Error loading ECAPA-TDNN model: {e}")
99
  return False
100
 
101
  def embed_utterance(self, audio, sr=16000):
@@ -126,16 +116,21 @@ class AudioProcessor:
126
  def __init__(self, encoder):
127
  self.encoder = encoder
128
 
129
- def extract_embedding(self, audio_int16):
130
  try:
131
- float_audio = audio_int16.astype(np.float32) / 32768.0
 
 
 
 
132
 
 
133
  if np.abs(float_audio).max() > 1.0:
134
  float_audio = float_audio / np.abs(float_audio).max()
135
 
136
- embedding = self.encoder.embed_utterance(float_audio)
137
-
138
  return embedding
 
139
  except Exception as e:
140
  print(f"Embedding extraction error: {e}")
141
  return np.zeros(self.encoder.embedding_dim)
@@ -271,20 +266,14 @@ class SpeakerChangeDetector:
271
  }
272
 
273
 
274
- class RealtimeSpeakerDiarization:
275
  def __init__(self):
276
  self.encoder = None
277
  self.audio_processor = None
278
  self.speaker_detector = None
279
- self.recorder = None
280
- self.recording_thread = None
281
- self.sentence_queue = queue.Queue()
282
  self.full_sentences = []
283
  self.sentence_speakers = []
284
- self.pending_sentences = []
285
- self.displayed_text = ""
286
- self.last_realtime_text = ""
287
- self.is_running = False
288
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
289
  self.max_speakers = DEFAULT_MAX_SPEAKERS
290
 
@@ -294,6 +283,7 @@ class RealtimeSpeakerDiarization:
294
  device_str = "cuda" if torch.cuda.is_available() else "cpu"
295
  print(f"Using device: {device_str}")
296
 
 
297
  self.encoder = SpeechBrainEncoder(device=device_str)
298
  success = self.encoder.load_model()
299
 
@@ -304,170 +294,62 @@ class RealtimeSpeakerDiarization:
304
  change_threshold=self.change_threshold,
305
  max_speakers=self.max_speakers
306
  )
307
- print("ECAPA-TDNN model loaded successfully!")
308
  return True
309
  else:
310
- print("Failed to load ECAPA-TDNN model")
311
  return False
 
312
  except Exception as e:
313
  print(f"Model initialization error: {e}")
314
  return False
315
 
316
- def live_text_detected(self, text):
317
- """Callback for real-time transcription updates"""
318
- text = text.strip()
319
- if text:
320
- sentence_delimiters = '.?!。'
321
- prob_sentence_end = (
322
- len(self.last_realtime_text) > 0
323
- and text[-1] in sentence_delimiters
324
- and self.last_realtime_text[-1] in sentence_delimiters
325
- )
326
-
327
- self.last_realtime_text = text
328
-
329
- if prob_sentence_end and FAST_SENTENCE_END:
330
- self.recorder.stop()
331
- elif prob_sentence_end:
332
- self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
333
- else:
334
- self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
335
-
336
- def process_final_text(self, text):
337
- """Process final transcribed text with speaker embedding"""
338
- text = text.strip()
339
- if text:
340
- try:
341
- bytes_data = self.recorder.last_transcription_bytes
342
- self.sentence_queue.put((text, bytes_data))
343
- self.pending_sentences.append(text)
344
- except Exception as e:
345
- print(f"Error processing final text: {e}")
346
-
347
- def process_sentence_queue(self):
348
- """Process sentences in the queue for speaker detection"""
349
- while self.is_running:
350
- try:
351
- text, bytes_data = self.sentence_queue.get(timeout=1)
352
-
353
- # Convert audio data to int16
354
- audio_int16 = np.int16(bytes_data * 32767)
355
-
356
- # Extract speaker embedding
357
- speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
358
-
359
- # Store sentence and embedding
360
- self.full_sentences.append((text, speaker_embedding))
361
-
362
- # Fill in missing speaker assignments
363
- while len(self.sentence_speakers) < len(self.full_sentences) - 1:
364
- self.sentence_speakers.append(0)
365
-
366
- # Detect speaker changes
367
- speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
368
- self.sentence_speakers.append(speaker_id)
369
-
370
- # Remove from pending
371
- if text in self.pending_sentences:
372
- self.pending_sentences.remove(text)
373
-
374
- except queue.Empty:
375
- continue
376
- except Exception as e:
377
- print(f"Error processing sentence: {e}")
378
-
379
- def start_recording(self):
380
- """Start the recording and transcription process"""
381
- if self.encoder is None:
382
- return "Please initialize models first!"
383
 
384
  try:
385
- # Setup recorder configuration
386
- recorder_config = {
387
- 'spinner': False,
388
- 'use_microphone': USE_MICROPHONE,
389
- 'model': FINAL_TRANSCRIPTION_MODEL,
390
- 'language': TRANSCRIPTION_LANGUAGE,
391
- 'silero_sensitivity': SILERO_SENSITIVITY,
392
- 'webrtc_sensitivity': WEBRTC_SENSITIVITY,
393
- 'post_speech_silence_duration': SILENCE_THRESHS[1],
394
- 'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
395
- 'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
396
- 'min_gap_between_recordings': 0,
397
- 'enable_realtime_transcription': True,
398
- 'realtime_processing_pause': 0,
399
- 'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
400
- 'on_realtime_transcription_update': self.live_text_detected,
401
- 'beam_size': FINAL_BEAM_SIZE,
402
- 'beam_size_realtime': REALTIME_BEAM_SIZE,
403
- 'buffer_size': BUFFER_SIZE,
404
- 'sample_rate': SAMPLE_RATE,
405
- }
406
-
407
- self.recorder = AudioToTextRecorder(**recorder_config)
408
 
409
- # Start sentence processing thread
410
- self.is_running = True
411
- self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
412
- self.sentence_thread.start()
413
 
414
- # Start audio capture thread
415
- self.audio_thread = threading.Thread(target=self.capture_audio, daemon=True)
416
- self.audio_thread.start()
417
 
418
- # Start transcription thread
419
- self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
420
- self.transcription_thread.start()
421
 
422
- return "Recording started successfully!"
 
423
 
424
- except Exception as e:
425
- return f"Error starting recording: {e}"
426
-
427
- def capture_audio(self):
428
- """Capture audio from default speaker/microphone"""
429
- try:
430
- device_id = str(sc.default_speaker().name if not USE_MICROPHONE else sc.default_microphone().name)
431
- include_loopback = not USE_MICROPHONE
432
 
433
- with sc.get_microphone(id=device_id, include_loopback=include_loopback).recorder(
434
- samplerate=SAMPLE_RATE, blocksize=BUFFER_SIZE
435
- ) as mic:
436
- while self.is_running:
437
- audio_data = mic.record(numframes=BUFFER_SIZE)
438
-
439
- if audio_data.shape[1] > 1 and CHANNELS == 1:
440
- audio_data = audio_data[:, 0]
441
-
442
- audio_int16 = (audio_data.flatten() * 32767).astype(np.int16)
443
- audio_bytes = audio_int16.tobytes()
444
- self.recorder.feed_audio(audio_bytes)
445
-
446
- except Exception as e:
447
- print(f"Audio capture error: {e}")
448
-
449
- def run_transcription(self):
450
- """Run the transcription loop"""
451
- try:
452
- while self.is_running:
453
- self.recorder.text(self.process_final_text)
454
  except Exception as e:
455
- print(f"Transcription error: {e}")
456
-
457
- def stop_recording(self):
458
- """Stop the recording process"""
459
- self.is_running = False
460
- if self.recorder:
461
- self.recorder.stop()
462
- return "Recording stopped!"
463
 
464
  def clear_conversation(self):
465
  """Clear all conversation data"""
466
  self.full_sentences = []
467
  self.sentence_speakers = []
468
- self.pending_sentences = []
469
- self.displayed_text = ""
470
- self.last_realtime_text = ""
471
 
472
  if self.speaker_detector:
473
  self.speaker_detector = SpeakerChangeDetector(
@@ -476,7 +358,7 @@ class RealtimeSpeakerDiarization:
476
  max_speakers=self.max_speakers
477
  )
478
 
479
- return "Conversation cleared!"
480
 
481
  def update_settings(self, threshold, max_speakers):
482
  """Update speaker detection settings"""
@@ -487,18 +369,22 @@ class RealtimeSpeakerDiarization:
487
  self.speaker_detector.set_change_threshold(threshold)
488
  self.speaker_detector.set_max_speakers(max_speakers)
489
 
490
- return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
 
491
 
492
  def get_formatted_conversation(self):
493
  """Get the formatted conversation with speaker colors"""
494
  try:
 
 
 
495
  sentences_with_style = []
496
 
497
- # Process completed sentences
498
  for i, sentence in enumerate(self.full_sentences):
499
  sentence_text, _ = sentence
500
  if i >= len(self.sentence_speakers):
501
  color = "#FFFFFF"
 
502
  else:
503
  speaker_id = self.sentence_speakers[i]
504
  color = self.speaker_detector.get_color_for_speaker(speaker_id)
@@ -507,15 +393,7 @@ class RealtimeSpeakerDiarization:
507
  sentences_with_style.append(
508
  f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
509
 
510
- # Add pending sentences
511
- for pending_sentence in self.pending_sentences:
512
- sentences_with_style.append(
513
- f'<span style="color:#60FFFF;"><b>Processing:</b> {pending_sentence}</span>')
514
-
515
- if sentences_with_style:
516
- return "<br><br>".join(sentences_with_style)
517
- else:
518
- return "Waiting for speech input..."
519
 
520
  except Exception as e:
521
  return f"Error formatting conversation: {e}"
@@ -533,7 +411,7 @@ class RealtimeSpeakerDiarization:
533
  f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
534
  f"**Last Similarity:** {status['last_similarity']:.3f}",
535
  f"**Change Threshold:** {status['threshold']:.2f}",
536
- f"**Total Sentences:** {len(self.full_sentences)}",
537
  "",
538
  "**Speaker Segment Counts:**"
539
  ]
@@ -549,26 +427,21 @@ class RealtimeSpeakerDiarization:
549
 
550
 
551
  # Global instance
552
- diarization_system = RealtimeSpeakerDiarization()
553
 
554
 
555
  def initialize_system():
556
  """Initialize the diarization system"""
557
  success = diarization_system.initialize_models()
558
  if success:
559
- return "✅ System initialized successfully! Models loaded."
560
  else:
561
- return "❌ Failed to initialize system. Please check the logs."
562
-
563
 
564
- def start_recording():
565
- """Start recording and transcription"""
566
- return diarization_system.start_recording()
567
 
568
-
569
- def stop_recording():
570
- """Stop recording and transcription"""
571
- return diarization_system.stop_recording()
572
 
573
 
574
  def clear_conversation():
@@ -581,44 +454,52 @@ def update_settings(threshold, max_speakers):
581
  return diarization_system.update_settings(threshold, max_speakers)
582
 
583
 
584
- def get_conversation():
585
- """Get the current conversation"""
586
- return diarization_system.get_formatted_conversation()
587
-
588
-
589
- def get_status():
590
- """Get system status"""
591
- return diarization_system.get_status_info()
592
-
593
-
594
  # Create Gradio interface
595
  def create_interface():
596
- with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Dark()) as app:
597
- gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
598
- gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding.")
599
 
600
  with gr.Row():
601
  with gr.Column(scale=2):
602
- # Main conversation display
603
- conversation_output = gr.HTML(
604
- value="<i>Click 'Initialize System' to start...</i>",
605
- label="Live Conversation"
606
- )
607
-
608
- # Control buttons
609
  with gr.Row():
610
- init_btn = gr.Button("🔧 Initialize System", variant="secondary")
611
- start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False)
612
- stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False)
613
- clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
 
 
 
 
 
 
 
614
 
615
- # Status display
 
 
 
 
 
 
 
 
616
  status_output = gr.Textbox(
617
- label="System Status",
618
- value="System not initialized",
619
- lines=8,
620
  interactive=False
621
  )
 
 
 
 
 
 
 
 
 
622
 
623
  with gr.Column(scale=1):
624
  # Settings panel
@@ -630,7 +511,7 @@ def create_interface():
630
  step=0.05,
631
  value=DEFAULT_CHANGE_THRESHOLD,
632
  label="Speaker Change Sensitivity",
633
- info="Lower values = more sensitive to speaker changes"
634
  )
635
 
636
  max_speakers_slider = gr.Slider(
@@ -641,88 +522,51 @@ def create_interface():
641
  label="Maximum Number of Speakers"
642
  )
643
 
644
- update_settings_btn = gr.Button("Update Settings")
 
 
 
 
 
 
 
 
645
 
646
  # Speaker color legend
647
  gr.Markdown("## 🎨 Speaker Colors")
648
  color_info = []
649
- for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
650
- color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
651
 
652
- gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
653
-
654
- # Auto-refresh conversation and status
655
- def refresh_display():
656
- return get_conversation(), get_status()
657
 
658
  # Event handlers
659
- def on_initialize():
660
- result = initialize_system()
661
- if "successfully" in result:
662
- return (
663
- result,
664
- gr.update(interactive=True), # start_btn
665
- gr.update(interactive=True), # clear_btn
666
- get_conversation(),
667
- get_status()
668
- )
669
- else:
670
- return (
671
- result,
672
- gr.update(interactive=False), # start_btn
673
- gr.update(interactive=False), # clear_btn
674
- get_conversation(),
675
- get_status()
676
- )
677
-
678
- def on_start():
679
- result = start_recording()
680
- return (
681
- result,
682
- gr.update(interactive=False), # start_btn
683
- gr.update(interactive=True), # stop_btn
684
- )
685
-
686
- def on_stop():
687
- result = stop_recording()
688
- return (
689
- result,
690
- gr.update(interactive=True), # start_btn
691
- gr.update(interactive=False), # stop_btn
692
- )
693
-
694
- # Connect event handlers
695
  init_btn.click(
696
- on_initialize,
697
- outputs=[status_output, start_btn, clear_btn, conversation_output, status_output]
698
  )
699
 
700
- start_btn.click(
701
- on_start,
702
- outputs=[status_output, start_btn, stop_btn]
 
703
  )
704
 
705
- stop_btn.click(
706
- on_stop,
707
- outputs=[status_output, start_btn, stop_btn]
 
708
  )
709
 
710
  clear_btn.click(
711
  clear_conversation,
712
- outputs=[status_output]
713
  )
714
 
715
  update_settings_btn.click(
716
  update_settings,
717
  inputs=[threshold_slider, max_speakers_slider],
718
- outputs=[status_output]
719
- )
720
-
721
- # Auto-refresh every 2 seconds when recording
722
- refresh_timer = gr.Timer(2.0)
723
- refresh_timer.tick(
724
- refresh_display,
725
- outputs=[conversation_output, status_output]
726
  )
727
 
728
  return app
 
1
  import gradio as gr
2
  import numpy as np
 
3
  import queue
4
  import torch
5
  import time
 
8
  import urllib.request
9
  import torchaudio
10
  from scipy.spatial.distance import cosine
 
11
  import json
12
+ import io
13
+ import wave
14
 
15
  # Simplified configuration parameters
16
  SILENCE_THRESHS = [0, 0.4]
 
33
 
34
  # Global variables
35
  FAST_SENTENCE_END = True
 
36
  SAMPLE_RATE = 16000
37
  BUFFER_SIZE = 512
38
  CHANNELS = 1
 
57
  ]
58
 
59
 
60
+
61
+
62
+
63
  class SpeechBrainEncoder:
64
  """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
65
  def __init__(self, device="cpu"):
 
70
  self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
71
  os.makedirs(self.cache_dir, exist_ok=True)
72
 
 
 
 
 
 
 
 
 
 
 
 
73
  def load_model(self):
74
  """Load the ECAPA-TDNN model"""
75
  try:
76
  from speechbrain.pretrained import EncoderClassifier
77
 
 
 
78
  self.model = EncoderClassifier.from_hparams(
79
  source="speechbrain/spkrec-ecapa-voxceleb",
80
  savedir=self.cache_dir,
 
82
  )
83
 
84
  self.model_loaded = True
85
+ print("ECAPA-TDNN model loaded successfully!")
86
  return True
87
  except Exception as e:
88
+ print(f"SpeechBrain not available: {e}")
89
  return False
90
 
91
  def embed_utterance(self, audio, sr=16000):
 
116
  def __init__(self, encoder):
117
  self.encoder = encoder
118
 
119
+ def extract_embedding(self, audio_data, sample_rate=16000):
120
  try:
121
+ # Ensure audio is float32 and normalized
122
+ if audio_data.dtype == np.int16:
123
+ float_audio = audio_data.astype(np.float32) / 32768.0
124
+ else:
125
+ float_audio = audio_data.astype(np.float32)
126
 
127
+ # Normalize if needed
128
  if np.abs(float_audio).max() > 1.0:
129
  float_audio = float_audio / np.abs(float_audio).max()
130
 
131
+ embedding = self.encoder.embed_utterance(float_audio, sample_rate)
 
132
  return embedding
133
+
134
  except Exception as e:
135
  print(f"Embedding extraction error: {e}")
136
  return np.zeros(self.encoder.embedding_dim)
 
266
  }
267
 
268
 
269
+ class GradioSpeakerDiarization:
270
  def __init__(self):
271
  self.encoder = None
272
  self.audio_processor = None
273
  self.speaker_detector = None
 
 
 
274
  self.full_sentences = []
275
  self.sentence_speakers = []
276
+ self.is_initialized = False
 
 
 
277
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
278
  self.max_speakers = DEFAULT_MAX_SPEAKERS
279
 
 
283
  device_str = "cuda" if torch.cuda.is_available() else "cpu"
284
  print(f"Using device: {device_str}")
285
 
286
+ # Load SpeechBrain encoder
287
  self.encoder = SpeechBrainEncoder(device=device_str)
288
  success = self.encoder.load_model()
289
 
 
294
  change_threshold=self.change_threshold,
295
  max_speakers=self.max_speakers
296
  )
297
+ self.is_initialized = True
298
  return True
299
  else:
 
300
  return False
301
+
302
  except Exception as e:
303
  print(f"Model initialization error: {e}")
304
  return False
305
 
306
+ def transcribe_audio(self, audio_input):
307
+ """Process audio input and perform transcription with speaker diarization"""
308
+ if not self.is_initialized:
309
+ return "❌ Please initialize the system first!", self.get_formatted_conversation(), self.get_status_info()
310
+
311
+ if audio_input is None:
312
+ return "No audio received", self.get_formatted_conversation(), self.get_status_info()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
  try:
315
+ # Handle different audio input formats
316
+ if isinstance(audio_input, tuple):
317
+ sample_rate, audio_data = audio_input
318
+ else:
319
+ # Assume it's a file path
320
+ import librosa
321
+ audio_data, sample_rate = librosa.load(audio_input, sr=16000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
+ # Ensure audio is in the right format
324
+ if len(audio_data.shape) > 1:
325
+ audio_data = audio_data.mean(axis=1) # Convert to mono
 
326
 
327
+ # Perform simple transcription (placeholder - you'd want to integrate with Whisper or similar)
328
+ # For now, we'll just do speaker diarization
329
+ transcription = f"Audio segment {len(self.full_sentences) + 1} (duration: {len(audio_data)/sample_rate:.1f}s)"
330
 
331
+ # Extract speaker embedding
332
+ speaker_embedding = self.audio_processor.extract_embedding(audio_data, sample_rate)
 
333
 
334
+ # Store sentence and embedding
335
+ self.full_sentences.append((transcription, speaker_embedding))
336
 
337
+ # Detect speaker changes
338
+ speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
339
+ self.sentence_speakers.append(speaker_id)
340
+
341
+ status_msg = f" Processed audio segment. Detected as Speaker {speaker_id + 1} (similarity: {similarity:.3f})"
342
+
343
+ return status_msg, self.get_formatted_conversation(), self.get_status_info()
 
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  except Exception as e:
346
+ error_msg = f" Error processing audio: {str(e)}"
347
+ return error_msg, self.get_formatted_conversation(), self.get_status_info()
 
 
 
 
 
 
348
 
349
  def clear_conversation(self):
350
  """Clear all conversation data"""
351
  self.full_sentences = []
352
  self.sentence_speakers = []
 
 
 
353
 
354
  if self.speaker_detector:
355
  self.speaker_detector = SpeakerChangeDetector(
 
358
  max_speakers=self.max_speakers
359
  )
360
 
361
+ return "Conversation cleared!", self.get_formatted_conversation(), self.get_status_info()
362
 
363
  def update_settings(self, threshold, max_speakers):
364
  """Update speaker detection settings"""
 
369
  self.speaker_detector.set_change_threshold(threshold)
370
  self.speaker_detector.set_max_speakers(max_speakers)
371
 
372
+ status_msg = f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
373
+ return status_msg, self.get_formatted_conversation(), self.get_status_info()
374
 
375
  def get_formatted_conversation(self):
376
  """Get the formatted conversation with speaker colors"""
377
  try:
378
+ if not self.full_sentences:
379
+ return "No audio processed yet. Upload an audio file or record using the microphone."
380
+
381
  sentences_with_style = []
382
 
 
383
  for i, sentence in enumerate(self.full_sentences):
384
  sentence_text, _ = sentence
385
  if i >= len(self.sentence_speakers):
386
  color = "#FFFFFF"
387
+ speaker_name = "Unknown"
388
  else:
389
  speaker_id = self.sentence_speakers[i]
390
  color = self.speaker_detector.get_color_for_speaker(speaker_id)
 
393
  sentences_with_style.append(
394
  f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
395
 
396
+ return "<br><br>".join(sentences_with_style)
 
 
 
 
 
 
 
 
397
 
398
  except Exception as e:
399
  return f"Error formatting conversation: {e}"
 
411
  f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
412
  f"**Last Similarity:** {status['last_similarity']:.3f}",
413
  f"**Change Threshold:** {status['threshold']:.2f}",
414
+ f"**Total Segments:** {len(self.full_sentences)}",
415
  "",
416
  "**Speaker Segment Counts:**"
417
  ]
 
427
 
428
 
429
  # Global instance
430
+ diarization_system = GradioSpeakerDiarization()
431
 
432
 
433
  def initialize_system():
434
  """Initialize the diarization system"""
435
  success = diarization_system.initialize_models()
436
  if success:
437
+ return "✅ System initialized successfully! Models loaded.", "", ""
438
  else:
439
+ return "❌ Failed to initialize system. Please check the logs.", "", ""
 
440
 
 
 
 
441
 
442
+ def process_audio(audio):
443
+ """Process uploaded or recorded audio"""
444
+ return diarization_system.transcribe_audio(audio)
 
445
 
446
 
447
  def clear_conversation():
 
454
  return diarization_system.update_settings(threshold, max_speakers)
455
 
456
 
 
 
 
 
 
 
 
 
 
 
457
  # Create Gradio interface
458
  def create_interface():
459
+ with gr.Blocks(title="Speaker Diarization", theme=gr.themes.Soft()) as app:
460
+ gr.Markdown("# 🎤 Audio Speaker Diarization")
461
+ gr.Markdown("Upload audio files or record directly to identify different speakers using voice characteristics.")
462
 
463
  with gr.Row():
464
  with gr.Column(scale=2):
465
+ # Initialize button
 
 
 
 
 
 
466
  with gr.Row():
467
+ init_btn = gr.Button("🔧 Initialize System", variant="primary", size="lg")
468
+
469
+ # Audio input options
470
+ gr.Markdown("### 📁 Audio Input")
471
+ with gr.Tab("Upload Audio File"):
472
+ audio_file = gr.Audio(
473
+ label="Upload Audio File",
474
+ type="filepath",
475
+ sources=["upload"]
476
+ )
477
+ process_file_btn = gr.Button("Process Audio File", variant="secondary")
478
 
479
+ with gr.Tab("Record Audio"):
480
+ audio_mic = gr.Audio(
481
+ label="Record Audio",
482
+ type="numpy",
483
+ sources=["microphone"]
484
+ )
485
+ process_mic_btn = gr.Button("Process Recording", variant="secondary")
486
+
487
+ # Results display
488
  status_output = gr.Textbox(
489
+ label="Status",
490
+ value="Click 'Initialize System' to start...",
491
+ lines=2,
492
  interactive=False
493
  )
494
+
495
+ conversation_output = gr.HTML(
496
+ value="<i>System not initialized...</i>",
497
+ label="Speaker Analysis Results"
498
+ )
499
+
500
+ # Control buttons
501
+ with gr.Row():
502
+ clear_btn = gr.Button("🗑️ Clear Results", variant="stop")
503
 
504
  with gr.Column(scale=1):
505
  # Settings panel
 
511
  step=0.05,
512
  value=DEFAULT_CHANGE_THRESHOLD,
513
  label="Speaker Change Sensitivity",
514
+ info="Lower = more sensitive to speaker changes"
515
  )
516
 
517
  max_speakers_slider = gr.Slider(
 
522
  label="Maximum Number of Speakers"
523
  )
524
 
525
+ update_settings_btn = gr.Button("Update Settings", variant="secondary")
526
+
527
+ # System status
528
+ system_status = gr.Textbox(
529
+ label="System Status",
530
+ value="System not initialized",
531
+ lines=12,
532
+ interactive=False
533
+ )
534
 
535
  # Speaker color legend
536
  gr.Markdown("## 🎨 Speaker Colors")
537
  color_info = []
538
+ for i, (color, name) in enumerate(zip(SPEAKER_COLORS[:DEFAULT_MAX_SPEAKERS], SPEAKER_COLOR_NAMES[:DEFAULT_MAX_SPEAKERS])):
539
+ color_info.append(f'<span style="color:{color};">●</span> Speaker {i+1} ({name})')
540
 
541
+ gr.HTML("<br>".join(color_info))
 
 
 
 
542
 
543
  # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
  init_btn.click(
545
+ initialize_system,
546
+ outputs=[status_output, conversation_output, system_status]
547
  )
548
 
549
+ process_file_btn.click(
550
+ process_audio,
551
+ inputs=[audio_file],
552
+ outputs=[status_output, conversation_output, system_status]
553
  )
554
 
555
+ process_mic_btn.click(
556
+ process_audio,
557
+ inputs=[audio_mic],
558
+ outputs=[status_output, conversation_output, system_status]
559
  )
560
 
561
  clear_btn.click(
562
  clear_conversation,
563
+ outputs=[status_output, conversation_output, system_status]
564
  )
565
 
566
  update_settings_btn.click(
567
  update_settings,
568
  inputs=[threshold_slider, max_speakers_slider],
569
+ outputs=[status_output, conversation_output, system_status]
 
 
 
 
 
 
 
570
  )
571
 
572
  return app
realtime_diarize.py DELETED
@@ -1,581 +0,0 @@
1
- import gradio as gr
2
- import numpy as np
3
- import queue
4
- import torch
5
- import time
6
- import threading
7
- import os
8
- import urllib.request
9
- import torchaudio
10
- from scipy.spatial.distance import cosine
11
- import json
12
- import io
13
- import wave
14
-
15
- # Simplified configuration parameters
16
- SILENCE_THRESHS = [0, 0.4]
17
- FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
18
- FINAL_BEAM_SIZE = 5
19
- REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
20
- REALTIME_BEAM_SIZE = 5
21
- TRANSCRIPTION_LANGUAGE = "en"
22
- SILERO_SENSITIVITY = 0.4
23
- WEBRTC_SENSITIVITY = 3
24
- MIN_LENGTH_OF_RECORDING = 0.7
25
- PRE_RECORDING_BUFFER_DURATION = 0.35
26
-
27
- # Speaker change detection parameters
28
- DEFAULT_CHANGE_THRESHOLD = 0.7
29
- EMBEDDING_HISTORY_SIZE = 5
30
- MIN_SEGMENT_DURATION = 1.0
31
- DEFAULT_MAX_SPEAKERS = 4
32
- ABSOLUTE_MAX_SPEAKERS = 10
33
-
34
- # Global variables
35
- FAST_SENTENCE_END = True
36
- SAMPLE_RATE = 16000
37
- BUFFER_SIZE = 512
38
- CHANNELS = 1
39
-
40
- # Speaker colors
41
- SPEAKER_COLORS = [
42
- "#FFFF00", # Yellow
43
- "#FF0000", # Red
44
- "#00FF00", # Green
45
- "#00FFFF", # Cyan
46
- "#FF00FF", # Magenta
47
- "#0000FF", # Blue
48
- "#FF8000", # Orange
49
- "#00FF80", # Spring Green
50
- "#8000FF", # Purple
51
- "#FFFFFF", # White
52
- ]
53
-
54
- SPEAKER_COLOR_NAMES = [
55
- "Yellow", "Red", "Green", "Cyan", "Magenta",
56
- "Blue", "Orange", "Spring Green", "Purple", "White"
57
- ]
58
-
59
-
60
-
61
-
62
-
63
- class SpeechBrainEncoder:
64
- """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
65
- def __init__(self, device="cpu"):
66
- self.device = device
67
- self.model = None
68
- self.embedding_dim = 192
69
- self.model_loaded = False
70
- self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
71
- os.makedirs(self.cache_dir, exist_ok=True)
72
-
73
- def load_model(self):
74
- """Load the ECAPA-TDNN model"""
75
- try:
76
- from speechbrain.pretrained import EncoderClassifier
77
-
78
- self.model = EncoderClassifier.from_hparams(
79
- source="speechbrain/spkrec-ecapa-voxceleb",
80
- savedir=self.cache_dir,
81
- run_opts={"device": self.device}
82
- )
83
-
84
- self.model_loaded = True
85
- print("ECAPA-TDNN model loaded successfully!")
86
- return True
87
- except Exception as e:
88
- print(f"SpeechBrain not available: {e}")
89
- return False
90
-
91
- def embed_utterance(self, audio, sr=16000):
92
- """Extract speaker embedding from audio"""
93
- if not self.model_loaded:
94
- raise ValueError("Model not loaded. Call load_model() first.")
95
-
96
- try:
97
- if isinstance(audio, np.ndarray):
98
- waveform = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
99
- else:
100
- waveform = audio.unsqueeze(0)
101
-
102
- if sr != 16000:
103
- waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
104
-
105
- with torch.no_grad():
106
- embedding = self.model.encode_batch(waveform)
107
-
108
- return embedding.squeeze().cpu().numpy()
109
- except Exception as e:
110
- print(f"Error extracting embedding: {e}")
111
- return np.zeros(self.embedding_dim)
112
-
113
-
114
- class AudioProcessor:
115
- """Processes audio data to extract speaker embeddings"""
116
- def __init__(self, encoder):
117
- self.encoder = encoder
118
-
119
- def extract_embedding(self, audio_data, sample_rate=16000):
120
- try:
121
- # Ensure audio is float32 and normalized
122
- if audio_data.dtype == np.int16:
123
- float_audio = audio_data.astype(np.float32) / 32768.0
124
- else:
125
- float_audio = audio_data.astype(np.float32)
126
-
127
- # Normalize if needed
128
- if np.abs(float_audio).max() > 1.0:
129
- float_audio = float_audio / np.abs(float_audio).max()
130
-
131
- embedding = self.encoder.embed_utterance(float_audio, sample_rate)
132
- return embedding
133
-
134
- except Exception as e:
135
- print(f"Embedding extraction error: {e}")
136
- return np.zeros(self.encoder.embedding_dim)
137
-
138
-
139
- class SpeakerChangeDetector:
140
- """Speaker change detector that supports a configurable number of speakers"""
141
- def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
142
- self.embedding_dim = embedding_dim
143
- self.change_threshold = change_threshold
144
- self.max_speakers = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
145
- self.current_speaker = 0
146
- self.previous_embeddings = []
147
- self.last_change_time = time.time()
148
- self.mean_embeddings = [None] * self.max_speakers
149
- self.speaker_embeddings = [[] for _ in range(self.max_speakers)]
150
- self.last_similarity = 0.0
151
- self.active_speakers = set([0])
152
-
153
- def set_max_speakers(self, max_speakers):
154
- """Update the maximum number of speakers"""
155
- new_max = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
156
-
157
- if new_max < self.max_speakers:
158
- for speaker_id in list(self.active_speakers):
159
- if speaker_id >= new_max:
160
- self.active_speakers.discard(speaker_id)
161
-
162
- if self.current_speaker >= new_max:
163
- self.current_speaker = 0
164
-
165
- if new_max > self.max_speakers:
166
- self.mean_embeddings.extend([None] * (new_max - self.max_speakers))
167
- self.speaker_embeddings.extend([[] for _ in range(new_max - self.max_speakers)])
168
- else:
169
- self.mean_embeddings = self.mean_embeddings[:new_max]
170
- self.speaker_embeddings = self.speaker_embeddings[:new_max]
171
-
172
- self.max_speakers = new_max
173
-
174
- def set_change_threshold(self, threshold):
175
- """Update the threshold for detecting speaker changes"""
176
- self.change_threshold = max(0.1, min(threshold, 0.99))
177
-
178
- def add_embedding(self, embedding, timestamp=None):
179
- """Add a new embedding and check if there's a speaker change"""
180
- current_time = timestamp or time.time()
181
-
182
- if not self.previous_embeddings:
183
- self.previous_embeddings.append(embedding)
184
- self.speaker_embeddings[self.current_speaker].append(embedding)
185
- if self.mean_embeddings[self.current_speaker] is None:
186
- self.mean_embeddings[self.current_speaker] = embedding.copy()
187
- return self.current_speaker, 1.0
188
-
189
- current_mean = self.mean_embeddings[self.current_speaker]
190
- if current_mean is not None:
191
- similarity = 1.0 - cosine(embedding, current_mean)
192
- else:
193
- similarity = 1.0 - cosine(embedding, self.previous_embeddings[-1])
194
-
195
- self.last_similarity = similarity
196
-
197
- time_since_last_change = current_time - self.last_change_time
198
- is_speaker_change = False
199
-
200
- if time_since_last_change >= MIN_SEGMENT_DURATION:
201
- if similarity < self.change_threshold:
202
- best_speaker = self.current_speaker
203
- best_similarity = similarity
204
-
205
- for speaker_id in range(self.max_speakers):
206
- if speaker_id == self.current_speaker:
207
- continue
208
-
209
- speaker_mean = self.mean_embeddings[speaker_id]
210
-
211
- if speaker_mean is not None:
212
- speaker_similarity = 1.0 - cosine(embedding, speaker_mean)
213
-
214
- if speaker_similarity > best_similarity:
215
- best_similarity = speaker_similarity
216
- best_speaker = speaker_id
217
-
218
- if best_speaker != self.current_speaker:
219
- is_speaker_change = True
220
- self.current_speaker = best_speaker
221
- elif len(self.active_speakers) < self.max_speakers:
222
- for new_id in range(self.max_speakers):
223
- if new_id not in self.active_speakers:
224
- is_speaker_change = True
225
- self.current_speaker = new_id
226
- self.active_speakers.add(new_id)
227
- break
228
-
229
- if is_speaker_change:
230
- self.last_change_time = current_time
231
-
232
- self.previous_embeddings.append(embedding)
233
- if len(self.previous_embeddings) > EMBEDDING_HISTORY_SIZE:
234
- self.previous_embeddings.pop(0)
235
-
236
- self.speaker_embeddings[self.current_speaker].append(embedding)
237
- self.active_speakers.add(self.current_speaker)
238
-
239
- if len(self.speaker_embeddings[self.current_speaker]) > 30:
240
- self.speaker_embeddings[self.current_speaker] = self.speaker_embeddings[self.current_speaker][-30:]
241
-
242
- if self.speaker_embeddings[self.current_speaker]:
243
- self.mean_embeddings[self.current_speaker] = np.mean(
244
- self.speaker_embeddings[self.current_speaker], axis=0
245
- )
246
-
247
- return self.current_speaker, similarity
248
-
249
- def get_color_for_speaker(self, speaker_id):
250
- """Return color for speaker ID"""
251
- if 0 <= speaker_id < len(SPEAKER_COLORS):
252
- return SPEAKER_COLORS[speaker_id]
253
- return "#FFFFFF"
254
-
255
- def get_status_info(self):
256
- """Return status information about the speaker change detector"""
257
- speaker_counts = [len(self.speaker_embeddings[i]) for i in range(self.max_speakers)]
258
-
259
- return {
260
- "current_speaker": self.current_speaker,
261
- "speaker_counts": speaker_counts,
262
- "active_speakers": len(self.active_speakers),
263
- "max_speakers": self.max_speakers,
264
- "last_similarity": self.last_similarity,
265
- "threshold": self.change_threshold
266
- }
267
-
268
-
269
- class GradioSpeakerDiarization:
270
- def __init__(self):
271
- self.encoder = None
272
- self.audio_processor = None
273
- self.speaker_detector = None
274
- self.full_sentences = []
275
- self.sentence_speakers = []
276
- self.is_initialized = False
277
- self.change_threshold = DEFAULT_CHANGE_THRESHOLD
278
- self.max_speakers = DEFAULT_MAX_SPEAKERS
279
-
280
- def initialize_models(self):
281
- """Initialize the speaker encoder model"""
282
- try:
283
- device_str = "cuda" if torch.cuda.is_available() else "cpu"
284
- print(f"Using device: {device_str}")
285
-
286
- # Load SpeechBrain encoder
287
- self.encoder = SpeechBrainEncoder(device=device_str)
288
- success = self.encoder.load_model()
289
-
290
- if success:
291
- self.audio_processor = AudioProcessor(self.encoder)
292
- self.speaker_detector = SpeakerChangeDetector(
293
- embedding_dim=self.encoder.embedding_dim,
294
- change_threshold=self.change_threshold,
295
- max_speakers=self.max_speakers
296
- )
297
- self.is_initialized = True
298
- return True
299
- else:
300
- return False
301
-
302
- except Exception as e:
303
- print(f"Model initialization error: {e}")
304
- return False
305
-
306
- def transcribe_audio(self, audio_input):
307
- """Process audio input and perform transcription with speaker diarization"""
308
- if not self.is_initialized:
309
- return "❌ Please initialize the system first!", self.get_formatted_conversation(), self.get_status_info()
310
-
311
- if audio_input is None:
312
- return "No audio received", self.get_formatted_conversation(), self.get_status_info()
313
-
314
- try:
315
- # Handle different audio input formats
316
- if isinstance(audio_input, tuple):
317
- sample_rate, audio_data = audio_input
318
- else:
319
- # Assume it's a file path
320
- import librosa
321
- audio_data, sample_rate = librosa.load(audio_input, sr=16000)
322
-
323
- # Ensure audio is in the right format
324
- if len(audio_data.shape) > 1:
325
- audio_data = audio_data.mean(axis=1) # Convert to mono
326
-
327
- # Perform simple transcription (placeholder - you'd want to integrate with Whisper or similar)
328
- # For now, we'll just do speaker diarization
329
- transcription = f"Audio segment {len(self.full_sentences) + 1} (duration: {len(audio_data)/sample_rate:.1f}s)"
330
-
331
- # Extract speaker embedding
332
- speaker_embedding = self.audio_processor.extract_embedding(audio_data, sample_rate)
333
-
334
- # Store sentence and embedding
335
- self.full_sentences.append((transcription, speaker_embedding))
336
-
337
- # Detect speaker changes
338
- speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
339
- self.sentence_speakers.append(speaker_id)
340
-
341
- status_msg = f"✅ Processed audio segment. Detected as Speaker {speaker_id + 1} (similarity: {similarity:.3f})"
342
-
343
- return status_msg, self.get_formatted_conversation(), self.get_status_info()
344
-
345
- except Exception as e:
346
- error_msg = f"❌ Error processing audio: {str(e)}"
347
- return error_msg, self.get_formatted_conversation(), self.get_status_info()
348
-
349
- def clear_conversation(self):
350
- """Clear all conversation data"""
351
- self.full_sentences = []
352
- self.sentence_speakers = []
353
-
354
- if self.speaker_detector:
355
- self.speaker_detector = SpeakerChangeDetector(
356
- embedding_dim=self.encoder.embedding_dim,
357
- change_threshold=self.change_threshold,
358
- max_speakers=self.max_speakers
359
- )
360
-
361
- return "Conversation cleared!", self.get_formatted_conversation(), self.get_status_info()
362
-
363
- def update_settings(self, threshold, max_speakers):
364
- """Update speaker detection settings"""
365
- self.change_threshold = threshold
366
- self.max_speakers = max_speakers
367
-
368
- if self.speaker_detector:
369
- self.speaker_detector.set_change_threshold(threshold)
370
- self.speaker_detector.set_max_speakers(max_speakers)
371
-
372
- status_msg = f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
373
- return status_msg, self.get_formatted_conversation(), self.get_status_info()
374
-
375
- def get_formatted_conversation(self):
376
- """Get the formatted conversation with speaker colors"""
377
- try:
378
- if not self.full_sentences:
379
- return "No audio processed yet. Upload an audio file or record using the microphone."
380
-
381
- sentences_with_style = []
382
-
383
- for i, sentence in enumerate(self.full_sentences):
384
- sentence_text, _ = sentence
385
- if i >= len(self.sentence_speakers):
386
- color = "#FFFFFF"
387
- speaker_name = "Unknown"
388
- else:
389
- speaker_id = self.sentence_speakers[i]
390
- color = self.speaker_detector.get_color_for_speaker(speaker_id)
391
- speaker_name = f"Speaker {speaker_id + 1}"
392
-
393
- sentences_with_style.append(
394
- f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
395
-
396
- return "<br><br>".join(sentences_with_style)
397
-
398
- except Exception as e:
399
- return f"Error formatting conversation: {e}"
400
-
401
- def get_status_info(self):
402
- """Get current status information"""
403
- if not self.speaker_detector:
404
- return "Speaker detector not initialized"
405
-
406
- try:
407
- status = self.speaker_detector.get_status_info()
408
-
409
- status_lines = [
410
- f"**Current Speaker:** {status['current_speaker'] + 1}",
411
- f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
412
- f"**Last Similarity:** {status['last_similarity']:.3f}",
413
- f"**Change Threshold:** {status['threshold']:.2f}",
414
- f"**Total Segments:** {len(self.full_sentences)}",
415
- "",
416
- "**Speaker Segment Counts:**"
417
- ]
418
-
419
- for i in range(status['max_speakers']):
420
- color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
421
- status_lines.append(f"Speaker {i+1} ({color_name}): {status['speaker_counts'][i]}")
422
-
423
- return "\n".join(status_lines)
424
-
425
- except Exception as e:
426
- return f"Error getting status: {e}"
427
-
428
-
429
- # Global instance
430
- diarization_system = GradioSpeakerDiarization()
431
-
432
-
433
- def initialize_system():
434
- """Initialize the diarization system"""
435
- success = diarization_system.initialize_models()
436
- if success:
437
- return "✅ System initialized successfully! Models loaded.", "", ""
438
- else:
439
- return "❌ Failed to initialize system. Please check the logs.", "", ""
440
-
441
-
442
- def process_audio(audio):
443
- """Process uploaded or recorded audio"""
444
- return diarization_system.transcribe_audio(audio)
445
-
446
-
447
- def clear_conversation():
448
- """Clear the conversation"""
449
- return diarization_system.clear_conversation()
450
-
451
-
452
- def update_settings(threshold, max_speakers):
453
- """Update system settings"""
454
- return diarization_system.update_settings(threshold, max_speakers)
455
-
456
-
457
- # Create Gradio interface
458
- def create_interface():
459
- with gr.Blocks(title="Speaker Diarization", theme=gr.themes.Soft()) as app:
460
- gr.Markdown("# 🎤 Audio Speaker Diarization")
461
- gr.Markdown("Upload audio files or record directly to identify different speakers using voice characteristics.")
462
-
463
- with gr.Row():
464
- with gr.Column(scale=2):
465
- # Initialize button
466
- with gr.Row():
467
- init_btn = gr.Button("🔧 Initialize System", variant="primary", size="lg")
468
-
469
- # Audio input options
470
- gr.Markdown("### 📁 Audio Input")
471
- with gr.Tab("Upload Audio File"):
472
- audio_file = gr.Audio(
473
- label="Upload Audio File",
474
- type="filepath",
475
- sources=["upload"]
476
- )
477
- process_file_btn = gr.Button("Process Audio File", variant="secondary")
478
-
479
- with gr.Tab("Record Audio"):
480
- audio_mic = gr.Audio(
481
- label="Record Audio",
482
- type="numpy",
483
- sources=["microphone"]
484
- )
485
- process_mic_btn = gr.Button("Process Recording", variant="secondary")
486
-
487
- # Results display
488
- status_output = gr.Textbox(
489
- label="Status",
490
- value="Click 'Initialize System' to start...",
491
- lines=2,
492
- interactive=False
493
- )
494
-
495
- conversation_output = gr.HTML(
496
- value="<i>System not initialized...</i>",
497
- label="Speaker Analysis Results"
498
- )
499
-
500
- # Control buttons
501
- with gr.Row():
502
- clear_btn = gr.Button("🗑️ Clear Results", variant="stop")
503
-
504
- with gr.Column(scale=1):
505
- # Settings panel
506
- gr.Markdown("## ⚙️ Settings")
507
-
508
- threshold_slider = gr.Slider(
509
- minimum=0.1,
510
- maximum=0.95,
511
- step=0.05,
512
- value=DEFAULT_CHANGE_THRESHOLD,
513
- label="Speaker Change Sensitivity",
514
- info="Lower = more sensitive to speaker changes"
515
- )
516
-
517
- max_speakers_slider = gr.Slider(
518
- minimum=2,
519
- maximum=ABSOLUTE_MAX_SPEAKERS,
520
- step=1,
521
- value=DEFAULT_MAX_SPEAKERS,
522
- label="Maximum Number of Speakers"
523
- )
524
-
525
- update_settings_btn = gr.Button("Update Settings", variant="secondary")
526
-
527
- # System status
528
- system_status = gr.Textbox(
529
- label="System Status",
530
- value="System not initialized",
531
- lines=12,
532
- interactive=False
533
- )
534
-
535
- # Speaker color legend
536
- gr.Markdown("## 🎨 Speaker Colors")
537
- color_info = []
538
- for i, (color, name) in enumerate(zip(SPEAKER_COLORS[:DEFAULT_MAX_SPEAKERS], SPEAKER_COLOR_NAMES[:DEFAULT_MAX_SPEAKERS])):
539
- color_info.append(f'<span style="color:{color};">●</span> Speaker {i+1} ({name})')
540
-
541
- gr.HTML("<br>".join(color_info))
542
-
543
- # Event handlers
544
- init_btn.click(
545
- initialize_system,
546
- outputs=[status_output, conversation_output, system_status]
547
- )
548
-
549
- process_file_btn.click(
550
- process_audio,
551
- inputs=[audio_file],
552
- outputs=[status_output, conversation_output, system_status]
553
- )
554
-
555
- process_mic_btn.click(
556
- process_audio,
557
- inputs=[audio_mic],
558
- outputs=[status_output, conversation_output, system_status]
559
- )
560
-
561
- clear_btn.click(
562
- clear_conversation,
563
- outputs=[status_output, conversation_output, system_status]
564
- )
565
-
566
- update_settings_btn.click(
567
- update_settings,
568
- inputs=[threshold_slider, max_speakers_slider],
569
- outputs=[status_output, conversation_output, system_status]
570
- )
571
-
572
- return app
573
-
574
-
575
- if __name__ == "__main__":
576
- app = create_interface()
577
- app.launch(
578
- server_name="0.0.0.0",
579
- server_port=7860,
580
- share=True
581
- )