Saiyaswanth007 commited on
Commit
10008f1
ยท
1 Parent(s): fd289b1
Files changed (1) hide show
  1. app.py +430 -236
app.py CHANGED
@@ -1,16 +1,19 @@
1
  import gradio as gr
2
  import numpy as np
 
3
  import torch
4
- import torchaudio
5
  import time
 
6
  import os
7
  import urllib.request
8
- import queue
9
- import threading
10
  from scipy.spatial.distance import cosine
11
  from RealtimeSTT import AudioToTextRecorder
 
 
 
12
 
13
- # Configuration parameters (kept same as original)
14
  SILENCE_THRESHS = [0, 0.4]
15
  FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
16
  FINAL_BEAM_SIZE = 5
@@ -29,20 +32,28 @@ MIN_SEGMENT_DURATION = 1.0
29
  DEFAULT_MAX_SPEAKERS = 4
30
  ABSOLUTE_MAX_SPEAKERS = 10
31
 
32
- # Audio parameters
33
  FAST_SENTENCE_END = True
34
  SAMPLE_RATE = 16000
35
  BUFFER_SIZE = 512
36
  CHANNELS = 1
37
 
38
- # Speaker colors for HTML display
39
  SPEAKER_COLORS = [
40
- "#FFFF00", "#FF0000", "#00FF00", "#00FFFF", "#FF00FF",
41
- "#0000FF", "#FF8000", "#00FF80", "#8000FF", "#FFFFFF"
 
 
 
 
 
 
 
 
42
  ]
43
 
44
  SPEAKER_COLOR_NAMES = [
45
- "Yellow", "Red", "Green", "Cyan", "Magenta",
46
  "Blue", "Orange", "Spring Green", "Purple", "White"
47
  ]
48
 
@@ -131,7 +142,7 @@ class AudioProcessor:
131
 
132
 
133
  class SpeakerChangeDetector:
134
- """Speaker change detector with configurable number of speakers"""
135
  def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
136
  self.embedding_dim = embedding_dim
137
  self.change_threshold = change_threshold
@@ -245,28 +256,87 @@ class SpeakerChangeDetector:
245
  if 0 <= speaker_id < len(SPEAKER_COLORS):
246
  return SPEAKER_COLORS[speaker_id]
247
  return "#FFFFFF"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
 
250
- class RealtimeASRDiarization:
251
- """Main class for real-time ASR with speaker diarization"""
252
  def __init__(self):
253
  self.encoder = None
254
  self.audio_processor = None
255
  self.speaker_detector = None
256
  self.recorder = None
257
- self.is_recording = False
 
258
  self.full_sentences = []
259
  self.sentence_speakers = []
260
  self.pending_sentences = []
 
261
  self.last_realtime_text = ""
262
- self.sentence_queue = queue.Queue()
263
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
264
  self.max_speakers = DEFAULT_MAX_SPEAKERS
265
 
266
- # Initialize model
267
- self.initialize_model()
268
-
269
- def initialize_model(self):
270
  """Initialize the speaker encoder model"""
271
  try:
272
  device_str = "cuda" if torch.cuda.is_available() else "cpu"
@@ -276,69 +346,95 @@ class RealtimeASRDiarization:
276
  success = self.encoder.load_model()
277
 
278
  if success:
279
- print("ECAPA-TDNN model loaded successfully!")
280
  self.audio_processor = AudioProcessor(self.encoder)
281
  self.speaker_detector = SpeakerChangeDetector(
282
  embedding_dim=self.encoder.embedding_dim,
283
  change_threshold=self.change_threshold,
284
  max_speakers=self.max_speakers
285
  )
286
-
287
- # Start sentence processing thread
288
- self.sentence_thread = threading.Thread(target=self.process_sentences, daemon=True)
289
- self.sentence_thread.start()
290
-
291
  else:
292
  print("Failed to load ECAPA-TDNN model")
293
-
294
  except Exception as e:
295
  print(f"Model initialization error: {e}")
 
296
 
297
- def process_sentences(self):
298
- """Process sentences in background thread"""
299
- while True:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  try:
301
- text, audio_bytes = self.sentence_queue.get(timeout=1)
302
- self.process_sentence(text, audio_bytes)
303
- except queue.Empty:
304
- continue
 
305
 
306
- def process_sentence(self, text, audio_bytes):
307
- """Process a sentence with speaker diarization"""
308
- if self.audio_processor is None or self.speaker_detector is None:
309
- return
310
-
311
- try:
312
- # Convert audio data to int16
313
- audio_int16 = np.int16(audio_bytes * 32767)
314
-
315
- # Extract speaker embedding
316
- speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
317
-
318
- # Store sentence and embedding
319
- self.full_sentences.append((text, speaker_embedding))
320
-
321
- # Fill in any missing speaker assignments
322
- while len(self.sentence_speakers) < len(self.full_sentences) - 1:
323
- self.sentence_speakers.append(0)
324
-
325
- # Detect speaker changes
326
- speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
327
- self.sentence_speakers.append(speaker_id)
328
-
329
- # Remove from pending
330
- if text in self.pending_sentences:
331
- self.pending_sentences.remove(text)
332
 
333
- except Exception as e:
334
- print(f"Error processing sentence: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
- def setup_recorder(self):
337
- """Setup the audio recorder"""
 
 
 
338
  try:
 
339
  recorder_config = {
340
  'spinner': False,
341
- 'use_microphone': False,
342
  'model': FINAL_TRANSCRIPTION_MODEL,
343
  'language': TRANSCRIPTION_LANGUAGE,
344
  'silero_sensitivity': SILERO_SENSITIVITY,
@@ -356,119 +452,44 @@ class RealtimeASRDiarization:
356
  'buffer_size': BUFFER_SIZE,
357
  'sample_rate': SAMPLE_RATE,
358
  }
359
-
360
  self.recorder = AudioToTextRecorder(**recorder_config)
361
- return True
362
-
363
- except Exception as e:
364
- print(f"Error setting up recorder: {e}")
365
- return False
366
-
367
- def live_text_detected(self, text):
368
- """Handle live text detection"""
369
- text = text.strip()
370
- if not text:
371
- return
372
 
373
- sentence_delimiters = '.?!ใ€‚'
374
- prob_sentence_end = (
375
- len(self.last_realtime_text) > 0
376
- and text[-1] in sentence_delimiters
377
- and self.last_realtime_text[-1] in sentence_delimiters
378
- )
379
-
380
- self.last_realtime_text = text
381
-
382
- if prob_sentence_end:
383
- if FAST_SENTENCE_END:
384
- self.recorder.stop()
385
- else:
386
- self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
387
- else:
388
- self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
389
-
390
- def process_audio_chunk(self, audio_chunk):
391
- """Process incoming audio chunk from FastRTC"""
392
- if self.recorder is None:
393
- if not self.setup_recorder():
394
- return "Failed to setup recorder"
395
-
396
- try:
397
- # Convert audio to the format expected by the recorder
398
- if isinstance(audio_chunk, tuple):
399
- sample_rate, audio_data = audio_chunk
400
- else:
401
- audio_data = audio_chunk
402
- sample_rate = SAMPLE_RATE
403
-
404
- # Ensure audio is in the right format
405
- if audio_data.dtype != np.int16:
406
- if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
407
- audio_data = (audio_data * 32767).astype(np.int16)
408
- else:
409
- audio_data = audio_data.astype(np.int16)
410
-
411
- # Convert to bytes and feed to recorder
412
- audio_bytes = audio_data.tobytes()
413
- self.recorder.feed_audio(audio_bytes)
414
-
415
- # Process final text if available
416
- def process_final_text(text):
417
- text = text.strip()
418
- if text:
419
- self.pending_sentences.append(text)
420
- audio_bytes = self.recorder.last_transcription_bytes
421
- self.sentence_queue.put((text, audio_bytes))
422
 
423
- # Get transcription
424
- self.recorder.text(process_final_text)
 
425
 
426
- return self.get_formatted_transcript()
427
 
428
  except Exception as e:
429
- print(f"Error processing audio: {e}")
430
- return f"Error: {e}"
431
 
432
- def get_formatted_transcript(self):
433
- """Get formatted transcript with speaker labels"""
434
  try:
435
- transcript_parts = []
436
-
437
- # Add completed sentences with speaker labels
438
- for i, (sentence_text, _) in enumerate(self.full_sentences):
439
- if i < len(self.sentence_speakers):
440
- speaker_id = self.sentence_speakers[i]
441
- speaker_label = f"Speaker {speaker_id + 1}"
442
- transcript_parts.append(f"{speaker_label}: {sentence_text}")
443
-
444
- # Add pending sentences
445
- for pending in self.pending_sentences:
446
- transcript_parts.append(f"[Processing]: {pending}")
447
-
448
- # Add current live text
449
- if self.last_realtime_text:
450
- transcript_parts.append(f"[Live]: {self.last_realtime_text}")
451
-
452
- return "\n".join(transcript_parts)
453
-
454
  except Exception as e:
455
- print(f"Error formatting transcript: {e}")
456
- return "Error formatting transcript"
457
 
458
- def update_settings(self, change_threshold, max_speakers):
459
- """Update diarization settings"""
460
- self.change_threshold = change_threshold
461
- self.max_speakers = max_speakers
462
-
463
- if self.speaker_detector:
464
- self.speaker_detector.set_change_threshold(change_threshold)
465
- self.speaker_detector.set_max_speakers(max_speakers)
466
 
467
- def clear_transcript(self):
468
- """Clear all transcript data"""
469
  self.full_sentences = []
470
  self.sentence_speakers = []
471
  self.pending_sentences = []
 
472
  self.last_realtime_text = ""
473
 
474
  if self.speaker_detector:
@@ -477,122 +498,295 @@ class RealtimeASRDiarization:
477
  change_threshold=self.change_threshold,
478
  max_speakers=self.max_speakers
479
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
 
482
  # Global instance
483
- asr_diarization = RealtimeASRDiarization()
484
 
485
 
486
- def process_audio_stream(audio_chunk, change_threshold, max_speakers):
487
- """Process audio stream and return transcript"""
488
- # Update settings if changed
489
- asr_diarization.update_settings(change_threshold, max_speakers)
490
-
491
- # Process audio
492
- transcript = asr_diarization.process_audio_chunk(audio_chunk)
493
-
494
- return transcript
 
 
 
 
495
 
 
 
 
496
 
497
- def clear_transcript():
498
- """Clear the transcript"""
499
- asr_diarization.clear_transcript()
500
- return "Transcript cleared. Ready for new input..."
501
 
 
 
 
502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  def create_interface():
504
- """Create Gradio interface with FastRTC"""
505
-
506
- with gr.Blocks(title="Real-time Speaker Diarization") as iface:
507
- gr.Markdown("# Real-time ASR with Speaker Diarization")
508
- gr.Markdown("Speak into your microphone to see real-time transcription with speaker labels!")
509
 
510
  with gr.Row():
511
- with gr.Column(scale=3):
512
- # Audio input with FastRTC
513
  audio_input = gr.Audio(
514
  sources=["microphone"],
515
  streaming=True,
516
- label="Microphone Input"
 
517
  )
518
 
519
- # Transcript output
520
- transcript_output = gr.Textbox(
521
- label="Live Transcript with Speaker Labels",
522
- lines=15,
523
- max_lines=20,
524
- value="Ready to start transcription...",
525
- interactive=False
526
  )
527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  with gr.Column(scale=1):
529
- gr.Markdown("### Settings")
 
530
 
531
- # Speaker change threshold
532
- change_threshold = gr.Slider(
533
  minimum=0.1,
534
  maximum=0.95,
535
- value=DEFAULT_CHANGE_THRESHOLD,
536
  step=0.05,
537
- label="Speaker Change Threshold",
 
538
  info="Lower values = more sensitive to speaker changes"
539
  )
540
 
541
- # Max speakers
542
- max_speakers = gr.Slider(
543
  minimum=2,
544
  maximum=ABSOLUTE_MAX_SPEAKERS,
545
- value=DEFAULT_MAX_SPEAKERS,
546
  step=1,
547
- label="Maximum Speakers",
548
- info="Maximum number of speakers to detect"
549
  )
550
 
551
- # Clear button
552
- clear_btn = gr.Button("Clear Transcript", variant="secondary")
553
 
554
- gr.Markdown("### Speaker Colors")
555
- color_info = "\\n".join([
556
- f"Speaker {i+1}: {SPEAKER_COLOR_NAMES[i]}"
557
- for i in range(min(DEFAULT_MAX_SPEAKERS, len(SPEAKER_COLOR_NAMES)))
558
- ])
559
- gr.Markdown(color_info)
560
-
561
- # Set up streaming
562
- audio_input.stream(
563
- fn=process_audio_stream,
564
- inputs=[audio_input, change_threshold, max_speakers],
565
- outputs=[transcript_output],
566
- show_progress=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  )
568
 
569
- # Clear button functionality
570
  clear_btn.click(
571
- fn=clear_transcript,
572
- outputs=[transcript_output]
 
 
 
 
 
 
573
  )
574
 
575
- gr.Markdown("""
576
- ### Instructions:
577
- 1. Allow microphone access when prompted
578
- 2. Start speaking - transcription will appear in real-time
579
- 3. Different speakers will be automatically detected and labeled
580
- 4. Adjust the threshold if speaker changes aren't detected properly
581
- 5. Use the clear button to reset the transcript
582
-
583
- ### Notes:
584
- - The system works best with clear audio and distinct speakers
585
- - It may take a moment to load the speaker recognition model on first use
586
- - Lower threshold values make the system more sensitive to speaker changes
587
- """)
588
 
589
- return iface
590
 
591
 
592
  if __name__ == "__main__":
593
- # Create and launch the interface
594
- iface = create_interface()
595
- iface.launch(
596
  server_name="0.0.0.0",
597
  server_port=7860,
598
  share=True
 
1
  import gradio as gr
2
  import numpy as np
3
+ import queue
4
  import torch
 
5
  import time
6
+ import threading
7
  import os
8
  import urllib.request
9
+ import torchaudio
 
10
  from scipy.spatial.distance import cosine
11
  from RealtimeSTT import AudioToTextRecorder
12
+ import json
13
+ import io
14
+ import wave
15
 
16
+ # Simplified configuration parameters
17
  SILENCE_THRESHS = [0, 0.4]
18
  FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
19
  FINAL_BEAM_SIZE = 5
 
32
  DEFAULT_MAX_SPEAKERS = 4
33
  ABSOLUTE_MAX_SPEAKERS = 10
34
 
35
+ # Global variables
36
  FAST_SENTENCE_END = True
37
  SAMPLE_RATE = 16000
38
  BUFFER_SIZE = 512
39
  CHANNELS = 1
40
 
41
+ # Speaker colors
42
  SPEAKER_COLORS = [
43
+ "#FFFF00", # Yellow
44
+ "#FF0000", # Red
45
+ "#00FF00", # Green
46
+ "#00FFFF", # Cyan
47
+ "#FF00FF", # Magenta
48
+ "#0000FF", # Blue
49
+ "#FF8000", # Orange
50
+ "#00FF80", # Spring Green
51
+ "#8000FF", # Purple
52
+ "#FFFFFF", # White
53
  ]
54
 
55
  SPEAKER_COLOR_NAMES = [
56
+ "Yellow", "Red", "Green", "Cyan", "Magenta",
57
  "Blue", "Orange", "Spring Green", "Purple", "White"
58
  ]
59
 
 
142
 
143
 
144
  class SpeakerChangeDetector:
145
+ """Speaker change detector that supports a configurable number of speakers"""
146
  def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
147
  self.embedding_dim = embedding_dim
148
  self.change_threshold = change_threshold
 
256
  if 0 <= speaker_id < len(SPEAKER_COLORS):
257
  return SPEAKER_COLORS[speaker_id]
258
  return "#FFFFFF"
259
+
260
+ def get_status_info(self):
261
+ """Return status information about the speaker change detector"""
262
+ speaker_counts = [len(self.speaker_embeddings[i]) for i in range(self.max_speakers)]
263
+
264
+ return {
265
+ "current_speaker": self.current_speaker,
266
+ "speaker_counts": speaker_counts,
267
+ "active_speakers": len(self.active_speakers),
268
+ "max_speakers": self.max_speakers,
269
+ "last_similarity": self.last_similarity,
270
+ "threshold": self.change_threshold
271
+ }
272
+
273
+
274
+ class WebRTCAudioProcessor:
275
+ """Processes WebRTC audio streams for speaker diarization"""
276
+ def __init__(self, diarization_system):
277
+ self.diarization_system = diarization_system
278
+ self.audio_buffer = []
279
+ self.buffer_lock = threading.Lock()
280
+ self.processing_thread = None
281
+ self.is_processing = False
282
+
283
+ def process_audio(self, audio_data, sample_rate):
284
+ """Process incoming audio data from WebRTC"""
285
+ try:
286
+ # Convert audio data to numpy array if needed
287
+ if isinstance(audio_data, bytes):
288
+ audio_array = np.frombuffer(audio_data, dtype=np.int16)
289
+ elif isinstance(audio_data, tuple):
290
+ # Handle tuple format (sample_rate, audio_array)
291
+ sample_rate, audio_array = audio_data
292
+ if isinstance(audio_array, np.ndarray):
293
+ if audio_array.dtype != np.int16:
294
+ audio_array = (audio_array * 32767).astype(np.int16)
295
+ else:
296
+ audio_array = np.array(audio_array, dtype=np.int16)
297
+ else:
298
+ audio_array = np.array(audio_data, dtype=np.int16)
299
+
300
+ # Ensure mono audio
301
+ if len(audio_array.shape) > 1:
302
+ audio_array = audio_array[:, 0]
303
+
304
+ # Add to buffer
305
+ with self.buffer_lock:
306
+ self.audio_buffer.extend(audio_array)
307
+
308
+ # Process buffer when it's large enough (1 second of audio)
309
+ if len(self.audio_buffer) >= sample_rate:
310
+ buffer_to_process = np.array(self.audio_buffer[:sample_rate])
311
+ self.audio_buffer = self.audio_buffer[sample_rate//2:] # Keep 50% overlap
312
+
313
+ # Feed to recorder in separate thread
314
+ if self.diarization_system.recorder:
315
+ audio_bytes = buffer_to_process.tobytes()
316
+ self.diarization_system.recorder.feed_audio(audio_bytes)
317
+
318
+ except Exception as e:
319
+ print(f"Error processing WebRTC audio: {e}")
320
 
321
 
322
+ class RealtimeSpeakerDiarization:
 
323
  def __init__(self):
324
  self.encoder = None
325
  self.audio_processor = None
326
  self.speaker_detector = None
327
  self.recorder = None
328
+ self.webrtc_processor = None
329
+ self.sentence_queue = queue.Queue()
330
  self.full_sentences = []
331
  self.sentence_speakers = []
332
  self.pending_sentences = []
333
+ self.displayed_text = ""
334
  self.last_realtime_text = ""
335
+ self.is_running = False
336
  self.change_threshold = DEFAULT_CHANGE_THRESHOLD
337
  self.max_speakers = DEFAULT_MAX_SPEAKERS
338
 
339
+ def initialize_models(self):
 
 
 
340
  """Initialize the speaker encoder model"""
341
  try:
342
  device_str = "cuda" if torch.cuda.is_available() else "cpu"
 
346
  success = self.encoder.load_model()
347
 
348
  if success:
 
349
  self.audio_processor = AudioProcessor(self.encoder)
350
  self.speaker_detector = SpeakerChangeDetector(
351
  embedding_dim=self.encoder.embedding_dim,
352
  change_threshold=self.change_threshold,
353
  max_speakers=self.max_speakers
354
  )
355
+ self.webrtc_processor = WebRTCAudioProcessor(self)
356
+ print("ECAPA-TDNN model loaded successfully!")
357
+ return True
 
 
358
  else:
359
  print("Failed to load ECAPA-TDNN model")
360
+ return False
361
  except Exception as e:
362
  print(f"Model initialization error: {e}")
363
+ return False
364
 
365
+ def live_text_detected(self, text):
366
+ """Callback for real-time transcription updates"""
367
+ text = text.strip()
368
+ if text:
369
+ sentence_delimiters = '.?!ใ€‚'
370
+ prob_sentence_end = (
371
+ len(self.last_realtime_text) > 0
372
+ and text[-1] in sentence_delimiters
373
+ and self.last_realtime_text[-1] in sentence_delimiters
374
+ )
375
+
376
+ self.last_realtime_text = text
377
+
378
+ if prob_sentence_end and FAST_SENTENCE_END:
379
+ self.recorder.stop()
380
+ elif prob_sentence_end:
381
+ self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
382
+ else:
383
+ self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
384
+
385
+ def process_final_text(self, text):
386
+ """Process final transcribed text with speaker embedding"""
387
+ text = text.strip()
388
+ if text:
389
  try:
390
+ bytes_data = self.recorder.last_transcription_bytes
391
+ self.sentence_queue.put((text, bytes_data))
392
+ self.pending_sentences.append(text)
393
+ except Exception as e:
394
+ print(f"Error processing final text: {e}")
395
 
396
+ def process_sentence_queue(self):
397
+ """Process sentences in the queue for speaker detection"""
398
+ while self.is_running:
399
+ try:
400
+ text, bytes_data = self.sentence_queue.get(timeout=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
+ # Convert audio data to int16
403
+ audio_int16 = np.int16(bytes_data * 32767)
404
+
405
+ # Extract speaker embedding
406
+ speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
407
+
408
+ # Store sentence and embedding
409
+ self.full_sentences.append((text, speaker_embedding))
410
+
411
+ # Fill in missing speaker assignments
412
+ while len(self.sentence_speakers) < len(self.full_sentences) - 1:
413
+ self.sentence_speakers.append(0)
414
+
415
+ # Detect speaker changes
416
+ speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
417
+ self.sentence_speakers.append(speaker_id)
418
+
419
+ # Remove from pending
420
+ if text in self.pending_sentences:
421
+ self.pending_sentences.remove(text)
422
+
423
+ except queue.Empty:
424
+ continue
425
+ except Exception as e:
426
+ print(f"Error processing sentence: {e}")
427
 
428
+ def start_recording(self):
429
+ """Start the recording and transcription process"""
430
+ if self.encoder is None:
431
+ return "Please initialize models first!"
432
+
433
  try:
434
+ # Setup recorder configuration for WebRTC input
435
  recorder_config = {
436
  'spinner': False,
437
+ 'use_microphone': False, # We'll feed audio manually
438
  'model': FINAL_TRANSCRIPTION_MODEL,
439
  'language': TRANSCRIPTION_LANGUAGE,
440
  'silero_sensitivity': SILERO_SENSITIVITY,
 
452
  'buffer_size': BUFFER_SIZE,
453
  'sample_rate': SAMPLE_RATE,
454
  }
455
+
456
  self.recorder = AudioToTextRecorder(**recorder_config)
 
 
 
 
 
 
 
 
 
 
 
457
 
458
+ # Start sentence processing thread
459
+ self.is_running = True
460
+ self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
461
+ self.sentence_thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
+ # Start transcription thread
464
+ self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
465
+ self.transcription_thread.start()
466
 
467
+ return "Recording started successfully! WebRTC audio input ready."
468
 
469
  except Exception as e:
470
+ return f"Error starting recording: {e}"
 
471
 
472
+ def run_transcription(self):
473
+ """Run the transcription loop"""
474
  try:
475
+ while self.is_running:
476
+ self.recorder.text(self.process_final_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  except Exception as e:
478
+ print(f"Transcription error: {e}")
 
479
 
480
+ def stop_recording(self):
481
+ """Stop the recording process"""
482
+ self.is_running = False
483
+ if self.recorder:
484
+ self.recorder.stop()
485
+ return "Recording stopped!"
 
 
486
 
487
+ def clear_conversation(self):
488
+ """Clear all conversation data"""
489
  self.full_sentences = []
490
  self.sentence_speakers = []
491
  self.pending_sentences = []
492
+ self.displayed_text = ""
493
  self.last_realtime_text = ""
494
 
495
  if self.speaker_detector:
 
498
  change_threshold=self.change_threshold,
499
  max_speakers=self.max_speakers
500
  )
501
+
502
+ return "Conversation cleared!"
503
+
504
+ def update_settings(self, threshold, max_speakers):
505
+ """Update speaker detection settings"""
506
+ self.change_threshold = threshold
507
+ self.max_speakers = max_speakers
508
+
509
+ if self.speaker_detector:
510
+ self.speaker_detector.set_change_threshold(threshold)
511
+ self.speaker_detector.set_max_speakers(max_speakers)
512
+
513
+ return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
514
+
515
+ def get_formatted_conversation(self):
516
+ """Get the formatted conversation with speaker colors"""
517
+ try:
518
+ sentences_with_style = []
519
+
520
+ # Process completed sentences
521
+ for i, sentence in enumerate(self.full_sentences):
522
+ sentence_text, _ = sentence
523
+ if i >= len(self.sentence_speakers):
524
+ color = "#FFFFFF"
525
+ else:
526
+ speaker_id = self.sentence_speakers[i]
527
+ color = self.speaker_detector.get_color_for_speaker(speaker_id)
528
+ speaker_name = f"Speaker {speaker_id + 1}"
529
+
530
+ sentences_with_style.append(
531
+ f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
532
+
533
+ # Add pending sentences
534
+ for pending_sentence in self.pending_sentences:
535
+ sentences_with_style.append(
536
+ f'<span style="color:#60FFFF;"><b>Processing:</b> {pending_sentence}</span>')
537
+
538
+ if sentences_with_style:
539
+ return "<br><br>".join(sentences_with_style)
540
+ else:
541
+ return "Waiting for speech input..."
542
+
543
+ except Exception as e:
544
+ return f"Error formatting conversation: {e}"
545
+
546
+ def get_status_info(self):
547
+ """Get current status information"""
548
+ if not self.speaker_detector:
549
+ return "Speaker detector not initialized"
550
+
551
+ try:
552
+ status = self.speaker_detector.get_status_info()
553
+
554
+ status_lines = [
555
+ f"**Current Speaker:** {status['current_speaker'] + 1}",
556
+ f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
557
+ f"**Last Similarity:** {status['last_similarity']:.3f}",
558
+ f"**Change Threshold:** {status['threshold']:.2f}",
559
+ f"**Total Sentences:** {len(self.full_sentences)}",
560
+ "",
561
+ "**Speaker Segment Counts:**"
562
+ ]
563
+
564
+ for i in range(status['max_speakers']):
565
+ color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
566
+ status_lines.append(f"Speaker {i+1} ({color_name}): {status['speaker_counts'][i]}")
567
+
568
+ return "\n".join(status_lines)
569
+
570
+ except Exception as e:
571
+ return f"Error getting status: {e}"
572
 
573
 
574
  # Global instance
575
+ diarization_system = RealtimeSpeakerDiarization()
576
 
577
 
578
+ def initialize_system():
579
+ """Initialize the diarization system"""
580
+ success = diarization_system.initialize_models()
581
+ if success:
582
+ return "โœ… System initialized successfully! Models loaded."
583
+ else:
584
+ return "โŒ Failed to initialize system. Please check the logs."
585
+
586
+
587
+ def start_recording():
588
+ """Start recording and transcription"""
589
+ return diarization_system.start_recording()
590
+
591
 
592
+ def stop_recording():
593
+ """Stop recording and transcription"""
594
+ return diarization_system.stop_recording()
595
 
 
 
 
 
596
 
597
+ def clear_conversation():
598
+ """Clear the conversation"""
599
+ return diarization_system.clear_conversation()
600
 
601
+
602
+ def update_settings(threshold, max_speakers):
603
+ """Update system settings"""
604
+ return diarization_system.update_settings(threshold, max_speakers)
605
+
606
+
607
+ def get_conversation():
608
+ """Get the current conversation"""
609
+ return diarization_system.get_formatted_conversation()
610
+
611
+
612
+ def get_status():
613
+ """Get system status"""
614
+ return diarization_system.get_status_info()
615
+
616
+
617
+ def process_audio_stream(audio):
618
+ """Process audio stream from WebRTC"""
619
+ if diarization_system.webrtc_processor and diarization_system.is_running:
620
+ diarization_system.webrtc_processor.process_audio(audio, SAMPLE_RATE)
621
+ return None
622
+
623
+
624
+ # Create Gradio interface
625
  def create_interface():
626
+ with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Dark()) as app:
627
+ gr.Markdown("# ๐ŸŽค Real-time Speech Recognition with Speaker Diarization")
628
+ gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding using WebRTC.")
 
 
629
 
630
  with gr.Row():
631
+ with gr.Column(scale=2):
632
+ # WebRTC Audio Input
633
  audio_input = gr.Audio(
634
  sources=["microphone"],
635
  streaming=True,
636
+ label="๐ŸŽ™๏ธ Microphone Input",
637
+ type="numpy"
638
  )
639
 
640
+ # Main conversation display
641
+ conversation_output = gr.HTML(
642
+ value="<i>Click 'Initialize System' to start...</i>",
643
+ label="Live Conversation"
 
 
 
644
  )
645
 
646
+ # Control buttons
647
+ with gr.Row():
648
+ init_btn = gr.Button("๐Ÿ”ง Initialize System", variant="secondary")
649
+ start_btn = gr.Button("๐ŸŽ™๏ธ Start Recording", variant="primary", interactive=False)
650
+ stop_btn = gr.Button("โน๏ธ Stop Recording", variant="stop", interactive=False)
651
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear Conversation", interactive=False)
652
+
653
+ # Status display
654
+ status_output = gr.Textbox(
655
+ label="System Status",
656
+ value="System not initialized",
657
+ lines=8,
658
+ interactive=False
659
+ )
660
+
661
  with gr.Column(scale=1):
662
+ # Settings panel
663
+ gr.Markdown("## โš™๏ธ Settings")
664
 
665
+ threshold_slider = gr.Slider(
 
666
  minimum=0.1,
667
  maximum=0.95,
 
668
  step=0.05,
669
+ value=DEFAULT_CHANGE_THRESHOLD,
670
+ label="Speaker Change Sensitivity",
671
  info="Lower values = more sensitive to speaker changes"
672
  )
673
 
674
+ max_speakers_slider = gr.Slider(
 
675
  minimum=2,
676
  maximum=ABSOLUTE_MAX_SPEAKERS,
 
677
  step=1,
678
+ value=DEFAULT_MAX_SPEAKERS,
679
+ label="Maximum Number of Speakers"
680
  )
681
 
682
+ update_settings_btn = gr.Button("Update Settings")
 
683
 
684
+ # Instructions
685
+ gr.Markdown("## ๐Ÿ“ Instructions")
686
+ gr.Markdown("""
687
+ 1. Click **Initialize System** to load models
688
+ 2. Click **Start Recording** to begin processing
689
+ 3. Allow microphone access when prompted
690
+ 4. Speak into your microphone
691
+ 5. Watch real-time transcription with speaker labels
692
+ 6. Adjust settings as needed
693
+ """)
694
+
695
+ # Speaker color legend
696
+ gr.Markdown("## ๐ŸŽจ Speaker Colors")
697
+ color_info = []
698
+ for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
699
+ color_info.append(f'<span style="color:{color};">โ– </span> Speaker {i+1} ({name})')
700
+
701
+ gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
702
+
703
+ # Auto-refresh conversation and status
704
+ def refresh_display():
705
+ return get_conversation(), get_status()
706
+
707
+ # Event handlers
708
+ def on_initialize():
709
+ result = initialize_system()
710
+ if "successfully" in result:
711
+ return (
712
+ result,
713
+ gr.update(interactive=True), # start_btn
714
+ gr.update(interactive=True), # clear_btn
715
+ get_conversation(),
716
+ get_status()
717
+ )
718
+ else:
719
+ return (
720
+ result,
721
+ gr.update(interactive=False), # start_btn
722
+ gr.update(interactive=False), # clear_btn
723
+ get_conversation(),
724
+ get_status()
725
+ )
726
+
727
+ def on_start():
728
+ result = start_recording()
729
+ return (
730
+ result,
731
+ gr.update(interactive=False), # start_btn
732
+ gr.update(interactive=True), # stop_btn
733
+ )
734
+
735
+ def on_stop():
736
+ result = stop_recording()
737
+ return (
738
+ result,
739
+ gr.update(interactive=True), # start_btn
740
+ gr.update(interactive=False), # stop_btn
741
+ )
742
+
743
+ # Connect event handlers
744
+ init_btn.click(
745
+ on_initialize,
746
+ outputs=[status_output, start_btn, clear_btn, conversation_output, status_output]
747
+ )
748
+
749
+ start_btn.click(
750
+ on_start,
751
+ outputs=[status_output, start_btn, stop_btn]
752
+ )
753
+
754
+ stop_btn.click(
755
+ on_stop,
756
+ outputs=[status_output, start_btn, stop_btn]
757
  )
758
 
 
759
  clear_btn.click(
760
+ clear_conversation,
761
+ outputs=[status_output]
762
+ )
763
+
764
+ update_settings_btn.click(
765
+ update_settings,
766
+ inputs=[threshold_slider, max_speakers_slider],
767
+ outputs=[status_output]
768
  )
769
 
770
+ # Connect WebRTC audio stream to processing
771
+ audio_input.stream(
772
+ process_audio_stream,
773
+ inputs=[audio_input],
774
+ outputs=[]
775
+ )
776
+
777
+ # Auto-refresh every 2 seconds when recording
778
+ refresh_timer = gr.Timer(2.0)
779
+ refresh_timer.tick(
780
+ refresh_display,
781
+ outputs=[conversation_output, status_output]
782
+ )
783
 
784
+ return app
785
 
786
 
787
  if __name__ == "__main__":
788
+ app = create_interface()
789
+ app.launch(
 
790
  server_name="0.0.0.0",
791
  server_port=7860,
792
  share=True