Spaces:
Sleeping
Sleeping
Commit
·
117eca9
1
Parent(s):
acd1802
Adding fastrtc
Browse files
app.py
CHANGED
@@ -8,7 +8,6 @@ import os
|
|
8 |
import urllib.request
|
9 |
import torchaudio
|
10 |
from scipy.spatial.distance import cosine
|
11 |
-
from RealtimeSTT import AudioToTextRecorder
|
12 |
import json
|
13 |
import io
|
14 |
import wave
|
@@ -126,14 +125,13 @@ class AudioProcessor:
|
|
126 |
def __init__(self, encoder):
|
127 |
self.encoder = encoder
|
128 |
|
129 |
-
def extract_embedding(self,
|
130 |
try:
|
131 |
-
|
|
|
|
|
132 |
|
133 |
-
|
134 |
-
float_audio = float_audio / np.abs(float_audio).max()
|
135 |
-
|
136 |
-
embedding = self.encoder.embed_utterance(float_audio)
|
137 |
|
138 |
return embedding
|
139 |
except Exception as e:
|
@@ -271,52 +269,58 @@ class SpeakerChangeDetector:
|
|
271 |
}
|
272 |
|
273 |
|
274 |
-
class
|
275 |
-
"""
|
276 |
-
def __init__(self,
|
277 |
-
self.
|
278 |
-
self.
|
279 |
-
self.
|
280 |
-
self.
|
281 |
-
self.is_processing = False
|
282 |
|
283 |
-
def
|
284 |
-
"""
|
285 |
try:
|
286 |
-
|
287 |
-
if isinstance(audio_data, bytes):
|
288 |
-
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
289 |
-
elif isinstance(audio_data, tuple):
|
290 |
-
# Handle tuple format (sample_rate, audio_array)
|
291 |
-
sample_rate, audio_array = audio_data
|
292 |
-
if isinstance(audio_array, np.ndarray):
|
293 |
-
if audio_array.dtype != np.int16:
|
294 |
-
audio_array = (audio_array * 32767).astype(np.int16)
|
295 |
-
else:
|
296 |
-
audio_array = np.array(audio_array, dtype=np.int16)
|
297 |
-
else:
|
298 |
-
audio_array = np.array(audio_data, dtype=np.int16)
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
-
# Add to buffer
|
305 |
-
with self.buffer_lock:
|
306 |
-
self.audio_buffer.extend(audio_array)
|
307 |
-
|
308 |
-
# Process buffer when it's large enough (1 second of audio)
|
309 |
-
if len(self.audio_buffer) >= sample_rate:
|
310 |
-
buffer_to_process = np.array(self.audio_buffer[:sample_rate])
|
311 |
-
self.audio_buffer = self.audio_buffer[sample_rate//2:] # Keep 50% overlap
|
312 |
-
|
313 |
-
# Feed to recorder in separate thread
|
314 |
-
if self.diarization_system.recorder:
|
315 |
-
audio_bytes = buffer_to_process.tobytes()
|
316 |
-
self.diarization_system.recorder.feed_audio(audio_bytes)
|
317 |
-
|
318 |
except Exception as e:
|
319 |
-
print(f"
|
|
|
320 |
|
321 |
|
322 |
class RealtimeSpeakerDiarization:
|
@@ -324,86 +328,112 @@ class RealtimeSpeakerDiarization:
|
|
324 |
self.encoder = None
|
325 |
self.audio_processor = None
|
326 |
self.speaker_detector = None
|
327 |
-
self.
|
328 |
-
self.
|
|
|
329 |
self.sentence_queue = queue.Queue()
|
330 |
self.full_sentences = []
|
331 |
self.sentence_speakers = []
|
332 |
self.pending_sentences = []
|
333 |
self.displayed_text = ""
|
334 |
-
self.last_realtime_text = ""
|
335 |
self.is_running = False
|
336 |
self.change_threshold = DEFAULT_CHANGE_THRESHOLD
|
337 |
self.max_speakers = DEFAULT_MAX_SPEAKERS
|
|
|
|
|
338 |
|
339 |
def initialize_models(self):
|
340 |
-
"""Initialize the speaker encoder
|
341 |
try:
|
342 |
device_str = "cuda" if torch.cuda.is_available() else "cpu"
|
343 |
print(f"Using device: {device_str}")
|
344 |
|
|
|
345 |
self.encoder = SpeechBrainEncoder(device=device_str)
|
346 |
-
|
|
|
|
|
|
|
|
|
347 |
|
348 |
-
if
|
349 |
self.audio_processor = AudioProcessor(self.encoder)
|
350 |
self.speaker_detector = SpeakerChangeDetector(
|
351 |
embedding_dim=self.encoder.embedding_dim,
|
352 |
change_threshold=self.change_threshold,
|
353 |
max_speakers=self.max_speakers
|
354 |
)
|
355 |
-
|
356 |
-
print("ECAPA-TDNN model loaded successfully!")
|
357 |
return True
|
358 |
else:
|
359 |
-
print("Failed to load
|
360 |
return False
|
361 |
except Exception as e:
|
362 |
print(f"Model initialization error: {e}")
|
363 |
return False
|
364 |
|
365 |
-
def
|
366 |
-
"""
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
)
|
375 |
-
|
376 |
-
self.last_realtime_text = text
|
377 |
-
|
378 |
-
if prob_sentence_end and FAST_SENTENCE_END:
|
379 |
-
self.recorder.stop()
|
380 |
-
elif prob_sentence_end:
|
381 |
-
self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
|
382 |
else:
|
383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
|
385 |
-
def
|
386 |
-
"""Process
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
|
396 |
def process_sentence_queue(self):
|
397 |
"""Process sentences in the queue for speaker detection"""
|
398 |
while self.is_running:
|
399 |
try:
|
400 |
-
text,
|
401 |
-
|
402 |
-
# Convert audio data to int16
|
403 |
-
audio_int16 = np.int16(bytes_data * 32767)
|
404 |
-
|
405 |
-
# Extract speaker embedding
|
406 |
-
speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
|
407 |
|
408 |
# Store sentence and embedding
|
409 |
self.full_sentences.append((text, speaker_embedding))
|
@@ -416,10 +446,6 @@ class RealtimeSpeakerDiarization:
|
|
416 |
speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
|
417 |
self.sentence_speakers.append(speaker_id)
|
418 |
|
419 |
-
# Remove from pending
|
420 |
-
if text in self.pending_sentences:
|
421 |
-
self.pending_sentences.remove(text)
|
422 |
-
|
423 |
except queue.Empty:
|
424 |
continue
|
425 |
except Exception as e:
|
@@ -431,57 +457,20 @@ class RealtimeSpeakerDiarization:
|
|
431 |
return "Please initialize models first!"
|
432 |
|
433 |
try:
|
434 |
-
# Setup recorder configuration for WebRTC input
|
435 |
-
recorder_config = {
|
436 |
-
'spinner': False,
|
437 |
-
'use_microphone': False, # We'll feed audio manually
|
438 |
-
'model': FINAL_TRANSCRIPTION_MODEL,
|
439 |
-
'language': TRANSCRIPTION_LANGUAGE,
|
440 |
-
'silero_sensitivity': SILERO_SENSITIVITY,
|
441 |
-
'webrtc_sensitivity': WEBRTC_SENSITIVITY,
|
442 |
-
'post_speech_silence_duration': SILENCE_THRESHS[1],
|
443 |
-
'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
|
444 |
-
'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
|
445 |
-
'min_gap_between_recordings': 0,
|
446 |
-
'enable_realtime_transcription': True,
|
447 |
-
'realtime_processing_pause': 0,
|
448 |
-
'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
|
449 |
-
'on_realtime_transcription_update': self.live_text_detected,
|
450 |
-
'beam_size': FINAL_BEAM_SIZE,
|
451 |
-
'beam_size_realtime': REALTIME_BEAM_SIZE,
|
452 |
-
'buffer_size': BUFFER_SIZE,
|
453 |
-
'sample_rate': SAMPLE_RATE,
|
454 |
-
}
|
455 |
-
|
456 |
-
self.recorder = AudioToTextRecorder(**recorder_config)
|
457 |
-
|
458 |
# Start sentence processing thread
|
459 |
self.is_running = True
|
460 |
-
self.
|
461 |
-
self.
|
462 |
-
|
463 |
-
# Start transcription thread
|
464 |
-
self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
|
465 |
-
self.transcription_thread.start()
|
466 |
|
467 |
-
return "Recording started successfully!
|
468 |
|
469 |
except Exception as e:
|
470 |
return f"Error starting recording: {e}"
|
471 |
|
472 |
-
def run_transcription(self):
|
473 |
-
"""Run the transcription loop"""
|
474 |
-
try:
|
475 |
-
while self.is_running:
|
476 |
-
self.recorder.text(self.process_final_text)
|
477 |
-
except Exception as e:
|
478 |
-
print(f"Transcription error: {e}")
|
479 |
-
|
480 |
def stop_recording(self):
|
481 |
"""Stop the recording process"""
|
482 |
self.is_running = False
|
483 |
-
|
484 |
-
self.recorder.stop()
|
485 |
return "Recording stopped!"
|
486 |
|
487 |
def clear_conversation(self):
|
@@ -490,7 +479,7 @@ class RealtimeSpeakerDiarization:
|
|
490 |
self.sentence_speakers = []
|
491 |
self.pending_sentences = []
|
492 |
self.displayed_text = ""
|
493 |
-
self.
|
494 |
|
495 |
if self.speaker_detector:
|
496 |
self.speaker_detector = SpeakerChangeDetector(
|
@@ -522,6 +511,7 @@ class RealtimeSpeakerDiarization:
|
|
522 |
sentence_text, _ = sentence
|
523 |
if i >= len(self.sentence_speakers):
|
524 |
color = "#FFFFFF"
|
|
|
525 |
else:
|
526 |
speaker_id = self.sentence_speakers[i]
|
527 |
color = self.speaker_detector.get_color_for_speaker(speaker_id)
|
@@ -530,11 +520,6 @@ class RealtimeSpeakerDiarization:
|
|
530 |
sentences_with_style.append(
|
531 |
f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
|
532 |
|
533 |
-
# Add pending sentences
|
534 |
-
for pending_sentence in self.pending_sentences:
|
535 |
-
sentences_with_style.append(
|
536 |
-
f'<span style="color:#60FFFF;"><b>Processing:</b> {pending_sentence}</span>')
|
537 |
-
|
538 |
if sentences_with_style:
|
539 |
return "<br><br>".join(sentences_with_style)
|
540 |
else:
|
@@ -557,6 +542,7 @@ class RealtimeSpeakerDiarization:
|
|
557 |
f"**Last Similarity:** {status['last_similarity']:.3f}",
|
558 |
f"**Change Threshold:** {status['threshold']:.2f}",
|
559 |
f"**Total Sentences:** {len(self.full_sentences)}",
|
|
|
560 |
"",
|
561 |
"**Speaker Segment Counts:**"
|
562 |
]
|
@@ -614,27 +600,28 @@ def get_status():
|
|
614 |
return diarization_system.get_status_info()
|
615 |
|
616 |
|
617 |
-
def
|
618 |
-
"""Process audio
|
619 |
-
if
|
620 |
-
|
621 |
-
|
|
|
622 |
|
623 |
|
624 |
# Create Gradio interface
|
625 |
def create_interface():
|
626 |
-
with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.
|
627 |
gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
|
628 |
-
gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding using
|
629 |
|
630 |
with gr.Row():
|
631 |
with gr.Column(scale=2):
|
632 |
-
#
|
633 |
audio_input = gr.Audio(
|
634 |
-
|
|
|
635 |
streaming=True,
|
636 |
-
label="🎙️ Microphone Input"
|
637 |
-
type="numpy"
|
638 |
)
|
639 |
|
640 |
# Main conversation display
|
@@ -654,7 +641,7 @@ def create_interface():
|
|
654 |
status_output = gr.Textbox(
|
655 |
label="System Status",
|
656 |
value="System not initialized",
|
657 |
-
lines=
|
658 |
interactive=False
|
659 |
)
|
660 |
|
@@ -681,17 +668,6 @@ def create_interface():
|
|
681 |
|
682 |
update_settings_btn = gr.Button("Update Settings")
|
683 |
|
684 |
-
# Instructions
|
685 |
-
gr.Markdown("## 📝 Instructions")
|
686 |
-
gr.Markdown("""
|
687 |
-
1. Click **Initialize System** to load models
|
688 |
-
2. Click **Start Recording** to begin processing
|
689 |
-
3. Allow microphone access when prompted
|
690 |
-
4. Speak into your microphone
|
691 |
-
5. Watch real-time transcription with speaker labels
|
692 |
-
6. Adjust settings as needed
|
693 |
-
""")
|
694 |
-
|
695 |
# Speaker color legend
|
696 |
gr.Markdown("## 🎨 Speaker Colors")
|
697 |
color_info = []
|
@@ -699,10 +675,18 @@ def create_interface():
|
|
699 |
color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
|
700 |
|
701 |
gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
706 |
|
707 |
# Event handlers
|
708 |
def on_initialize():
|
@@ -767,17 +751,19 @@ def create_interface():
|
|
767 |
outputs=[status_output]
|
768 |
)
|
769 |
|
770 |
-
#
|
771 |
audio_input.stream(
|
772 |
-
|
773 |
inputs=[audio_input],
|
774 |
-
outputs=[]
|
|
|
|
|
775 |
)
|
776 |
|
777 |
-
# Auto-refresh every
|
778 |
-
refresh_timer = gr.Timer(
|
779 |
refresh_timer.tick(
|
780 |
-
|
781 |
outputs=[conversation_output, status_output]
|
782 |
)
|
783 |
|
|
|
8 |
import urllib.request
|
9 |
import torchaudio
|
10 |
from scipy.spatial.distance import cosine
|
|
|
11 |
import json
|
12 |
import io
|
13 |
import wave
|
|
|
125 |
def __init__(self, encoder):
|
126 |
self.encoder = encoder
|
127 |
|
128 |
+
def extract_embedding(self, audio_float):
|
129 |
try:
|
130 |
+
# Ensure audio is in the right format
|
131 |
+
if np.abs(audio_float).max() > 1.0:
|
132 |
+
audio_float = audio_float / np.abs(audio_float).max()
|
133 |
|
134 |
+
embedding = self.encoder.embed_utterance(audio_float)
|
|
|
|
|
|
|
135 |
|
136 |
return embedding
|
137 |
except Exception as e:
|
|
|
269 |
}
|
270 |
|
271 |
|
272 |
+
class WhisperTranscriber:
|
273 |
+
"""Simple Whisper transcriber for audio chunks"""
|
274 |
+
def __init__(self, model_name="distil-large-v3"):
|
275 |
+
self.model = None
|
276 |
+
self.processor = None
|
277 |
+
self.model_name = model_name
|
278 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
279 |
|
280 |
+
def load_model(self):
|
281 |
+
"""Load Whisper model"""
|
282 |
try:
|
283 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
|
285 |
+
self.processor = WhisperProcessor.from_pretrained(f"distil-whisper/{self.model_name}")
|
286 |
+
self.model = WhisperForConditionalGeneration.from_pretrained(f"distil-whisper/{self.model_name}")
|
287 |
+
self.model.to(self.device)
|
288 |
+
|
289 |
+
return True
|
290 |
+
except Exception as e:
|
291 |
+
print(f"Error loading Whisper model: {e}")
|
292 |
+
return False
|
293 |
+
|
294 |
+
def transcribe(self, audio_array, sample_rate=16000):
|
295 |
+
"""Transcribe audio array"""
|
296 |
+
try:
|
297 |
+
if self.model is None:
|
298 |
+
return ""
|
299 |
+
|
300 |
+
# Ensure audio is the right sample rate
|
301 |
+
if sample_rate != 16000:
|
302 |
+
audio_array = torchaudio.functional.resample(
|
303 |
+
torch.tensor(audio_array).float(),
|
304 |
+
orig_freq=sample_rate,
|
305 |
+
new_freq=16000
|
306 |
+
).numpy()
|
307 |
+
|
308 |
+
# Process audio
|
309 |
+
inputs = self.processor(audio_array, sampling_rate=16000, return_tensors="pt")
|
310 |
+
inputs = inputs.to(self.device)
|
311 |
+
|
312 |
+
# Generate transcription
|
313 |
+
with torch.no_grad():
|
314 |
+
predicted_ids = self.model.generate(inputs["input_features"])
|
315 |
+
|
316 |
+
# Decode transcription
|
317 |
+
transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
318 |
+
|
319 |
+
return transcription[0] if transcription else ""
|
320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
except Exception as e:
|
322 |
+
print(f"Transcription error: {e}")
|
323 |
+
return ""
|
324 |
|
325 |
|
326 |
class RealtimeSpeakerDiarization:
|
|
|
328 |
self.encoder = None
|
329 |
self.audio_processor = None
|
330 |
self.speaker_detector = None
|
331 |
+
self.transcriber = None
|
332 |
+
self.audio_buffer = []
|
333 |
+
self.processing_thread = None
|
334 |
self.sentence_queue = queue.Queue()
|
335 |
self.full_sentences = []
|
336 |
self.sentence_speakers = []
|
337 |
self.pending_sentences = []
|
338 |
self.displayed_text = ""
|
|
|
339 |
self.is_running = False
|
340 |
self.change_threshold = DEFAULT_CHANGE_THRESHOLD
|
341 |
self.max_speakers = DEFAULT_MAX_SPEAKERS
|
342 |
+
self.audio_chunks = []
|
343 |
+
self.chunk_counter = 0
|
344 |
|
345 |
def initialize_models(self):
|
346 |
+
"""Initialize the speaker encoder and transcription models"""
|
347 |
try:
|
348 |
device_str = "cuda" if torch.cuda.is_available() else "cpu"
|
349 |
print(f"Using device: {device_str}")
|
350 |
|
351 |
+
# Initialize speaker encoder
|
352 |
self.encoder = SpeechBrainEncoder(device=device_str)
|
353 |
+
encoder_success = self.encoder.load_model()
|
354 |
+
|
355 |
+
# Initialize transcriber
|
356 |
+
self.transcriber = WhisperTranscriber(FINAL_TRANSCRIPTION_MODEL)
|
357 |
+
transcriber_success = self.transcriber.load_model()
|
358 |
|
359 |
+
if encoder_success and transcriber_success:
|
360 |
self.audio_processor = AudioProcessor(self.encoder)
|
361 |
self.speaker_detector = SpeakerChangeDetector(
|
362 |
embedding_dim=self.encoder.embedding_dim,
|
363 |
change_threshold=self.change_threshold,
|
364 |
max_speakers=self.max_speakers
|
365 |
)
|
366 |
+
print("Models loaded successfully!")
|
|
|
367 |
return True
|
368 |
else:
|
369 |
+
print("Failed to load models")
|
370 |
return False
|
371 |
except Exception as e:
|
372 |
print(f"Model initialization error: {e}")
|
373 |
return False
|
374 |
|
375 |
+
def process_audio_stream(self, audio_data, sample_rate):
|
376 |
+
"""Process incoming audio stream data"""
|
377 |
+
if not self.is_running or self.encoder is None:
|
378 |
+
return
|
379 |
+
|
380 |
+
try:
|
381 |
+
# Convert audio data to numpy array if needed
|
382 |
+
if isinstance(audio_data, tuple):
|
383 |
+
sample_rate, audio_array = audio_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
else:
|
385 |
+
audio_array = audio_data
|
386 |
+
|
387 |
+
# Ensure audio is float32 and normalized
|
388 |
+
if audio_array.dtype != np.float32:
|
389 |
+
if audio_array.dtype == np.int16:
|
390 |
+
audio_array = audio_array.astype(np.float32) / 32768.0
|
391 |
+
else:
|
392 |
+
audio_array = audio_array.astype(np.float32)
|
393 |
+
|
394 |
+
# Ensure mono audio
|
395 |
+
if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
|
396 |
+
audio_array = np.mean(audio_array, axis=1)
|
397 |
+
|
398 |
+
# Add to buffer
|
399 |
+
self.audio_buffer.extend(audio_array.flatten())
|
400 |
+
|
401 |
+
# Process when we have enough audio (about 2 seconds)
|
402 |
+
target_length = int(sample_rate * 2.0)
|
403 |
+
if len(self.audio_buffer) >= target_length:
|
404 |
+
self.process_audio_chunk()
|
405 |
+
|
406 |
+
except Exception as e:
|
407 |
+
print(f"Error processing audio stream: {e}")
|
408 |
|
409 |
+
def process_audio_chunk(self):
|
410 |
+
"""Process accumulated audio chunk"""
|
411 |
+
try:
|
412 |
+
if len(self.audio_buffer) < SAMPLE_RATE: # Need at least 1 second
|
413 |
+
return
|
414 |
+
|
415 |
+
# Get audio chunk
|
416 |
+
audio_chunk = np.array(self.audio_buffer[:int(SAMPLE_RATE * 2)])
|
417 |
+
self.audio_buffer = self.audio_buffer[int(SAMPLE_RATE * 1.5):] # Keep some overlap
|
418 |
+
|
419 |
+
# Transcribe audio
|
420 |
+
transcription = self.transcriber.transcribe(audio_chunk, SAMPLE_RATE)
|
421 |
+
|
422 |
+
if transcription.strip():
|
423 |
+
# Extract speaker embedding
|
424 |
+
speaker_embedding = self.audio_processor.extract_embedding(audio_chunk)
|
425 |
+
|
426 |
+
# Add to queue for processing
|
427 |
+
self.sentence_queue.put((transcription.strip(), speaker_embedding))
|
428 |
+
|
429 |
+
except Exception as e:
|
430 |
+
print(f"Error processing audio chunk: {e}")
|
431 |
|
432 |
def process_sentence_queue(self):
|
433 |
"""Process sentences in the queue for speaker detection"""
|
434 |
while self.is_running:
|
435 |
try:
|
436 |
+
text, speaker_embedding = self.sentence_queue.get(timeout=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
# Store sentence and embedding
|
439 |
self.full_sentences.append((text, speaker_embedding))
|
|
|
446 |
speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
|
447 |
self.sentence_speakers.append(speaker_id)
|
448 |
|
|
|
|
|
|
|
|
|
449 |
except queue.Empty:
|
450 |
continue
|
451 |
except Exception as e:
|
|
|
457 |
return "Please initialize models first!"
|
458 |
|
459 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
# Start sentence processing thread
|
461 |
self.is_running = True
|
462 |
+
self.processing_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
|
463 |
+
self.processing_thread.start()
|
|
|
|
|
|
|
|
|
464 |
|
465 |
+
return "Recording started successfully! Start speaking into your microphone."
|
466 |
|
467 |
except Exception as e:
|
468 |
return f"Error starting recording: {e}"
|
469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
def stop_recording(self):
|
471 |
"""Stop the recording process"""
|
472 |
self.is_running = False
|
473 |
+
self.audio_buffer = []
|
|
|
474 |
return "Recording stopped!"
|
475 |
|
476 |
def clear_conversation(self):
|
|
|
479 |
self.sentence_speakers = []
|
480 |
self.pending_sentences = []
|
481 |
self.displayed_text = ""
|
482 |
+
self.audio_buffer = []
|
483 |
|
484 |
if self.speaker_detector:
|
485 |
self.speaker_detector = SpeakerChangeDetector(
|
|
|
511 |
sentence_text, _ = sentence
|
512 |
if i >= len(self.sentence_speakers):
|
513 |
color = "#FFFFFF"
|
514 |
+
speaker_name = "Speaker ?"
|
515 |
else:
|
516 |
speaker_id = self.sentence_speakers[i]
|
517 |
color = self.speaker_detector.get_color_for_speaker(speaker_id)
|
|
|
520 |
sentences_with_style.append(
|
521 |
f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
|
522 |
|
|
|
|
|
|
|
|
|
|
|
523 |
if sentences_with_style:
|
524 |
return "<br><br>".join(sentences_with_style)
|
525 |
else:
|
|
|
542 |
f"**Last Similarity:** {status['last_similarity']:.3f}",
|
543 |
f"**Change Threshold:** {status['threshold']:.2f}",
|
544 |
f"**Total Sentences:** {len(self.full_sentences)}",
|
545 |
+
f"**Audio Buffer Size:** {len(self.audio_buffer)}",
|
546 |
"",
|
547 |
"**Speaker Segment Counts:**"
|
548 |
]
|
|
|
600 |
return diarization_system.get_status_info()
|
601 |
|
602 |
|
603 |
+
def process_audio(audio_data):
|
604 |
+
"""Process audio from Gradio audio input"""
|
605 |
+
if audio_data is not None:
|
606 |
+
sample_rate, audio_array = audio_data
|
607 |
+
diarization_system.process_audio_stream(audio_array, sample_rate)
|
608 |
+
return get_conversation(), get_status()
|
609 |
|
610 |
|
611 |
# Create Gradio interface
|
612 |
def create_interface():
|
613 |
+
with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Dark()) as app:
|
614 |
gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
|
615 |
+
gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding using your browser's microphone.")
|
616 |
|
617 |
with gr.Row():
|
618 |
with gr.Column(scale=2):
|
619 |
+
# Audio input
|
620 |
audio_input = gr.Audio(
|
621 |
+
source="microphone",
|
622 |
+
type="numpy",
|
623 |
streaming=True,
|
624 |
+
label="🎙️ Microphone Input"
|
|
|
625 |
)
|
626 |
|
627 |
# Main conversation display
|
|
|
641 |
status_output = gr.Textbox(
|
642 |
label="System Status",
|
643 |
value="System not initialized",
|
644 |
+
lines=10,
|
645 |
interactive=False
|
646 |
)
|
647 |
|
|
|
668 |
|
669 |
update_settings_btn = gr.Button("Update Settings")
|
670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
671 |
# Speaker color legend
|
672 |
gr.Markdown("## 🎨 Speaker Colors")
|
673 |
color_info = []
|
|
|
675 |
color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
|
676 |
|
677 |
gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
|
678 |
+
|
679 |
+
# Instructions
|
680 |
+
gr.Markdown("""
|
681 |
+
## 📋 Instructions
|
682 |
+
1. **Initialize System** - Load AI models
|
683 |
+
2. **Allow microphone access** when prompted
|
684 |
+
3. **Start Recording** - Begin real-time processing
|
685 |
+
4. **Speak naturally** - The system will detect different speakers
|
686 |
+
5. **Stop Recording** when done
|
687 |
+
|
688 |
+
**Note:** Processing happens in real-time with ~2 second chunks for better accuracy.
|
689 |
+
""")
|
690 |
|
691 |
# Event handlers
|
692 |
def on_initialize():
|
|
|
751 |
outputs=[status_output]
|
752 |
)
|
753 |
|
754 |
+
# Process streaming audio
|
755 |
audio_input.stream(
|
756 |
+
process_audio,
|
757 |
inputs=[audio_input],
|
758 |
+
outputs=[conversation_output, status_output],
|
759 |
+
time_limit=60,
|
760 |
+
stream_every=0.5
|
761 |
)
|
762 |
|
763 |
+
# Auto-refresh every 3 seconds
|
764 |
+
refresh_timer = gr.Timer(3.0)
|
765 |
refresh_timer.tick(
|
766 |
+
lambda: (get_conversation(), get_status()),
|
767 |
outputs=[conversation_output, status_output]
|
768 |
)
|
769 |
|