Saiyaswanth007 commited on
Commit
4611564
·
1 Parent(s): 21bc664

Code Update

Browse files
Files changed (2) hide show
  1. realtime_diarize.py +523 -0
  2. requirements.txt +184 -0
realtime_diarize.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import queue
5
+ import threading
6
+ import signal
7
+ import atexit
8
+ from contextlib import contextmanager
9
+ import warnings
10
+ warnings.filterwarnings("ignore", category=UserWarning)
11
+
12
+ import numpy as np
13
+ import torch
14
+ import torchaudio
15
+ from scipy.spatial.distance import cosine
16
+
17
+ try:
18
+ import soundcard as sc
19
+ except ImportError:
20
+ print("soundcard not found. Install with: pip install soundcard")
21
+ sys.exit(1)
22
+
23
+ try:
24
+ from RealtimeSTT import AudioToTextRecorder
25
+ except ImportError:
26
+ print("RealtimeSTT not found. Install with: pip install RealtimeSTT")
27
+ sys.exit(1)
28
+
29
+ # Configuration
30
+ class Config:
31
+ # Audio settings
32
+ SAMPLE_RATE = 16000
33
+ BUFFER_SIZE = 1024
34
+ CHANNELS = 1
35
+
36
+ # Transcription settings
37
+ FINAL_MODEL = "distil-large-v3"
38
+ REALTIME_MODEL = "distil-small.en"
39
+ LANGUAGE = "en"
40
+ BEAM_SIZE = 5
41
+ REALTIME_BEAM_SIZE = 3
42
+
43
+ # Voice activity detection
44
+ SILENCE_THRESHOLD = 0.4
45
+ MIN_RECORDING_LENGTH = 0.5
46
+ PRE_RECORDING_BUFFER = 0.2
47
+ SILERO_SENSITIVITY = 0.4
48
+ WEBRTC_SENSITIVITY = 3
49
+
50
+ # Speaker detection
51
+ CHANGE_THRESHOLD = 0.65
52
+ MAX_SPEAKERS = 4
53
+ MIN_SEGMENT_DURATION = 1.0
54
+ EMBEDDING_HISTORY_SIZE = 3
55
+ SPEAKER_MEMORY_SIZE = 20
56
+
57
+ # Console colors for speakers
58
+ COLORS = [
59
+ '\033[93m', # Yellow
60
+ '\033[91m', # Red
61
+ '\033[92m', # Green
62
+ '\033[96m', # Cyan
63
+ '\033[95m', # Magenta
64
+ '\033[94m', # Blue
65
+ '\033[97m', # White
66
+ '\033[33m', # Orange
67
+ ]
68
+ RESET = '\033[0m'
69
+ LIVE_COLOR = '\033[90m'
70
+
71
+ class SpeakerEncoder:
72
+ """Simplified speaker encoder using torchaudio transforms"""
73
+
74
+ def __init__(self, device="cpu"):
75
+ self.device = device
76
+ self.embedding_dim = 128
77
+ self.model_loaded = False
78
+ self._setup_model()
79
+
80
+ def _setup_model(self):
81
+ """Setup a simple MFCC-based feature extractor"""
82
+ try:
83
+ self.mfcc_transform = torchaudio.transforms.MFCC(
84
+ sample_rate=Config.SAMPLE_RATE,
85
+ n_mfcc=13,
86
+ melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23}
87
+ ).to(self.device)
88
+ self.model_loaded = True
89
+ print("Simple MFCC-based encoder initialized")
90
+ except Exception as e:
91
+ print(f"Error setting up encoder: {e}")
92
+ self.model_loaded = False
93
+
94
+ def extract_embedding(self, audio):
95
+ """Extract speaker embedding from audio"""
96
+ if not self.model_loaded:
97
+ return np.zeros(self.embedding_dim)
98
+
99
+ try:
100
+ # Ensure audio is float32 and normalized
101
+ if isinstance(audio, np.ndarray):
102
+ audio = torch.from_numpy(audio).float()
103
+
104
+ # Normalize audio
105
+ if audio.abs().max() > 0:
106
+ audio = audio / audio.abs().max()
107
+
108
+ # Add batch dimension if needed
109
+ if audio.dim() == 1:
110
+ audio = audio.unsqueeze(0)
111
+
112
+ # Extract MFCC features
113
+ with torch.no_grad():
114
+ mfcc = self.mfcc_transform(audio)
115
+ # Simple statistics-based embedding
116
+ embedding = torch.cat([
117
+ mfcc.mean(dim=2).flatten(),
118
+ mfcc.std(dim=2).flatten(),
119
+ mfcc.max(dim=2)[0].flatten(),
120
+ mfcc.min(dim=2)[0].flatten()
121
+ ])
122
+
123
+ # Pad or truncate to fixed size
124
+ if embedding.size(0) > self.embedding_dim:
125
+ embedding = embedding[:self.embedding_dim]
126
+ elif embedding.size(0) < self.embedding_dim:
127
+ padding = torch.zeros(self.embedding_dim - embedding.size(0))
128
+ embedding = torch.cat([embedding, padding])
129
+
130
+ return embedding.cpu().numpy()
131
+
132
+ except Exception as e:
133
+ print(f"Error extracting embedding: {e}")
134
+ return np.zeros(self.embedding_dim)
135
+
136
+ class SpeakerDetector:
137
+ """Speaker change detection using embeddings"""
138
+
139
+ def __init__(self, threshold=Config.CHANGE_THRESHOLD, max_speakers=Config.MAX_SPEAKERS):
140
+ self.threshold = threshold
141
+ self.max_speakers = max_speakers
142
+ self.current_speaker = 0
143
+ self.speaker_embeddings = [[] for _ in range(max_speakers)]
144
+ self.speaker_centroids = [None] * max_speakers
145
+ self.last_change_time = time.time()
146
+ self.active_speakers = {0}
147
+
148
+ def detect_speaker(self, embedding):
149
+ """Detect current speaker from embedding"""
150
+ current_time = time.time()
151
+
152
+ # Initialize first speaker
153
+ if not self.speaker_embeddings[0]:
154
+ self.speaker_embeddings[0].append(embedding)
155
+ self.speaker_centroids[0] = embedding.copy()
156
+ return 0, 1.0
157
+
158
+ # Calculate similarity with current speaker
159
+ current_centroid = self.speaker_centroids[self.current_speaker]
160
+ if current_centroid is not None:
161
+ similarity = 1.0 - cosine(embedding, current_centroid)
162
+ else:
163
+ similarity = 0.0
164
+
165
+ # Check if enough time has passed for a speaker change
166
+ if current_time - self.last_change_time < Config.MIN_SEGMENT_DURATION:
167
+ self._update_speaker_model(self.current_speaker, embedding)
168
+ return self.current_speaker, similarity
169
+
170
+ # Check for speaker change
171
+ if similarity < self.threshold:
172
+ # Find best matching existing speaker
173
+ best_speaker = self.current_speaker
174
+ best_similarity = similarity
175
+
176
+ for speaker_id in self.active_speakers:
177
+ if speaker_id == self.current_speaker:
178
+ continue
179
+
180
+ centroid = self.speaker_centroids[speaker_id]
181
+ if centroid is not None:
182
+ sim = 1.0 - cosine(embedding, centroid)
183
+ if sim > best_similarity and sim > self.threshold:
184
+ best_similarity = sim
185
+ best_speaker = speaker_id
186
+
187
+ # Create new speaker if no good match and slots available
188
+ if (best_speaker == self.current_speaker and
189
+ len(self.active_speakers) < self.max_speakers):
190
+ for new_id in range(self.max_speakers):
191
+ if new_id not in self.active_speakers:
192
+ best_speaker = new_id
193
+ best_similarity = 0.0
194
+ self.active_speakers.add(new_id)
195
+ break
196
+
197
+ # Update current speaker if changed
198
+ if best_speaker != self.current_speaker:
199
+ self.current_speaker = best_speaker
200
+ self.last_change_time = current_time
201
+ similarity = best_similarity
202
+
203
+ # Update speaker model
204
+ self._update_speaker_model(self.current_speaker, embedding)
205
+ return self.current_speaker, similarity
206
+
207
+ def _update_speaker_model(self, speaker_id, embedding):
208
+ """Update speaker model with new embedding"""
209
+ self.speaker_embeddings[speaker_id].append(embedding)
210
+
211
+ # Keep only recent embeddings
212
+ if len(self.speaker_embeddings[speaker_id]) > Config.SPEAKER_MEMORY_SIZE:
213
+ self.speaker_embeddings[speaker_id] = \
214
+ self.speaker_embeddings[speaker_id][-Config.SPEAKER_MEMORY_SIZE:]
215
+
216
+ # Update centroid
217
+ if self.speaker_embeddings[speaker_id]:
218
+ self.speaker_centroids[speaker_id] = np.mean(
219
+ self.speaker_embeddings[speaker_id], axis=0
220
+ )
221
+
222
+ class AudioRecorder:
223
+ """Handles audio recording from system audio"""
224
+
225
+ def __init__(self, audio_queue):
226
+ self.audio_queue = audio_queue
227
+ self.running = False
228
+ self.thread = None
229
+
230
+ def start(self):
231
+ """Start recording"""
232
+ self.running = True
233
+ self.thread = threading.Thread(target=self._record_loop, daemon=True)
234
+ self.thread.start()
235
+ print("Audio recording started")
236
+
237
+ def stop(self):
238
+ """Stop recording"""
239
+ self.running = False
240
+ if self.thread and self.thread.is_alive():
241
+ self.thread.join(timeout=2)
242
+
243
+ def _record_loop(self):
244
+ """Main recording loop"""
245
+ try:
246
+ # Try to use system audio (loopback)
247
+ try:
248
+ device = sc.default_speaker()
249
+ with device.recorder(
250
+ samplerate=Config.SAMPLE_RATE,
251
+ blocksize=Config.BUFFER_SIZE,
252
+ channels=Config.CHANNELS
253
+ ) as recorder:
254
+ print(f"Recording from: {device.name}")
255
+ while self.running:
256
+ data = recorder.record(numframes=Config.BUFFER_SIZE)
257
+ if data is not None and len(data) > 0:
258
+ # Convert to mono if needed
259
+ if data.ndim > 1:
260
+ data = data[:, 0]
261
+ self.audio_queue.put(data.flatten())
262
+
263
+ except Exception as e:
264
+ print(f"Loopback recording failed: {e}")
265
+ print("Falling back to microphone...")
266
+
267
+ # Fallback to microphone
268
+ mic = sc.default_microphone()
269
+ with mic.recorder(
270
+ samplerate=Config.SAMPLE_RATE,
271
+ blocksize=Config.BUFFER_SIZE,
272
+ channels=Config.CHANNELS
273
+ ) as recorder:
274
+ print(f"Recording from microphone: {mic.name}")
275
+ while self.running:
276
+ data = recorder.record(numframes=Config.BUFFER_SIZE)
277
+ if data is not None and len(data) > 0:
278
+ if data.ndim > 1:
279
+ data = data[:, 0]
280
+ self.audio_queue.put(data.flatten())
281
+
282
+ except Exception as e:
283
+ print(f"Recording error: {e}")
284
+ self.running = False
285
+
286
+ class TranscriptionProcessor:
287
+ """Handles transcription and speaker detection"""
288
+
289
+ def __init__(self):
290
+ self.encoder = SpeakerEncoder()
291
+ self.detector = SpeakerDetector()
292
+ self.recorder = None
293
+ self.audio_queue = queue.Queue(maxsize=100)
294
+ self.audio_recorder = AudioRecorder(self.audio_queue)
295
+ self.processing_thread = None
296
+ self.running = False
297
+
298
+ def setup(self):
299
+ """Setup transcription recorder"""
300
+ try:
301
+ self.recorder = AudioToTextRecorder(
302
+ spinner=False,
303
+ use_microphone=False,
304
+ model=Config.FINAL_MODEL,
305
+ language=Config.LANGUAGE,
306
+ silero_sensitivity=Config.SILERO_SENSITIVITY,
307
+ webrtc_sensitivity=Config.WEBRTC_SENSITIVITY,
308
+ post_speech_silence_duration=Config.SILENCE_THRESHOLD,
309
+ min_length_of_recording=Config.MIN_RECORDING_LENGTH,
310
+ pre_recording_buffer_duration=Config.PRE_RECORDING_BUFFER,
311
+ enable_realtime_transcription=True,
312
+ realtime_model_type=Config.REALTIME_MODEL,
313
+ beam_size=Config.BEAM_SIZE,
314
+ beam_size_realtime=Config.REALTIME_BEAM_SIZE,
315
+ on_realtime_transcription_update=self._on_live_text,
316
+ )
317
+ print("Transcription recorder setup complete")
318
+ return True
319
+ except Exception as e:
320
+ print(f"Transcription setup failed: {e}")
321
+ return False
322
+
323
+ def start(self):
324
+ """Start processing"""
325
+ if not self.setup():
326
+ return False
327
+
328
+ self.running = True
329
+
330
+ # Start audio recording
331
+ self.audio_recorder.start()
332
+
333
+ # Start audio processing thread
334
+ self.processing_thread = threading.Thread(target=self._process_audio, daemon=True)
335
+ self.processing_thread.start()
336
+
337
+ # Start transcription
338
+ self._start_transcription()
339
+
340
+ return True
341
+
342
+ def stop(self):
343
+ """Stop processing"""
344
+ print("\nStopping transcription...")
345
+ self.running = False
346
+
347
+ if self.audio_recorder:
348
+ self.audio_recorder.stop()
349
+
350
+ if self.processing_thread and self.processing_thread.is_alive():
351
+ self.processing_thread.join(timeout=2)
352
+
353
+ if self.recorder:
354
+ try:
355
+ self.recorder.shutdown()
356
+ except:
357
+ pass
358
+
359
+ def _process_audio(self):
360
+ """Process audio chunks for speaker detection"""
361
+ audio_buffer = []
362
+
363
+ while self.running:
364
+ try:
365
+ # Get audio chunk
366
+ chunk = self.audio_queue.get(timeout=0.1)
367
+ audio_buffer.extend(chunk)
368
+
369
+ # Process when we have enough audio (about 1 second)
370
+ if len(audio_buffer) >= Config.SAMPLE_RATE:
371
+ audio_array = np.array(audio_buffer[:Config.SAMPLE_RATE])
372
+ audio_buffer = audio_buffer[Config.SAMPLE_RATE//2:] # 50% overlap
373
+
374
+ # Convert to int16 for recorder
375
+ audio_int16 = (audio_array * 32767).astype(np.int16)
376
+
377
+ # Feed to transcription recorder
378
+ if self.recorder:
379
+ self.recorder.feed_audio(audio_int16.tobytes())
380
+
381
+ except queue.Empty:
382
+ continue
383
+ except Exception as e:
384
+ if self.running:
385
+ print(f"Audio processing error: {e}")
386
+
387
+ def _start_transcription(self):
388
+ """Start transcription loop"""
389
+ def transcription_loop():
390
+ while self.running:
391
+ try:
392
+ text = self.recorder.text()
393
+ if text and text.strip():
394
+ self._process_final_text(text)
395
+ except Exception as e:
396
+ if self.running:
397
+ print(f"Transcription error: {e}")
398
+ break
399
+
400
+ transcription_thread = threading.Thread(target=transcription_loop, daemon=True)
401
+ transcription_thread.start()
402
+
403
+ def _on_live_text(self, text):
404
+ """Handle live transcription updates"""
405
+ if text and text.strip():
406
+ print(f"\r{LIVE_COLOR}[Live] {text}{RESET}", end="", flush=True)
407
+
408
+ def _process_final_text(self, text):
409
+ """Process final transcription with speaker detection"""
410
+ # Clear live text line
411
+ print("\r" + " " * 80 + "\r", end="")
412
+
413
+ try:
414
+ # Get recent audio for speaker detection
415
+ recent_audio = []
416
+ temp_queue = []
417
+
418
+ # Collect recent audio chunks
419
+ for _ in range(min(10, self.audio_queue.qsize())):
420
+ try:
421
+ chunk = self.audio_queue.get_nowait()
422
+ recent_audio.extend(chunk)
423
+ temp_queue.append(chunk)
424
+ except queue.Empty:
425
+ break
426
+
427
+ # Put chunks back
428
+ for chunk in reversed(temp_queue):
429
+ try:
430
+ self.audio_queue.put_nowait(chunk)
431
+ except queue.Full:
432
+ break
433
+
434
+ # Extract speaker embedding if we have audio
435
+ if recent_audio:
436
+ audio_tensor = torch.FloatTensor(recent_audio[-Config.SAMPLE_RATE:])
437
+ embedding = self.encoder.extract_embedding(audio_tensor)
438
+ speaker_id, similarity = self.detector.detect_speaker(embedding)
439
+ else:
440
+ speaker_id, similarity = 0, 1.0
441
+
442
+ # Display with speaker color
443
+ color = COLORS[speaker_id % len(COLORS)]
444
+ print(f"{color}Speaker {speaker_id + 1}: {text}{RESET}")
445
+
446
+ except Exception as e:
447
+ print(f"Error processing text: {e}")
448
+ print(f"Text: {text}")
449
+
450
+ class RealTimeSpeakerDetection:
451
+ """Main application class"""
452
+
453
+ def __init__(self):
454
+ self.processor = None
455
+ self.running = False
456
+
457
+ # Setup signal handlers for clean shutdown
458
+ signal.signal(signal.SIGINT, self._signal_handler)
459
+ signal.signal(signal.SIGTERM, self._signal_handler)
460
+ atexit.register(self.cleanup)
461
+
462
+ def _signal_handler(self, signum, frame):
463
+ """Handle shutdown signals"""
464
+ print(f"\nReceived signal {signum}, shutting down...")
465
+ self.stop()
466
+
467
+ def start(self):
468
+ """Start the application"""
469
+ print("=== Real-time Speaker Detection and Transcription ===")
470
+ print("Initializing...")
471
+
472
+ self.processor = TranscriptionProcessor()
473
+
474
+ if not self.processor.start():
475
+ print("Failed to start. Check your audio setup and dependencies.")
476
+ return False
477
+
478
+ self.running = True
479
+
480
+ print("=" * 60)
481
+ print("System ready! Listening for audio...")
482
+ print("Different speakers will be shown in different colors.")
483
+ print("Press Ctrl+C to stop.")
484
+ print("=" * 60)
485
+
486
+ # Keep main thread alive
487
+ try:
488
+ while self.running:
489
+ time.sleep(1)
490
+ except KeyboardInterrupt:
491
+ pass
492
+
493
+ return True
494
+
495
+ def stop(self):
496
+ """Stop the application"""
497
+ if not self.running:
498
+ return
499
+
500
+ self.running = False
501
+
502
+ if self.processor:
503
+ self.processor.stop()
504
+
505
+ print("System stopped.")
506
+
507
+ def cleanup(self):
508
+ """Cleanup resources"""
509
+ self.stop()
510
+
511
+ def main():
512
+ """Main entry point"""
513
+ app = RealTimeSpeakerDetection()
514
+
515
+ try:
516
+ app.start()
517
+ except Exception as e:
518
+ print(f"Application error: {e}")
519
+ finally:
520
+ app.cleanup()
521
+
522
+ if __name__ == "__main__":
523
+ main()
requirements.txt ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiohttp==3.9.3
3
+ aiosignal==1.3.1
4
+ annotated-types==0.6.0
5
+ anyascii==0.3.2
6
+ anyio==4.3.0
7
+ asttokens==2.4.1
8
+ attrs==23.2.0
9
+ audioread==3.0.1
10
+ av==11.0.0
11
+ azure-cognitiveservices-speech==1.36.0
12
+ Babel==2.14.0
13
+ bangla==0.0.2
14
+ blinker==1.7.0
15
+ blis==0.7.11
16
+ bnnumerizer==0.0.2
17
+ bnunicodenormalizer==0.1.6
18
+ catalogue==2.0.10
19
+ certifi==2024.2.2
20
+ cffi==1.16.0
21
+ charset-normalizer==3.3.2
22
+ click==8.1.7
23
+ cloudpathlib==0.16.0
24
+ colorama==0.4.6
25
+ coloredlogs==15.0.1
26
+ comtypes==1.3.1
27
+ confection==0.1.4
28
+ contourpy==1.2.0
29
+ coqpit==0.0.17
30
+ ctranslate2==4.1.0
31
+ cycler==0.12.1
32
+ cymem==2.0.8
33
+ Cython==3.0.9
34
+ dateparser==1.1.8
35
+ decorator==5.1.1
36
+ distro==1.9.0
37
+ docopt==0.6.2
38
+ einops==0.7.0
39
+ elevenlabs==0.2.27
40
+ emoji==2.8.0
41
+ encodec==0.1.1
42
+ enum34==1.1.10
43
+ executing==2.0.1
44
+ faster-whisper==1.0.1
45
+ ffmpeg-python==0.2.0
46
+ filelock==3.9.0
47
+ Flask==3.0.2
48
+ flatbuffers==24.3.25
49
+ fonttools==4.50.0
50
+ frozenlist==1.4.1
51
+ fsspec==2024.3.1
52
+ future==1.0.0
53
+ g2pkk==0.1.2
54
+ grpcio==1.62.1
55
+ gruut==2.2.3
56
+ gruut-ipa==0.13.0
57
+ gruut_lang_de==2.0.0
58
+ gruut_lang_en==2.0.0
59
+ gruut_lang_es==2.0.0
60
+ gruut_lang_fr==2.0.2
61
+ h11==0.14.0
62
+ halo==0.0.31
63
+ hangul-romanize==0.1.0
64
+ httpcore==1.0.5
65
+ httpx==0.27.0
66
+ huggingface-hub==0.22.2
67
+ humanfriendly==10.0
68
+ idna==3.6
69
+ inflect==7.0.0
70
+ ipython==8.22.2
71
+ itsdangerous==2.1.2
72
+ jamo==0.4.1
73
+ jedi==0.19.1
74
+ jieba==0.42.1
75
+ Jinja2==3.1.2
76
+ joblib==1.3.2
77
+ jsonlines==1.2.0
78
+ kiwisolver==1.4.5
79
+ langcodes==3.3.0
80
+ lazy_loader==0.3
81
+ librosa==0.10.1
82
+ llvmlite==0.42.0
83
+ log-symbols==0.0.14
84
+ Markdown==3.6
85
+ MarkupSafe==2.1.3
86
+ matplotlib==3.8.3
87
+ matplotlib-inline==0.1.6
88
+ more-itertools==10.2.0
89
+ mpmath==1.3.0
90
+ msgpack==1.0.8
91
+ multidict==6.0.5
92
+ murmurhash==1.0.10
93
+ networkx==2.8.8
94
+ nltk==3.8.1
95
+ num2words==0.5.13
96
+ numba==0.59.1
97
+ numpy==1.26.4
98
+ onnxruntime==1.17.1
99
+ openai==1.13.3
100
+ openai-whisper==20231117
101
+ packaging==24.0
102
+ pandas==1.5.3
103
+ parso==0.8.3
104
+ pillow==10.2.0
105
+ platformdirs==4.2.0
106
+ pooch==1.8.1
107
+ preshed==3.0.9
108
+ prompt-toolkit==3.0.43
109
+ protobuf==5.26.1
110
+ psutil==5.9.8
111
+ pure-eval==0.2.2
112
+ pvporcupine==1.9.5
113
+ pyannote-audio==3.1.1
114
+ PyAudio==0.2.14
115
+ pycparser==2.22
116
+ pydantic==2.6.4
117
+ pydantic_core==2.16.3
118
+ pydub==0.25.1
119
+ Pygments==2.17.2
120
+ pynndescent==0.5.12
121
+ pyparsing==3.1.2
122
+ pypinyin==0.51.0
123
+ pypiwin32==223
124
+ pyreadline3==3.4.1
125
+ pysbd==0.3.4
126
+ python-crfsuite==0.9.10
127
+ python-dateutil==2.9.0.post0
128
+ pyttsx3==2.90
129
+ pytz==2024.1
130
+ pywin32==306
131
+ PyYAML==6.0.1
132
+ RealTimeSTT==0.1.13
133
+ RealTimeTTS==0.3.44
134
+ regex==2023.12.25
135
+ requests==2.31.0
136
+ safetensors==0.4.2
137
+ scikit-learn==1.4.1.post1
138
+ scipy==1.12.0
139
+ six==1.16.0
140
+ smart-open==6.4.0
141
+ sniffio==1.3.1
142
+ soundfile==0.12.1
143
+ soxr==0.3.7
144
+ spacy==3.7.4
145
+ spacy-legacy==3.0.12
146
+ spacy-loggers==1.0.5
147
+ spinners==0.0.24
148
+ srsly==2.4.8
149
+ stable-ts==2.15.10
150
+ stack-data==0.6.3
151
+ stanza==1.6.1
152
+ stream2sentence==0.2.3
153
+ SudachiDict-core==20240109
154
+ SudachiPy==0.6.8
155
+ sympy==1.12
156
+ tensorboard==2.16.2
157
+ tensorboard-data-server==0.7.2
158
+ termcolor==2.4.0
159
+ thinc==8.2.3
160
+ threadpoolctl==3.4.0
161
+ tiktoken==0.6.0
162
+ tokenizers==0.15.2
163
+ torch==2.2.2+cu118
164
+ torchaudio==2.2.2+cu118
165
+ tqdm==4.66.2
166
+ trainer==0.0.36
167
+ traitlets==5.14.2
168
+ transformers==4.39.2
169
+ TTS==0.22.0
170
+ typer==0.9.4
171
+ typing_extensions==4.8.0
172
+ tzdata==2024.1
173
+ tzlocal==5.2
174
+ umap-learn==0.5.5
175
+ Unidecode==1.3.8
176
+ urllib3==2.2.1
177
+ wasabi==1.1.2
178
+ wcwidth==0.2.13
179
+ weasel==0.3.4
180
+ webrtcvad==2.0.10
181
+ websockets==12.0
182
+ Werkzeug==3.0.1
183
+ yarl==1.9.4
184
+ yt-dlp==2024.3.10