Spaces:
Sleeping
Sleeping
Commit
·
4e75e2b
1
Parent(s):
8acaa5d
Updated html to json
Browse files
shared.py
CHANGED
@@ -455,43 +455,100 @@ class RealtimeSpeakerDiarization:
|
|
455 |
return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
|
456 |
|
457 |
def get_formatted_conversation(self):
|
458 |
-
"""Get the formatted conversation"""
|
459 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
|
461 |
def get_status_info(self):
|
462 |
-
"""Get current status information"""
|
463 |
if not self.speaker_detector:
|
464 |
-
return "Speaker detector not initialized"
|
465 |
|
466 |
try:
|
467 |
-
|
468 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
status_lines = [
|
470 |
f"**Current Speaker:** {status['current_speaker'] + 1}",
|
471 |
-
f"**Active Speakers:** {status['
|
472 |
f"**Last Similarity:** {status['last_similarity']:.3f}",
|
473 |
-
f"**Change Threshold:** {status['
|
474 |
-
f"**Total Sentences:** {
|
475 |
-
f"**Segments Processed:** {status['
|
476 |
"",
|
477 |
"**Speaker Activity:**"
|
478 |
]
|
479 |
|
480 |
-
for
|
481 |
-
|
482 |
-
|
483 |
-
active = "🟢" if count > 0 else "⚫"
|
484 |
-
status_lines.append(f"{active} Speaker {i+1} ({color_name}): {count} segments")
|
485 |
|
486 |
-
|
|
|
|
|
487 |
|
488 |
except Exception as e:
|
489 |
-
|
|
|
|
|
490 |
|
491 |
def process_audio_chunk(self, audio_data, sample_rate=16000):
|
492 |
"""Process audio chunk from WebSocket input"""
|
493 |
if not self.is_running or self.audio_processor is None:
|
494 |
-
return
|
495 |
|
496 |
try:
|
497 |
# Convert bytes to numpy array if needed
|
@@ -517,6 +574,10 @@ class RealtimeSpeakerDiarization:
|
|
517 |
self.audio_processor.add_audio_chunk(audio_data)
|
518 |
|
519 |
# Periodically extract embeddings for speaker detection
|
|
|
|
|
|
|
|
|
520 |
if len(self.audio_processor.audio_buffer) % (SAMPLE_RATE // 2) == 0: # Every 0.5 seconds
|
521 |
embedding = self.audio_processor.extract_embedding_from_buffer()
|
522 |
if embedding is not None:
|
@@ -527,9 +588,18 @@ class RealtimeSpeakerDiarization:
|
|
527 |
with self.transcription_lock:
|
528 |
self.full_sentences.append((f"[Audio segment {self.speaker_detector.segment_counter}]", speaker_id))
|
529 |
self.update_conversation_display()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
|
531 |
except Exception as e:
|
532 |
logger.error(f"Error processing audio chunk: {e}")
|
|
|
533 |
|
534 |
def resample_audio(self, audio_bytes, from_rate, to_rate):
|
535 |
"""Resample audio to target sample rate"""
|
|
|
455 |
return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
|
456 |
|
457 |
def get_formatted_conversation(self):
|
458 |
+
"""Get the formatted conversation with structured data"""
|
459 |
+
try:
|
460 |
+
# Create conversation HTML format as before
|
461 |
+
html_content = self.current_conversation
|
462 |
+
|
463 |
+
# Create structured data
|
464 |
+
structured_data = {
|
465 |
+
"html_content": html_content,
|
466 |
+
"sentences": [],
|
467 |
+
"current_transcript": self.last_transcription,
|
468 |
+
"current_speaker": self.speaker_detector.current_speaker if self.speaker_detector else 0
|
469 |
+
}
|
470 |
+
|
471 |
+
# Add sentence data
|
472 |
+
for sentence_text, speaker_id in self.full_sentences:
|
473 |
+
color = self.speaker_detector.get_color_for_speaker(speaker_id) if self.speaker_detector else "#FFFFFF"
|
474 |
+
structured_data["sentences"].append({
|
475 |
+
"text": sentence_text,
|
476 |
+
"speaker_id": speaker_id,
|
477 |
+
"speaker_name": f"Speaker {speaker_id + 1}",
|
478 |
+
"color": color
|
479 |
+
})
|
480 |
+
|
481 |
+
return html_content
|
482 |
+
except Exception as e:
|
483 |
+
logger.error(f"Error formatting conversation: {e}")
|
484 |
+
return f"<i>Error formatting conversation: {str(e)}</i>"
|
485 |
|
486 |
def get_status_info(self):
|
487 |
+
"""Get current status information as structured data"""
|
488 |
if not self.speaker_detector:
|
489 |
+
return {"error": "Speaker detector not initialized"}
|
490 |
|
491 |
try:
|
492 |
+
speaker_status = self.speaker_detector.get_status_info()
|
493 |
|
494 |
+
# Format speaker activity
|
495 |
+
speaker_activity = []
|
496 |
+
for i in range(speaker_status['max_speakers']):
|
497 |
+
color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
|
498 |
+
count = speaker_status['speaker_counts'][i]
|
499 |
+
active = count > 0
|
500 |
+
speaker_activity.append({
|
501 |
+
"id": i,
|
502 |
+
"name": f"Speaker {i+1}",
|
503 |
+
"color": SPEAKER_COLORS[i] if i < len(SPEAKER_COLORS) else "#FFFFFF",
|
504 |
+
"color_name": color_name,
|
505 |
+
"segment_count": count,
|
506 |
+
"active": active
|
507 |
+
})
|
508 |
+
|
509 |
+
# Create structured status object
|
510 |
+
status = {
|
511 |
+
"current_speaker": speaker_status['current_speaker'],
|
512 |
+
"current_speaker_name": f"Speaker {speaker_status['current_speaker'] + 1}",
|
513 |
+
"active_speakers_count": speaker_status['active_speakers'],
|
514 |
+
"max_speakers": speaker_status['max_speakers'],
|
515 |
+
"last_similarity": speaker_status['last_similarity'],
|
516 |
+
"change_threshold": speaker_status['threshold'],
|
517 |
+
"total_sentences": len(self.full_sentences),
|
518 |
+
"segments_processed": speaker_status['segment_counter'],
|
519 |
+
"speaker_activity": speaker_activity,
|
520 |
+
"timestamp": time.time()
|
521 |
+
}
|
522 |
+
|
523 |
+
# Also create a formatted text version for UI display
|
524 |
status_lines = [
|
525 |
f"**Current Speaker:** {status['current_speaker'] + 1}",
|
526 |
+
f"**Active Speakers:** {status['active_speakers_count']} of {status['max_speakers']}",
|
527 |
f"**Last Similarity:** {status['last_similarity']:.3f}",
|
528 |
+
f"**Change Threshold:** {status['change_threshold']:.2f}",
|
529 |
+
f"**Total Sentences:** {status['total_sentences']}",
|
530 |
+
f"**Segments Processed:** {status['segments_processed']}",
|
531 |
"",
|
532 |
"**Speaker Activity:**"
|
533 |
]
|
534 |
|
535 |
+
for speaker in status["speaker_activity"]:
|
536 |
+
active = "🟢" if speaker["active"] else "⚫"
|
537 |
+
status_lines.append(f"{active} Speaker {speaker['id']+1} ({speaker['color_name']}): {speaker['segment_count']} segments")
|
|
|
|
|
538 |
|
539 |
+
status["formatted_text"] = "\n".join(status_lines)
|
540 |
+
|
541 |
+
return status
|
542 |
|
543 |
except Exception as e:
|
544 |
+
error_msg = f"Error getting status: {e}"
|
545 |
+
logger.error(error_msg)
|
546 |
+
return {"error": error_msg, "formatted_text": error_msg}
|
547 |
|
548 |
def process_audio_chunk(self, audio_data, sample_rate=16000):
|
549 |
"""Process audio chunk from WebSocket input"""
|
550 |
if not self.is_running or self.audio_processor is None:
|
551 |
+
return {"status": "not_running"}
|
552 |
|
553 |
try:
|
554 |
# Convert bytes to numpy array if needed
|
|
|
574 |
self.audio_processor.add_audio_chunk(audio_data)
|
575 |
|
576 |
# Periodically extract embeddings for speaker detection
|
577 |
+
embedding = None
|
578 |
+
speaker_id = self.speaker_detector.current_speaker
|
579 |
+
similarity = 1.0
|
580 |
+
|
581 |
if len(self.audio_processor.audio_buffer) % (SAMPLE_RATE // 2) == 0: # Every 0.5 seconds
|
582 |
embedding = self.audio_processor.extract_embedding_from_buffer()
|
583 |
if embedding is not None:
|
|
|
588 |
with self.transcription_lock:
|
589 |
self.full_sentences.append((f"[Audio segment {self.speaker_detector.segment_counter}]", speaker_id))
|
590 |
self.update_conversation_display()
|
591 |
+
|
592 |
+
# Return processing result
|
593 |
+
return {
|
594 |
+
"status": "processed",
|
595 |
+
"buffer_size": len(self.audio_processor.audio_buffer),
|
596 |
+
"speaker_id": speaker_id,
|
597 |
+
"similarity": similarity if embedding is not None else None
|
598 |
+
}
|
599 |
|
600 |
except Exception as e:
|
601 |
logger.error(f"Error processing audio chunk: {e}")
|
602 |
+
return {"status": "error", "message": str(e)}
|
603 |
|
604 |
def resample_audio(self, audio_bytes, from_rate, to_rate):
|
605 |
"""Resample audio to target sample rate"""
|
ui.py
CHANGED
@@ -173,10 +173,57 @@ def build_ui():
|
|
173 |
};
|
174 |
|
175 |
wsConnection.onmessage = (event) => {
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
};
|
181 |
|
182 |
wsConnection.onerror = (error) => {
|
|
|
173 |
};
|
174 |
|
175 |
wsConnection.onmessage = (event) => {
|
176 |
+
try {
|
177 |
+
// Parse the JSON message
|
178 |
+
const message = JSON.parse(event.data);
|
179 |
+
|
180 |
+
// Process different message types
|
181 |
+
switch(message.type) {
|
182 |
+
case 'transcription':
|
183 |
+
// Handle transcription data
|
184 |
+
if (message.data && typeof message.data === 'object') {
|
185 |
+
document.getElementById("conversation").innerHTML = message.data.conversation_html ||
|
186 |
+
JSON.stringify(message.data);
|
187 |
+
}
|
188 |
+
break;
|
189 |
+
|
190 |
+
case 'connection':
|
191 |
+
console.log('Connection status:', message.status);
|
192 |
+
updateStatus(message.status === 'connected' ? 'connected' : 'warning');
|
193 |
+
break;
|
194 |
+
|
195 |
+
case 'conversation_update':
|
196 |
+
if (message.conversation_html) {
|
197 |
+
document.getElementById("conversation").innerHTML = message.conversation_html;
|
198 |
+
}
|
199 |
+
break;
|
200 |
+
|
201 |
+
case 'conversation_cleared':
|
202 |
+
document.getElementById("conversation").innerHTML =
|
203 |
+
"<i>Conversation cleared. Start speaking again...</i>";
|
204 |
+
break;
|
205 |
+
|
206 |
+
case 'error':
|
207 |
+
console.error('Error message from server:', message.message);
|
208 |
+
updateStatus('warning', message.message);
|
209 |
+
break;
|
210 |
+
|
211 |
+
default:
|
212 |
+
// If it's just HTML content without proper JSON structure (legacy format)
|
213 |
+
document.getElementById("conversation").innerHTML = event.data;
|
214 |
+
}
|
215 |
+
|
216 |
+
// Auto-scroll to bottom
|
217 |
+
const container = document.getElementById("conversation");
|
218 |
+
container.scrollTop = container.scrollHeight;
|
219 |
+
} catch (e) {
|
220 |
+
// Fallback for non-JSON messages (legacy format)
|
221 |
+
document.getElementById("conversation").innerHTML = event.data;
|
222 |
+
|
223 |
+
// Auto-scroll to bottom
|
224 |
+
const container = document.getElementById("conversation");
|
225 |
+
container.scrollTop = container.scrollHeight;
|
226 |
+
}
|
227 |
};
|
228 |
|
229 |
wsConnection.onerror = (error) => {
|