Spaces:

Seicas
/

VoiceToWrite

Sleeping

App Files Files Community

Seicas commited on Jun 16

Commit

432e60e

verified ·

1 Parent(s): bbd2003

Update diarization.py

Browse files

Files changed (1) hide show

diarization.py +92 -85

diarization.py CHANGED Viewed

@@ -1,94 +1,101 @@
 from pyannote.audio import Pipeline
-from typing import List, Dict, Any
-import torch
-import os
-from config import settings  # Düzeltildi
-# HuggingFace token'ı ayarla
-os.environ["HF_TOKEN"] = "hf_your_token_here"  # Bu satırın yorumunu kaldırın ve kendi token'ınızı ekleyin
-_diarization_pipeline = None
-def get_diarization_pipeline():
-    """Diarization pipeline singleton"""
-    global _diarization_pipeline
-    if _diarization_pipeline is None:
-        # GPU varsa kullan
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        _diarization_pipeline = Pipeline.from_pretrained(
-            "pyannote/speaker-diarization-3.0",
-            use_auth_token=os.environ.get("HF_TOKEN"),
-            device=device
-        )
-    return _diarization_pipeline
-def rename_speakers_for_pediatrics(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """
-    Konuşmacıları pediatri bağlamına göre yeniden isimlendirir
-    """
-    # Konuşmacıları basit bir şekilde yeniden isimlendiriyoruz
-    # Gerçek bir uygulamada ses özellikleri analizi ile daha sofistike olabilir
-    renamed_segments = []
-    speaker_mapping = {}
-    for segment in segments:
-        speaker = segment["speaker"]
-        if speaker not in speaker_mapping:
-            # İlk konuşmacıyı bölüm başkanı olarak kabul ediyoruz
-            if len(speaker_mapping) == 0:
-                speaker_mapping[speaker] = "Bölüm_Başkanı"
-            # İkinci konuşmacıyı hekim olarak kabul ediyoruz
-            elif len(speaker_mapping) == 1:
-                speaker_mapping[speaker] = "Hekim"
-            # Üçüncü konuşmacıyı asistan olarak kabul ediyoruz
-            elif len(speaker_mapping) == 2:
-                speaker_mapping[speaker] = "Asistan"
-            # Diğer konuşmacılar
-            else:
-                speaker_mapping[speaker] = f"Konuşmacı_{len(speaker_mapping) + 1}"
-        # Segment kopyası oluştur ve konuşmacı ismini güncelle
-        new_segment = segment.copy()
-        new_segment["speaker"] = speaker_mapping[speaker]
-        renamed_segments.append(new_segment)
-    return renamed_segments
-def diarize_segments(audio_file: str, is_pediatrics: bool = True) -> List[Dict[str, Any]]:
     """
-    Ses dosyasındaki konuşmacıları ayırt eder
-    Args:
-        audio_file: Ses dosyasının yolu
     Returns:
-        Konuşmacı segmentleri listesi
-        [
-            {"speaker": "speaker_0", "start": 0.5, "end": 2.3, "text": "..."},
-            {"speaker": "speaker_1", "start": 2.4, "end": 5.1, "text": "..."},
-            ...
-        ]
     """
-    # Pipeline'ı al
-    pipeline = get_diarization_pipeline()
-    # Diyarizasyon gerçekleştir
-    diarization = pipeline(audio_file)
-    # Sonuçları formatlayalım
-    results = []
-    for turn, _, speaker in diarization.itertracks(yield_label=True):
-        segment = {
-            "speaker": speaker,
-            "start": turn.start,
-            "end": turn.end,
-            "text": ""  # Bu alanı transcribe işlemi sonrası dolduracağız
-        }
-        results.append(segment)
-    # Pediatri bağlamı için konuşmacı isimlerini güncelle
-    if is_pediatrics:
-        results = rename_speakers_for_pediatrics(results)
-    return results

 from pyannote.audio import Pipeline
+from config import settings
+_diar_pipeline = Pipeline.from_pretrained(settings.DIAR_MODEL)
+def diarize_segments(audio_path, segments):
     """
+    Konuşma segmentlerini konuşmacılara göre ayırır
+    Parameters:
+    -----------
+    audio_path : str
+        Ses dosyasının tam yolu
+    segments : list
+        ASR tarafından oluşturulan zaman damgalı segmentler
     Returns:
+    --------
+    list
+        Konuşmacı bilgisi eklenmiş segmentler
     """
+    try:
+        # Pyannote modelini yükle
+        import os
+        import torch
+        from pyannote.audio import Pipeline
+        # HuggingFace token kontrolü
+        token = os.environ.get("HF_TOKEN")
+        if not token:
+            print("UYARI: HF_TOKEN bulunamadı, diyarizasyon atlanıyor.")
+            # Tüm segmentleri Konuşmacı 1 olarak işaretle
+            for seg in segments:
+                seg["speaker"] = "Konuşmacı 1"
+            return segments
+        # Pyannote pipeline'ını oluştur
+        pipeline = Pipeline.from_pretrained(
+            settings.DIAR_MODEL,
+            use_auth_token=token
+        )
+        # GPU varsa kullan
+        if torch.cuda.is_available():
+            pipeline = pipeline.to(torch.device("cuda"))
+        # Diyarizasyon yap
+        diarization = pipeline(audio_path)
+        # Konuşmacı etiketlerini oluştur (Hekim, Asistan, vs.)
+        speaker_labels = {}
+        # Segmentlere konuşmacı bilgisi ekle
+        for seg in segments:
+            seg_start = seg["start"]
+            seg_end = seg["end"]
+            # Bu segment için en çok konuşan kişiyi bul
+            speaker_times = {}
+            for turn, _, speaker in diarization.itertracks(yield_label=True):
+                # Segment ve konuşma turu arasındaki çakışmayı hesapla
+                overlap_start = max(seg_start, turn.start)
+                overlap_end = min(seg_end, turn.end)
+                if overlap_end > overlap_start:  # Çakışma varsa
+                    duration = overlap_end - overlap_start
+                    speaker_times[speaker] = speaker_times.get(speaker, 0) + duration
+            # En uzun konuşan kişiyi bul
+            if speaker_times:
+                max_speaker = max(speaker_times, key=speaker_times.get)
+                # Konuşmacıyı etiketle
+                if max_speaker not in speaker_labels:
+                    # Yeni konuşmacı tespit edildi
+                    # Pediatri bağlamında etiketler belirle (ilk konuşmacı genelde hekim)
+                    idx = len(speaker_labels) + 1
+                    if idx == 1:
+                        label = "Hekim"
+                    elif idx == 2:
+                        label = "Asistan"
+                    elif idx == 3:
+                        label = "Ebeveyn"
+                    else:
+                        label = f"Konuşmacı {idx}"
+                    speaker_labels[max_speaker] = label
+                seg["speaker"] = speaker_labels[max_speaker]
+            else:
+                # Diyarizasyon bilgisi yoksa
+                seg["speaker"] = "Bilinmeyen Konuşmacı"
+        return segments
+    except Exception as e:
+        # Hata durumunda basit etiketleme yap
+        print(f"Diyarizasyon hatası: {str(e)}")
+        for seg in segments:
+            seg["speaker"] = "Konuşmacı"
+        return segments