File size: 1,387 Bytes
99da818
017628c
 
 
 
 
99da818
5ef62cb
 
99da818
5ef62cb
 
 
 
017628c
 
 
 
 
99da818
017628c
 
 
 
 
 
 
 
 
 
 
 
 
99da818
5ef62cb
017628c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import gradio as gr
import torch
from pyannote.audio import Inference
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

# βœ… Use HF token from Hugging Face Space secrets
hf_token = os.getenv("HF_TOKEN")

# πŸ” Load model with authentication
model = Inference("pyannote/embedding", use_auth_token=hf_token, window="whole")

# 🎧 Load known speaker embeddings
speaker_embeddings = {}
for speaker in os.listdir("known_speakers"):
    if speaker.endswith(".wav"):
        emb = model(f"known_speakers/{speaker}")
        speaker_embeddings[speaker.replace(".wav", "")] = emb

def identify_speaker(audio):
    input_embedding = model(audio)
    
    best_score = -1
    best_speaker = "Unknown"
    
    for name, emb in speaker_embeddings.items():
        score = cosine_similarity(input_embedding.numpy().reshape(1, -1), emb.numpy().reshape(1, -1))[0][0]
        if score > best_score:
            best_score = score
            best_speaker = name
    
    return f"🧍 Identified Speaker: {best_speaker}\nπŸ§ͺ Similarity Score: {best_score:.2f}"

# πŸš€ Launch Gradio UI
gr.Interface(
    fn=identify_speaker,
    inputs=gr.Audio(source="microphone", type="filepath", label="πŸŽ™οΈ Upload or record voice"),
    outputs="text",
    title="🎀 Speaker Identification App",
    description="Upload a voice clip to identify the speaker."
).launch()