File size: 1,288 Bytes
35ebfd2
fa8a5a0
 
 
35ebfd2
fa8a5a0
 
 
 
 
 
 
 
 
 
 
eb7474e
 
fa8a5a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35ebfd2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import gradio as gr
import torch
import torchaudio
from transformers import AutoFeatureExtractor, ASTForAudioClassification

model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
model = ASTForAudioClassification.from_pretrained(model_name)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

device = torch.device("cpu")
model.to(device)

def classify_sound(file_path):
    wv, sr = torchaudio.load(file_path)

    # Convert to mono
    if wv.shape[0] > 1:
        wv = wv.mean(dim=0, keepdim=True)

    inputs = feature_extractor(
        wv.squeeze().numpy(), sampling_rate=44100, return_tensors="pt"
    )

    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.softmax(logits, dim=-1)[0]
    top5 = torch.topk(probs, k=5)

    res = [
        (model.config.id2label[idx.item()], round(prob.item(), 4))
        for idx, prob in zip(top5.indices, top5.values)
    ]
    return dict(res)

demo = gr.Interface(
    fn=classify_sound, 
    inputs=gr.audio(source="upload", type="filepath"),
    outputs=gr.Label(num_top_classes=5),
    title="Audio Classification with AST",
    description="Upload an audio clip (speech, music, ambient sound, etc.). Model: MIT AST fine-tuned on AudioSet (10 classes).",
    live=False,
)

demo.launch()