xljesse commited on
Commit
d5ff3d0
Β·
verified Β·
1 Parent(s): 40ab142

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +508 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import librosa
5
+ import os
6
+ from transformers import Wav2Vec2BertModel, AutoFeatureExtractor, HubertModel
7
+ import torch.nn as nn
8
+ from typing import Optional, Tuple
9
+ from transformers.file_utils import ModelOutput
10
+ from dataclasses import dataclass
11
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
12
+
13
+ @dataclass
14
+ class SpeechClassifierOutput(ModelOutput):
15
+ loss: Optional[torch.FloatTensor] = None
16
+ logits: torch.FloatTensor = None
17
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
18
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
19
+
20
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
21
+ Wav2Vec2PreTrainedModel,
22
+ Wav2Vec2Model
23
+ )
24
+ class Wav2Vec2ClassificationHead(nn.Module):
25
+ """Head for wav2vec classification task."""
26
+
27
+ def __init__(self, config):
28
+ super().__init__()
29
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
30
+ self.dropout = nn.Dropout(config.final_dropout)
31
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
32
+
33
+ def forward(self, features, **kwargs):
34
+ x = features
35
+ x = self.dropout(x)
36
+ x = self.dense(x)
37
+ x = torch.tanh(x)
38
+ x = self.dropout(x)
39
+ x = self.out_proj(x)
40
+ return x
41
+
42
+
43
+ class Wav2Vec2ForSpeechClassification(nn.Module):
44
+ def __init__(self,model_name):
45
+ super().__init__()
46
+ self.num_labels = 2
47
+ self.pooling_mode = 'mean'
48
+ self.wav2vec2bert = Wav2Vec2BertModel.from_pretrained(model_name)
49
+ self.config = self.wav2vec2bert.config
50
+ self.classifier = Wav2Vec2ClassificationHead(self.wav2vec2bert.config)
51
+
52
+ def merged_strategy(self,hidden_states,mode="mean"):
53
+ if mode == "mean":
54
+ outputs = torch.mean(hidden_states, dim=1)
55
+ elif mode == "sum":
56
+ outputs = torch.sum(hidden_states, dim=1)
57
+ elif mode == "max":
58
+ outputs = torch.max(hidden_states, dim=1)[0]
59
+ else:
60
+ raise Exception(
61
+ "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
62
+
63
+ return outputs
64
+
65
+ def forward(self,input_features,attention_mask=None,output_attentions=None,output_hidden_states=None,return_dict=None,labels=None,):
66
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
67
+ outputs = self.wav2vec2bert(
68
+ input_features,
69
+ attention_mask=attention_mask,
70
+ output_attentions=output_attentions,
71
+ output_hidden_states=output_hidden_states,
72
+ return_dict=return_dict,
73
+ )
74
+ hidden_states = outputs.last_hidden_state
75
+ hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
76
+ logits = self.classifier(hidden_states)
77
+
78
+ loss = None
79
+ if labels is not None:
80
+ if self.config.problem_type is None:
81
+ if self.num_labels == 1:
82
+ self.config.problem_type = "regression"
83
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
84
+ self.config.problem_type = "single_label_classification"
85
+ else:
86
+ self.config.problem_type = "multi_label_classification"
87
+
88
+ if self.config.problem_type == "regression":
89
+ loss_fct = MSELoss()
90
+ loss = loss_fct(logits.view(-1, self.num_labels), labels)
91
+ elif self.config.problem_type == "single_label_classification":
92
+ loss_fct = CrossEntropyLoss()
93
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
94
+ elif self.config.problem_type == "multi_label_classification":
95
+ loss_fct = BCEWithLogitsLoss()
96
+ loss = loss_fct(logits, labels)
97
+
98
+ if not return_dict:
99
+ output = (logits,) + outputs[2:]
100
+ return ((loss,) + output) if loss is not None else output
101
+
102
+ return SpeechClassifierOutput(
103
+ loss=loss,
104
+ logits=logits,
105
+ hidden_states=outputs.last_hidden_state,
106
+ attentions=outputs.attentions,
107
+ )
108
+
109
+ class HuBERT(nn.Module):
110
+ def __init__(self, model_name):
111
+ super().__init__()
112
+ self.num_labels = 2
113
+ self.pooling_mode = 'mean'
114
+ self.wav2vec2 = HubertModel.from_pretrained(model_name)
115
+ self.config = self.wav2vec2.config
116
+ self.classifier = Wav2Vec2ClassificationHead(self.wav2vec2.config)
117
+
118
+ def merged_strategy(self, hidden_states, mode="mean"):
119
+ if mode == "mean":
120
+ outputs = torch.mean(hidden_states, dim=1)
121
+ elif mode == "sum":
122
+ outputs = torch.sum(hidden_states, dim=1)
123
+ elif mode == "max":
124
+ outputs = torch.max(hidden_states, dim=1)[0]
125
+ else:
126
+ raise Exception(
127
+ "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
128
+
129
+ return outputs
130
+
131
+ def forward(self, input_values, attention_mask=None, output_attentions=None, output_hidden_states=None,
132
+ return_dict=None, labels=None, ):
133
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
134
+ outputs = self.wav2vec2(
135
+ input_values,
136
+ attention_mask=attention_mask,
137
+ output_attentions=output_attentions,
138
+ output_hidden_states=output_hidden_states,
139
+ return_dict=return_dict,
140
+ )
141
+ hidden_states = outputs.last_hidden_state
142
+ hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
143
+ logits = self.classifier(hidden_states)
144
+
145
+ loss = None
146
+ if labels is not None:
147
+ if self.config.problem_type is None:
148
+ if self.num_labels == 1:
149
+ self.config.problem_type = "regression"
150
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
151
+ self.config.problem_type = "single_label_classification"
152
+ else:
153
+ self.config.problem_type = "multi_label_classification"
154
+
155
+ if self.config.problem_type == "regression":
156
+ loss_fct = MSELoss()
157
+ loss = loss_fct(logits.view(-1, self.num_labels), labels)
158
+ elif self.config.problem_type == "single_label_classification":
159
+ loss_fct = CrossEntropyLoss()
160
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
161
+ elif self.config.problem_type == "multi_label_classification":
162
+ loss_fct = BCEWithLogitsLoss()
163
+ loss = loss_fct(logits, labels)
164
+
165
+ if not return_dict:
166
+ output = (logits,) + outputs[2:]
167
+ return ((loss,) + output) if loss is not None else output
168
+
169
+ return SpeechClassifierOutput(
170
+ loss=loss,
171
+ logits=logits,
172
+ hidden_states=outputs.last_hidden_state,
173
+ attentions=outputs.attentions,
174
+ )
175
+
176
+ def pad(x, max_len=64000):
177
+ x_len = x.shape[0]
178
+ if x_len > max_len:
179
+ stt = np.random.randint(x_len - max_len)
180
+ return x[stt:stt + max_len]
181
+ # return x[:max_len]
182
+
183
+ # num_repeats = int(max_len / x_len) + 1
184
+ # padded_x = np.tile(x, (num_repeats))[:max_len]
185
+ pad_length = max_len - x_len
186
+ padded_x = np.concatenate([x, np.zeros(pad_length)], axis=0)
187
+ return padded_x
188
+
189
+ class AudioDeepfakeDetector:
190
+ def __init__(self):
191
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
192
+ self.models = {}
193
+ self.feature_extractors = {}
194
+ self.current_model = None
195
+ # model_name = 'facebook/w2v-bert-2.0'
196
+ # self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
197
+ # self.model = Wav2Vec2ForSpeechClassification(model_name).to(self.device)
198
+ # ckpt = torch.load("wave2vec2bert_wavefake.pth",map_location=self.device)
199
+ # self.model.load_state_dict(ckpt)
200
+
201
+ print(f"Using device: {self.device}")
202
+ print("Audio deepfake detector initilized")
203
+
204
+
205
+ def load_model(self, model_type):
206
+ """Load the specified model type"""
207
+ if model_type in self.models:
208
+ self.current_model = model_type
209
+ return
210
+
211
+ try:
212
+ print(f"πŸ”„ Loading {model_type} model...")
213
+
214
+ if model_type == "Wave2Vec2BERT":
215
+ model_name = 'facebook/w2v-bert-2.0'
216
+ self.feature_extractors[model_type] = AutoFeatureExtractor.from_pretrained(model_name)
217
+ self.models[model_type] = Wav2Vec2ForSpeechClassification(model_name).to(self.device)
218
+ # checkpoint_path = "wave2vec2bert_wavefake.pth"
219
+ # if os.path.exists(checkpoint_path):
220
+ # ckpt = torch.load(checkpoint_path, map_location=self.device)
221
+ # self.models[model_type].load_state_dict(ckpt)
222
+ # print(f"βœ… Loaded checkpoint for {model_type}")
223
+ # else:
224
+ # print(f"⚠️ Checkpoint not found for {model_type}, using pretrained weights only")
225
+
226
+ try:
227
+ from huggingface_hub import hf_hub_download
228
+ checkpoint_path = hf_hub_download(
229
+ repo_id="TrustSafeAI/AudioDeepfakeDetectors",
230
+ filename="wave2vec2bert_wavefake.pth",
231
+ cache_dir="./models"
232
+ )
233
+ ckpt = torch.load(checkpoint_path, map_location=self.device)
234
+ self.models[model_type].load(ckpt)
235
+ print(f"βœ… Loaded checkpoint for {model_type}")
236
+ except Exception as e:
237
+ print(f"⚠️ Could not load checkpoint for {model_type}: {e}")
238
+ print("Using pretrained weights only")
239
+
240
+ elif model_type == "HuBERT":
241
+ model_name = 'facebook/hubert-large-ls960-ft'
242
+ self.feature_extractors[model_type] = AutoFeatureExtractor.from_pretrained(model_name)
243
+ self.models[model_type] = HuBERT(model_name).to(self.device)
244
+
245
+ # checkpoint_path = "hubert_large_wavefake.pth"
246
+ # if os.path.exists(checkpoint_path):
247
+ # ckpt = torch.load(checkpoint_path, map_location=self.device)
248
+ # self.models[model_type].load_state_dict(ckpt)
249
+ # print(f"βœ… Loaded checkpoint for {model_type}")
250
+ # else:
251
+ # print(f"⚠️ Checkpoint not found for {model_type}, using pretrained weights only")
252
+ try:
253
+ from huggingface_hub import hf_hub_download
254
+ checkpoint_path = hf_hub_download(
255
+ repo_id="TrustSafeAI/AudioDeepfakeDetectors", # ζ›Ώζ’δΈΊδ½ ηš„ζ¨‘εž‹δ»“εΊ“
256
+ filename="hubert_large_wavefake.pth",
257
+ cache_dir="./models"
258
+ )
259
+ ckpt = torch.load(checkpoint_path, map_location=self.device)
260
+ self.models[model_type].load_state_dict(ckpt)
261
+ print(f"βœ… Loaded checkpoint for {model_type}")
262
+ except Exception as e:
263
+ print(f"⚠️ Could not load checkpoint for {model_type}: {e}")
264
+ print("Using pretrained weights only")
265
+
266
+ self.current_model = model_type
267
+ print(f"βœ… {model_type} model loaded successfully")
268
+
269
+ except Exception as e:
270
+ print(f"❌ Error loading {model_type} model: {str(e)}")
271
+ raise
272
+
273
+ def preprocess_audio(self, audio_path, target_sr=16000, max_length=4):
274
+ try:
275
+ print(f"πŸ“ Loading audio file: {os.path.basename(audio_path)}")
276
+
277
+ audio, sr = librosa.load(audio_path, sr=target_sr)
278
+ original_duration = len(audio) / sr
279
+
280
+ audio = pad(audio).reshape(-1)
281
+ audio = audio[np.newaxis, :]
282
+
283
+
284
+ print(f"βœ… Audio loaded successfully: {original_duration:.2f}s, {sr}Hz")
285
+ return audio, sr
286
+
287
+ except Exception as e:
288
+ print(f"❌ Audio processing error: {str(e)}")
289
+ raise
290
+
291
+ def extract_features(self, audio, sr, model_type):
292
+ print("πŸ” extract audio features...")
293
+ feature_extractor = self.feature_extractors[model_type]
294
+
295
+ inputs = feature_extractor(audio, sampling_rate=sr, return_attention_mask=True, padding_value=0, return_tensors="pt").to(self.device)
296
+ print("βœ… Feature extracion completed")
297
+ return inputs
298
+
299
+ def classifier(self, features, model_type):
300
+ model = self.models[model_type]
301
+ with torch.no_grad():
302
+ outputs = model(**features)
303
+ prob = outputs.logits.softmax(dim=-1)
304
+ fake_prob = prob[0][0].item()
305
+
306
+ return fake_prob
307
+
308
+ def predict(self, audio_path, model_type):
309
+ try:
310
+ print("🎡 Start analyzing...")
311
+ self.load_model(model_type)
312
+ audio, sr = self.preprocess_audio(audio_path)
313
+
314
+ features= self.extract_features(audio, sr, model_type)
315
+
316
+ fake_probability = self.classifier(features, model_type)
317
+ real_probability = 1 - fake_probability
318
+
319
+ threshold = 0.5
320
+ if fake_probability > threshold:
321
+ status = "SUSPICIOUS"
322
+ prediction = "🚨 Likely fake audio"
323
+ confidence = fake_probability
324
+ color = "red"
325
+ else:
326
+ status = "LIKELY_REAL"
327
+ prediction = "βœ… Likely real audio"
328
+ confidence = real_probability
329
+ color = "green"
330
+
331
+ print(f"\n{'='*50}")
332
+ print(f"🎯 Result: {prediction}")
333
+ print(f"πŸ“Š Confidence: {confidence:.1%}")
334
+ print(f"πŸ“ˆ Real Probability: {real_probability:.1%}")
335
+ print(f"πŸ“‰ Fake Probability: {fake_probability:.1%}")
336
+ print(f"{'='*50}")
337
+
338
+ duration = len(audio) / sr
339
+ file_size = os.path.getsize(audio_path) / 1024
340
+
341
+ result_data = {
342
+ "status": status,
343
+ "prediction": prediction,
344
+ "confidence": confidence,
345
+ "real_probability": real_probability,
346
+ "fake_probability": fake_probability,
347
+ "duration": duration,
348
+ "sample_rate": sr,
349
+ "file_size_kb": file_size,
350
+ "model_used": model_type
351
+ }
352
+
353
+ return result_data
354
+
355
+ except Exception as e:
356
+ print(f"❌ Failed: {str(e)}")
357
+ return {"error": str(e)}
358
+
359
+
360
+ detector = AudioDeepfakeDetector()
361
+
362
+ def analyze_uploaded_audio(audio_file, model_choice):
363
+ if audio_file is None:
364
+ return "Please upload audio", {}
365
+
366
+ try:
367
+ result = detector.predict(audio_file, model_choice)
368
+
369
+ if "error" in result:
370
+ return f"Error: {result['error']}", {}
371
+
372
+ status_color = "#ff4444" if result['status'] == "SUSPICIOUS" else "#44ff44"
373
+
374
+ result_html = f"""
375
+ <div style="padding: 20px; border-radius: 10px; background-color: {status_color}20; border: 2px solid {status_color};">
376
+ <h3 style="color: {status_color}; margin-top: 0;">{result['prediction']}</h3>
377
+ <p><strong>Status:</strong> {result['status']}</p>
378
+ <p><strong>Confidence:</strong> {result['confidence']:.1%}</p>
379
+ </div>
380
+ """
381
+
382
+ analysis_data = {
383
+ "status": result['status'],
384
+ "real_probability": f"{result['real_probability']:.1%}",
385
+ "fake_probability": f"{result['fake_probability']:.1%}",
386
+ }
387
+
388
+ return result_html, analysis_data
389
+
390
+ except Exception as e:
391
+ error_html = f"""
392
+ <div style="padding: 20px; border-radius: 10px; background-color: #ff444420; border: 2px solid #ff4444;">
393
+ <h3 style="color: #ff4444;">❌ Processing error</h3>
394
+ <p>{str(e)}</p>
395
+ </div>
396
+ """
397
+ return error_html, {"error": str(e)}
398
+
399
+ def create_audio_interface():
400
+ with gr.Blocks(title="Audio Deepfake Detection", theme=gr.themes.Soft()) as interface:
401
+ gr.Markdown("""
402
+ <div style="text-align: center; margin-bottom: 30px;">
403
+ <h1 style="font-size: 28px; font-weight: bold; margin-bottom: 20px; color: #333;">
404
+ Measuring the Robustness of Audio Deepfake Detection under Real-World Corruptions
405
+ </h1>
406
+ <p style="font-size: 16px; color: #666; margin-bottom: 15px;">
407
+ Audio deepfake detectors based on Wave2Vec2BERT and HuBERT speech foundation models (fine-tuned with Wavefake dataset).
408
+ </p>
409
+ <div style="font-size: 14px; color: #555; line-height: 1.8; text-align: left;">
410
+ <p><strong>Paper:</strong> <a href="https://arxiv.org/pdf/2503.17577" target="_blank" style="color: #4285f4; text-decoration: none;">https://arxiv.org/pdf/2503.17577</a></p>
411
+ <p><strong>Project Page:</strong> <a href="https://huggingface.co/spaces/TrustSafeAI/AudioPerturber" target="_blank" style="color: #4285f4; text-decoration: none;">"https://huggingface.co/spaces/TrustSafeAI/AudioPerturber</a></p>
412
+ <p><strong>Checkpoint and model card (To be added):</strong> <a href="https://huggingface.co/TrustSafeAI/Wave2Vec2BERT" target="_blank" style="color: #4285f4; text-decoration: none;">"https://huggingface.co/TrustSafeAI/Wave2Vec2BERT</a></p>
413
+ <p><strong>Github Codebase:</strong> <a href="https://github.com/Jessegator/Audio_robustness_evaluation" target="_blank" style="color: #4285f4; text-decoration: none;">https://github.com/Jessegator/Audio_robustness_evaluation</a></p>
414
+ </div>
415
+ </div>
416
+ <hr style="margin: 30px 0; border: none; border-top: 1px solid #e0e0e0;">
417
+ """)
418
+
419
+ gr.Markdown("""
420
+ # Audio Deepfake Detection
421
+
422
+ **Supported Format**: .wav, .mp3, .flac, .m4a, etc.
423
+ """)
424
+
425
+ with gr.Row():
426
+ # model_choice = gr.Dropdown(
427
+ # choices=["Wave2Vec2BERT", "HuBERT"],
428
+ # value="Wave2Vec2BERT",
429
+ # label="πŸ€– Select Model",
430
+ # info="Choose the foundation model for detection"
431
+ # )
432
+
433
+ with gr.Column(scale=1):
434
+ model_choice = gr.Dropdown(
435
+ choices=["Wave2Vec2BERT", "HuBERT"],
436
+ value="Wave2Vec2BERT",
437
+ label="πŸ€– Select Model",
438
+ info="Choose the foundation model for detection"
439
+ )
440
+
441
+ audio_input = gr.Audio(
442
+ label="πŸ“ Upload audio file",
443
+ type="filepath",
444
+ show_label=True,
445
+ interactive=True
446
+ )
447
+
448
+ analyze_btn = gr.Button(
449
+ "πŸ” Start analyzing",
450
+ variant="primary",
451
+ size="lg"
452
+ )
453
+
454
+ gr.Markdown("### πŸ”Š Play uploaded audio")
455
+ audio_player = gr.Audio(
456
+ label="Audio Player",
457
+ interactive=False,
458
+ show_label=False
459
+ )
460
+
461
+ with gr.Column(scale=1):
462
+ result_display = gr.HTML(
463
+ label="🎯 Results",
464
+ value="<p style='text-align: center; color: #666;'>Waiting for uploading...</p>"
465
+ )
466
+
467
+ analysis_json = gr.JSON(
468
+ label="πŸ“Š Detailed analysis",
469
+ value={}
470
+ )
471
+
472
+ def update_player_and_analyze(audio_file, model_type):
473
+ if audio_file is not None:
474
+ result_html, result_data = analyze_uploaded_audio(audio_file, model_type)
475
+ return audio_file, result_html, result_data
476
+ else:
477
+ return None, "<p style='text-align: center; color: #666;'>Waiting for uploading...</p>", {}
478
+
479
+ audio_input.change(
480
+ fn=update_player_and_analyze,
481
+ inputs=[audio_input, model_choice],
482
+ outputs=[audio_player, result_display, analysis_json]
483
+ )
484
+
485
+ analyze_btn.click(
486
+ fn=analyze_uploaded_audio,
487
+ inputs=[audio_input, model_choice],
488
+ outputs=[result_display, analysis_json]
489
+ )
490
+
491
+ model_choice.change(
492
+ fn=lambda audio_file, model_type: analyze_uploaded_audio(audio_file, model_type) if audio_file is not None else ("Please upload audio first", {}),
493
+ inputs=[audio_input, model_choice],
494
+ outputs=[result_display, analysis_json]
495
+ )
496
+
497
+ return interface
498
+
499
+ if __name__ == "__main__":
500
+ print("πŸš€ Create interface...")
501
+ demo = create_audio_interface()
502
+
503
+ print("πŸ“± Launching...")
504
+ demo.launch(
505
+ share=False,
506
+ debug=True,
507
+ show_error=True
508
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ numpy
4
+ librosa
5
+ matplotlib
6
+ transformers
7
+ huggingface_hub
8
+