File size: 1,071 Bytes
5b60321
1da6a76
ee196d5
98c9824
5b60321
1da6a76
 
 
 
 
 
 
98c9824
 
 
 
1da6a76
 
98c9824
1da6a76
 
98c9824
1da6a76
 
 
98c9824
1da6a76
 
 
ee196d5
1da6a76
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import numpy as np

# Load the pre-trained model and processor
model_name = "facebook/s2t-wav2vec2-large-en-ar"
model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Define a function for the ASR model
def transcribe(audio):
    # Convert the audio into a format compatible with the processor
    if isinstance(audio, np.ndarray):
        audio = audio.flatten()  # Ensure it's a 1D array
    
    # Process the audio
    inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
    
    # Get the model's predictions
    logits = model(input_values=inputs.input_values).logits
    
    # Decode the predicted text
    predicted_ids = logits.argmax(dim=-1)
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Define the Gradio interface
interface = gr.Interface(fn=transcribe, inputs=gr.Audio(type="numpy"), outputs="text")

# Launch the Gradio interface
interface.launch()