import gradio as gr | |
from transformers import pipeline | |
from librosa import resample | |
import numpy as np | |
def transcribe(input_audio): | |
sr, speech = input_audio | |
# Convert to mono if stereo | |
if speech.ndim > 1: | |
speech = speech.mean(axis=1) | |
# Convert to float32 if needed | |
if speech.dtype != "float32": | |
speech = speech.astype(np.float32) | |
# Resample if sampling rate is not 16kHz | |
if sr!=16000: | |
speech = resample(speech, orig_sr=sr, target_sr=16000) | |
output = pipe(speech, chunk_length_s=30, stride_length_s=5)['text'] | |
return output | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model="GetmanY1/wav2vec2-large-sami-cont-pt-22k-finetuned", | |
device="cpu" | |
) | |
gradio_app = gr.Interface( | |
transcribe, | |
gr.Audio(sources=["upload","microphone"]), | |
"text", | |
) | |
if __name__ == "__main__": | |
gradio_app.launch() | |
# if __name__ == "__main__": | |
# gradio_app.launch() |