Spaces:
Running
Running
File size: 2,523 Bytes
654165b d690b2a 654165b b912ddb 654165b b912ddb 654165b 9a82013 654165b d690b2a 654165b d690b2a 9a82013 b912ddb d690b2a 654165b d690b2a 654165b d690b2a 654165b 37d60b5 654165b 66d6b16 654165b 66d6b16 d690b2a a665f66 d690b2a 318260c d690b2a 66d6b16 a665f66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
from transformers import Wav2Vec2ForCTC, AutoProcessor
import torch
import numpy as np
import librosa
import json
with open('ISO_codes.json', 'r') as file:
iso_codes = json.load(file)
languages = list(iso_codes.keys())
model_id = "facebook/mms-1b-all"
processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
def transcribe(audio_file_mic=None, audio_file_upload=None, language="English (eng)", progress=gr.Progress()):
if audio_file_mic:
audio_file = audio_file_mic
elif audio_file_upload:
audio_file = audio_file_upload
else:
return "Please upload an audio file or record one"
progress(0, desc="Starting")
# Make sure audio is 16kHz
speech, sample_rate = librosa.load(audio_file)
if sample_rate != 16000:
progress(0.01, desc="Resampling")
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
# Cut speech into chunks
chunk_size = 30 * 16000 # 30s * 16000Hz
chunks = np.split(speech, np.arange(chunk_size, len(speech), chunk_size))
# load model adapter for this language
language_code = iso_codes[language]
processor.tokenizer.set_target_lang(language_code)
model.load_adapter(language_code)
transcriptions = []
progress(0.02, desc="Transcribing")
for chunk in progress.tqdm(chunks, desc="Transcribing"):
inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs).logits
ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)
transcriptions.append(transcription)
transcription = ' '.join(transcriptions)
return transcription
examples = [
["balinese.mp3", None, "Bali (ban)"],
["madura.mp3", None, "Madura (mad)"],
["toba_batak.mp3", None, "Batak Toba (bbc)"],
["minangkabau.mp3", None, "Minangkabau (min)"],
]
description = '''Automatic Speech Recognition with [MMS](https://ai.facebook.com/blog/multilingual-model-speech-recognition/) (Massively Multilingual Speech) by Meta.'''
demo = gr.Interface(
transcribe,
inputs=[
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Record Audio"),
gr.Dropdown(choices=languages, label="Language", value="English (eng)")
],
outputs=gr.Textbox(label="Transcription"),
# examples=examples,
description=description
)
if __name__ == "__main__":
demo.launch() |