File size: 2,523 Bytes
654165b
 
 
d690b2a
654165b
 
 
 
 
 
 
 
 
 
 
 
b912ddb
654165b
 
 
 
 
 
 
b912ddb
 
654165b
 
 
9a82013
654165b
 
d690b2a
 
 
 
 
654165b
 
 
 
d690b2a
9a82013
b912ddb
d690b2a
 
 
 
654165b
d690b2a
 
 
654165b
d690b2a
654165b
 
37d60b5
 
 
 
 
 
654165b
66d6b16
654165b
66d6b16
 
d690b2a
a665f66
d690b2a
 
 
318260c
d690b2a
66d6b16
 
 
a665f66
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
from transformers import Wav2Vec2ForCTC, AutoProcessor
import torch
import numpy as np
import librosa
import json

with open('ISO_codes.json', 'r') as file:
    iso_codes = json.load(file)

languages = list(iso_codes.keys())

model_id = "facebook/mms-1b-all"
processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

def transcribe(audio_file_mic=None, audio_file_upload=None, language="English (eng)", progress=gr.Progress()):
    if audio_file_mic:
        audio_file = audio_file_mic
    elif audio_file_upload:
        audio_file = audio_file_upload
    else:
        return "Please upload an audio file or record one"

    progress(0, desc="Starting")

    # Make sure audio is 16kHz
    speech, sample_rate = librosa.load(audio_file)
    if sample_rate != 16000:
        progress(0.01, desc="Resampling")
        speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)

    # Cut speech into chunks
    chunk_size = 30 * 16000  # 30s * 16000Hz
    chunks = np.split(speech, np.arange(chunk_size, len(speech), chunk_size))

    # load model adapter for this language
    language_code = iso_codes[language]
    processor.tokenizer.set_target_lang(language_code)
    model.load_adapter(language_code)

    transcriptions = []
    progress(0.02, desc="Transcribing")
    for chunk in progress.tqdm(chunks, desc="Transcribing"):
        inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs).logits

        ids = torch.argmax(outputs, dim=-1)[0]
        transcription = processor.decode(ids)
        transcriptions.append(transcription)

    transcription = ' '.join(transcriptions)
    return transcription

examples = [
    ["balinese.mp3", None, "Bali (ban)"],
    ["madura.mp3", None, "Madura (mad)"],
    ["toba_batak.mp3", None, "Batak Toba (bbc)"],
    ["minangkabau.mp3", None, "Minangkabau (min)"],
]

description = '''Automatic Speech Recognition with [MMS](https://ai.facebook.com/blog/multilingual-model-speech-recognition/) (Massively Multilingual Speech) by Meta.'''

demo = gr.Interface(
    transcribe,
    inputs=[
        gr.Audio(sources=["upload", "microphone"], type="filepath", label="Record Audio"),
        gr.Dropdown(choices=languages, label="Language", value="English (eng)")
    ],
    outputs=gr.Textbox(label="Transcription"),
    # examples=examples,
    description=description
)

if __name__ == "__main__":
    demo.launch()