File size: 4,894 Bytes
b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 28fa904 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 927d6f8 b3db0b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
"""
Description:
This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK.
Dependencies:
all the necessary dependencies are listed in requirements.txt
Usage:
The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space
Author: Lorenzo Concina
Date: 4/6/2025
"""
import os
import torch
import librosa as lb
import gradio as gr
from transformers import AutoProcessor, pipeline
from datasets import load_dataset
def load_fama(model_id, input_lang, task_type):
processor = AutoProcessor.from_pretrained(model_id)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tgt_lang = "it"
#select the right lang depending by Utterance lang and Task type
output_lang = ""
if task_type == "ASR":
output_lang = input_lang
elif task_type == "ST" and input_lang == "it":
output_lang = "en"
elif task_type == "ST" and input_lang == "en":
output_lang = "it"
# Force the model to start with the language tag
lang_tag = "<lang:{}>".format(output_lang)
lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}
pipe = pipeline(
"automatic-speech-recognition",
model=model_id,
trust_remote_code=True,
torch_dtype=torch.float32,
device=device,
return_timestamps=False,
generate_kwargs=generate_kwargs,
chunk_length_s=60,
stride_length_s=1
)
return pipe
def load_audio_file(audio_path):
y, sr = lb.load(audio_path, sr=16000, mono=True)
return y
def transcribe(audio, task_type, model_id, input_lang):
"""
Function called by gradio interface. It runs model inference on an audio sample
"""
pipeline = load_fama(model_id, input_lang, task_type)
if isinstance(audio, str) and os.path.isfile(audio):
#load the audio with Librosa
utterance = load_audio_file(audio)
result = pipeline(utterance)
else:
#user used the mic
result = pipeline(audio)
return result["text"]
def update_model_options(task_type):
if task_type == "ST":
model_choices = ["FBK-MT/fama-small", "FBK-MT/fama-medium"]
default_model = "FBK-MT/fama-small"
button_label = "Translate"
textbox_label = "Translation"
else:
model_choices = [
"FBK-MT/fama-small",
"FBK-MT/fama-medium",
"FBK-MT/fama-small-asr",
"FBK-MT/fama-medium-asr"
]
default_model = "FBK-MT/fama-small"
button_label = "Transcribe"
textbox_label = "Transcription"
return (
gr.update(choices=model_choices, value=default_model),
gr.update(value=button_label),
gr.update(label=textbox_label)
)
# Language options (languages supported by FAMA models)
language_choices = ["en", "it"]
if __name__ == "__main__":
with gr.Blocks() as iface:
gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo for English and Italian powered by FAMA models, developed at FBK. \
More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
#with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload or record audio")
#task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
lang_input = gr.Dropdown(choices=language_choices, value="it", label="Utterance Language")
task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
model_input = gr.Radio(choices=[
"FBK-MT/fama-small",
"FBK-MT/fama-medium",
"FBK-MT/fama-small-asr",
"FBK-MT/fama-medium-asr"
], value="FBK-MT/fama-small", label="Select a FAMA model")
output = gr.Textbox(label="Transcription")
transcribe_btn = gr.Button("Transcribe")
#Dinamically change object when task changes
task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=[model_input, transcribe_btn, output])
transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
gr.Markdown(""" ### Instructions: \n
1 - Load an audio file or record yourself talking with a microphone \n
2 - Specify the language of the utterance (FAMA supports English and Italian)\n
3 - Select the task to run: Speech recognition or Speech Translation. \n
4 - Select a FAMA model among the available ones \n
4 - Click on Transcribe/Translate
""")
iface.launch()
|