File size: 4,894 Bytes
b3db0b0
 
927d6f8
b3db0b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927d6f8
b3db0b0
 
 
 
 
927d6f8
 
 
 
 
 
 
 
 
b3db0b0
 
 
 
 
 
 
 
 
 
 
 
 
28fa904
 
 
b3db0b0
 
 
 
 
 
 
927d6f8
b3db0b0
 
 
927d6f8
b3db0b0
 
 
 
 
 
 
 
 
 
927d6f8
b3db0b0
 
927d6f8
 
 
 
b3db0b0
927d6f8
b3db0b0
 
 
 
927d6f8
 
 
 
 
 
 
 
 
 
 
 
b3db0b0
 
 
 
 
 
 
927d6f8
b3db0b0
927d6f8
 
 
 
 
 
b3db0b0
 
 
 
 
 
 
 
 
 
 
927d6f8
 
 
b3db0b0
927d6f8
 
 
 
 
 
 
 
b3db0b0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Description:
    This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK. 

Dependencies:
    all the necessary dependencies are listed in requirements.txt

Usage:
    The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space

Author: Lorenzo Concina
Date: 4/6/2025
"""
import os
import torch
import librosa as lb
import gradio as gr
from transformers import AutoProcessor, pipeline
from datasets import load_dataset

def load_fama(model_id, input_lang, task_type):
    processor = AutoProcessor.from_pretrained(model_id)

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    tgt_lang = "it"

    #select the right lang depending by Utterance lang and Task type
    output_lang = ""
    if task_type  == "ASR":
        output_lang = input_lang
    elif task_type  == "ST" and input_lang == "it":
        output_lang = "en"
    elif task_type  == "ST" and input_lang == "en":
        output_lang = "it"

    # Force the model to start with the language tag
    lang_tag = "<lang:{}>".format(output_lang)
    lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)

    generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_id,
        trust_remote_code=True,
        torch_dtype=torch.float32,
        device=device,
        return_timestamps=False,
        generate_kwargs=generate_kwargs,
        chunk_length_s=60,
        stride_length_s=1
    )
    return pipe

def load_audio_file(audio_path):
    y, sr = lb.load(audio_path, sr=16000, mono=True)
    return y

def transcribe(audio, task_type, model_id, input_lang):
    """
    Function called by gradio interface. It runs model inference on an audio sample
    """
    pipeline = load_fama(model_id, input_lang, task_type)

    if isinstance(audio, str) and os.path.isfile(audio):
        #load the audio with Librosa
        utterance = load_audio_file(audio)
        result = pipeline(utterance)
    else:
        #user used the mic
        result = pipeline(audio)
    return result["text"]


def update_model_options(task_type):
    if task_type == "ST":
        model_choices = ["FBK-MT/fama-small", "FBK-MT/fama-medium"]
        default_model = "FBK-MT/fama-small"
        button_label = "Translate"
        textbox_label = "Translation"
    else:
        model_choices = [
            "FBK-MT/fama-small",
            "FBK-MT/fama-medium",
            "FBK-MT/fama-small-asr",
            "FBK-MT/fama-medium-asr"
        ]
        default_model = "FBK-MT/fama-small"
        button_label = "Transcribe"
        textbox_label = "Transcription"

    return (
        gr.update(choices=model_choices, value=default_model),
        gr.update(value=button_label),
        gr.update(label=textbox_label)
    )


# Language options (languages supported by FAMA models)
language_choices = ["en", "it"]


if __name__ == "__main__":

    with gr.Blocks() as iface:
        gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo for English and Italian powered by FAMA models, developed at FBK. \
                    More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
        #with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload or record audio")
        #task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")

        lang_input = gr.Dropdown(choices=language_choices, value="it", label="Utterance Language")
        task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")

        model_input = gr.Radio(choices=[
            "FBK-MT/fama-small",
            "FBK-MT/fama-medium",
            "FBK-MT/fama-small-asr",
            "FBK-MT/fama-medium-asr"
        ], value="FBK-MT/fama-small", label="Select a FAMA model")

        output = gr.Textbox(label="Transcription")

        transcribe_btn = gr.Button("Transcribe")
        #Dinamically change object when task changes
        task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=[model_input, transcribe_btn, output])

        transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)

        gr.Markdown(""" ### Instructions: \n
        1 - Load an audio file or record yourself talking with a microphone \n
        2 - Specify the language of the utterance (FAMA supports English and Italian)\n
        3 - Select the task to run: Speech recognition or Speech Translation. \n
        4 - Select a FAMA model among the available ones \n
        4 - Click on Transcribe/Translate
        """)
        
    iface.launch()