Spaces:

lorenzoncina
/

FAMA-ASR

Sleeping

File size: 3,807 Bytes

"""
Description:
    This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK

Dependencies:
    all the necessary dependencies are listed in requirements.txt

Usage:
    The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space

Author: Lorenzo Concina
Date: 4/6/2025
"""
import os
import torch
import librosa as lb
import gradio as gr
from transformers import AutoProcessor, pipeline
from datasets import load_dataset

def load_fama(model_id, output_lang):
    processor = AutoProcessor.from_pretrained(model_id)

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    tgt_lang = "it"

    # Force the model to start with the language tag
    lang_tag = "<lang:{}>".format(output_lang)
    lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)

    generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_id,
        trust_remote_code=True,
        torch_dtype=torch.float32,
        device=device,
        return_timestamps=False,
        generate_kwargs=generate_kwargs
    )
    return pipe

def load_audio_file(audio_path):
    y, sr = lb.load(audio_path, sr=16000, mono=True)
    return y

def transcribe(audio, task_type, model_id, output_lang):
    """
    Function called by gradio interface. It runs model inference on an audio sample
    """
    cache_key = (model_id, output_lang)
    if cache_key not in model_cache:
        model_cache[cache_key] = load_fama(model_id, output_lang)

    pipeline = model_cache[cache_key]

    if isinstance(audio, str) and os.path.isfile(audio):
        #load the audio with Librosa
        utterance = load_audio_file(audio)
        result = pipeline(utterance)
    else:
        #user used the mic
        result = pipeline(audio)
    return result["text"]

#available models
def update_model_options(task_type):
    if task_type == "ST":
        return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small")
    else:
        return gr.update(choices=[
            "FBK-MT/fama-small",
            "FBK-MT/fama-medium",
            "FBK-MT/fama-small-asr",
            "FBK-MT/fama-medium-asr"
        ], value="FBK-MT/fama-small")
    
# Language options (languages supported by FAMA models)
language_choices = ["en", "it"]

# Cache loaded models to avoid reloading
model_cache = {}

if __name__ == "__main__":

    with gr.Blocks() as iface:
        gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \
                    More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
        
        with gr.Row():
            audio_input = gr.Audio(type="filepath", label="Upload or record audio")
            task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")

        model_input = gr.Radio(choices=[
            "FBK-MT/fama-small",
            "FBK-MT/fama-medium",
            "FBK-MT/fama-small-asr",
            "FBK-MT/fama-medium-asr"
        ], value="FBK-MT/fama-small", label="Select a FAMA model")

        lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language")

        output = gr.Textbox(label="Transcription")

        task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input)

        transcribe_btn = gr.Button("Transcribe")
        transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
        
    iface.launch()