Spaces:

SpeechTek
/

FAMA-demo

Running

App Files Files Community

Lorenzoncina commited on 24 days ago

Commit

b3db0b0

1 Parent(s): 85a5ac9

First version of FAMA models demo

Browse files

Files changed (2) hide show

app.py +111 -0
requirements.txt +111 -0

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Description:
+    This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK
+Dependencies:
+    all the necessary dependencies are listed in requirements.txt
+Usage:
+    The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space
+Author: Lorenzo Concina
+Date: 4/6/2025
+"""
+import os
+import torch
+import librosa as lb
+import gradio as gr
+from transformers import AutoProcessor, pipeline
+from datasets import load_dataset
+def load_fama(model_id, output_lang):
+    processor = AutoProcessor.from_pretrained(model_id)
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    tgt_lang = "it"
+    # Force the model to start with the language tag
+    lang_tag = "<lang:{}>".format(output_lang)
+    lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
+    generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model_id,
+        trust_remote_code=True,
+        torch_dtype=torch.float32,
+        device=device,
+        return_timestamps=False,
+        generate_kwargs=generate_kwargs
+    )
+    return pipe
+def load_audio_file(audio_path):
+    y, sr = lb.load(audio_path, sr=16000, mono=True)
+    return y
+def transcribe(audio, task_type, model_id, output_lang):
+    """
+    Function called by gradio interface. It runs model inference on an audio sample
+    """
+    cache_key = (model_id, output_lang)
+    if cache_key not in model_cache:
+        model_cache[cache_key] = load_fama(model_id, output_lang)
+    pipeline = model_cache[cache_key]
+    if isinstance(audio, str) and os.path.isfile(audio):
+        #load the audio with Librosa
+        utterance = load_audio_file(audio)
+        result = pipeline(utterance)
+    else:
+        #user used the mic
+        result = pipeline(audio)
+    return result["text"]
+#available models
+def update_model_options(task_type):
+    if task_type == "ST":
+        return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small")
+    else:
+        return gr.update(choices=[
+            "FBK-MT/fama-small",
+            "FBK-MT/fama-medium",
+            "FBK-MT/fama-small-asr",
+            "FBK-MT/fama-medium-asr"
+        ], value="FBK-MT/fama-small")
+# Language options (languages supported by FAMA models)
+language_choices = ["en", "it"]
+# Cache loaded models to avoid reloading
+model_cache = {}
+if __name__ == "__main__":
+    with gr.Blocks() as iface:
+        gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \
+                    More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
+        with gr.Row():
+            audio_input = gr.Audio(type="filepath", label="Upload or record audio")
+            task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
+        model_input = gr.Radio(choices=[
+            "FBK-MT/fama-small",
+            "FBK-MT/fama-medium",
+            "FBK-MT/fama-small-asr",
+            "FBK-MT/fama-medium-asr"
+        ], value="FBK-MT/fama-small", label="Select a FAMA model")
+        lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language")
+        output = gr.Textbox(label="Transcription")
+        task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input)
+        transcribe_btn = gr.Button("Transcribe")
+        transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,111 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.7
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioread==3.0.1
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+datasets==3.6.0
+decorator==5.2.1
+dill==0.3.8
+fastapi==0.115.12
+ffmpy==0.6.0
+filelock==3.18.0
+frozenlist==1.6.0
+fsspec==2025.3.0
+gradio==5.32.1
+gradio_client==1.10.2
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.2
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.4
+idna==3.10
+Jinja2==3.1.6
+joblib==1.5.1
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.4.4
+multiprocess==0.70.16
+networkx==3.4.2
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+orjson==3.10.18
+packaging==25.0
+pandas==2.2.3
+pillow==11.2.1
+platformdirs==4.3.8
+pooch==1.8.2
+propcache==0.3.1
+pyarrow==20.0.0
+pycparser==2.22
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.12
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.3
+semantic-version==2.10.0
+sentencepiece==0.2.0
+setuptools==80.9.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+soxr==0.5.0.post1
+starlette==0.46.2
+sympy==1.14.0
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.7.0
+torchaudio==2.7.0
+torchvision==0.22.0
+tqdm==4.67.1
+transformers==4.48.1
+triton==3.3.0
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.3
+websockets==15.0.1
+xxhash==3.5.0
+yarl==1.20.0