Spaces:

lorenzoncina
/

FAMA-ASR

Sleeping

App Files Files Community

Lorenzoncina commited on Jun 4

Commit

0421e65

1 Parent(s): 9934c70

new demo interface

Browse files

Files changed (2) hide show

app.py +78 -15
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,16 +1,31 @@
 import torch
 import gradio as gr
 from transformers import AutoProcessor, pipeline
 from datasets import load_dataset
-def load_fama(model_id):
     processor = AutoProcessor.from_pretrained(model_id)
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
-    tgt_lang = "en"
     # Force the model to start with the language tag
-    lang_tag = "<lang:{}>".format(tgt_lang)
     lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
     generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}
@@ -26,23 +41,71 @@ def load_fama(model_id):
     )
     return pipe
-#load fama model
-model_id = "FBK-MT/fama-small"
-pipeline = load_fama(model_id)
-def transcribe(audio):
     """
     Function called by gradio interface. It runs model inference on an audio sample
     """
-    dataset = load_dataset("distil-whisper/librispeech_asr_dummy", "clean", split="validation")
-    sample = dataset[0]["audio"]
-    result = pipeline(sample)
     return result["text"]
 if __name__ == "__main__":
-    #run gradio interface
-    iface = gr.Interface(fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text")
-    iface.launch()

+"""
+Description:
+    This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK
+Dependencies:
+    all the necessary dependencies are listed in requirements.txt
+Usage:
+    The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space
+Author: Lorenzo Concina
+Date: 4/6/2025
+"""
+import os
 import torch
+import librosa as lb
 import gradio as gr
 from transformers import AutoProcessor, pipeline
 from datasets import load_dataset
+def load_fama(model_id, output_lang):
     processor = AutoProcessor.from_pretrained(model_id)
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    tgt_lang = "it"
     # Force the model to start with the language tag
+    lang_tag = "<lang:{}>".format(output_lang)
     lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
     generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}
     )
     return pipe
+def load_audio_file(audio_path):
+    y, sr = lb.load(audio_path, sr=16000, mono=True)
+    return y
+def transcribe(audio, task_type, model_id, output_lang):
     """
     Function called by gradio interface. It runs model inference on an audio sample
     """
+    cache_key = (model_id, output_lang)
+    if cache_key not in model_cache:
+        model_cache[cache_key] = load_fama(model_id, output_lang)
+    pipeline = model_cache[cache_key]
+    if isinstance(audio, str) and os.path.isfile(audio):
+        #load the audio with Librosa
+        utterance = load_audio_file(audio)
+        result = pipeline(utterance)
+    else:
+        #user used the mic
+        result = pipeline(audio)
     return result["text"]
+#available models
+def update_model_options(task_type):
+    if task_type == "ST":
+        return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small")
+    else:
+        return gr.update(choices=[
+            "FBK-MT/fama-small",
+            "FBK-MT/fama-medium",
+            "FBK-MT/fama-small-asr",
+            "FBK-MT/fama-medium-asr"
+        ], value="FBK-MT/fama-small")
+# Language options (languages supported by FAMA models)
+language_choices = ["en", "it"]
+# Cache loaded models to avoid reloading
+model_cache = {}
 if __name__ == "__main__":
+    with gr.Blocks() as iface:
+        gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \
+                    More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
+        with gr.Row():
+            audio_input = gr.Audio(type="filepath", label="Upload or record audio")
+            task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
+        model_input = gr.Radio(choices=[
+            "FBK-MT/fama-small",
+            "FBK-MT/fama-medium",
+            "FBK-MT/fama-small-asr",
+            "FBK-MT/fama-medium-asr"
+        ], value="FBK-MT/fama-small", label="Select a FAMA model")
+        lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language")
+        output = gr.Textbox(label="Transcription")
+        task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input)
+        transcribe_btn = gr.Button("Transcribe")
+        transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
+    iface.launch()

requirements.txt CHANGED Viewed

@@ -39,7 +39,7 @@ mpmath==1.3.0
 msgpack==1.1.0
 multidict==6.4.4
 multiprocess==0.70.16
-networkx==3.4.2
 numba==0.61.2
 numpy==2.2.6
 nvidia-cublas-cu12==12.6.4.1

 msgpack==1.1.0
 multidict==6.4.4
 multiprocess==0.70.16
+networkx==3.5
 numba==0.61.2
 numpy==2.2.6
 nvidia-cublas-cu12==12.6.4.1