Spaces:

SpeechTek
/

FAMA-demo

Running

App Files Files Community

Lorenzoncina commited on 6 days ago

Commit

927d6f8

1 Parent(s): 28fa904

ST feedbacks implemented

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +49 -24

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ local_venv

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Description:
-    This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK
 Dependencies:
     all the necessary dependencies are listed in requirements.txt
@@ -18,12 +18,21 @@ import gradio as gr
 from transformers import AutoProcessor, pipeline
 from datasets import load_dataset
-def load_fama(model_id, output_lang):
     processor = AutoProcessor.from_pretrained(model_id)
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     tgt_lang = "it"
     # Force the model to start with the language tag
     lang_tag = "<lang:{}>".format(output_lang)
     lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
@@ -47,15 +56,11 @@ def load_audio_file(audio_path):
     y, sr = lb.load(audio_path, sr=16000, mono=True)
     return y
-def transcribe(audio, task_type, model_id, output_lang):
     """
     Function called by gradio interface. It runs model inference on an audio sample
     """
-    cache_key = (model_id, output_lang)
-    if cache_key not in model_cache:
-        model_cache[cache_key] = load_fama(model_id, output_lang)
-    pipeline = model_cache[cache_key]
     if isinstance(audio, str) and os.path.isfile(audio):
         #load the audio with Librosa
@@ -66,33 +71,46 @@ def transcribe(audio, task_type, model_id, output_lang):
         result = pipeline(audio)
     return result["text"]
-#available models
 def update_model_options(task_type):
     if task_type == "ST":
-        return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small")
     else:
-        return gr.update(choices=[
             "FBK-MT/fama-small",
             "FBK-MT/fama-medium",
             "FBK-MT/fama-small-asr",
             "FBK-MT/fama-medium-asr"
-        ], value="FBK-MT/fama-small")
 # Language options (languages supported by FAMA models)
 language_choices = ["en", "it"]
-# Cache loaded models to avoid reloading
-model_cache = {}
 if __name__ == "__main__":
     with gr.Blocks() as iface:
-        gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \
                     More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
-        with gr.Row():
-            audio_input = gr.Audio(type="filepath", label="Upload or record audio")
-            task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
         model_input = gr.Radio(choices=[
             "FBK-MT/fama-small",
@@ -101,13 +119,20 @@ if __name__ == "__main__":
             "FBK-MT/fama-medium-asr"
         ], value="FBK-MT/fama-small", label="Select a FAMA model")
-        lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language")
         output = gr.Textbox(label="Transcription")
-        task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input)
         transcribe_btn = gr.Button("Transcribe")
         transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
     iface.launch()

 """
 Description:
+    This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK.
 Dependencies:
     all the necessary dependencies are listed in requirements.txt
 from transformers import AutoProcessor, pipeline
 from datasets import load_dataset
+def load_fama(model_id, input_lang, task_type):
     processor = AutoProcessor.from_pretrained(model_id)
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     tgt_lang = "it"
+    #select the right lang depending by Utterance lang and Task type
+    output_lang = ""
+    if task_type  == "ASR":
+        output_lang = input_lang
+    elif task_type  == "ST" and input_lang == "it":
+        output_lang = "en"
+    elif task_type  == "ST" and input_lang == "en":
+        output_lang = "it"
     # Force the model to start with the language tag
     lang_tag = "<lang:{}>".format(output_lang)
     lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
     y, sr = lb.load(audio_path, sr=16000, mono=True)
     return y
+def transcribe(audio, task_type, model_id, input_lang):
     """
     Function called by gradio interface. It runs model inference on an audio sample
     """
+    pipeline = load_fama(model_id, input_lang, task_type)
     if isinstance(audio, str) and os.path.isfile(audio):
         #load the audio with Librosa
         result = pipeline(audio)
     return result["text"]
 def update_model_options(task_type):
     if task_type == "ST":
+        model_choices = ["FBK-MT/fama-small", "FBK-MT/fama-medium"]
+        default_model = "FBK-MT/fama-small"
+        button_label = "Translate"
+        textbox_label = "Translation"
     else:
+        model_choices = [
             "FBK-MT/fama-small",
             "FBK-MT/fama-medium",
             "FBK-MT/fama-small-asr",
             "FBK-MT/fama-medium-asr"
+        ]
+        default_model = "FBK-MT/fama-small"
+        button_label = "Transcribe"
+        textbox_label = "Transcription"
+    return (
+        gr.update(choices=model_choices, value=default_model),
+        gr.update(value=button_label),
+        gr.update(label=textbox_label)
+    )
 # Language options (languages supported by FAMA models)
 language_choices = ["en", "it"]
 if __name__ == "__main__":
     with gr.Blocks() as iface:
+        gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo for English and Italian powered by FAMA models, developed at FBK. \
                     More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
+        #with gr.Row():
+        audio_input = gr.Audio(type="filepath", label="Upload or record audio")
+        #task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
+        lang_input = gr.Dropdown(choices=language_choices, value="it", label="Utterance Language")
+        task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
         model_input = gr.Radio(choices=[
             "FBK-MT/fama-small",
             "FBK-MT/fama-medium-asr"
         ], value="FBK-MT/fama-small", label="Select a FAMA model")
         output = gr.Textbox(label="Transcription")
         transcribe_btn = gr.Button("Transcribe")
+        #Dinamically change object when task changes
+        task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=[model_input, transcribe_btn, output])
         transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
+        gr.Markdown(""" ### Instructions: \n
+        1 - Load an audio file or record yourself talking with a microphone \n
+        2 - Specify the language of the utterance (FAMA supports English and Italian)\n
+        3 - Select the task to run: Speech recognition or Speech Translation. \n
+        4 - Select a FAMA model among the available ones \n
+        4 - Click on Transcribe/Translate
+        """)
     iface.launch()