Lorenzoncina commited on
Commit
927d6f8
·
1 Parent(s): 28fa904

ST feedbacks implemented

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +49 -24
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ local_venv
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Description:
3
- This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK
4
 
5
  Dependencies:
6
  all the necessary dependencies are listed in requirements.txt
@@ -18,12 +18,21 @@ import gradio as gr
18
  from transformers import AutoProcessor, pipeline
19
  from datasets import load_dataset
20
 
21
- def load_fama(model_id, output_lang):
22
  processor = AutoProcessor.from_pretrained(model_id)
23
 
24
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
  tgt_lang = "it"
26
 
 
 
 
 
 
 
 
 
 
27
  # Force the model to start with the language tag
28
  lang_tag = "<lang:{}>".format(output_lang)
29
  lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
@@ -47,15 +56,11 @@ def load_audio_file(audio_path):
47
  y, sr = lb.load(audio_path, sr=16000, mono=True)
48
  return y
49
 
50
- def transcribe(audio, task_type, model_id, output_lang):
51
  """
52
  Function called by gradio interface. It runs model inference on an audio sample
53
  """
54
- cache_key = (model_id, output_lang)
55
- if cache_key not in model_cache:
56
- model_cache[cache_key] = load_fama(model_id, output_lang)
57
-
58
- pipeline = model_cache[cache_key]
59
 
60
  if isinstance(audio, str) and os.path.isfile(audio):
61
  #load the audio with Librosa
@@ -66,33 +71,46 @@ def transcribe(audio, task_type, model_id, output_lang):
66
  result = pipeline(audio)
67
  return result["text"]
68
 
69
- #available models
70
  def update_model_options(task_type):
71
  if task_type == "ST":
72
- return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small")
 
 
 
73
  else:
74
- return gr.update(choices=[
75
  "FBK-MT/fama-small",
76
  "FBK-MT/fama-medium",
77
  "FBK-MT/fama-small-asr",
78
  "FBK-MT/fama-medium-asr"
79
- ], value="FBK-MT/fama-small")
80
-
 
 
 
 
 
 
 
 
 
 
81
  # Language options (languages supported by FAMA models)
82
  language_choices = ["en", "it"]
83
 
84
- # Cache loaded models to avoid reloading
85
- model_cache = {}
86
 
87
  if __name__ == "__main__":
88
 
89
  with gr.Blocks() as iface:
90
- gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \
91
  More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
92
-
93
- with gr.Row():
94
- audio_input = gr.Audio(type="filepath", label="Upload or record audio")
95
- task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
 
 
96
 
97
  model_input = gr.Radio(choices=[
98
  "FBK-MT/fama-small",
@@ -101,13 +119,20 @@ if __name__ == "__main__":
101
  "FBK-MT/fama-medium-asr"
102
  ], value="FBK-MT/fama-small", label="Select a FAMA model")
103
 
104
- lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language")
105
-
106
  output = gr.Textbox(label="Transcription")
107
 
108
- task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input)
109
-
110
  transcribe_btn = gr.Button("Transcribe")
 
 
 
111
  transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
 
 
 
 
 
 
 
 
112
 
113
  iface.launch()
 
1
  """
2
  Description:
3
+ This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK.
4
 
5
  Dependencies:
6
  all the necessary dependencies are listed in requirements.txt
 
18
  from transformers import AutoProcessor, pipeline
19
  from datasets import load_dataset
20
 
21
+ def load_fama(model_id, input_lang, task_type):
22
  processor = AutoProcessor.from_pretrained(model_id)
23
 
24
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
  tgt_lang = "it"
26
 
27
+ #select the right lang depending by Utterance lang and Task type
28
+ output_lang = ""
29
+ if task_type == "ASR":
30
+ output_lang = input_lang
31
+ elif task_type == "ST" and input_lang == "it":
32
+ output_lang = "en"
33
+ elif task_type == "ST" and input_lang == "en":
34
+ output_lang = "it"
35
+
36
  # Force the model to start with the language tag
37
  lang_tag = "<lang:{}>".format(output_lang)
38
  lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
 
56
  y, sr = lb.load(audio_path, sr=16000, mono=True)
57
  return y
58
 
59
+ def transcribe(audio, task_type, model_id, input_lang):
60
  """
61
  Function called by gradio interface. It runs model inference on an audio sample
62
  """
63
+ pipeline = load_fama(model_id, input_lang, task_type)
 
 
 
 
64
 
65
  if isinstance(audio, str) and os.path.isfile(audio):
66
  #load the audio with Librosa
 
71
  result = pipeline(audio)
72
  return result["text"]
73
 
74
+
75
  def update_model_options(task_type):
76
  if task_type == "ST":
77
+ model_choices = ["FBK-MT/fama-small", "FBK-MT/fama-medium"]
78
+ default_model = "FBK-MT/fama-small"
79
+ button_label = "Translate"
80
+ textbox_label = "Translation"
81
  else:
82
+ model_choices = [
83
  "FBK-MT/fama-small",
84
  "FBK-MT/fama-medium",
85
  "FBK-MT/fama-small-asr",
86
  "FBK-MT/fama-medium-asr"
87
+ ]
88
+ default_model = "FBK-MT/fama-small"
89
+ button_label = "Transcribe"
90
+ textbox_label = "Transcription"
91
+
92
+ return (
93
+ gr.update(choices=model_choices, value=default_model),
94
+ gr.update(value=button_label),
95
+ gr.update(label=textbox_label)
96
+ )
97
+
98
+
99
  # Language options (languages supported by FAMA models)
100
  language_choices = ["en", "it"]
101
 
 
 
102
 
103
  if __name__ == "__main__":
104
 
105
  with gr.Blocks() as iface:
106
+ gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo for English and Italian powered by FAMA models, developed at FBK. \
107
  More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
108
+ #with gr.Row():
109
+ audio_input = gr.Audio(type="filepath", label="Upload or record audio")
110
+ #task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
111
+
112
+ lang_input = gr.Dropdown(choices=language_choices, value="it", label="Utterance Language")
113
+ task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
114
 
115
  model_input = gr.Radio(choices=[
116
  "FBK-MT/fama-small",
 
119
  "FBK-MT/fama-medium-asr"
120
  ], value="FBK-MT/fama-small", label="Select a FAMA model")
121
 
 
 
122
  output = gr.Textbox(label="Transcription")
123
 
 
 
124
  transcribe_btn = gr.Button("Transcribe")
125
+ #Dinamically change object when task changes
126
+ task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=[model_input, transcribe_btn, output])
127
+
128
  transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
129
+
130
+ gr.Markdown(""" ### Instructions: \n
131
+ 1 - Load an audio file or record yourself talking with a microphone \n
132
+ 2 - Specify the language of the utterance (FAMA supports English and Italian)\n
133
+ 3 - Select the task to run: Speech recognition or Speech Translation. \n
134
+ 4 - Select a FAMA model among the available ones \n
135
+ 4 - Click on Transcribe/Translate
136
+ """)
137
 
138
  iface.launch()