Lorenzoncina commited on
Commit
0421e65
·
1 Parent(s): 9934c70

new demo interface

Browse files
Files changed (2) hide show
  1. app.py +78 -15
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,16 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
 
2
  import gradio as gr
3
  from transformers import AutoProcessor, pipeline
4
  from datasets import load_dataset
5
 
6
- def load_fama(model_id):
7
  processor = AutoProcessor.from_pretrained(model_id)
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
- tgt_lang = "en"
11
 
12
  # Force the model to start with the language tag
13
- lang_tag = "<lang:{}>".format(tgt_lang)
14
  lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
15
 
16
  generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}
@@ -26,23 +41,71 @@ def load_fama(model_id):
26
  )
27
  return pipe
28
 
29
- #load fama model
30
- model_id = "FBK-MT/fama-small"
31
- pipeline = load_fama(model_id)
32
-
33
 
34
- def transcribe(audio):
35
  """
36
  Function called by gradio interface. It runs model inference on an audio sample
37
  """
38
- dataset = load_dataset("distil-whisper/librispeech_asr_dummy", "clean", split="validation")
39
- sample = dataset[0]["audio"]
40
- result = pipeline(sample)
 
 
 
 
 
 
 
 
 
 
41
  return result["text"]
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  if __name__ == "__main__":
45
-
46
- #run gradio interface
47
- iface = gr.Interface(fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text")
48
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Description:
3
+ This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK
4
+
5
+ Dependencies:
6
+ all the necessary dependencies are listed in requirements.txt
7
+
8
+ Usage:
9
+ The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space
10
+
11
+ Author: Lorenzo Concina
12
+ Date: 4/6/2025
13
+ """
14
+ import os
15
  import torch
16
+ import librosa as lb
17
  import gradio as gr
18
  from transformers import AutoProcessor, pipeline
19
  from datasets import load_dataset
20
 
21
+ def load_fama(model_id, output_lang):
22
  processor = AutoProcessor.from_pretrained(model_id)
23
 
24
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
+ tgt_lang = "it"
26
 
27
  # Force the model to start with the language tag
28
+ lang_tag = "<lang:{}>".format(output_lang)
29
  lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)
30
 
31
  generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}
 
41
  )
42
  return pipe
43
 
44
+ def load_audio_file(audio_path):
45
+ y, sr = lb.load(audio_path, sr=16000, mono=True)
46
+ return y
 
47
 
48
+ def transcribe(audio, task_type, model_id, output_lang):
49
  """
50
  Function called by gradio interface. It runs model inference on an audio sample
51
  """
52
+ cache_key = (model_id, output_lang)
53
+ if cache_key not in model_cache:
54
+ model_cache[cache_key] = load_fama(model_id, output_lang)
55
+
56
+ pipeline = model_cache[cache_key]
57
+
58
+ if isinstance(audio, str) and os.path.isfile(audio):
59
+ #load the audio with Librosa
60
+ utterance = load_audio_file(audio)
61
+ result = pipeline(utterance)
62
+ else:
63
+ #user used the mic
64
+ result = pipeline(audio)
65
  return result["text"]
66
 
67
+ #available models
68
+ def update_model_options(task_type):
69
+ if task_type == "ST":
70
+ return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small")
71
+ else:
72
+ return gr.update(choices=[
73
+ "FBK-MT/fama-small",
74
+ "FBK-MT/fama-medium",
75
+ "FBK-MT/fama-small-asr",
76
+ "FBK-MT/fama-medium-asr"
77
+ ], value="FBK-MT/fama-small")
78
+
79
+ # Language options (languages supported by FAMA models)
80
+ language_choices = ["en", "it"]
81
+
82
+ # Cache loaded models to avoid reloading
83
+ model_cache = {}
84
 
85
  if __name__ == "__main__":
86
+
87
+ with gr.Blocks() as iface:
88
+ gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \
89
+ More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
90
+
91
+ with gr.Row():
92
+ audio_input = gr.Audio(type="filepath", label="Upload or record audio")
93
+ task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")
94
+
95
+ model_input = gr.Radio(choices=[
96
+ "FBK-MT/fama-small",
97
+ "FBK-MT/fama-medium",
98
+ "FBK-MT/fama-small-asr",
99
+ "FBK-MT/fama-medium-asr"
100
+ ], value="FBK-MT/fama-small", label="Select a FAMA model")
101
+
102
+ lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language")
103
+
104
+ output = gr.Textbox(label="Transcription")
105
+
106
+ task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input)
107
+
108
+ transcribe_btn = gr.Button("Transcribe")
109
+ transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)
110
+
111
+ iface.launch()
requirements.txt CHANGED
@@ -39,7 +39,7 @@ mpmath==1.3.0
39
  msgpack==1.1.0
40
  multidict==6.4.4
41
  multiprocess==0.70.16
42
- networkx==3.4.2
43
  numba==0.61.2
44
  numpy==2.2.6
45
  nvidia-cublas-cu12==12.6.4.1
 
39
  msgpack==1.1.0
40
  multidict==6.4.4
41
  multiprocess==0.70.16
42
+ networkx==3.5
43
  numba==0.61.2
44
  numpy==2.2.6
45
  nvidia-cublas-cu12==12.6.4.1