storresbusquets commited on
Commit
a9bc837
·
1 Parent(s): cad98a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -49
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import gradio as gr
2
  import whisper
3
  from pytube import YouTube
4
- from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
5
  from wordcloud import WordCloud
6
- import matplotlib.pyplot as plt
7
 
8
- class GradioInference():
 
9
  def __init__(self):
10
  self.sizes = list(whisper._MODELS.keys())
11
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
@@ -13,10 +13,14 @@ class GradioInference():
13
  self.loaded_model = whisper.load_model(self.current_size)
14
  self.yt = None
15
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
16
-
17
  # Initialize VoiceLabT5 model and tokenizer
18
- self.keyword_model = T5ForConditionalGeneration.from_pretrained("Voicelab/vlt5-base-keywords")
19
- self.keyword_tokenizer = T5Tokenizer.from_pretrained("Voicelab/vlt5-base-keywords")
 
 
 
 
20
 
21
  # Sentiment Classifier
22
  self.classifier = pipeline("text-classification")
@@ -32,35 +36,46 @@ class GradioInference():
32
  if size != self.current_size:
33
  self.loaded_model = whisper.load_model(size)
34
  self.current_size = size
35
-
36
  results = self.loaded_model.transcribe(path, language=lang)
37
-
38
  # Perform summarization on the transcription
39
- transcription_summary = self.summarizer(results["text"], max_length=130, min_length=30, do_sample=False)
 
 
40
 
41
  # Extract keywords using VoiceLabT5
42
  task_prefix = "Keywords: "
43
  input_sequence = task_prefix + results["text"]
44
- input_ids = self.keyword_tokenizer(input_sequence, return_tensors="pt", truncation=False).input_ids
45
- output = self.keyword_model.generate(input_ids, no_repeat_ngram_size=3, num_beams=4)
 
 
 
 
46
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
47
- keywords = [x.strip() for x in predicted.split(',') if x.strip()]
48
 
49
  label = self.classifier(results["text"])[0]["label"]
50
  wordcloud = WordCloud().generate(results["text"])
51
  wordcloud_image = wordcloud.to_image()
52
-
53
- return results["text"], transcription_summary[0]["summary_text"], keywords, label, wordcloud_image
 
 
 
 
 
 
54
 
55
  def populate_metadata(self, link):
56
  self.yt = YouTube(link)
57
  return self.yt.thumbnail_url, self.yt.title
58
 
59
-
60
  def from_audio_input(self, lang, size, audio_file):
61
  if lang == "none":
62
  lang = None
63
-
64
  if size != self.current_size:
65
  self.loaded_model = whisper.load_model(size)
66
  self.current_size = size
@@ -68,21 +83,35 @@ class GradioInference():
68
  results = self.loaded_model.transcribe(audio_file, language=lang)
69
 
70
  # Perform summarization on the transcription
71
- transcription_summary = self.summarizer(results["text"], max_length=130, min_length=30, do_sample=False)
 
 
72
 
73
  # Extract keywords using VoiceLabT5
74
  task_prefix = "Keywords: "
75
  input_sequence = task_prefix + results["text"]
76
- input_ids = self.keyword_tokenizer(input_sequence, return_tensors="pt", truncation=False).input_ids
77
- output = self.keyword_model.generate(input_ids, no_repeat_ngram_size=3, num_beams=4)
 
 
 
 
78
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
79
- keywords = [x.strip() for x in predicted.split(',') if x.strip()]
80
 
81
  label = self.classifier(results["text"])[0]["label"]
82
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(results["text"])
 
 
83
  wordcloud_image = wordcloud.to_image()
84
-
85
- return results["text"], transcription_summary[0]["summary_text"], keywords, label, wordcloud_image
 
 
 
 
 
 
86
 
87
 
88
  gio = GradioInference()
@@ -107,62 +136,106 @@ with block as demo:
107
  with gr.Tab("From YouTube"):
108
  with gr.Box():
109
  with gr.Row().style(equal_height=True):
110
- size = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
111
- lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
112
- link = gr.Textbox(label="YouTube Link", placeholder="Enter YouTube link...")
 
 
 
 
 
 
113
  title = gr.Label(label="Video Title")
114
  with gr.Row().style(equal_height=True):
115
  img = gr.Image(label="Thumbnail")
116
- text = gr.Textbox(label="Transcription", placeholder="Transcription Output...", lines=10).style(show_copy_button=True, container=True)
 
 
 
 
117
  with gr.Row().style(equal_height=True):
118
- summary = gr.Textbox(label="Summary", placeholder="Summary Output...", lines=5).style(show_copy_button=True, container=True)
119
- keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output...", lines=5).style(show_copy_button=True, container=True)
 
 
 
 
120
  label = gr.Label(label="Sentiment Analysis")
121
- with gr.Row().style(equal_height=True):
122
  # Display the Word Cloud
123
  wordcloud_image = gr.Image()
124
  with gr.Row().style(equal_height=True):
125
- clear = gr.ClearButton([link, title, img, text, summary, keywords, label], scale=1)
126
- btn = gr.Button("Get video insights", variant='primary', scale=1)
127
- btn.click(gio, inputs=[link, lang, size], outputs=[text, summary, keywords, label, wordcloud_image])
 
 
 
 
 
 
128
  link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
129
 
130
  with gr.Tab("From Audio file"):
131
  with gr.Box():
132
  with gr.Row().style(equal_height=True):
133
- size = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
134
- lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
 
 
 
 
135
  audio_file = gr.Audio(type="filepath")
136
  with gr.Row().style(equal_height=True):
137
- text = gr.Textbox(label="Transcription", placeholder="Transcription Output...", lines=10).style(show_copy_button=True, container=False)
 
 
 
 
138
  with gr.Row().style(equal_height=True):
139
- summary = gr.Textbox(label="Summary", placeholder="Summary Output", lines=5)
140
- keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output", lines=5)
 
 
 
 
141
  label = gr.Label(label="Sentiment Analysis")
142
  with gr.Row().style(equal_height=True):
143
  clear = gr.ClearButton([text], scale=1)
144
- btn = gr.Button("Get video insights", variant='primary', scale=1) # Updated button label
145
- btn.click(gio.from_audio_input, inputs=[lang, size, audio_file], outputs=[text, summary, keywords, label, wordcloud_image])
146
-
 
 
 
 
 
 
147
 
148
  with block:
149
  gr.Markdown("### Video Examples")
150
  gr.Examples(["https://www.youtube.com/shorts/xDNzz8yAH7I"], inputs=link)
151
-
152
  gr.Markdown("About the app:")
153
-
154
  with gr.Accordion("What is YouTube Insights?", open=False):
155
- gr.Markdown("YouTube Insights is a tool developed with academic purposes only, that creates summaries, keywords and sentiments analysis based on YouTube videos or user audio files.")
156
-
 
 
157
  with gr.Accordion("How does it work?", open=False):
158
- gr.Markdown("Works by using OpenAI's Whisper, DistilBART for summarization and VoiceLabT5 for Keyword Extraction.")
 
 
159
 
160
- gr.HTML("""
 
161
  <div style="text-align: center; max-width: 500px; margin: 0 auto;">
162
  <p style="margin-bottom: 10px; font-size: 96%">
163
  2023 Master in Big Data & Data Science - Universidad Complutense de Madrid
164
  </p>
165
  </div>
166
- """)
 
167
 
168
- demo.launch()
 
1
  import gradio as gr
2
  import whisper
3
  from pytube import YouTube
4
+ from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
5
  from wordcloud import WordCloud
 
6
 
7
+
8
+ class GradioInference:
9
  def __init__(self):
10
  self.sizes = list(whisper._MODELS.keys())
11
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
 
13
  self.loaded_model = whisper.load_model(self.current_size)
14
  self.yt = None
15
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
16
+
17
  # Initialize VoiceLabT5 model and tokenizer
18
+ self.keyword_model = T5ForConditionalGeneration.from_pretrained(
19
+ "Voicelab/vlt5-base-keywords"
20
+ )
21
+ self.keyword_tokenizer = T5Tokenizer.from_pretrained(
22
+ "Voicelab/vlt5-base-keywords"
23
+ )
24
 
25
  # Sentiment Classifier
26
  self.classifier = pipeline("text-classification")
 
36
  if size != self.current_size:
37
  self.loaded_model = whisper.load_model(size)
38
  self.current_size = size
39
+
40
  results = self.loaded_model.transcribe(path, language=lang)
41
+
42
  # Perform summarization on the transcription
43
+ transcription_summary = self.summarizer(
44
+ results["text"], max_length=130, min_length=30, do_sample=False
45
+ )
46
 
47
  # Extract keywords using VoiceLabT5
48
  task_prefix = "Keywords: "
49
  input_sequence = task_prefix + results["text"]
50
+ input_ids = self.keyword_tokenizer(
51
+ input_sequence, return_tensors="pt", truncation=False
52
+ ).input_ids
53
+ output = self.keyword_model.generate(
54
+ input_ids, no_repeat_ngram_size=3, num_beams=4
55
+ )
56
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
57
+ keywords = [x.strip() for x in predicted.split(",") if x.strip()]
58
 
59
  label = self.classifier(results["text"])[0]["label"]
60
  wordcloud = WordCloud().generate(results["text"])
61
  wordcloud_image = wordcloud.to_image()
62
+
63
+ return (
64
+ results["text"],
65
+ transcription_summary[0]["summary_text"],
66
+ keywords,
67
+ label,
68
+ wordcloud_image,
69
+ )
70
 
71
  def populate_metadata(self, link):
72
  self.yt = YouTube(link)
73
  return self.yt.thumbnail_url, self.yt.title
74
 
 
75
  def from_audio_input(self, lang, size, audio_file):
76
  if lang == "none":
77
  lang = None
78
+
79
  if size != self.current_size:
80
  self.loaded_model = whisper.load_model(size)
81
  self.current_size = size
 
83
  results = self.loaded_model.transcribe(audio_file, language=lang)
84
 
85
  # Perform summarization on the transcription
86
+ transcription_summary = self.summarizer(
87
+ results["text"], max_length=130, min_length=30, do_sample=False
88
+ )
89
 
90
  # Extract keywords using VoiceLabT5
91
  task_prefix = "Keywords: "
92
  input_sequence = task_prefix + results["text"]
93
+ input_ids = self.keyword_tokenizer(
94
+ input_sequence, return_tensors="pt", truncation=False
95
+ ).input_ids
96
+ output = self.keyword_model.generate(
97
+ input_ids, no_repeat_ngram_size=3, num_beams=4
98
+ )
99
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
100
+ keywords = [x.strip() for x in predicted.split(",") if x.strip()]
101
 
102
  label = self.classifier(results["text"])[0]["label"]
103
+ wordcloud = WordCloud(width=800, height=400, background_color="white").generate(
104
+ results["text"]
105
+ )
106
  wordcloud_image = wordcloud.to_image()
107
+
108
+ return (
109
+ results["text"],
110
+ transcription_summary[0]["summary_text"],
111
+ keywords,
112
+ label,
113
+ wordcloud_image,
114
+ )
115
 
116
 
117
  gio = GradioInference()
 
136
  with gr.Tab("From YouTube"):
137
  with gr.Box():
138
  with gr.Row().style(equal_height=True):
139
+ size = gr.Dropdown(
140
+ label="Model Size", choices=gio.sizes, value="base"
141
+ )
142
+ lang = gr.Dropdown(
143
+ label="Language (Optional)", choices=gio.langs, value="none"
144
+ )
145
+ link = gr.Textbox(
146
+ label="YouTube Link", placeholder="Enter YouTube link..."
147
+ )
148
  title = gr.Label(label="Video Title")
149
  with gr.Row().style(equal_height=True):
150
  img = gr.Image(label="Thumbnail")
151
+ text = gr.Textbox(
152
+ label="Transcription",
153
+ placeholder="Transcription Output...",
154
+ lines=10,
155
+ ).style(show_copy_button=True, container=True)
156
  with gr.Row().style(equal_height=True):
157
+ summary = gr.Textbox(
158
+ label="Summary", placeholder="Summary Output...", lines=5
159
+ ).style(show_copy_button=True, container=True)
160
+ keywords = gr.Textbox(
161
+ label="Keywords", placeholder="Keywords Output...", lines=5
162
+ ).style(show_copy_button=True, container=True)
163
  label = gr.Label(label="Sentiment Analysis")
164
+ with gr.Row().style(equal_height=True):
165
  # Display the Word Cloud
166
  wordcloud_image = gr.Image()
167
  with gr.Row().style(equal_height=True):
168
+ clear = gr.ClearButton(
169
+ [link, title, img, text, summary, keywords, label], scale=1
170
+ )
171
+ btn = gr.Button("Get video insights", variant="primary", scale=1)
172
+ btn.click(
173
+ gio,
174
+ inputs=[link, lang, size],
175
+ outputs=[text, summary, keywords, label, wordcloud_image],
176
+ )
177
  link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
178
 
179
  with gr.Tab("From Audio file"):
180
  with gr.Box():
181
  with gr.Row().style(equal_height=True):
182
+ size = gr.Dropdown(
183
+ label="Model Size", choices=gio.sizes, value="base"
184
+ )
185
+ lang = gr.Dropdown(
186
+ label="Language (Optional)", choices=gio.langs, value="none"
187
+ )
188
  audio_file = gr.Audio(type="filepath")
189
  with gr.Row().style(equal_height=True):
190
+ text = gr.Textbox(
191
+ label="Transcription",
192
+ placeholder="Transcription Output...",
193
+ lines=10,
194
+ ).style(show_copy_button=True, container=False)
195
  with gr.Row().style(equal_height=True):
196
+ summary = gr.Textbox(
197
+ label="Summary", placeholder="Summary Output", lines=5
198
+ )
199
+ keywords = gr.Textbox(
200
+ label="Keywords", placeholder="Keywords Output", lines=5
201
+ )
202
  label = gr.Label(label="Sentiment Analysis")
203
  with gr.Row().style(equal_height=True):
204
  clear = gr.ClearButton([text], scale=1)
205
+ btn = gr.Button(
206
+ "Get video insights", variant="primary", scale=1
207
+ ) # Updated button label
208
+ btn.click(
209
+ gio.from_audio_input,
210
+ inputs=[lang, size, audio_file],
211
+ outputs=[text, summary, keywords, label, wordcloud_image],
212
+ )
213
+
214
 
215
  with block:
216
  gr.Markdown("### Video Examples")
217
  gr.Examples(["https://www.youtube.com/shorts/xDNzz8yAH7I"], inputs=link)
218
+
219
  gr.Markdown("About the app:")
220
+
221
  with gr.Accordion("What is YouTube Insights?", open=False):
222
+ gr.Markdown(
223
+ "YouTube Insights is a tool developed with academic purposes only, that creates summaries, keywords and sentiments analysis based on YouTube videos or user audio files."
224
+ )
225
+
226
  with gr.Accordion("How does it work?", open=False):
227
+ gr.Markdown(
228
+ "Works by using OpenAI's Whisper, DistilBART for summarization and VoiceLabT5 for Keyword Extraction."
229
+ )
230
 
231
+ gr.HTML(
232
+ """
233
  <div style="text-align: center; max-width: 500px; margin: 0 auto;">
234
  <p style="margin-bottom: 10px; font-size: 96%">
235
  2023 Master in Big Data & Data Science - Universidad Complutense de Madrid
236
  </p>
237
  </div>
238
+ """
239
+ )
240
 
241
+ demo.launch()