Joshua1808 commited on
Commit
11adebb
1 Parent(s): b5a92d0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +311 -0
app.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tweepy as tw
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import torch
5
+ import numpy as np
6
+ import regex as re
7
+ import pysentimiento
8
+ import geopy
9
+
10
+ from pysentimiento.preprocessing import preprocess_tweet
11
+ from geopy.geocoders import Nominatim
12
+
13
+ from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
14
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification,AdamW
15
+ tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/twitter_sexismo-finetuned-robertuito-exist2021')
16
+ model = AutoModelForSequenceClassification.from_pretrained("hackathon-pln-es/twitter_sexismo-finetuned-robertuito-exist2021")
17
+
18
+ import torch
19
+ if torch.cuda.is_available():
20
+ device = torch.device( "cuda")
21
+ print('I will use the GPU:', torch.cuda.get_device_name(0))
22
+
23
+ else:
24
+ print('No GPU available, using the CPU instead.')
25
+ device = torch.device("cpu")
26
+
27
+
28
+ consumer_key = "BjipwQslVG4vBdy4qK318KnoA"
29
+ consumer_secret = "3fzL70v9faklrPgvTi3zbofw9rwk92fgGdtAslFkFYt8kGmqBJ"
30
+ access_token = "1217853705086799872-Y5zEChpTeKccuLY3XJRXDPPZhNrlba"
31
+ access_token_secret = "pqQ5aFSJxzJ2xnI6yhVtNjQO36FOu8DBOH6DtUrPAU54J"
32
+ auth = tw.OAuthHandler(consumer_key, consumer_secret)
33
+ auth.set_access_token(access_token, access_token_secret)
34
+ api = tw.API(auth, wait_on_rate_limit=True)
35
+
36
+ def preprocess(text):
37
+ #text=text.lower()
38
+ # remove hyperlinks
39
+ text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
40
+ text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
41
+ #Replace &amp, &lt, &gt with &,<,> respectively
42
+ text=text.replace(r'&amp;?',r'and')
43
+ text=text.replace(r'&lt;',r'<')
44
+ text=text.replace(r'&gt;',r'>')
45
+ #remove hashtag sign
46
+ #text=re.sub(r"#","",text)
47
+ #remove mentions
48
+ text = re.sub(r"(?:\@)\w+", '', text)
49
+ #text=re.sub(r"@","",text)
50
+ #remove non ascii chars
51
+ text=text.encode("ascii",errors="ignore").decode()
52
+ #remove some puncts (except . ! ?)
53
+ text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
54
+ text=re.sub(r'[!]+','!',text)
55
+ text=re.sub(r'[?]+','?',text)
56
+ text=re.sub(r'[.]+','.',text)
57
+ text=re.sub(r"'","",text)
58
+ text=re.sub(r"\(","",text)
59
+ text=re.sub(r"\)","",text)
60
+ text=" ".join(text.split())
61
+ return text
62
+
63
+
64
+ def highlight_survived(s):
65
+ return ['background-color: red']*len(s) if (s.Sexista == 1) else ['background-color: green']*len(s)
66
+
67
+ def color_survived(val):
68
+ color = 'red' if val=='Sexista' else 'white'
69
+ return f'background-color: {color}'
70
+
71
+
72
+ st.set_page_config(layout="wide")
73
+ st.markdown('<style>body{background-color: Blue;}</style>',unsafe_allow_html=True)
74
+
75
+ colT1,colT2 = st.columns([2,8])
76
+ with colT2:
77
+ # st.title('Analisis de comentarios sexistas en Twitter')
78
+ st.markdown(""" <style> .font {
79
+ font-size:40px ; font-family: 'Cooper Black'; color: #06bf69;}
80
+ </style> """, unsafe_allow_html=True)
81
+ st.markdown('<p class="font">An谩lisis de comentarios sexistas en Twitter</p>', unsafe_allow_html=True)
82
+
83
+ st.markdown(""" <style> .font1 {
84
+ font-size:28px ; font-family: 'Times New Roman'; color: #8d33ff;}
85
+ </style> """, unsafe_allow_html=True)
86
+
87
+ st.markdown(""" <style> .font2 {
88
+ font-size:16px ; font-family: 'Times New Roman'; color: #3358ff;}
89
+ </style> """, unsafe_allow_html=True)
90
+
91
+
92
+
93
+
94
+
95
+ def analizar_tweets(search_words, number_of_tweets ):
96
+ tweets = api.user_timeline(screen_name = search_words, count= number_of_tweets)
97
+ tweet_list = [i.text for i in tweets]
98
+ text= pd.DataFrame(tweet_list)
99
+ text[0] = text[0].apply(preprocess_tweet)
100
+ text1=text[0].values
101
+ indices1=tokenizer.batch_encode_plus(text1.tolist(), max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
102
+ input_ids1=indices1["input_ids"]
103
+ attention_masks1=indices1["attention_mask"]
104
+ prediction_inputs1= torch.tensor(input_ids1)
105
+ prediction_masks1 = torch.tensor(attention_masks1)
106
+ batch_size = 25
107
+ # Create the DataLoader.
108
+ prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
109
+ prediction_sampler1 = SequentialSampler(prediction_data1)
110
+ prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
111
+ #print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
112
+ # Put model in evaluation mode
113
+ model.eval()
114
+ # Tracking variables
115
+ predictions = []
116
+ for batch in prediction_dataloader1:
117
+ batch = tuple(t.to(device) for t in batch)
118
+ # Unpack the inputs from our dataloader
119
+ b_input_ids1, b_input_mask1 = batch
120
+
121
+ #Telling the model not to compute or store gradients, saving memory and # speeding up prediction
122
+ with torch.no_grad():
123
+ # Forward pass, calculate logit predictions
124
+ outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
125
+ logits1 = outputs1[0]
126
+ # Move logits and labels to CPU
127
+ logits1 = logits1.detach().cpu().numpy()
128
+ # Store predictions and true labels
129
+ predictions.append(logits1)
130
+
131
+ #flat_predictions = [item for sublist in predictions for item in sublist]
132
+ flat_predictions = [item for sublist in predictions for item in sublist]
133
+
134
+ flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
135
+
136
+ probability = np.amax(logits1,axis=1).flatten()
137
+ Tweets =['脷ltimos '+ str(number_of_tweets)+' Tweets'+' de '+search_words]
138
+ df = pd.DataFrame(list(zip(text1, flat_predictions,probability)), columns = ['Tweets' , 'Prediccion','Probabilidad'])
139
+
140
+ df['Prediccion']= np.where(df['Prediccion']== 0, 'No Sexista', 'Sexista')
141
+ df['Tweets'] = df['Tweets'].str.replace('RT|@', '')
142
+ #df['Tweets'] = df['Tweets'].apply(lambda x: re.sub(r'[:;][-o^]?[)\]DpP3]|[(/\\]|[\U0001f600-\U0001f64f]|[\U0001f300-\U0001f5ff]|[\U0001f680-\U0001f6ff]|[\U0001f1e0-\U0001f1ff]','', x))
143
+
144
+ tabla = st.table(df.reset_index(drop=True).head(30).style.applymap(color_survived, subset=['Prediccion']))
145
+
146
+ return tabla
147
+
148
+ def analizar_frase(frase):
149
+ #palabra = frase.split()
150
+ palabra = [frase]
151
+
152
+ indices1=tokenizer.batch_encode_plus(palabra,max_length=128,add_special_tokens=True,
153
+ return_attention_mask=True,
154
+ pad_to_max_length=True,
155
+ truncation=True)
156
+ input_ids1=indices1["input_ids"]
157
+ attention_masks1=indices1["attention_mask"]
158
+ prediction_inputs1= torch.tensor(input_ids1)
159
+ prediction_masks1 = torch.tensor(attention_masks1)
160
+ batch_size = 25
161
+ prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
162
+ prediction_sampler1 = SequentialSampler(prediction_data1)
163
+ prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
164
+ model.eval()
165
+ predictions = []
166
+ # Predict
167
+ for batch in prediction_dataloader1:
168
+ batch = tuple(t.to(device) for t in batch)
169
+ # Unpack the inputs from our dataloader
170
+ b_input_ids1, b_input_mask1 = batch
171
+ # Telling the model not to compute or store gradients, saving memory and # speeding up prediction
172
+ with torch.no_grad():
173
+ # Forward pass, calculate logit predictions
174
+ outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
175
+ logits1 = outputs1[0]
176
+ # Move logits and labels to CPU
177
+ logits1 = logits1.detach().cpu().numpy()
178
+ # Store predictions and true labels
179
+ predictions.append(logits1)
180
+ flat_predictions = [item for sublist in predictions for item in sublist]
181
+ flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
182
+ tokens = tokenizer.tokenize(frase)
183
+ # Convertir los tokens a un formato compatible con el modelo
184
+ input_ids = tokenizer.convert_tokens_to_ids(tokens)
185
+ attention_masks = [1] * len(input_ids)
186
+
187
+ # Pasar los tokens al modelo
188
+ outputs = model(torch.tensor([input_ids]), token_type_ids=None, attention_mask=torch.tensor([attention_masks]))
189
+ scores = outputs[0]
190
+ #prediccion = scores.argmax(dim=1).item()
191
+ # Obtener la probabilidad de que la frase sea "sexista"
192
+ probabilidad_sexista = scores.amax(dim=1).item()
193
+ #print(probabilidad_sexista)
194
+
195
+ # Crear un Dataframe
196
+ text= pd.DataFrame({'Frase': [frase], 'Prediccion':[flat_predictions], 'Probabilidad':[probabilidad_sexista]})
197
+ text['Prediccion'] = np.where(text['Prediccion'] == 0 , 'No Sexista', 'Sexista')
198
+
199
+
200
+ tabla = st.table(text.reset_index(drop=True).head(30).style.applymap(color_survived, subset=['Prediccion']))
201
+
202
+ return tabla
203
+
204
+ def tweets_localidad(buscar_localidad):
205
+ geolocator = Nominatim(user_agent="nombre_del_usuario")
206
+ location = geolocator.geocode(buscar_localidad)
207
+ radius = "200km"
208
+ tweets = api.search(lang="es",geocode=f"{location.latitude},{location.longitude},{radius}", count = 50)
209
+ #for tweet in tweets:
210
+ # print(tweet.text)
211
+ tweet_list = [i.text for i in tweets]
212
+ text= pd.DataFrame(tweet_list)
213
+ text[0] = text[0].apply(preprocess_tweet)
214
+ text1=text[0].values
215
+ print(text1)
216
+ indices1=tokenizer.batch_encode_plus(text1.tolist(), max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
217
+ input_ids1=indices1["input_ids"]
218
+ attention_masks1=indices1["attention_mask"]
219
+ prediction_inputs1= torch.tensor(input_ids1)
220
+ prediction_masks1 = torch.tensor(attention_masks1)
221
+ batch_size = 25
222
+ # Create the DataLoader.
223
+ prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
224
+ prediction_sampler1 = SequentialSampler(prediction_data1)
225
+ prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
226
+ #print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
227
+ # Put model in evaluation mode
228
+ model.eval()
229
+ # Tracking variables
230
+ predictions = []
231
+ for batch in prediction_dataloader1:
232
+ batch = tuple(t.to(device) for t in batch)
233
+ # Unpack the inputs from our dataloader
234
+ b_input_ids1, b_input_mask1 = batch
235
+
236
+ #Telling the model not to compute or store gradients, saving memory and # speeding up prediction
237
+ with torch.no_grad():
238
+ # Forward pass, calculate logit predictions
239
+ outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
240
+ logits1 = outputs1[0]
241
+ # Move logits and labels to CPU
242
+ logits1 = logits1.detach().cpu().numpy()
243
+ # Store predictions and true labels
244
+ predictions.append(logits1)
245
+
246
+ #flat_predictions = [item for sublist in predictions for item in sublist]
247
+ flat_predictions = [item for sublist in predictions for item in sublist]
248
+
249
+ flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
250
+
251
+ probability = np.amax(logits1,axis=1).flatten()
252
+ Tweets =['脷ltimos 50 Tweets'+' de '+ buscar_localidad]
253
+ df = pd.DataFrame(list(zip(text1, flat_predictions,probability)), columns = ['Tweets' , 'Prediccion','Probabilidad'])
254
+
255
+ df['Prediccion']= np.where(df['Prediccion']== 0, 'No Sexista', 'Sexista')
256
+ #df['Tweets'] = df['Tweets'].str.replace('RT|@', '')
257
+ #df_filtrado = df[df["Sexista"] == 'Sexista']
258
+ #df['Tweets'] = df['Tweets'].apply(lambda x: re.sub(r'[:;][-o^]?[)\]DpP3]|[(/\\]|[\U0001f600-\U0001f64f]|[\U0001f300-\U0001f5ff]|[\U0001f680-\U0001f6ff]|[\U0001f1e0-\U0001f1ff]','', x))
259
+
260
+ tabla = st.table(df.reset_index(drop=True).head(50).style.applymap(color_survived, subset=['Prediccion']))
261
+
262
+ df_sexista = df[df['Sexista']=="Sexista"]
263
+ df_no_sexista = df[df['Probabilidad'] > 0]
264
+ sexista = len(df_sexista)
265
+ no_sexista = len(df_no_sexista)
266
+
267
+ # Crear un gr谩fico de barras
268
+ labels = ['Sexista ', ' No sexista']
269
+ counts = [sexista, no_sexista]
270
+ plt.bar(labels, counts)
271
+ plt.xlabel('Categor铆a')
272
+ plt.ylabel('Cantidad de tweets')
273
+ plt.title('Cantidad de tweets sexistas y no sexistas')
274
+ plt.show()
275
+
276
+ return df
277
+
278
+
279
+
280
+
281
+ def run():
282
+ with st.form("my_form"):
283
+ col,buff1, buff2 = st.columns([2,2,1])
284
+ st.write("Escoja una Opci贸n")
285
+ search_words = col.text_input("Introduzca el termino, usuario o localidad para analizar y pulse el check correspondiente")
286
+ number_of_tweets = col.number_input('Introduzca n煤mero de tweets a analizar. M谩ximo 50', 0,50,10)
287
+ termino=st.checkbox('T茅rmino')
288
+ usuario=st.checkbox('Usuario')
289
+ localidad=st.checkbox('Localidad')
290
+ submit_button = col.form_submit_button(label='Analizar')
291
+ error =False
292
+
293
+ if submit_button:
294
+ # Condici贸n para el caso de que esten dos check seleccionados
295
+ if ( termino == False and usuario == False and localidad == False):
296
+ st.text('Error no se ha seleccionado ningun check')
297
+ error=True
298
+ elif ( termino == True and usuario == True and localidad == True):
299
+ st.text('Error se han seleccionado varios check')
300
+ error=True
301
+
302
+ if (error == False):
303
+ if (termino):
304
+ analizar_frase(search_words)
305
+
306
+ elif (usuario):
307
+ analizar_tweets(search_words,number_of_tweets)
308
+ elif (localidad):
309
+ tweets_localidad(search_words)
310
+
311
+ run()