T Le commited on
Commit
c837e02
·
1 Parent(s): d4ac1c4

Upload updated files

Browse files
images/bidirected.png ADDED
images/burst.png ADDED
images/coconut-web.jpg ADDED
images/download_bertopic.jpg ADDED
images/download_bidirected.jpg ADDED
images/download_biterm.jpg ADDED
images/download_sentiment.png ADDED
images/downloadtable.png ADDED
images/lemma.png ADDED
images/scattertext.png ADDED
images/sentiment.png ADDED
images/sentitable.png ADDED
images/sunburst.png ADDED
images/tablenetwork.png ADDED
images/topicmodeling.png ADDED
pages/.DS_Store ADDED
Binary file (6.15 kB). View file
 
pages/0 FileChecker.py CHANGED
@@ -1,5 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
 
 
3
 
4
  #===config===
5
  st.set_page_config(
@@ -34,10 +36,23 @@ def get_ext(extype):
34
  @st.cache_data(ttl=3600)
35
  def upload(extype):
36
  keywords = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
37
  return keywords
38
 
39
  @st.cache_data(ttl=3600)
40
  def conv_txt(extype):
 
 
 
 
 
41
  col_dict = {'TI': 'Title',
42
  'SO': 'Source title',
43
  'DE': 'Author Keywords',
@@ -45,16 +60,48 @@ def conv_txt(extype):
45
  'AB': 'Abstract',
46
  'TC': 'Cited by',
47
  'PY': 'Year',
48
- 'ID': 'Keywords Plus'}
49
- keywords = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
50
- keywords.rename(columns=col_dict, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  return keywords
52
 
53
  st.header('File Checker', anchor=False)
54
  st.subheader('Put your file here...', anchor=False)
55
 
56
  #===read data===
57
- uploaded_file = st.file_uploader('', type=['csv','txt'], on_change=reset_data)
58
 
59
  if uploaded_file is not None:
60
  extype = get_ext(uploaded_file)
@@ -64,8 +111,15 @@ if uploaded_file is not None:
64
  elif extype.endswith('.txt'):
65
  data = conv_txt(extype)
66
 
67
- col1, col2, col3 = st.columns(3)
 
68
 
 
 
 
 
 
 
69
  with col1:
70
  #===check keywords===
71
  keycheck = list(data.columns)
@@ -134,4 +188,4 @@ if uploaded_file is not None:
134
  container6.write("Unfortunately, you don't have a column containing object in your data. Please check again.")
135
  else:
136
  container6.subheader('✔️ Scattertext', divider='blue', anchor=False)
137
- container6.write('Congratulations! You can use Scattertext')
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import json
4
+ from tools import sourceformat as sf
5
 
6
  #===config===
7
  st.set_page_config(
 
36
  @st.cache_data(ttl=3600)
37
  def upload(extype):
38
  keywords = pd.read_csv(uploaded_file)
39
+ if "dimensions" in uploaded_file.name.lower():
40
+ keywords = sf.dim(keywords)
41
+ col_dict = {'MeSH terms': 'Keywords',
42
+ 'PubYear': 'Year',
43
+ 'Times cited': 'Cited by',
44
+ 'Publication Type': 'Document Type'
45
+ }
46
+ keywords.rename(columns=col_dict, inplace=True)
47
  return keywords
48
 
49
  @st.cache_data(ttl=3600)
50
  def conv_txt(extype):
51
+ if("PMID" in (uploaded_file.read()).decode()):
52
+ uploaded_file.seek(0)
53
+ papers = sf.medline(uploaded_file)
54
+ print(papers)
55
+ return papers
56
  col_dict = {'TI': 'Title',
57
  'SO': 'Source title',
58
  'DE': 'Author Keywords',
 
60
  'AB': 'Abstract',
61
  'TC': 'Cited by',
62
  'PY': 'Year',
63
+ 'ID': 'Keywords Plus',
64
+ 'rights_date_used': 'Year'}
65
+ uploaded_file.seek(0)
66
+ papers = pd.read_csv(uploaded_file, sep='\t')
67
+ if("htid" in papers.columns):
68
+ papers = sf.htrc(papers)
69
+ papers.rename(columns=col_dict, inplace=True)
70
+ print(papers)
71
+ return papers
72
+
73
+
74
+ @st.cache_data(ttl=3600)
75
+ def conv_json(extype):
76
+ col_dict={'title': 'title',
77
+ 'rights_date_used': 'Year',
78
+ 'content_provider_code':'Source title'
79
+ }
80
+
81
+ data = json.load(uploaded_file)
82
+ hathifile = data['gathers']
83
+ keywords = pd.DataFrame.from_records(hathifile)
84
+
85
+ keywords = sf.htrc(keywords)
86
+ keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
87
+ keywords.rename(columns=col_dict,inplace=True)
88
+ return keywords
89
+
90
+ @st.cache_data(ttl=3600)
91
+ def conv_pub(extype):
92
+ if (get_ext(extype)).endswith('.tar.gz'):
93
+ bytedata = extype.read()
94
+ keywords = sf.readPub(bytedata)
95
+ elif (get_ext(extype)).endswith('.xml'):
96
+ bytedata = extype.read()
97
+ keywords = sf.readxml(bytedata)
98
  return keywords
99
 
100
  st.header('File Checker', anchor=False)
101
  st.subheader('Put your file here...', anchor=False)
102
 
103
  #===read data===
104
+ uploaded_file = st.file_uploader('', type=['csv','txt','json', 'tar.gz', 'xml'], on_change=reset_data)
105
 
106
  if uploaded_file is not None:
107
  extype = get_ext(uploaded_file)
 
111
  elif extype.endswith('.txt'):
112
  data = conv_txt(extype)
113
 
114
+ elif extype.endswith('.json'):
115
+ data = conv_json(extype)
116
 
117
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
118
+ data = conv_pub(uploaded_file)
119
+
120
+
121
+ col1, col2, col3 = st.columns(3)
122
+
123
  with col1:
124
  #===check keywords===
125
  keycheck = list(data.columns)
 
188
  container6.write("Unfortunately, you don't have a column containing object in your data. Please check again.")
189
  else:
190
  container6.subheader('✔️ Scattertext', divider='blue', anchor=False)
191
+ container6.write('Congratulations! You can use Scattertext')
pages/1 Scattertext.py CHANGED
@@ -10,6 +10,8 @@ nltk.download('stopwords')
10
  from nltk.corpus import stopwords
11
  import time
12
  import sys
 
 
13
 
14
  #===config===
15
  st.set_page_config(
@@ -37,6 +39,9 @@ with st.popover("🔗 Menu"):
37
  st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
38
  st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
39
  st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
 
 
 
40
 
41
  st.header("Scattertext", anchor=False)
42
  st.subheader('Put your file here...', anchor=False)
@@ -57,19 +62,66 @@ def upload(extype):
57
  if 'Publication Year' in papers.columns:
58
  papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
59
  'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
 
 
 
 
 
 
 
 
 
 
60
  return papers
61
 
62
  @st.cache_data(ttl=3600)
63
  def conv_txt(extype):
 
 
 
 
 
64
  col_dict = {'TI': 'Title',
65
  'SO': 'Source title',
 
66
  'DT': 'Document Type',
67
  'AB': 'Abstract',
68
- 'PY': 'Year'}
69
- papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
 
 
 
 
 
 
70
  papers.rename(columns=col_dict, inplace=True)
 
71
  return papers
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  @st.cache_data(ttl=3600)
74
  def get_data(extype):
75
  df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
@@ -122,18 +174,24 @@ def clean_csv(extype):
122
 
123
  #===lemmatize===
124
  lemmatizer = WordNetLemmatizer()
 
 
125
  def lemmatize_words(text):
126
  words = text.split()
127
  words = [lemmatizer.lemmatize(word) for word in words]
128
  return ' '.join(words)
 
129
  paper[ColCho] = paper[ColCho].apply(lemmatize_words)
130
 
131
  words_rmv = [word.strip() for word in words_to_remove.split(";")]
132
  remove_set = set(words_rmv)
 
 
133
  def remove_words(text):
134
  words = text.split()
135
  cleaned_words = [word for word in words if word not in remove_set]
136
  return ' '.join(cleaned_words)
 
137
  paper[ColCho] = paper[ColCho].apply(remove_words)
138
 
139
  return paper
@@ -155,7 +213,11 @@ def running_scattertext(cat_col, catname, noncatname):
155
  nlp = stx.whitespace_nlp_with_sentences,
156
  ).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count = min_term)
157
 
158
- st.toast('Building corpus completed', icon='🎉')
 
 
 
 
159
 
160
  try:
161
  html = stx.produce_scattertext_explorer(corpus,
@@ -175,11 +237,8 @@ def running_scattertext(cat_col, catname, noncatname):
175
  width_in_pixels = 900,
176
  minimum_term_frequency = 0,
177
  save_svg_button=True)
178
-
179
- st.toast('Process completed', icon='🎉')
180
- time.sleep(1)
181
- st.toast('Visualizing', icon='⏳')
182
- components.html(html, height = 1200, scrolling = True)
183
 
184
  except ValueError:
185
  st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
@@ -226,7 +285,7 @@ def df_years(first_range, second_range):
226
  return filtered_df
227
 
228
  #===Read data===
229
- uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
230
 
231
  if uploaded_file is not None:
232
  try:
@@ -236,6 +295,10 @@ if uploaded_file is not None:
236
  papers = upload(extype)
237
  elif extype.endswith('.txt'):
238
  papers = conv_txt(extype)
 
 
 
 
239
 
240
  df_col, selected_cols = get_data(extype)
241
  comparison = check_comparison(extype)
@@ -264,7 +327,7 @@ if uploaded_file is not None:
264
 
265
  paper = clean_csv(extype)
266
 
267
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
268
 
269
  with tab1:
270
  #===visualization===
@@ -286,7 +349,7 @@ if uploaded_file is not None:
286
  st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
287
  else:
288
  with st.spinner('Processing. Please wait until the visualization comes up'):
289
- running_scattertext('Topic', 'First Term', 'Second Term')
290
 
291
  elif compare == 'Manual label':
292
  col1, col2, col3 = st.columns(3)
@@ -313,7 +376,7 @@ if uploaded_file is not None:
313
  filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
314
 
315
  with st.spinner('Processing. Please wait until the visualization comes up'):
316
- running_scattertext(column_selected, label1, label2)
317
 
318
  elif compare == 'Sources':
319
  col1, col2, col3 = st.columns([4,0.1,4])
@@ -334,7 +397,7 @@ if uploaded_file is not None:
334
  filtered_df = df_sources(stitle1, stitle2)
335
 
336
  with st.spinner('Processing. Please wait until the visualization comes up'):
337
- running_scattertext('Source title', stitle1, stitle2)
338
 
339
  elif compare == 'Years':
340
  col1, col2, col3 = st.columns([4,0.1,4])
@@ -348,19 +411,44 @@ if uploaded_file is not None:
348
  filtered_df = df_years(first_range, second_range)
349
 
350
  with st.spinner('Processing. Please wait until the visualization comes up'):
351
- running_scattertext('Topic Range', 'First range', 'Second range')
352
-
353
  else:
354
  st.write('You only have data in ', (MAX))
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
  with tab2:
357
  st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
358
 
359
  with tab3:
 
360
  st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
361
  st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
362
- st.markdown('**Sánchez-Franco, M.J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision.** https://doi.org/10.1108/MD-06-2023-0966')
363
 
364
- except:
 
 
 
 
 
 
 
 
 
 
 
365
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
366
- st.stop()
 
10
  from nltk.corpus import stopwords
11
  import time
12
  import sys
13
+ import json
14
+ from tools import sourceformat as sf
15
 
16
  #===config===
17
  st.set_page_config(
 
39
  st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
40
  st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
41
  st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
42
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
43
+ st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
44
+ st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣")
45
 
46
  st.header("Scattertext", anchor=False)
47
  st.subheader('Put your file here...', anchor=False)
 
62
  if 'Publication Year' in papers.columns:
63
  papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
64
  'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
65
+
66
+ elif "About the data" in papers.columns[0]:
67
+ papers = sf.dim(papers)
68
+ col_dict = {'MeSH terms': 'Keywords',
69
+ 'PubYear': 'Year',
70
+ 'Times cited': 'Cited by',
71
+ 'Publication Type': 'Document Type'
72
+ }
73
+ papers.rename(columns=col_dict, inplace=True)
74
+
75
  return papers
76
 
77
  @st.cache_data(ttl=3600)
78
  def conv_txt(extype):
79
+ if("PMID" in (uploaded_file.read()).decode()):
80
+ uploaded_file.seek(0)
81
+ papers = sf.medline(uploaded_file)
82
+ print(papers)
83
+ return papers
84
  col_dict = {'TI': 'Title',
85
  'SO': 'Source title',
86
+ 'DE': 'Author Keywords',
87
  'DT': 'Document Type',
88
  'AB': 'Abstract',
89
+ 'TC': 'Cited by',
90
+ 'PY': 'Year',
91
+ 'ID': 'Keywords Plus',
92
+ 'rights_date_used': 'Year'}
93
+ uploaded_file.seek(0)
94
+ papers = pd.read_csv(uploaded_file, sep='\t')
95
+ if("htid" in papers.columns):
96
+ papers = sf.htrc(papers)
97
  papers.rename(columns=col_dict, inplace=True)
98
+ print(papers)
99
  return papers
100
 
101
+ @st.cache_data(ttl=3600)
102
+ def conv_json(extype):
103
+ col_dict={'title': 'title',
104
+ 'rights_date_used': 'Year',
105
+ }
106
+
107
+ data = json.load(uploaded_file)
108
+ hathifile = data['gathers']
109
+ keywords = pd.DataFrame.from_records(hathifile)
110
+
111
+ keywords = sf.htrc(keywords)
112
+ keywords.rename(columns=col_dict,inplace=True)
113
+ return keywords
114
+
115
+ @st.cache_data(ttl=3600)
116
+ def conv_pub(extype):
117
+ if (get_ext(extype)).endswith('.tar.gz'):
118
+ bytedata = extype.read()
119
+ keywords = sf.readPub(bytedata)
120
+ elif (get_ext(extype)).endswith('.xml'):
121
+ bytedata = extype.read()
122
+ keywords = sf.readxml(bytedata)
123
+ return keywords
124
+
125
  @st.cache_data(ttl=3600)
126
  def get_data(extype):
127
  df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
 
174
 
175
  #===lemmatize===
176
  lemmatizer = WordNetLemmatizer()
177
+
178
+ @st.cache_data(ttl=3600)
179
  def lemmatize_words(text):
180
  words = text.split()
181
  words = [lemmatizer.lemmatize(word) for word in words]
182
  return ' '.join(words)
183
+
184
  paper[ColCho] = paper[ColCho].apply(lemmatize_words)
185
 
186
  words_rmv = [word.strip() for word in words_to_remove.split(";")]
187
  remove_set = set(words_rmv)
188
+
189
+ @st.cache_data(ttl=3600)
190
  def remove_words(text):
191
  words = text.split()
192
  cleaned_words = [word for word in words if word not in remove_set]
193
  return ' '.join(cleaned_words)
194
+
195
  paper[ColCho] = paper[ColCho].apply(remove_words)
196
 
197
  return paper
 
213
  nlp = stx.whitespace_nlp_with_sentences,
214
  ).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count = min_term)
215
 
216
+ #table results
217
+ disp = stx.Dispersion(corpus)
218
+ disp_df = disp.get_df()
219
+
220
+ disp_csv = disp_df.to_csv(index=False).encode('utf-8')
221
 
222
  try:
223
  html = stx.produce_scattertext_explorer(corpus,
 
237
  width_in_pixels = 900,
238
  minimum_term_frequency = 0,
239
  save_svg_button=True)
240
+
241
+ return disp_csv, html
 
 
 
242
 
243
  except ValueError:
244
  st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
 
285
  return filtered_df
286
 
287
  #===Read data===
288
+ uploaded_file = st.file_uploader('', type=['csv', 'txt', 'json', 'tar.gz','xml'], on_change=reset_all)
289
 
290
  if uploaded_file is not None:
291
  try:
 
295
  papers = upload(extype)
296
  elif extype.endswith('.txt'):
297
  papers = conv_txt(extype)
298
+ elif extype.endswith('.json'):
299
+ papers = conv_json(extype)
300
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
301
+ papers = conv_pub(uploaded_file)
302
 
303
  df_col, selected_cols = get_data(extype)
304
  comparison = check_comparison(extype)
 
327
 
328
  paper = clean_csv(extype)
329
 
330
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
331
 
332
  with tab1:
333
  #===visualization===
 
349
  st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
350
  else:
351
  with st.spinner('Processing. Please wait until the visualization comes up'):
352
+ disp_df, html = running_scattertext('Topic', 'First Term', 'Second Term')
353
 
354
  elif compare == 'Manual label':
355
  col1, col2, col3 = st.columns(3)
 
376
  filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
377
 
378
  with st.spinner('Processing. Please wait until the visualization comes up'):
379
+ disp_df, html = running_scattertext(column_selected, label1, label2)
380
 
381
  elif compare == 'Sources':
382
  col1, col2, col3 = st.columns([4,0.1,4])
 
397
  filtered_df = df_sources(stitle1, stitle2)
398
 
399
  with st.spinner('Processing. Please wait until the visualization comes up'):
400
+ disp_df, html = running_scattertext('Source title', stitle1, stitle2)
401
 
402
  elif compare == 'Years':
403
  col1, col2, col3 = st.columns([4,0.1,4])
 
411
  filtered_df = df_years(first_range, second_range)
412
 
413
  with st.spinner('Processing. Please wait until the visualization comes up'):
414
+ disp_df, html = running_scattertext('Topic Range', 'First range', 'Second range')
415
+
416
  else:
417
  st.write('You only have data in ', (MAX))
418
+
419
+ if html:
420
+ st.toast('Process completed', icon='🎉')
421
+ time.sleep(1)
422
+ st.toast('Visualizing', icon='⏳')
423
+ components.html(html, height = 1200, scrolling = True)
424
+
425
+ st.download_button(
426
+ "📥 Click to download result",
427
+ disp_df,
428
+ "scattertext_dataframe.csv",
429
+ "text/csv",
430
+ on_click="ignore")
431
 
432
  with tab2:
433
  st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
434
 
435
  with tab3:
436
+ st.markdown('**Sánchez-Franco, M. J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision, 62(7).** https://doi.org/10.1108/md-06-2023-0966')
437
  st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
438
  st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
439
+ st.markdown('**Santosa, F. A. (2025). Artificial Intelligence in Library Studies: A Textual Analysis. JLIS.It, 16(1).** https://doi.org/10.36253/jlis.it-626')
440
 
441
+ with tab4:
442
+ st.subheader(':blue[Image]', anchor=False)
443
+ st.write("Click the :blue[Download SVG] on the right side.")
444
+ st.divider()
445
+ st.subheader(':blue[Scattertext Dataframe]', anchor=False)
446
+ st.button('📥 Click to download result')
447
+ st.text("Click the Download button to get the CSV result.")
448
+
449
+ except NameError:
450
+ pass
451
+
452
+ except Exception as e:
453
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
454
+ st.stop()
pages/2 Topic Modeling.py CHANGED
@@ -1,445 +1,659 @@
1
- #import module
2
- import streamlit as st
3
- import streamlit.components.v1 as components
4
- import pandas as pd
5
- import numpy as np
6
- import re
7
- import nltk
8
- nltk.download('wordnet')
9
- from nltk.stem import WordNetLemmatizer
10
- nltk.download('stopwords')
11
- from nltk.corpus import stopwords
12
- #from scipy import triu
13
- import gensim
14
- import gensim.corpora as corpora
15
- from gensim.corpora import Dictionary
16
- from gensim.models.coherencemodel import CoherenceModel
17
- from gensim.models.ldamodel import LdaModel
18
- from pprint import pprint
19
- import pickle
20
- import pyLDAvis
21
- import pyLDAvis.gensim_models as gensimvis
22
- from io import StringIO
23
- from ipywidgets.embed import embed_minimal_html
24
- from nltk.stem.snowball import SnowballStemmer
25
- from bertopic import BERTopic
26
- import plotly.express as px
27
- from sklearn.cluster import KMeans
28
- import bitermplus as btm
29
- import tmplot as tmp
30
- import tomotopy
31
- import sys
32
- import spacy
33
- import en_core_web_md
34
- import pipeline
35
- from html2image import Html2Image
36
- from umap import UMAP
37
- import os
38
- import time
39
-
40
-
41
- #===config===
42
- st.set_page_config(
43
- page_title="Coconut",
44
- page_icon="🥥",
45
- layout="wide",
46
- initial_sidebar_state="collapsed"
47
- )
48
-
49
- hide_streamlit_style = """
50
- <style>
51
- #MainMenu
52
- {visibility: hidden;}
53
- footer {visibility: hidden;}
54
- [data-testid="collapsedControl"] {display: none}
55
- </style>
56
- """
57
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
58
-
59
- with st.popover("🔗 Menu"):
60
- st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
61
- st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
62
- st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
63
- st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
64
- st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
65
- st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
66
- st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
67
-
68
- st.header("Topic Modeling", anchor=False)
69
- st.subheader('Put your file here...', anchor=False)
70
-
71
- #========unique id========
72
- @st.cache_resource(ttl=3600)
73
- def create_list():
74
- l = [1, 2, 3]
75
- return l
76
-
77
- l = create_list()
78
- first_list_value = l[0]
79
- l[0] = first_list_value + 1
80
- uID = str(l[0])
81
-
82
- @st.cache_data(ttl=3600)
83
- def get_ext(uploaded_file):
84
- extype = uID+uploaded_file.name
85
- return extype
86
-
87
- #===clear cache===
88
-
89
- def reset_biterm():
90
- try:
91
- biterm_map.clear()
92
- biterm_bar.clear()
93
- except NameError:
94
- biterm_topic.clear()
95
-
96
- def reset_all():
97
- st.cache_data.clear()
98
-
99
- #===avoiding deadlock===
100
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
101
-
102
- #===upload file===
103
- @st.cache_data(ttl=3600)
104
- def upload(file):
105
- papers = pd.read_csv(uploaded_file)
106
- return papers
107
-
108
- @st.cache_data(ttl=3600)
109
- def conv_txt(extype):
110
- col_dict = {'TI': 'Title',
111
- 'SO': 'Source title',
112
- 'DT': 'Document Type',
113
- 'AB': 'Abstract',
114
- 'PY': 'Year'}
115
- papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
116
- papers.rename(columns=col_dict, inplace=True)
117
- return papers
118
-
119
-
120
- #===Read data===
121
- uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
122
-
123
- if uploaded_file is not None:
124
- try:
125
- extype = get_ext(uploaded_file)
126
-
127
- if extype.endswith('.csv'):
128
- papers = upload(extype)
129
- elif extype.endswith('.txt'):
130
- papers = conv_txt(extype)
131
-
132
- coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
133
-
134
- c1, c2 = st.columns([3,4])
135
- method = c1.selectbox(
136
- 'Choose method',
137
- ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
138
- num_cho = c1.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
139
- ColCho = c2.selectbox(
140
- 'Choose column',
141
- (coldf), on_change=reset_all)
142
- words_to_remove = c2.text_input("Remove specific words. Separate words by semicolons (;)")
143
- rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
144
- rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
145
-
146
- #===clean csv===
147
- @st.cache_data(ttl=3600, show_spinner=False)
148
- def clean_csv(extype):
149
- paper = papers.dropna(subset=[ColCho])
150
-
151
- #===mapping===
152
- paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
153
- if rem_punc:
154
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
155
- paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
156
- if rem_copyright:
157
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
158
-
159
- #===stopword removal===
160
- stop = stopwords.words('english')
161
- paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
162
-
163
- #===lemmatize===
164
- lemmatizer = WordNetLemmatizer()
165
- def lemmatize_words(text):
166
- words = text.split()
167
- words = [lemmatizer.lemmatize(word) for word in words]
168
- return ' '.join(words)
169
- paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
170
-
171
- words_rmv = [word.strip() for word in words_to_remove.split(";")]
172
- remove_dict = {word: None for word in words_rmv}
173
- def remove_words(text):
174
- words = text.split()
175
- cleaned_words = [word for word in words if word not in remove_dict]
176
- return ' '.join(cleaned_words)
177
- paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
178
-
179
- topic_abs = paper.Abstract_lem.values.tolist()
180
- return topic_abs, paper
181
-
182
- d1, d2 = st.columns([7,3])
183
- d2.info("Don't do anything during the computing", icon="⚠️")
184
- topic_abs, paper=clean_csv(extype)
185
-
186
- #===advance settings===
187
- with d1.expander("🧮 Show advance settings"):
188
- t1, t2 = st.columns([5,5])
189
- if method == 'pyLDA':
190
- py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
191
- py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
192
- elif method == 'Biterm':
193
- btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
194
- btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
195
- elif method == 'BERTopic':
196
- bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
197
- bert_random_state = t1.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
198
- bert_n_components = t2.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
199
- bert_n_neighbors = t2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
200
- bert_embedding_model = st.radio(
201
- "embedding_model",
202
- ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_md"], index=0, horizontal=True)
203
- else:
204
- st.write('Please choose your preferred method')
205
- if st.button("Submit", on_click=reset_all):
206
- num_topic = num_cho
207
-
208
- if method == 'BERTopic':
209
- st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
210
-
211
- #===topic===
212
- if method == 'Choose...':
213
- st.write('')
214
-
215
- elif method == 'pyLDA':
216
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
217
-
218
- with tab1:
219
- #===visualization===
220
- @st.cache_data(ttl=3600, show_spinner=False)
221
- def pylda(extype):
222
- topic_abs_LDA = [t.split(' ') for t in topic_abs]
223
- id2word = Dictionary(topic_abs_LDA)
224
- corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
225
- #===LDA===
226
- lda_model = LdaModel(corpus=corpus,
227
- id2word=id2word,
228
- num_topics=num_topic,
229
- random_state=py_random_state,
230
- chunksize=py_chunksize,
231
- alpha='auto',
232
- per_word_topics=True)
233
-
234
- pprint(lda_model.print_topics())
235
- doc_lda = lda_model[corpus]
236
-
237
- #===visualization===
238
- coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
239
- coherence_lda = coherence_model_lda.get_coherence()
240
- vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
241
- py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
242
- return py_lda_vis_html, coherence_lda, vis
243
-
244
- with st.spinner('Performing computations. Please wait ...'):
245
- try:
246
- py_lda_vis_html, coherence_lda, vis = pylda(extype)
247
- st.write('Coherence score: ', coherence_lda)
248
- components.html(py_lda_vis_html, width=1500, height=800)
249
- st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
250
-
251
- @st.cache_data(ttl=3600, show_spinner=False)
252
- def img_lda(vis):
253
- pyLDAvis.save_html(vis, 'output.html')
254
- hti = Html2Image()
255
- hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
256
- css = "body {background: white;}"
257
- hti.screenshot(
258
- other_file='output.html', css_str=css, size=(1500, 800),
259
- save_as='ldavis_img.png'
260
- )
261
-
262
- img_lda(vis)
263
- with open("ldavis_img.png", "rb") as file:
264
- btn = st.download_button(
265
- label="Download image",
266
- data=file,
267
- file_name="ldavis_img.png",
268
- mime="image/png"
269
- )
270
-
271
- except NameError:
272
- st.warning('🖱️ Please click Submit')
273
-
274
- with tab2:
275
- st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
276
-
277
- with tab3:
278
- st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
279
- st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
280
- st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
281
- st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
282
-
283
- #===Biterm===
284
- elif method == 'Biterm':
285
-
286
- #===optimize Biterm===
287
- @st.cache_data(ttl=3600, show_spinner=False)
288
- def biterm_topic(extype):
289
- X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs)
290
- tf = np.array(X.sum(axis=0)).ravel()
291
- docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
292
- docs_lens = list(map(len, docs_vec))
293
- biterms = btm.get_biterms(docs_vec)
294
- model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
295
- model.fit(biterms, iterations=btm_iterations)
296
- p_zd = model.transform(docs_vec)
297
- coherence = model.coherence_
298
- phi = tmp.get_phi(model)
299
- topics_coords = tmp.prepare_coords(model)
300
- totaltop = topics_coords.label.values.tolist()
301
- perplexity = model.perplexity_
302
- return topics_coords, phi, totaltop, perplexity
303
-
304
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
305
- with tab1:
306
- try:
307
- with st.spinner('Performing computations. Please wait ...'):
308
- topics_coords, phi, totaltop, perplexity = biterm_topic(extype)
309
- col1, col2 = st.columns([4,6])
310
-
311
- @st.cache_data(ttl=3600)
312
- def biterm_map(extype):
313
- btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
314
- return btmvis_coords
315
-
316
- @st.cache_data(ttl=3600)
317
- def biterm_bar(extype):
318
- terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
319
- btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
320
- return btmvis_probs
321
-
322
- with col1:
323
- st.write('Perplexity score: ', perplexity)
324
- st.write('')
325
- numvis = st.selectbox(
326
- 'Choose topic',
327
- (totaltop), on_change=reset_biterm)
328
- btmvis_coords = biterm_map(extype)
329
- st.altair_chart(btmvis_coords)
330
- with col2:
331
- btmvis_probs = biterm_bar(extype)
332
- st.altair_chart(btmvis_probs, use_container_width=True)
333
-
334
- except ValueError:
335
- st.error('🙇‍♂️ Please raise the number of topics and click submit')
336
- except NameError:
337
- st.warning('🖱️ Please click Submit')
338
-
339
- with tab2:
340
- st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
341
- with tab3:
342
- st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
343
- st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
344
- st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
345
- st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
346
-
347
- #===BERTopic===
348
- elif method == 'BERTopic':
349
- @st.cache_data(ttl=3600, show_spinner=False)
350
- def bertopic_vis(extype):
351
- umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
352
- min_dist=0.0, metric='cosine', random_state=bert_random_state)
353
- cluster_model = KMeans(n_clusters=num_topic)
354
- if bert_embedding_model == 'all-MiniLM-L6-v2':
355
- emb_mod = 'all-MiniLM-L6-v2'
356
- lang = 'en'
357
- elif bert_embedding_model == 'en_core_web_md':
358
- emb_mod = en_core_web_md.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
359
- lang = 'en'
360
- elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
361
- emb_mod = 'paraphrase-multilingual-MiniLM-L12-v2'
362
- lang = 'multilingual'
363
- topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
364
- topics, probs = topic_model.fit_transform(topic_abs)
365
- return topic_model, topics, probs
366
-
367
- @st.cache_data(ttl=3600, show_spinner=False)
368
- def Vis_Topics(extype):
369
- fig1 = topic_model.visualize_topics()
370
- return fig1
371
-
372
- @st.cache_data(ttl=3600, show_spinner=False)
373
- def Vis_Documents(extype):
374
- fig2 = topic_model.visualize_documents(topic_abs)
375
- return fig2
376
-
377
- @st.cache_data(ttl=3600, show_spinner=False)
378
- def Vis_Hierarchy(extype):
379
- fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic)
380
- return fig3
381
-
382
- @st.cache_data(ttl=3600, show_spinner=False)
383
- def Vis_Heatmap(extype):
384
- global topic_model
385
- fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000)
386
- return fig4
387
-
388
- @st.cache_data(ttl=3600, show_spinner=False)
389
- def Vis_Barchart(extype):
390
- fig5 = topic_model.visualize_barchart(top_n_topics=num_topic)
391
- return fig5
392
-
393
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
394
- with tab1:
395
- try:
396
- with st.spinner('Performing computations. Please wait ...'):
397
-
398
- topic_model, topics, probs = bertopic_vis(extype)
399
- time.sleep(.5)
400
- st.toast('Visualize Topics', icon='🏃')
401
- fig1 = Vis_Topics(extype)
402
-
403
- time.sleep(.5)
404
- st.toast('Visualize Document', icon='🏃')
405
- fig2 = Vis_Documents(extype)
406
-
407
- time.sleep(.5)
408
- st.toast('Visualize Document Hierarchy', icon='🏃')
409
- fig3 = Vis_Hierarchy(extype)
410
-
411
- time.sleep(.5)
412
- st.toast('Visualize Topic Similarity', icon='🏃')
413
- fig4 = Vis_Heatmap(extype)
414
-
415
- time.sleep(.5)
416
- st.toast('Visualize Terms', icon='🏃')
417
- fig5 = Vis_Barchart(extype)
418
-
419
- with st.expander("Visualize Topics"):
420
- st.write(fig1)
421
- with st.expander("Visualize Terms"):
422
- st.write(fig5)
423
- with st.expander("Visualize Documents"):
424
- st.write(fig2)
425
- with st.expander("Visualize Document Hierarchy"):
426
- st.write(fig3)
427
- with st.expander("Visualize Topic Similarity"):
428
- st.write(fig4)
429
-
430
- except ValueError:
431
- st.error('🙇‍♂️ Please raise the number of topics and click submit')
432
-
433
- except NameError:
434
- st.warning('🖱️ Please click Submit')
435
-
436
- with tab2:
437
- st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
438
-
439
- with tab3:
440
- st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
441
- st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
442
-
443
- except:
444
- st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
445
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import module
2
+ import streamlit as st
3
+ import streamlit.components.v1 as components
4
+ import pandas as pd
5
+ import numpy as np
6
+ import re
7
+ import string
8
+ import nltk
9
+ nltk.download('wordnet')
10
+ from nltk.stem import WordNetLemmatizer
11
+ nltk.download('stopwords')
12
+ from nltk.corpus import stopwords
13
+ import gensim
14
+ import gensim.corpora as corpora
15
+ from gensim.corpora import Dictionary
16
+ from gensim.models.coherencemodel import CoherenceModel
17
+ from gensim.models.ldamodel import LdaModel
18
+ from gensim.models import Phrases
19
+ from gensim.models.phrases import Phraser
20
+ from pprint import pprint
21
+ import pickle
22
+ import pyLDAvis
23
+ import pyLDAvis.gensim_models as gensimvis
24
+ from io import StringIO
25
+ from ipywidgets.embed import embed_minimal_html
26
+ from nltk.stem.snowball import SnowballStemmer
27
+ from bertopic import BERTopic
28
+ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
29
+ import plotly.express as px
30
+ from sklearn.cluster import KMeans
31
+ from sklearn.feature_extraction.text import CountVectorizer
32
+ import bitermplus as btm
33
+ import tmplot as tmp
34
+ import tomotopy
35
+ import sys
36
+ import spacy
37
+ import en_core_web_sm
38
+ import pipeline
39
+ from html2image import Html2Image
40
+ from umap import UMAP
41
+ import os
42
+ import time
43
+ import json
44
+ from tools import sourceformat as sf
45
+ import datamapplot
46
+ from sentence_transformers import SentenceTransformer
47
+ import openai
48
+ from transformers import pipeline
49
+
50
+ #===config===
51
+ st.set_page_config(
52
+ page_title="Coconut",
53
+ page_icon="🥥",
54
+ layout="wide",
55
+ initial_sidebar_state="collapsed"
56
+ )
57
+
58
+ hide_streamlit_style = """
59
+ <style>
60
+ #MainMenu
61
+ {visibility: hidden;}
62
+ footer {visibility: hidden;}
63
+ [data-testid="collapsedControl"] {display: none}
64
+ </style>
65
+ """
66
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
67
+
68
+ with st.popover("🔗 Menu"):
69
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
70
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
71
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
72
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
73
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
74
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
75
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
76
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
77
+
78
+ st.header("Topic Modeling", anchor=False)
79
+ st.subheader('Put your file here...', anchor=False)
80
+
81
+ #========unique id========
82
+ @st.cache_resource(ttl=3600)
83
+ def create_list():
84
+ l = [1, 2, 3]
85
+ return l
86
+
87
+ l = create_list()
88
+ first_list_value = l[0]
89
+ l[0] = first_list_value + 1
90
+ uID = str(l[0])
91
+
92
+ @st.cache_data(ttl=3600)
93
+ def get_ext(uploaded_file):
94
+ extype = uID+uploaded_file.name
95
+ return extype
96
+
97
+ #===clear cache===
98
+
99
+ def reset_biterm():
100
+ try:
101
+ biterm_map.clear()
102
+ biterm_bar.clear()
103
+ except NameError:
104
+ biterm_topic.clear()
105
+
106
+ def reset_all():
107
+ st.cache_data.clear()
108
+
109
+ #===avoiding deadlock===
110
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
111
+
112
+ #===upload file===
113
+ @st.cache_data(ttl=3600)
114
+ def upload(file):
115
+ papers = pd.read_csv(uploaded_file)
116
+ if "About the data" in papers.columns[0]:
117
+ papers = sf.dim(papers)
118
+ col_dict = {'MeSH terms': 'Keywords',
119
+ 'PubYear': 'Year',
120
+ 'Times cited': 'Cited by',
121
+ 'Publication Type': 'Document Type'
122
+ }
123
+ papers.rename(columns=col_dict, inplace=True)
124
+
125
+ return papers
126
+
127
+ @st.cache_data(ttl=3600)
128
+ def conv_txt(extype):
129
+ if("PMID" in (uploaded_file.read()).decode()):
130
+ uploaded_file.seek(0)
131
+ papers = sf.medline(uploaded_file)
132
+ print(papers)
133
+ return papers
134
+ col_dict = {'TI': 'Title',
135
+ 'SO': 'Source title',
136
+ 'DE': 'Author Keywords',
137
+ 'DT': 'Document Type',
138
+ 'AB': 'Abstract',
139
+ 'TC': 'Cited by',
140
+ 'PY': 'Year',
141
+ 'ID': 'Keywords Plus',
142
+ 'rights_date_used': 'Year'}
143
+ uploaded_file.seek(0)
144
+ papers = pd.read_csv(uploaded_file, sep='\t')
145
+ if("htid" in papers.columns):
146
+ papers = sf.htrc(papers)
147
+ papers.rename(columns=col_dict, inplace=True)
148
+ print(papers)
149
+ return papers
150
+
151
+
152
+ @st.cache_data(ttl=3600)
153
+ def conv_json(extype):
154
+ col_dict={'title': 'title',
155
+ 'rights_date_used': 'Year',
156
+ }
157
+
158
+ data = json.load(uploaded_file)
159
+ hathifile = data['gathers']
160
+ keywords = pd.DataFrame.from_records(hathifile)
161
+
162
+ keywords = sf.htrc(keywords)
163
+ keywords.rename(columns=col_dict,inplace=True)
164
+ return keywords
165
+
166
+ @st.cache_resource(ttl=3600)
167
+ def conv_pub(extype):
168
+ if (get_ext(extype)).endswith('.tar.gz'):
169
+ bytedata = extype.read()
170
+ keywords = sf.readPub(bytedata)
171
+ elif (get_ext(extype)).endswith('.xml'):
172
+ bytedata = extype.read()
173
+ keywords = sf.readxml(bytedata)
174
+ return keywords
175
+
176
+ #===Read data===
177
+ uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
178
+
179
+ if uploaded_file is not None:
180
+ try:
181
+ extype = get_ext(uploaded_file)
182
+
183
+ if extype.endswith('.csv'):
184
+ papers = upload(extype)
185
+ elif extype.endswith('.txt'):
186
+ papers = conv_txt(extype)
187
+
188
+ elif extype.endswith('.json'):
189
+ papers = conv_json(extype)
190
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
191
+ papers = conv_pub(uploaded_file)
192
+
193
+ coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
194
+
195
+ c1, c2, c3 = st.columns([3,3,4])
196
+ method = c1.selectbox(
197
+ 'Choose method',
198
+ ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
199
+ ColCho = c2.selectbox('Choose column', (coldf))
200
+ num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
201
+
202
+ d1, d2 = st.columns([3,7])
203
+ xgram = d1.selectbox("N-grams", ("1", "2", "3"))
204
+ xgram = int(xgram)
205
+ words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)")
206
+
207
+ rem_copyright = d1.toggle('Remove copyright statement', value=True)
208
+ rem_punc = d2.toggle('Remove punctuation', value=True)
209
+
210
+ #===advance settings===
211
+ with st.expander("🧮 Show advance settings"):
212
+ t1, t2, t3 = st.columns([3,3,4])
213
+ if method == 'pyLDA':
214
+ py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
215
+ py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
216
+ opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
217
+
218
+ elif method == 'Biterm':
219
+ btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
220
+ btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
221
+ opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
222
+
223
+ elif method == 'BERTopic':
224
+ u1, u2 = st.columns([5,5])
225
+
226
+ bert_top_n_words = u1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
227
+ bert_random_state = u2.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
228
+ bert_n_components = u1.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
229
+ bert_n_neighbors = u2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
230
+ bert_embedding_model = st.radio(
231
+ "embedding_model",
232
+ ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_sm"], index=0, horizontal=True)
233
+
234
+ fine_tuning = st.toggle("Use Fine-tuning")
235
+ if fine_tuning:
236
+ topic_labelling = st.toggle("Automatic topic labelling")
237
+ if topic_labelling:
238
+ llm_provider = st.selectbox("Provider",["OpenAI","HuggingFace"])
239
+ if llm_provider == "OpenAI":
240
+ api_key = st.text_input("API Key")
241
+
242
+ else:
243
+ st.write('Please choose your preferred method')
244
+
245
+ #===clean csv===
246
+ @st.cache_data(ttl=3600, show_spinner=False)
247
+ def clean_csv(extype):
248
+ paper = papers.dropna(subset=[ColCho])
249
+
250
+ #===mapping===
251
+ paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
252
+ if rem_punc:
253
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(
254
+ lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x)
255
+ ).map(lambda x: re.sub(r"\s+", " ", x).strip())
256
+ paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('[\u2018\u2019\u201c\u201d]', '', regex=True)
257
+ if rem_copyright:
258
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
259
+
260
+ #===stopword removal===
261
+ stop = stopwords.words('english')
262
+ paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
263
+
264
+ #===lemmatize===
265
+ lemmatizer = WordNetLemmatizer()
266
+
267
+ @st.cache_resource(ttl=3600)
268
+ def lemmatize_words(text):
269
+ words = text.split()
270
+ words = [lemmatizer.lemmatize(word) for word in words]
271
+ return ' '.join(words)
272
+ paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
273
+
274
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
275
+ remove_dict = {word: None for word in words_rmv}
276
+
277
+ @st.cache_resource(ttl=3600)
278
+ def remove_words(text):
279
+ words = text.split()
280
+ cleaned_words = [word for word in words if word not in remove_dict]
281
+ return ' '.join(cleaned_words)
282
+ paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
283
+
284
+ topic_abs = paper.Abstract_lem.values.tolist()
285
+ return topic_abs, paper
286
+
287
+ topic_abs, paper=clean_csv(extype)
288
+
289
+ if st.button("Submit", on_click=reset_all):
290
+ num_topic = num_cho
291
+
292
+ if method == 'BERTopic':
293
+ st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
294
+
295
+ #===topic===
296
+ if method == 'Choose...':
297
+ st.write('')
298
+
299
+ elif method == 'pyLDA':
300
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
301
+
302
+ with tab1:
303
+ #===visualization===
304
+ @st.cache_data(ttl=3600, show_spinner=False)
305
+ def pylda(extype):
306
+ topic_abs_LDA = [t.split(' ') for t in topic_abs]
307
+
308
+ bigram = Phrases(topic_abs_LDA, min_count=xgram, threshold=opt_threshold)
309
+ trigram = Phrases(bigram[topic_abs_LDA], threshold=opt_threshold)
310
+ bigram_mod = Phraser(bigram)
311
+ trigram_mod = Phraser(trigram)
312
+
313
+ topic_abs_LDA = [trigram_mod[bigram_mod[doc]] for doc in topic_abs_LDA]
314
+
315
+ id2word = Dictionary(topic_abs_LDA)
316
+ corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
317
+ #===LDA===
318
+ lda_model = LdaModel(corpus=corpus,
319
+ id2word=id2word,
320
+ num_topics=num_topic,
321
+ random_state=py_random_state,
322
+ chunksize=py_chunksize,
323
+ alpha='auto',
324
+ per_word_topics=False)
325
+ pprint(lda_model.print_topics())
326
+ doc_lda = lda_model[corpus]
327
+ topics = lda_model.show_topics(num_words = 30,formatted=False)
328
+
329
+ #===visualization===
330
+ coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
331
+ coherence_lda = coherence_model_lda.get_coherence()
332
+ vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
333
+ py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
334
+ return py_lda_vis_html, coherence_lda, vis, topics
335
+
336
+ with st.spinner('Performing computations. Please wait ...'):
337
+ try:
338
+ py_lda_vis_html, coherence_lda, vis, topics = pylda(extype)
339
+ st.write('Coherence score: ', coherence_lda)
340
+ components.html(py_lda_vis_html, width=1500, height=800)
341
+ st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
342
+
343
+ @st.cache_data(ttl=3600, show_spinner=False)
344
+ def img_lda(vis):
345
+ pyLDAvis.save_html(vis, 'output.html')
346
+ hti = Html2Image()
347
+ hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
348
+ hti.browser.use_new_headless = None
349
+ css = "body {background: white;}"
350
+ hti.screenshot(
351
+ other_file='output.html', css_str=css, size=(1500, 800),
352
+ save_as='ldavis_img.png'
353
+ )
354
+
355
+ img_lda(vis)
356
+
357
+ d1, d2 = st.columns(2)
358
+ with open("ldavis_img.png", "rb") as file:
359
+ btn = d1.download_button(
360
+ label="Download image",
361
+ data=file,
362
+ file_name="ldavis_img.png",
363
+ mime="image/png"
364
+ )
365
+
366
+ #===download results===#
367
+ resultf = pd.DataFrame(topics)
368
+ #formatting
369
+ resultf = resultf.transpose()
370
+ resultf = resultf.drop([0])
371
+ resultf = resultf.explode(list(range(len(resultf.columns))), ignore_index=False)
372
+
373
+ resultcsv = resultf.to_csv().encode("utf-8")
374
+ d2.download_button(
375
+ label = "Download Results",
376
+ data=resultcsv,
377
+ file_name="results.csv",
378
+ mime="text\csv",
379
+ on_click="ignore")
380
+
381
+ except NameError as f:
382
+ st.warning('🖱️ Please click Submit')
383
+
384
+ with tab2:
385
+ st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
386
+
387
+ with tab3:
388
+ st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
389
+ st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
390
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
391
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
392
+
393
+ with tab4:
394
+ st.subheader(':blue[pyLDA]', anchor=False)
395
+ st.button('Download image')
396
+ st.text("Click Download Image button.")
397
+ st.divider()
398
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
399
+ st.button("Download Results")
400
+ st.text("Click Download results button at bottom of page")
401
+
402
+ #===Biterm===
403
+ elif method == 'Biterm':
404
+
405
+ #===optimize Biterm===
406
+ @st.cache_data(ttl=3600, show_spinner=False)
407
+ def biterm_topic(extype):
408
+ tokenized_abs = [t.split(' ') for t in topic_abs]
409
+
410
+ bigram = Phrases(tokenized_abs, min_count=xgram, threshold=opt_threshold)
411
+ trigram = Phrases(bigram[tokenized_abs], threshold=opt_threshold)
412
+ bigram_mod = Phraser(bigram)
413
+ trigram_mod = Phraser(trigram)
414
+
415
+ topic_abs_ngram = [trigram_mod[bigram_mod[doc]] for doc in tokenized_abs]
416
+
417
+ topic_abs_str = [' '.join(doc) for doc in topic_abs_ngram]
418
+
419
+
420
+ X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs_str)
421
+ tf = np.array(X.sum(axis=0)).ravel()
422
+ docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
423
+ docs_lens = list(map(len, docs_vec))
424
+ biterms = btm.get_biterms(docs_vec)
425
+
426
+ model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
427
+ model.fit(biterms, iterations=btm_iterations)
428
+
429
+ p_zd = model.transform(docs_vec)
430
+ coherence = model.coherence_
431
+ phi = tmp.get_phi(model)
432
+ topics_coords = tmp.prepare_coords(model)
433
+ totaltop = topics_coords.label.values.tolist()
434
+ perplexity = model.perplexity_
435
+ top_topics = model.df_words_topics_
436
+
437
+ return topics_coords, phi, totaltop, perplexity, top_topics
438
+
439
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
440
+ with tab1:
441
+ try:
442
+ with st.spinner('Performing computations. Please wait ...'):
443
+ topics_coords, phi, totaltop, perplexity, top_topics = biterm_topic(extype)
444
+ col1, col2 = st.columns([4,6])
445
+
446
+ @st.cache_data(ttl=3600)
447
+ def biterm_map(extype):
448
+ btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
449
+ return btmvis_coords
450
+
451
+ @st.cache_data(ttl=3600)
452
+ def biterm_bar(extype):
453
+ terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
454
+ btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
455
+ return btmvis_probs
456
+
457
+ with col1:
458
+ st.write('Perplexity score: ', perplexity)
459
+ st.write('')
460
+ numvis = st.selectbox(
461
+ 'Choose topic',
462
+ (totaltop), on_change=reset_biterm)
463
+ btmvis_coords = biterm_map(extype)
464
+ st.altair_chart(btmvis_coords)
465
+ with col2:
466
+ btmvis_probs = biterm_bar(extype)
467
+ st.altair_chart(btmvis_probs, use_container_width=True)
468
+
469
+ #===download results===#
470
+ resultcsv = top_topics.to_csv().encode("utf-8")
471
+ st.download_button(label = "Download Results", data=resultcsv, file_name="results.csv", mime="text\csv", on_click="ignore")
472
+
473
+ except ValueError as g:
474
+ st.error('🙇‍♂️ Please raise the number of topics and click submit')
475
+
476
+ except NameError as f:
477
+ st.warning('🖱️ Please click Submit')
478
+
479
+ with tab2:
480
+ st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
481
+ with tab3:
482
+ st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
483
+ st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
484
+ st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
485
+ st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
486
+ with tab4:
487
+ st.subheader(':blue[Biterm]', anchor=False)
488
+ st.text("Click the three dots at the top right then select the desired format.")
489
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_biterm.jpg)")
490
+ st.divider()
491
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
492
+ st.button("Download Results")
493
+ st.text("Click Download results button at bottom of page")
494
+
495
+
496
+ #===BERTopic===
497
+ elif method == 'BERTopic':
498
+ @st.cache_resource(ttl = 3600, show_spinner=False)
499
+ #@st.cache_data(ttl=3600, show_spinner=False)
500
+ def bertopic_vis(extype):
501
+ umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
502
+ min_dist=0.0, metric='cosine', random_state=bert_random_state)
503
+ cluster_model = KMeans(n_clusters=num_topic)
504
+ if bert_embedding_model == 'all-MiniLM-L6-v2':
505
+ model = SentenceTransformer('all-MiniLM-L6-v2')
506
+ lang = 'en'
507
+ embeddings = model.encode(topic_abs, show_progress_bar=True)
508
+
509
+ elif bert_embedding_model == 'en_core_web_sm':
510
+ nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
511
+ model = nlp
512
+ lang = 'en'
513
+ embeddings = np.array([nlp(text).vector for text in topic_abs])
514
+
515
+ elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
516
+ model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
517
+ lang = 'multilingual'
518
+ embeddings = model.encode(topic_abs, show_progress_bar=True)
519
+
520
+ representation_model = ""
521
+
522
+ if fine_tuning:
523
+ keybert = KeyBERTInspired()
524
+ mmr = MaximalMarginalRelevance(diversity=0.3)
525
+ representation_model = {
526
+ "KeyBERT": keybert,
527
+ "MMR": mmr,
528
+ }
529
+ if topic_labelling:
530
+ if llm_provider == "OpenAI":
531
+ client = openai.OpenAI(api_key=api_key)
532
+ representation_model = {
533
+ "KeyBERT": keybert,
534
+ "MMR": mmr,
535
+ "test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
536
+ }
537
+ elif llm_provider == "HuggingFace":
538
+ gennie = pipeline("text2text-generation", model = "google/flan-t5-base")
539
+ clientmod = TextGeneration(gennie)
540
+ representation_model = {
541
+ "KeyBERT": keybert,
542
+ "MMR": mmr,
543
+ "test": clientmod
544
+ }
545
+
546
+ vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
547
+ topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
548
+ topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
549
+
550
+ if(fine_tuning and topic_labelling):
551
+ generated_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["test"].values()]
552
+ topic_model.set_topic_labels(generated_labels)
553
+
554
+ return topic_model, topics, probs, embeddings
555
+
556
+ @st.cache_resource(ttl = 3600, show_spinner=False)
557
+ def Vis_Topics(extype):
558
+ fig1 = topic_model.visualize_topics()
559
+ return fig1
560
+ @st.cache_resource(ttl = 3600, show_spinner=False)
561
+ def Vis_Documents(extype):
562
+ fig2 = topic_model.visualize_document_datamap(topic_abs, embeddings=embeddings, custom_labels = True)
563
+ return fig2
564
+ @st.cache_resource(ttl = 3600, show_spinner=False)
565
+ def Vis_Hierarchy(extype):
566
+ fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic, custom_labels = True)
567
+ return fig3
568
+ @st.cache_resource(ttl = 3600, show_spinner=False)
569
+ def Vis_Heatmap(extype):
570
+ global topic_model
571
+ fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000, custom_labels = True)
572
+ return fig4
573
+ @st.cache_resource(ttl = 3600, show_spinner=False)
574
+ def Vis_Barchart(extype):
575
+ fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, custom_labels = True)
576
+ return fig5
577
+
578
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
579
+ with tab1:
580
+ try:
581
+ with st.spinner('Performing computations. Please wait ...'):
582
+
583
+ topic_model, topics, probs, embeddings = bertopic_vis(extype)
584
+ time.sleep(.5)
585
+ st.toast('Visualize Topics', icon='🏃')
586
+ fig1 = Vis_Topics(extype)
587
+
588
+ time.sleep(.5)
589
+ st.toast('Visualize Document', icon='🏃')
590
+ fig2 = Vis_Documents(extype)
591
+
592
+ time.sleep(.5)
593
+ st.toast('Visualize Document Hierarchy', icon='🏃')
594
+ fig3 = Vis_Hierarchy(extype)
595
+
596
+ time.sleep(.5)
597
+ st.toast('Visualize Topic Similarity', icon='🏃')
598
+ fig4 = Vis_Heatmap(extype)
599
+
600
+ time.sleep(.5)
601
+ st.toast('Visualize Terms', icon='🏃')
602
+ fig5 = Vis_Barchart(extype)
603
+
604
+ bertab1, bertab2, bertab3, bertab4, bertab5 = st.tabs(["Visualize Topics", "Visualize Terms", "Visualize Documents",
605
+ "Visualize Document Hierarchy", "Visualize Topic Similarity"])
606
+
607
+ with bertab1:
608
+ st.plotly_chart(fig1, use_container_width=True)
609
+ with bertab2:
610
+ st.plotly_chart(fig5, use_container_width=True)
611
+ with bertab3:
612
+ st.plotly_chart(fig2, use_container_width=True)
613
+ with bertab4:
614
+ st.plotly_chart(fig3, use_container_width=True)
615
+ with bertab5:
616
+ st.plotly_chart(fig4, use_container_width=True)
617
+
618
+ #===download results===#
619
+ results = topic_model.get_topic_info()
620
+ resultf = pd.DataFrame(results)
621
+ resultcsv = resultf.to_csv().encode("utf-8")
622
+ st.download_button(
623
+ label = "Download Results",
624
+ data=resultcsv,
625
+ file_name="results.csv",
626
+ mime="text\csv",
627
+ on_click="ignore",
628
+ )
629
+
630
+ except ValueError as e:
631
+ st.write(e)
632
+ st.error('🙇‍♂️ Please raise the number of topics and click submit')
633
+
634
+
635
+ except NameError as e:
636
+ st.warning('🖱️ Please click Submit')
637
+ st.write(e)
638
+
639
+ with tab2:
640
+ st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
641
+
642
+ with tab3:
643
+ st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
644
+ st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
645
+
646
+ with tab4:
647
+ st.divider()
648
+ st.subheader(':blue[BERTopic]', anchor=False)
649
+ st.text("Click the camera icon on the top right menu")
650
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)")
651
+ st.divider()
652
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
653
+ st.button("Download Results")
654
+ st.text("Click Download results button at bottom of page")
655
+
656
+ except Exception as e:
657
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
658
+ st.write(e)
659
+ st.stop()
pages/3 Bidirected Network.py CHANGED
@@ -1,276 +1,370 @@
1
- #import module
2
- import streamlit as st
3
- import pandas as pd
4
- import re
5
- import nltk
6
- nltk.download('punkt')
7
- from nltk.tokenize import word_tokenize
8
- from mlxtend.preprocessing import TransactionEncoder
9
- te = TransactionEncoder()
10
- from mlxtend.frequent_patterns import fpgrowth
11
- from mlxtend.frequent_patterns import association_rules
12
- from streamlit_agraph import agraph, Node, Edge, Config
13
- import nltk
14
- nltk.download('wordnet')
15
- from nltk.stem import WordNetLemmatizer
16
- nltk.download('stopwords')
17
- from nltk.corpus import stopwords
18
- from nltk.stem.snowball import SnowballStemmer
19
- import sys
20
- import time
21
-
22
- #===config===
23
- st.set_page_config(
24
- page_title="Coconut",
25
- page_icon="🥥",
26
- layout="wide",
27
- initial_sidebar_state="collapsed"
28
- )
29
-
30
- hide_streamlit_style = """
31
- <style>
32
- #MainMenu
33
- {visibility: hidden;}
34
- footer {visibility: hidden;}
35
- [data-testid="collapsedControl"] {display: none}
36
- </style>
37
- """
38
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
39
-
40
- with st.popover("🔗 Menu"):
41
- st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
42
- st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
43
- st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
44
- st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
45
- st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
46
- st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
47
- st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
48
-
49
- st.header("Bidirected Network", anchor=False)
50
- st.subheader('Put your file here...', anchor=False)
51
-
52
- #===clear cache===
53
- def reset_all():
54
- st.cache_data.clear()
55
-
56
- #===check type===
57
- @st.cache_data(ttl=3600)
58
- def get_ext(extype):
59
- extype = uploaded_file.name
60
- return extype
61
-
62
- @st.cache_data(ttl=3600)
63
- def upload(extype):
64
- papers = pd.read_csv(uploaded_file)
65
- return papers
66
-
67
- @st.cache_data(ttl=3600)
68
- def conv_txt(extype):
69
- col_dict = {'TI': 'Title',
70
- 'SO': 'Source title',
71
- 'DT': 'Document Type',
72
- 'DE': 'Author Keywords',
73
- 'ID': 'Keywords Plus'}
74
- papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
75
- papers.rename(columns=col_dict, inplace=True)
76
- return papers
77
-
78
- #===Read data===
79
- uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
80
-
81
- if uploaded_file is not None:
82
- try:
83
- extype = get_ext(uploaded_file)
84
- if extype.endswith('.csv'):
85
- papers = upload(extype)
86
- elif extype.endswith('.txt'):
87
- papers = conv_txt(extype)
88
-
89
- @st.cache_data(ttl=3600)
90
- def get_data_arul(extype):
91
- list_of_column_key = list(papers.columns)
92
- list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
93
- return papers, list_of_column_key
94
-
95
- papers, list_of_column_key = get_data_arul(extype)
96
-
97
- col1, col2 = st.columns(2)
98
- with col1:
99
- method = st.selectbox(
100
- 'Choose method',
101
- ('Lemmatization', 'Stemming'), on_change=reset_all)
102
- with col2:
103
- keyword = st.selectbox(
104
- 'Choose column',
105
- (list_of_column_key), on_change=reset_all)
106
-
107
-
108
- #===body===
109
- @st.cache_data(ttl=3600)
110
- def clean_arul(extype):
111
- global keyword, papers
112
- try:
113
- arul = papers.dropna(subset=[keyword])
114
- except KeyError:
115
- st.error('Error: Please check your Author/Index Keywords column.')
116
- sys.exit(1)
117
- arul[keyword] = arul[keyword].map(lambda x: re.sub('-—–', ' ', x))
118
- arul[keyword] = arul[keyword].map(lambda x: re.sub('; ', ' ; ', x))
119
- arul[keyword] = arul[keyword].map(lambda x: x.lower())
120
- arul[keyword] = arul[keyword].dropna()
121
- return arul
122
-
123
- arul = clean_arul(extype)
124
-
125
- #===stem/lem===
126
- @st.cache_data(ttl=3600)
127
- def lemma_arul(extype):
128
- lemmatizer = WordNetLemmatizer()
129
- def lemmatize_words(text):
130
- words = text.split()
131
- words = [lemmatizer.lemmatize(word) for word in words]
132
- return ' '.join(words)
133
- arul[keyword] = arul[keyword].apply(lemmatize_words)
134
- return arul
135
-
136
- @st.cache_data(ttl=3600)
137
- def stem_arul(extype):
138
- stemmer = SnowballStemmer("english")
139
- def stem_words(text):
140
- words = text.split()
141
- words = [stemmer.stem(word) for word in words]
142
- return ' '.join(words)
143
- arul[keyword] = arul[keyword].apply(stem_words)
144
- return arul
145
-
146
- if method is 'Lemmatization':
147
- arul = lemma_arul(extype)
148
- else:
149
- arul = stem_arul(extype)
150
-
151
- @st.cache_data(ttl=3600)
152
- def arm(extype):
153
- arule = arul[keyword].str.split(' ; ')
154
- arule_list = arule.values.tolist()
155
- te_ary = te.fit(arule_list).transform(arule_list)
156
- df = pd.DataFrame(te_ary, columns=te.columns_)
157
- return df
158
- df = arm(extype)
159
-
160
- col1, col2, col3 = st.columns(3)
161
- with col1:
162
- supp = st.slider(
163
- 'Select value of Support',
164
- 0.001, 1.000, (0.010), on_change=reset_all)
165
- with col2:
166
- conf = st.slider(
167
- 'Select value of Confidence',
168
- 0.001, 1.000, (0.050), on_change=reset_all)
169
- with col3:
170
- maxlen = st.slider(
171
- 'Maximum length of the itemsets generated',
172
- 2, 8, (2), on_change=reset_all)
173
-
174
- tab1, tab2, tab3 = st.tabs(["📈 Result & Generate visualization", "📃 Reference", "📓 Recommended Reading"])
175
-
176
- with tab1:
177
- #===Association rules===
178
- @st.cache_data(ttl=3600)
179
- def freqitem(extype):
180
- freq_item = fpgrowth(df, min_support=supp, use_colnames=True, max_len=maxlen)
181
- return freq_item
182
-
183
- freq_item = freqitem(extype)
184
- col1, col2 = st.columns(2)
185
- with col1:
186
- st.write('🚨 The more data you have, the longer you will have to wait.')
187
- with col2:
188
- showall = st.checkbox('Show all nodes', value=True, on_change=reset_all)
189
-
190
- @st.cache_data(ttl=3600)
191
- def arm_table(extype):
192
- restab = association_rules(freq_item, metric='confidence', min_threshold=conf)
193
- restab = restab[['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'conviction']]
194
- restab['antecedents'] = restab['antecedents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
195
- restab['consequents'] = restab['consequents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
196
- if showall:
197
- restab['Show'] = True
198
- else:
199
- restab['Show'] = False
200
- return restab
201
-
202
- if freq_item.empty:
203
- st.error('Please lower your value.', icon="🚨")
204
- else:
205
- restab = arm_table(extype)
206
- restab = st.data_editor(restab, use_container_width=True)
207
- res = restab[restab['Show'] == True]
208
-
209
- #===visualize===
210
-
211
- if st.button('📈 Generate network visualization', on_click=reset_all):
212
- with st.spinner('Visualizing, please wait ....'):
213
- @st.cache_data(ttl=3600)
214
- def map_node(extype):
215
- res['to'] = res['antecedents'] + '' + res['consequents'] + '\n Support = ' + res['support'].astype(str) + '\n Confidence = ' + res['confidence'].astype(str) + '\n Conviction = ' + res['conviction'].astype(str)
216
- res_ant = res[['antecedents','antecedent support']].rename(columns={'antecedents': 'node', 'antecedent support': 'size'})
217
- res_con = res[['consequents','consequent support']].rename(columns={'consequents': 'node', 'consequent support': 'size'})
218
- res_node = pd.concat([res_ant, res_con]).drop_duplicates(keep='first')
219
- return res_node, res
220
-
221
- res_node, res = map_node(extype)
222
-
223
- @st.cache_data(ttl=3600)
224
- def arul_network(extype):
225
- nodes = []
226
- edges = []
227
-
228
- for w,x in zip(res_node['size'], res_node['node']):
229
- nodes.append( Node(id=x,
230
- label=x,
231
- size=50*w+10,
232
- shape="dot",
233
- labelHighlightBold=True,
234
- group=x,
235
- opacity=10,
236
- mass=1)
237
- )
238
-
239
- for y,z,a,b in zip(res['antecedents'],res['consequents'],res['confidence'],res['to']):
240
- edges.append( Edge(source=y,
241
- target=z,
242
- title=b,
243
- width=a*2,
244
- physics=True,
245
- smooth=True
246
- )
247
- )
248
- return nodes, edges
249
-
250
- nodes, edges = arul_network(extype)
251
- config = Config(width=1200,
252
- height=800,
253
- directed=True,
254
- physics=True,
255
- hierarchical=False,
256
- maxVelocity=5
257
- )
258
-
259
- return_value = agraph(nodes=nodes,
260
- edges=edges,
261
- config=config)
262
- time.sleep(1)
263
- st.toast('Process completed', icon='📈')
264
-
265
- with tab2:
266
- st.markdown('**Santosa, F. A. (2023). Adding Perspective to the Bibliometric Mapping Using Bidirected Graph. Open Information Science, 7(1), 20220152.** https://doi.org/10.1515/opis-2022-0152')
267
-
268
- with tab3:
269
- st.markdown('**Agrawal, R., Imieliński, T., & Swami, A. (1993). Mining association rules between sets of items in large databases. In ACM SIGMOD Record (Vol. 22, Issue 2, pp. 207–216). Association for Computing Machinery (ACM).** https://doi.org/10.1145/170036.170072')
270
- st.markdown('**Brin, S., Motwani, R., Ullman, J. D., & Tsur, S. (1997). Dynamic itemset counting and implication rules for market basket data. ACM SIGMOD Record, 26(2), 255–264.** https://doi.org/10.1145/253262.253325')
271
- st.markdown('**Edmonds, J., & Johnson, E. L. (2003). Matching: A Well-Solved Class of Integer Linear Programs. Combinatorial Optimization — Eureka, You Shrink!, 27–30.** https://doi.org/10.1007/3-540-36478-1_3')
272
- st.markdown('**Li, M. (2016, August 23). An exploration to visualise the emerging trends of technology foresight based on an improved technique of co-word analysis and relevant literature data of WOS. Technology Analysis & Strategic Management, 29(6), 655–671.** https://doi.org/10.1080/09537325.2016.1220518')
273
-
274
- except:
275
- st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
276
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import module
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import re
5
+ import nltk
6
+ nltk.download('punkt')
7
+ from nltk.tokenize import word_tokenize
8
+ from mlxtend.preprocessing import TransactionEncoder
9
+ te = TransactionEncoder()
10
+ from mlxtend.frequent_patterns import fpgrowth
11
+ from mlxtend.frequent_patterns import association_rules
12
+ from streamlit_agraph import agraph, Node, Edge, Config
13
+ import nltk
14
+ nltk.download('wordnet')
15
+ from nltk.stem import WordNetLemmatizer
16
+ nltk.download('stopwords')
17
+ from nltk.corpus import stopwords
18
+ from nltk.stem.snowball import SnowballStemmer
19
+ import sys
20
+ import time
21
+ import json
22
+ from tools import sourceformat as sf
23
+
24
+ import networkx as nx
25
+ import matplotlib.pyplot as plt
26
+ import plotly.graph_objects as go
27
+
28
+ import altair as alt
29
+ import altair_nx as anx
30
+
31
+ #===config===
32
+ st.set_page_config(
33
+ page_title="Coconut",
34
+ page_icon="🥥",
35
+ layout="wide",
36
+ initial_sidebar_state="collapsed"
37
+ )
38
+
39
+ hide_streamlit_style = """
40
+ <style>
41
+ #MainMenu
42
+ {visibility: hidden;}
43
+ footer {visibility: hidden;}
44
+ [data-testid="collapsedControl"] {display: none}
45
+ </style>
46
+ """
47
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
48
+
49
+ with st.popover("🔗 Menu"):
50
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
51
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
52
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
53
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
54
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
55
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
56
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
57
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
58
+ st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
59
+
60
+ st.header("Bidirected Network", anchor=False)
61
+ st.subheader('Put your file here...', anchor=False)
62
+
63
+ #===clear cache===
64
+ def reset_all():
65
+ st.cache_data.clear()
66
+
67
+ #===check type===
68
+ @st.cache_data(ttl=3600)
69
+ def get_ext(extype):
70
+ extype = uploaded_file.name
71
+ return extype
72
+
73
+ @st.cache_data(ttl=3600)
74
+ def upload(extype):
75
+ papers = pd.read_csv(uploaded_file)
76
+
77
+ if "About the data" in papers.columns[0]:
78
+ papers = sf.dim(papers)
79
+ col_dict = {'MeSH terms': 'Keywords',
80
+ 'PubYear': 'Year',
81
+ 'Times cited': 'Cited by',
82
+ 'Publication Type': 'Document Type'
83
+ }
84
+ papers.rename(columns=col_dict, inplace=True)
85
+
86
+ return papers
87
+
88
+ return papers
89
+
90
+ @st.cache_data(ttl=3600)
91
+ def conv_txt(extype):
92
+ if("PMID" in (uploaded_file.read()).decode()):
93
+ uploaded_file.seek(0)
94
+ papers = sf.medline(uploaded_file)
95
+ print(papers)
96
+ return papers
97
+ col_dict = {'TI': 'Title',
98
+ 'SO': 'Source title',
99
+ 'DE': 'Author Keywords',
100
+ 'DT': 'Document Type',
101
+ 'AB': 'Abstract',
102
+ 'TC': 'Cited by',
103
+ 'PY': 'Year',
104
+ 'ID': 'Keywords Plus',
105
+ 'rights_date_used': 'Year'}
106
+ uploaded_file.seek(0)
107
+ papers = pd.read_csv(uploaded_file, sep='\t')
108
+ if("htid" in papers.columns):
109
+ papers = sf.htrc(papers)
110
+ papers.rename(columns=col_dict, inplace=True)
111
+ print(papers)
112
+ return papers
113
+
114
+
115
+ @st.cache_data(ttl=3600)
116
+ def conv_json(extype):
117
+ col_dict={'title': 'title',
118
+ 'rights_date_used': 'Year',
119
+ }
120
+
121
+ data = json.load(uploaded_file)
122
+ hathifile = data['gathers']
123
+ keywords = pd.DataFrame.from_records(hathifile)
124
+
125
+ keywords = sf.htrc(keywords)
126
+ keywords.rename(columns=col_dict,inplace=True)
127
+ return keywords
128
+
129
+ @st.cache_data(ttl=3600)
130
+ def conv_pub(extype):
131
+ if (get_ext(extype)).endswith('.tar.gz'):
132
+ bytedata = extype.read()
133
+ keywords = sf.readPub(bytedata)
134
+ elif (get_ext(extype)).endswith('.xml'):
135
+ bytedata = extype.read()
136
+ keywords = sf.readxml(bytedata)
137
+ return keywords
138
+
139
+ #===Read data===
140
+ uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all)
141
+
142
+ if uploaded_file is not None:
143
+ try:
144
+ extype = get_ext(uploaded_file)
145
+ if extype.endswith('.csv'):
146
+ papers = upload(extype)
147
+ elif extype.endswith('.txt'):
148
+ papers = conv_txt(extype)
149
+ elif extype.endswith('.json'):
150
+ papers = conv_json(extype)
151
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
152
+ papers = conv_pub(uploaded_file)
153
+
154
+ @st.cache_data(ttl=3600)
155
+ def get_data_arul(extype):
156
+ list_of_column_key = list(papers.columns)
157
+ list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
158
+ return papers, list_of_column_key
159
+
160
+ papers, list_of_column_key = get_data_arul(extype)
161
+
162
+ col1, col2 = st.columns(2)
163
+ with col1:
164
+ method = st.selectbox(
165
+ 'Choose method',
166
+ ('Lemmatization', 'Stemming'), on_change=reset_all)
167
+ layout = st.selectbox(
168
+ 'Choose graph layout',
169
+ ['Circular','Kamada Kawai','Random','Spring','Shell']
170
+ )
171
+ with col2:
172
+ keyword = st.selectbox(
173
+ 'Choose column',
174
+ (list_of_column_key), on_change=reset_all)
175
+
176
+
177
+ #===body===
178
+ @st.cache_data(ttl=3600)
179
+ def clean_arul(extype):
180
+ global keyword, papers
181
+ try:
182
+ arul = papers.dropna(subset=[keyword])
183
+ except KeyError:
184
+ st.error('Error: Please check your Author/Index Keywords column.')
185
+ sys.exit(1)
186
+ arul[keyword] = arul[keyword].map(lambda x: re.sub('-—–', ' ', x))
187
+ arul[keyword] = arul[keyword].map(lambda x: re.sub('; ', ' ; ', x))
188
+ arul[keyword] = arul[keyword].map(lambda x: x.lower())
189
+ arul[keyword] = arul[keyword].dropna()
190
+ return arul
191
+
192
+ arul = clean_arul(extype)
193
+
194
+ #===stem/lem===
195
+ @st.cache_data(ttl=3600)
196
+ def lemma_arul(extype):
197
+ lemmatizer = WordNetLemmatizer()
198
+ def lemmatize_words(text):
199
+ words = text.split()
200
+ words = [lemmatizer.lemmatize(word) for word in words]
201
+ return ' '.join(words)
202
+ arul[keyword] = arul[keyword].apply(lemmatize_words)
203
+ return arul
204
+
205
+ @st.cache_data(ttl=3600)
206
+ def stem_arul(extype):
207
+ stemmer = SnowballStemmer("english")
208
+ def stem_words(text):
209
+ words = text.split()
210
+ words = [stemmer.stem(word) for word in words]
211
+ return ' '.join(words)
212
+ arul[keyword] = arul[keyword].apply(stem_words)
213
+ return arul
214
+
215
+ if method is 'Lemmatization':
216
+ arul = lemma_arul(extype)
217
+ else:
218
+ arul = stem_arul(extype)
219
+
220
+ @st.cache_data(ttl=3600)
221
+ def arm(extype):
222
+ arule = arul[keyword].str.split(' ; ')
223
+ arule_list = arule.values.tolist()
224
+ te_ary = te.fit(arule_list).transform(arule_list)
225
+ df = pd.DataFrame(te_ary, columns=te.columns_)
226
+ return df
227
+ df = arm(extype)
228
+
229
+ col1, col2, col3 = st.columns(3)
230
+ with col1:
231
+ supp = st.slider(
232
+ 'Select value of Support',
233
+ 0.001, 1.000, (0.010), on_change=reset_all)
234
+ with col2:
235
+ conf = st.slider(
236
+ 'Select value of Confidence',
237
+ 0.001, 1.000, (0.050), on_change=reset_all)
238
+ with col3:
239
+ maxlen = st.slider(
240
+ 'Maximum length of the itemsets generated',
241
+ 2, 8, (2), on_change=reset_all)
242
+
243
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Result & Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
244
+
245
+ with tab1:
246
+ #===Association rules===
247
+ @st.cache_data(ttl=3600)
248
+ def freqitem(extype):
249
+ freq_item = fpgrowth(df, min_support=supp, use_colnames=True, max_len=maxlen)
250
+ return freq_item
251
+
252
+ freq_item = freqitem(extype)
253
+ col1, col2 = st.columns(2)
254
+ with col1:
255
+ st.write('🚨 The more data you have, the longer you will have to wait.')
256
+ with col2:
257
+ showall = st.checkbox('Show all nodes', value=True, on_change=reset_all)
258
+
259
+ @st.cache_data(ttl=3600)
260
+ def arm_table(extype):
261
+ restab = association_rules(freq_item, metric='confidence', min_threshold=conf)
262
+ restab = restab[['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'conviction']]
263
+ restab['antecedents'] = restab['antecedents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
264
+ restab['consequents'] = restab['consequents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
265
+ if showall:
266
+ restab['Show'] = True
267
+ else:
268
+ restab['Show'] = False
269
+ return restab
270
+
271
+ if freq_item.empty:
272
+ st.error('Please lower your value.', icon="🚨")
273
+ else:
274
+ restab = arm_table(extype)
275
+ restab = st.data_editor(restab, use_container_width=True)
276
+ res = restab[restab['Show'] == True]
277
+
278
+ #===visualize===
279
+
280
+ if st.button('📈 Generate network visualization', on_click=reset_all):
281
+ with st.spinner('Visualizing, please wait ....'):
282
+ @st.cache_data(ttl=3600)
283
+ def map_node(extype):
284
+ res['to'] = res['antecedents'] + ' → ' + res['consequents'] + '\n Support = ' + res['support'].astype(str) + '\n Confidence = ' + res['confidence'].astype(str) + '\n Conviction = ' + res['conviction'].astype(str)
285
+ res_ant = res[['antecedents','antecedent support']].rename(columns={'antecedents': 'node', 'antecedent support': 'size'})
286
+ res_con = res[['consequents','consequent support']].rename(columns={'consequents': 'node', 'consequent support': 'size'})
287
+ res_node = pd.concat([res_ant, res_con]).drop_duplicates(keep='first')
288
+ return res_node, res
289
+
290
+ res_node, res = map_node(extype)
291
+ ___='''
292
+ @st.cache_data(ttl=3600)
293
+ def arul_net(extype):
294
+ nodes = []
295
+ edges = []
296
+
297
+ for w,x in zip(res_node['size'], res_node['node']):
298
+ nodes.append(x)
299
+ for y,z,a,b in zip(res['antecedents'],res['consequents'],res['confidence'],res['to']):
300
+ edge = (y,z)
301
+
302
+ edges.append(edge)
303
+
304
+ return nodes, edges
305
+
306
+ #nodes, edges = arul_net(res)
307
+ '''
308
+
309
+ @st.cache_data(ttl=3600)
310
+ def graphmaker(__netgraph):
311
+
312
+ #add nodes, w is weight, x is node label
313
+ for w,x in zip(res_node['size'], res_node['node']):
314
+ __netgraph.add_node(x, size = w)
315
+ #add edges, y is startpoint, z is endpoint, a is edge weight, b is title
316
+ for y,z,a,b in zip(res['antecedents'],res['consequents'],res['confidence'],res['to']):
317
+ __netgraph.add_edge(y,z, weight = int(a*100))
318
+
319
+
320
+ #Make graph with NetworkX
321
+
322
+ G=nx.DiGraph()
323
+
324
+ graphmaker(G)
325
+
326
+ #G.add_edges_from(edges) ##### remove this later
327
+
328
+ #Graph layout
329
+ if(layout=="Spring"):
330
+ pos=nx.spring_layout(G)
331
+ elif(layout == "Kamada Kawai"):
332
+ pos=nx.kamada_kawai_layout(G)
333
+ elif(layout == "Circular"):
334
+ pos = nx.circular_layout(G)
335
+ elif(layout=="Random"):
336
+ pos = nx.random_layout(G)
337
+ elif(layout=="Shell"):
338
+ pos=nx.shell_layout(G)
339
+
340
+ graph = anx.draw_networkx(G,pos, node_label = 'node',
341
+ edge_width = 'weight',
342
+ node_size = 'size',
343
+ curved_edges = True,
344
+ node_font_size=12,
345
+ chart_width=1920,
346
+ chart_height=1080).interactive()
347
+
348
+ st.altair_chart(graph)
349
+
350
+
351
+ with tab2:
352
+ st.markdown('**Santosa, F. A. (2023). Adding Perspective to the Bibliometric Mapping Using Bidirected Graph. Open Information Science, 7(1), 20220152.** https://doi.org/10.1515/opis-2022-0152')
353
+
354
+ with tab3:
355
+ st.markdown('**Agrawal, R., Imieliński, T., & Swami, A. (1993). Mining association rules between sets of items in large databases. In ACM SIGMOD Record (Vol. 22, Issue 2, pp. 207–216). Association for Computing Machinery (ACM).** https://doi.org/10.1145/170036.170072')
356
+ st.markdown('**Brin, S., Motwani, R., Ullman, J. D., & Tsur, S. (1997). Dynamic itemset counting and implication rules for market basket data. ACM SIGMOD Record, 26(2), 255–264.** https://doi.org/10.1145/253262.253325')
357
+ st.markdown('**Edmonds, J., & Johnson, E. L. (2003). Matching: A Well-Solved Class of Integer Linear Programs. Combinatorial Optimization — Eureka, You Shrink!, 27–30.** https://doi.org/10.1007/3-540-36478-1_3')
358
+ st.markdown('**Li, M. (2016, August 23). An exploration to visualise the emerging trends of technology foresight based on an improved technique of co-word analysis and relevant literature data of WOS. Technology Analysis & Strategic Management, 29(6), 655–671.** https://doi.org/10.1080/09537325.2016.1220518')
359
+ with tab4:
360
+ st.subheader("Download visualization")
361
+ st.text("Zoom in, zoom out, or shift the nodes as desired, then right-click and select Save image as ...")
362
+ st.markdown("![Downloading graph](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bidirected.jpg)")
363
+ st.subheader("Download table as CSV")
364
+ st.text("Hover cursor over table, and click download arrow")
365
+ st.image("images/tablenetwork.png")
366
+
367
+ except Exception as e:
368
+ st.write(e)
369
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
370
+ st.stop()
pages/4 Sunburst.py CHANGED
@@ -4,6 +4,9 @@ import pandas as pd
4
  import plotly.express as px
5
  import numpy as np
6
  import sys
 
 
 
7
 
8
  #===config===
9
  st.set_page_config(
@@ -31,6 +34,7 @@ with st.popover("🔗 Menu"):
31
  st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
32
  st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
33
  st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
 
34
 
35
  st.header("Sunburst Visualization", anchor=False)
36
  st.subheader('Put your file here...', anchor=False)
@@ -52,24 +56,71 @@ def upload(extype):
52
  if 'Publication Year' in papers.columns:
53
  papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
54
  'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
 
 
 
 
 
 
 
 
 
55
  return papers
56
 
57
  @st.cache_data(ttl=3600)
58
  def conv_txt(extype):
 
 
 
 
 
59
  col_dict = {'TI': 'Title',
60
  'SO': 'Source title',
61
- 'DT': 'Document Type',
62
  'DE': 'Author Keywords',
63
- 'ID': 'Keywords Plus',
64
  'AB': 'Abstract',
65
  'TC': 'Cited by',
66
- 'PY': 'Year',}
67
- papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
 
 
 
 
 
68
  papers.rename(columns=col_dict, inplace=True)
 
69
  return papers
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  #===Read data===
72
- uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
73
 
74
  if uploaded_file is not None:
75
  try:
@@ -79,36 +130,45 @@ if uploaded_file is not None:
79
 
80
  elif extype.endswith('.txt'):
81
  papers = conv_txt(extype)
82
-
 
 
 
 
83
  @st.cache_data(ttl=3600)
84
  def get_minmax(extype):
85
  extype = extype
86
  MIN = int(papers['Year'].min())
87
  MAX = int(papers['Year'].max())
 
 
88
  GAP = MAX - MIN
89
- return papers, MIN, MAX, GAP
90
-
91
- tab1, tab2 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading"])
92
 
93
  with tab1:
94
  #===sunburst===
95
  try:
96
- papers, MIN, MAX, GAP = get_minmax(extype)
97
  except KeyError:
98
  st.error('Error: Please check again your columns.')
99
  sys.exit(1)
100
 
101
  if (GAP != 0):
102
  YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
 
103
  else:
104
  st.write('You only have data in ', (MAX))
105
  YEAR = (MIN, MAX)
106
-
107
  @st.cache_data(ttl=3600)
108
  def listyear(extype):
109
  global papers
110
  years = list(range(YEAR[0],YEAR[1]+1))
 
111
  papers = papers.loc[papers['Year'].isin(years)]
 
112
  return years, papers
113
 
114
  @st.cache_data(ttl=3600)
@@ -118,19 +178,23 @@ if uploaded_file is not None:
118
  vis[['doctype','source','citby','year']] = papers[['Document Type','Source title','Cited by','Year']]
119
  viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index()
120
  viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True)
121
-
 
 
122
  fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs',
123
  color='cited by',
124
  color_continuous_scale='RdBu',
125
  color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
126
  fig.update_layout(height=800, width=1200)
127
- return fig
128
 
129
  years, papers = listyear(extype)
130
 
131
  if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
132
- fig = vis_sunbrust(extype)
133
  st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
 
 
134
 
135
  else:
136
  st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨")
@@ -138,7 +202,10 @@ if uploaded_file is not None:
138
  with tab2:
139
  st.markdown('**numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual.** https://numpy.org/doc/stable/reference/generated/numpy.average.html')
140
  st.markdown('**Sunburst. (n.d.). Sunburst Charts in Python.** https://plotly.com/python/sunburst-charts/')
141
-
 
 
 
142
  except:
143
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
144
- st.stop()
 
4
  import plotly.express as px
5
  import numpy as np
6
  import sys
7
+ import json
8
+ from tools import sourceformat as sf
9
+
10
 
11
  #===config===
12
  st.set_page_config(
 
34
  st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
35
  st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
36
  st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
37
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
38
 
39
  st.header("Sunburst Visualization", anchor=False)
40
  st.subheader('Put your file here...', anchor=False)
 
56
  if 'Publication Year' in papers.columns:
57
  papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
58
  'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
59
+ if "About the data" in papers.columns[0]:
60
+ papers = sf.dim(papers)
61
+ col_dict = {'MeSH terms': 'Keywords',
62
+ 'PubYear': 'Year',
63
+ 'Times cited': 'Cited by',
64
+ 'Publication Type': 'Document Type'
65
+ }
66
+ papers.rename(columns=col_dict, inplace=True)
67
+
68
  return papers
69
 
70
  @st.cache_data(ttl=3600)
71
  def conv_txt(extype):
72
+ if("PMID" in (uploaded_file.read()).decode()):
73
+ uploaded_file.seek(0)
74
+ papers = sf.medline(uploaded_file)
75
+ print(papers)
76
+ return papers
77
  col_dict = {'TI': 'Title',
78
  'SO': 'Source title',
 
79
  'DE': 'Author Keywords',
80
+ 'DT': 'Document Type',
81
  'AB': 'Abstract',
82
  'TC': 'Cited by',
83
+ 'PY': 'Year',
84
+ 'ID': 'Keywords Plus',
85
+ 'rights_date_used': 'Year'}
86
+ uploaded_file.seek(0)
87
+ papers = pd.read_csv(uploaded_file, sep='\t')
88
+ if("htid" in papers.columns):
89
+ papers = sf.htrc(papers)
90
  papers.rename(columns=col_dict, inplace=True)
91
+ print(papers)
92
  return papers
93
 
94
+ @st.cache_data(ttl=3600)
95
+ def conv_json(extype):
96
+ col_dict={'title': 'title',
97
+ 'rights_date_used': 'Year',
98
+ 'content_provider_code': 'Document Type',
99
+ 'Keywords':'Source title'
100
+ }
101
+
102
+ data = json.load(uploaded_file)
103
+ hathifile = data['gathers']
104
+ keywords = pd.DataFrame.from_records(hathifile)
105
+
106
+ keywords = sf.htrc(keywords)
107
+ keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
108
+ keywords.rename(columns=col_dict,inplace=True)
109
+ return keywords
110
+
111
+ def conv_pub(extype):
112
+ if (get_ext(extype)).endswith('.tar.gz'):
113
+ bytedata = extype.read()
114
+ keywords = sf.readPub(bytedata)
115
+ elif (get_ext(extype)).endswith('.xml'):
116
+ bytedata = extype.read()
117
+ keywords = sf.readxml(bytedata)
118
+ keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
119
+ st.write(keywords)
120
+ return keywords
121
+
122
  #===Read data===
123
+ uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all)
124
 
125
  if uploaded_file is not None:
126
  try:
 
130
 
131
  elif extype.endswith('.txt'):
132
  papers = conv_txt(extype)
133
+ elif extype.endswith('.json'):
134
+ papers = conv_json(extype)
135
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
136
+ papers = conv_pub(uploaded_file)
137
+
138
  @st.cache_data(ttl=3600)
139
  def get_minmax(extype):
140
  extype = extype
141
  MIN = int(papers['Year'].min())
142
  MAX = int(papers['Year'].max())
143
+ MIN1 = int(papers['Cited by'].min())
144
+ MAX1 = int(papers['Cited by'].max())
145
  GAP = MAX - MIN
146
+ return papers, MIN, MAX, GAP, MIN1, MAX1
147
+
148
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading", "⬇️ Download Help"])
149
 
150
  with tab1:
151
  #===sunburst===
152
  try:
153
+ papers, MIN, MAX, GAP, MIN1, MAX1 = get_minmax(extype)
154
  except KeyError:
155
  st.error('Error: Please check again your columns.')
156
  sys.exit(1)
157
 
158
  if (GAP != 0):
159
  YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
160
+ KEYLIM = st.slider('Cited By Count',min_value = MIN1, max_value = MAX1, value = (MIN1,MAX1), on_change=reset_all)
161
  else:
162
  st.write('You only have data in ', (MAX))
163
  YEAR = (MIN, MAX)
164
+ KEYLIM = (MIN1,MAX1)
165
  @st.cache_data(ttl=3600)
166
  def listyear(extype):
167
  global papers
168
  years = list(range(YEAR[0],YEAR[1]+1))
169
+ cited = list(range(KEYLIM[0],KEYLIM[1]+1))
170
  papers = papers.loc[papers['Year'].isin(years)]
171
+ papers = papers.loc[papers['Cited by'].isin(cited)]
172
  return years, papers
173
 
174
  @st.cache_data(ttl=3600)
 
178
  vis[['doctype','source','citby','year']] = papers[['Document Type','Source title','Cited by','Year']]
179
  viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index()
180
  viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True)
181
+
182
+
183
+
184
  fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs',
185
  color='cited by',
186
  color_continuous_scale='RdBu',
187
  color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
188
  fig.update_layout(height=800, width=1200)
189
+ return fig, viz
190
 
191
  years, papers = listyear(extype)
192
 
193
  if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
194
+ fig, viz = vis_sunbrust(extype)
195
  st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
196
+
197
+ st.dataframe(viz)
198
 
199
  else:
200
  st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨")
 
202
  with tab2:
203
  st.markdown('**numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual.** https://numpy.org/doc/stable/reference/generated/numpy.average.html')
204
  st.markdown('**Sunburst. (n.d.). Sunburst Charts in Python.** https://plotly.com/python/sunburst-charts/')
205
+
206
+ with tab3:
207
+ st.text("Click the camera icon on the top right menu (you may need to hover your cursor within the visualization)")
208
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)")
209
  except:
210
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
211
+ st.stop()
pages/5 Burst Detection.py CHANGED
@@ -15,6 +15,9 @@ import plotly.graph_objects as go
15
  from plotly.subplots import make_subplots
16
  import plotly.io as pio
17
  import sys
 
 
 
18
 
19
  #===config===
20
  st.set_page_config(
@@ -42,6 +45,8 @@ with st.popover("🔗 Menu"):
42
  st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
43
  st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
44
  st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
 
 
45
 
46
  st.header("Burst Detection", anchor=False)
47
  st.subheader('Put your file here...', anchor=False)
@@ -51,7 +56,7 @@ def reset_all():
51
  st.cache_data.clear()
52
 
53
  # Initialize NLP model
54
- nlp = spacy.load("en_core_web_md")
55
 
56
  @st.cache_data(ttl=3600)
57
  def upload(extype):
@@ -60,6 +65,15 @@ def upload(extype):
60
  if 'Publication Year' in df.columns:
61
  df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
62
  'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
 
 
 
 
 
 
 
 
 
63
  return df
64
 
65
  @st.cache_data(ttl=3600)
@@ -76,14 +90,49 @@ def get_minmax(df):
76
 
77
  @st.cache_data(ttl=3600)
78
  def conv_txt(extype):
 
 
 
 
 
79
  col_dict = {'TI': 'Title',
80
  'SO': 'Source title',
 
81
  'DT': 'Document Type',
82
  'AB': 'Abstract',
83
- 'PY': 'Year'}
84
- df = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
85
- df.rename(columns=col_dict, inplace=True)
86
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  # Helper Functions
89
  @st.cache_data(ttl=3600)
@@ -107,6 +156,10 @@ def load_data(uploaded_file):
107
  df = upload(extype)
108
  elif extype.endswith('.txt'):
109
  df = conv_txt(extype)
 
 
 
 
110
 
111
  df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
112
  df = df.dropna(subset=['Year'])
@@ -133,23 +186,29 @@ def clean_data(df):
133
 
134
  # Preprocess text
135
  df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
 
 
136
 
137
  # Vectorize processed text
138
  if count_method == "Document Frequency":
139
- vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), binary=True)
140
  else:
141
- vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
142
  X = vectorizer.fit_transform(df['processed'].tolist())
143
 
144
  # Create DataFrame from the Document-Term Matrix (DTM)
145
  dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
146
  yearly_term_frequency = dtm.groupby(dtm.index).sum()
147
 
148
- # User inputs for top words analysis and exclusions
149
- excluded_words = [word.strip() for word in excluded_words_input.split(',')]
 
 
150
 
151
- # Identify top words, excluding specified words
152
- filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
 
 
153
  top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
154
 
155
  return yearly_term_frequency, top_words
@@ -205,27 +264,38 @@ def convert_df(df):
205
  return df.to_csv().encode("utf-8")
206
 
207
  @st.cache_data(ttl=3600)
208
- def scattervis(bursts, freq_data):
209
- freq_data.reset_index(inplace=True)
210
  freq_data.rename(columns={"index": "Year"}, inplace=True)
211
-
212
  freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
213
  freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
214
- wordlist = freq_data_melted["Category"].unique()
215
 
 
216
  years = freq_data["Year"].tolist()
 
217
  bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
218
  bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
 
219
  burst_points = []
220
-
221
  for _, row in bursts.iterrows():
222
  for year in range(row["begin"], row["end"] + 1):
223
  burst_points.append((year, row["label"], row["weight"]))
224
-
225
  burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
226
-
 
 
 
 
 
 
 
 
 
 
 
227
  fig = go.Figure()
228
-
229
  # scatter trace for burst points
230
  fig.add_trace(go.Scatter(
231
  x=burst_points_df["Year"],
@@ -233,14 +303,15 @@ def scattervis(bursts, freq_data):
233
  mode='markers',
234
  marker=dict(
235
  symbol='square',
236
- size=40,
237
  color='red',
238
- opacity=0.5),
 
239
  hoverinfo='text',
240
  text=burst_points_df["Weight"],
241
  showlegend=False
242
  ))
243
-
244
  # scatter trace for freq_data
245
  fig.add_trace(go.Scatter(
246
  x=freq_data_melted["Year"],
@@ -251,26 +322,43 @@ def scattervis(bursts, freq_data):
251
  size=30,
252
  color=freq_data_melted["Value"],
253
  colorscale='Blues',
254
- showscale=False),
 
255
  text=freq_data_melted["Value"],
256
  textposition="middle center",
257
  textfont=dict(
258
  size=16,
259
- color=['white' if value > freq_data_melted["Value"].max()/2 else 'black' for value in freq_data_melted["Value"]])
 
 
260
  ))
261
-
262
- min_year = min(years)
263
- max_year = max(years)
264
-
265
  fig.update_layout(
266
- xaxis=dict(tickmode='linear', dtick=1, range=[(min_year-1), (max_year+1)], tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
267
- yaxis=dict(tickvals=wordlist, ticktext=wordlist, tickmode='array', tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
268
- plot_bgcolor='white',
269
- paper_bgcolor='white',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  showlegend=False,
271
- margin=dict(l=1, r=1, t=1, b=1),
272
- height=top_n*50+2,
273
- width=(max_year-min_year)*52+100,
274
  autosize=False
275
  )
276
 
@@ -289,15 +377,13 @@ def linegraph(bursts, freq_data):
289
  line_shape='linear',
290
  hoverinfo='text',
291
  hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
292
- text=freq_data[column],
293
  textposition='top center'
294
  ), row=row, col=col)
295
-
296
  # Add area charts
297
  for _, row_data in bursts[bursts['label'] == column].iterrows():
298
  x_values = freq_data.index[row_data['begin']:row_data['end']+1]
299
  y_values = freq_data[column][row_data['begin']:row_data['end']+1]
300
-
301
  #middle_y = sum(y_values) / len(y_values)
302
  y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
303
  x_offset = 0.1
@@ -326,7 +412,19 @@ def linegraph(bursts, freq_data):
326
  textangle=270,
327
  row=row, col=col
328
  )
329
-
 
 
 
 
 
 
 
 
 
 
 
 
330
  col += 1
331
  if col > 2:
332
  col = 1
@@ -349,36 +447,41 @@ def download_result(freq_data, bursts):
349
  csv2 = convert_df(bursts)
350
  return csv1, csv2
351
 
352
- uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
353
 
354
  if uploaded_file is not None:
355
  try:
356
- c1, c2, c3, c4 = st.columns([2,2,3,3])
357
  top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
358
  viz_selected = c2.selectbox("Option for visualization",
359
- ("Line graph", "Scatter plot"), on_change=reset_all)
360
  running_total = c3.selectbox("Calculation method",
361
  ("Running total", "By occurrences each year"), on_change=reset_all)
362
- count_method = c4.selectbox("Count by",
363
  ("Term Frequency", "Document Frequency"), on_change=reset_all)
364
 
365
- d1, d2 = st.columns([2,8])
366
  df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
367
- col_name = d1.selectbox("Select column to analyze",
368
  (coldf), on_change=reset_all)
369
- excluded_words_input = d2.text_input("Words to exclude (comma-separated)", on_change=reset_all)
 
 
 
 
 
 
370
 
371
  if (GAP != 0):
372
  YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
373
  else:
374
- e1.write('You only have data in ', (MAX))
375
  sys.exit(1)
376
 
377
  yearly_term_frequency, top_words = clean_data(df)
378
 
379
  bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
380
 
381
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
382
 
383
  with tab1:
384
  if bursts.empty:
@@ -394,7 +497,7 @@ if uploaded_file is not None:
394
  linegraph(bursts, freq_data)
395
 
396
  elif viz_selected =="Scatter plot":
397
- scattervis(bursts, freq_data)
398
 
399
  csv1, csv2 = download_result(freq_data, bursts)
400
  e1, e2, e3 = st.columns(3)
@@ -424,7 +527,23 @@ if uploaded_file is not None:
424
  st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680')
425
  st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
426
  st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
- except:
429
- st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
430
- st.stop()
 
15
  from plotly.subplots import make_subplots
16
  import plotly.io as pio
17
  import sys
18
+ import json
19
+ from tools import sourceformat as sf
20
+
21
 
22
  #===config===
23
  st.set_page_config(
 
45
  st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
46
  st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
47
  st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
48
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
49
+ st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
50
 
51
  st.header("Burst Detection", anchor=False)
52
  st.subheader('Put your file here...', anchor=False)
 
56
  st.cache_data.clear()
57
 
58
  # Initialize NLP model
59
+ nlp = spacy.load("en_core_web_sm")
60
 
61
  @st.cache_data(ttl=3600)
62
  def upload(extype):
 
65
  if 'Publication Year' in df.columns:
66
  df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
67
  'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
68
+ if "About the data" in df.columns[0]:
69
+ df = sf.dim(df)
70
+ col_dict = {'MeSH terms': 'Keywords',
71
+ 'PubYear': 'Year',
72
+ 'Times cited': 'Cited by',
73
+ 'Publication Type': 'Document Type'
74
+ }
75
+ df.rename(columns=col_dict, inplace=True)
76
+
77
  return df
78
 
79
  @st.cache_data(ttl=3600)
 
90
 
91
  @st.cache_data(ttl=3600)
92
  def conv_txt(extype):
93
+ if("PMID" in (uploaded_file.read()).decode()):
94
+ uploaded_file.seek(0)
95
+ papers = sf.medline(uploaded_file)
96
+ print(papers)
97
+ return papers
98
  col_dict = {'TI': 'Title',
99
  'SO': 'Source title',
100
+ 'DE': 'Author Keywords',
101
  'DT': 'Document Type',
102
  'AB': 'Abstract',
103
+ 'TC': 'Cited by',
104
+ 'PY': 'Year',
105
+ 'ID': 'Keywords Plus',
106
+ 'rights_date_used': 'Year'}
107
+ uploaded_file.seek(0)
108
+ papers = pd.read_csv(uploaded_file, sep='\t')
109
+ if("htid" in papers.columns):
110
+ papers = sf.htrc(papers)
111
+ papers.rename(columns=col_dict, inplace=True)
112
+ print(papers)
113
+ return papers
114
+
115
+ def conv_json(extype):
116
+ col_dict={'title': 'title',
117
+ 'rights_date_used': 'Year',
118
+ }
119
+
120
+ data = json.load(uploaded_file)
121
+ hathifile = data['gathers']
122
+ keywords = pd.DataFrame.from_records(hathifile)
123
+
124
+ keywords = sf.htrc(keywords)
125
+ keywords.rename(columns=col_dict,inplace=True)
126
+ return keywords
127
+
128
+ def conv_pub(extype):
129
+ if (get_ext(extype)).endswith('.tar.gz'):
130
+ bytedata = extype.read()
131
+ keywords = sf.readPub(bytedata)
132
+ elif (get_ext(extype)).endswith('.xml'):
133
+ bytedata = extype.read()
134
+ keywords = sf.readxml(bytedata)
135
+ return keywords
136
 
137
  # Helper Functions
138
  @st.cache_data(ttl=3600)
 
156
  df = upload(extype)
157
  elif extype.endswith('.txt'):
158
  df = conv_txt(extype)
159
+ elif extype.endswith('.json'):
160
+ df = conv_json(extype)
161
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
162
+ df = conv_pub(uploaded_file)
163
 
164
  df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
165
  df = df.dropna(subset=['Year'])
 
186
 
187
  # Preprocess text
188
  df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
189
+
190
+ ngram_range = (1, xgram)
191
 
192
  # Vectorize processed text
193
  if count_method == "Document Frequency":
194
+ vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), binary=True, ngram_range=ngram_range)
195
  else:
196
+ vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), ngram_range=ngram_range)
197
  X = vectorizer.fit_transform(df['processed'].tolist())
198
 
199
  # Create DataFrame from the Document-Term Matrix (DTM)
200
  dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
201
  yearly_term_frequency = dtm.groupby(dtm.index).sum()
202
 
203
+ # excluded & included words
204
+ if exc_inc == "Words to exclude":
205
+ excluded_words = [word.strip() for word in words_input.split(',')]
206
+ filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
207
 
208
+ elif exc_inc == "Focus on these words":
209
+ included_words = [word.strip() for word in words_input.split(',')]
210
+ filtered_words = [word for word in yearly_term_frequency.columns if word in included_words]
211
+
212
  top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
213
 
214
  return yearly_term_frequency, top_words
 
264
  return df.to_csv().encode("utf-8")
265
 
266
  @st.cache_data(ttl=3600)
267
+ def scattervis(bursts, freq_data, top_n):
268
+ freq_data = freq_data.reset_index()
269
  freq_data.rename(columns={"index": "Year"}, inplace=True)
270
+
271
  freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
272
  freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
 
273
 
274
+ wordlist = freq_data_melted["Category"].unique()
275
  years = freq_data["Year"].tolist()
276
+
277
  bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
278
  bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
279
+
280
  burst_points = []
 
281
  for _, row in bursts.iterrows():
282
  for year in range(row["begin"], row["end"] + 1):
283
  burst_points.append((year, row["label"], row["weight"]))
 
284
  burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
285
+
286
+ min_year = min(years)
287
+ max_year = max(years)
288
+ n_years = max_year - min_year + 1
289
+ n_labels = len(wordlist)
290
+
291
+ label_spacing = 50
292
+ year_spacing = 60
293
+
294
+ plot_height = n_labels * label_spacing + 100
295
+ plot_width = n_years * year_spacing + 150
296
+
297
  fig = go.Figure()
298
+
299
  # scatter trace for burst points
300
  fig.add_trace(go.Scatter(
301
  x=burst_points_df["Year"],
 
303
  mode='markers',
304
  marker=dict(
305
  symbol='square',
306
+ size=40,
307
  color='red',
308
+ opacity=0.5
309
+ ),
310
  hoverinfo='text',
311
  text=burst_points_df["Weight"],
312
  showlegend=False
313
  ))
314
+
315
  # scatter trace for freq_data
316
  fig.add_trace(go.Scatter(
317
  x=freq_data_melted["Year"],
 
322
  size=30,
323
  color=freq_data_melted["Value"],
324
  colorscale='Blues',
325
+ showscale=False
326
+ ),
327
  text=freq_data_melted["Value"],
328
  textposition="middle center",
329
  textfont=dict(
330
  size=16,
331
+ color=['white' if value > freq_data_melted["Value"].max()/2 else 'black'
332
+ for value in freq_data_melted["Value"]]
333
+ )
334
  ))
335
+
336
+ # Layout
 
 
337
  fig.update_layout(
338
+ xaxis=dict(
339
+ tickmode='linear',
340
+ dtick=1,
341
+ range=[min_year - 1, max_year + 1],
342
+ tickfont=dict(size=16),
343
+ automargin=True,
344
+ showgrid=False,
345
+ zeroline=False
346
+ ),
347
+ yaxis=dict(
348
+ tickvals=wordlist,
349
+ ticktext=wordlist,
350
+ tickmode='array',
351
+ tickfont=dict(size=16),
352
+ automargin=True,
353
+ showgrid=False,
354
+ zeroline=False
355
+ ),
356
+ plot_bgcolor='white',
357
+ paper_bgcolor='white',
358
  showlegend=False,
359
+ margin=dict(l=20, r=20, t=20, b=20),
360
+ height=plot_height,
361
+ width=plot_width,
362
  autosize=False
363
  )
364
 
 
377
  line_shape='linear',
378
  hoverinfo='text',
379
  hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
380
+ #text=freq_data[column],
381
  textposition='top center'
382
  ), row=row, col=col)
 
383
  # Add area charts
384
  for _, row_data in bursts[bursts['label'] == column].iterrows():
385
  x_values = freq_data.index[row_data['begin']:row_data['end']+1]
386
  y_values = freq_data[column][row_data['begin']:row_data['end']+1]
 
387
  #middle_y = sum(y_values) / len(y_values)
388
  y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
389
  x_offset = 0.1
 
412
  textangle=270,
413
  row=row, col=col
414
  )
415
+
416
+ # Add labels for values only in bursts
417
+ fig.add_trace(go.Scatter(
418
+ x=x_values, y=y_values, mode='lines+markers+text', name=column,
419
+ line_shape='linear',
420
+ hoverinfo='text',
421
+ hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
422
+ text=y_values,
423
+ textposition='top center'
424
+ ), row=row, col=col)
425
+ print(freq_data[column])
426
+
427
+
428
  col += 1
429
  if col > 2:
430
  col = 1
 
447
  csv2 = convert_df(bursts)
448
  return csv1, csv2
449
 
450
+ uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
451
 
452
  if uploaded_file is not None:
453
  try:
454
+ c1, c2, c3 = st.columns([3,3,4])
455
  top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
456
  viz_selected = c2.selectbox("Option for visualization",
457
+ ("Line graph", "Heatmap"), on_change=reset_all)
458
  running_total = c3.selectbox("Calculation method",
459
  ("Running total", "By occurrences each year"), on_change=reset_all)
460
+ count_method = c1.selectbox("Count by",
461
  ("Term Frequency", "Document Frequency"), on_change=reset_all)
462
 
 
463
  df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
464
+ col_name = c2.selectbox("Select column to analyze",
465
  (coldf), on_change=reset_all)
466
+ xgram = c3.selectbox("N-grams", ("1", "2", "3"), on_change=reset_all)
467
+ xgram = int(xgram)
468
+
469
+ st.divider()
470
+ d1, d2 = st.columns([3,7])
471
+ exc_inc = d1.radio("Select to exclude or focus on specific words", ["Words to exclude","Focus on these words"], horizontal=True, on_change=reset_all)
472
+ words_input = d2.text_input("Words to exclude or focus on (comma-separated)", on_change=reset_all)
473
 
474
  if (GAP != 0):
475
  YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
476
  else:
477
+ c1.write('You only have data in ', (MAX))
478
  sys.exit(1)
479
 
480
  yearly_term_frequency, top_words = clean_data(df)
481
 
482
  bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
483
 
484
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
485
 
486
  with tab1:
487
  if bursts.empty:
 
497
  linegraph(bursts, freq_data)
498
 
499
  elif viz_selected =="Scatter plot":
500
+ scattervis(bursts, freq_data, top_n)
501
 
502
  csv1, csv2 = download_result(freq_data, bursts)
503
  e1, e2, e3 = st.columns(3)
 
527
  st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680')
528
  st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
529
  st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
530
+ st.markdown('**Santosa, F. A. (2025). Artificial Intelligence in Library Studies: A Textual Analysis. JLIS.It, 16(1).** https://doi.org/10.36253/jlis.it-626')
531
+
532
+ with tab4:
533
+ st.subheader(':blue[Burst Detection]', anchor=False)
534
+ st.button('📊 Download high resolution image', on_click=None)
535
+ st.text("Click download button.")
536
+
537
+ st.divider()
538
+ st.subheader(':blue[Top words]', anchor=False)
539
+ st.button('👉 Press to download list of top words', on_click=None)
540
+ st.text("Click download button.")
541
+
542
+ st.divider()
543
+ st.subheader(':blue[Burst]', anchor=False)
544
+ st.button('👉 Press to download the list of detected bursts', on_click=None)
545
+ st.text("Click download button.")
546
 
547
+ except Exception as e:
548
+ st.error("Please ensure that your file or settings are correct. If you think there is a mistake, feel free to reach out to us!", icon="🚨")
549
+ st.stop()
pages/6 Keywords Stem.py CHANGED
@@ -1,238 +1,298 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- import re
5
- import nltk
6
- nltk.download('wordnet')
7
- from nltk.stem import WordNetLemmatizer
8
- nltk.download('stopwords')
9
- from nltk.corpus import stopwords
10
- from pprint import pprint
11
- import pickle
12
- import streamlit.components.v1 as components
13
- from io import StringIO
14
- from nltk.stem.snowball import SnowballStemmer
15
- import csv
16
- import sys
17
-
18
- #===config===
19
- st.set_page_config(
20
- page_title="Coconut",
21
- page_icon="🥥",
22
- layout="wide",
23
- initial_sidebar_state="collapsed"
24
- )
25
-
26
- hide_streamlit_style = """
27
- <style>
28
- #MainMenu
29
- {visibility: hidden;}
30
- footer {visibility: hidden;}
31
- [data-testid="collapsedControl"] {display: none}
32
- </style>
33
- """
34
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
35
-
36
- with st.popover("🔗 Menu"):
37
- st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
38
- st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
39
- st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
40
- st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
41
- st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
42
- st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
43
- st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
44
-
45
-
46
- st.header("Keywords Stem", anchor=False)
47
- st.subheader('Put your file here...', anchor=False)
48
-
49
- def reset_data():
50
- st.cache_data.clear()
51
-
52
- #===check filetype===
53
- @st.cache_data(ttl=3600)
54
- def get_ext(extype):
55
- extype = uploaded_file.name
56
- return extype
57
-
58
- #===upload===
59
- @st.cache_data(ttl=3600)
60
- def upload(extype):
61
- keywords = pd.read_csv(uploaded_file)
62
- return keywords
63
-
64
- @st.cache_data(ttl=3600)
65
- def conv_txt(extype):
66
- col_dict = {'TI': 'Title',
67
- 'SO': 'Source title',
68
- 'DE': 'Author Keywords',
69
- 'ID': 'Keywords Plus'}
70
- keywords = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
71
- keywords.rename(columns=col_dict, inplace=True)
72
- return keywords
73
-
74
- @st.cache_data(ttl=3600)
75
- def rev_conv_txt(extype):
76
- col_dict_rev = {'Title': 'TI',
77
- 'Source title': 'SO',
78
- 'Author Keywords': 'DE',
79
- 'Keywords Plus': 'ID'}
80
- keywords.rename(columns=col_dict_rev, inplace=True)
81
- return keywords
82
-
83
- @st.cache_data(ttl=3600)
84
- def get_data(extype):
85
- list_of_column_key = list(keywords.columns)
86
- list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
87
- return list_of_column_key
88
-
89
- uploaded_file = st.file_uploader('', type=['csv','txt'], on_change=reset_data)
90
-
91
- if uploaded_file is not None:
92
- try:
93
- extype = get_ext(uploaded_file)
94
- if extype.endswith('.csv'):
95
- keywords = upload(extype)
96
-
97
- elif extype.endswith('.txt'):
98
- keywords = conv_txt(extype)
99
-
100
- list_of_column_key = get_data(extype)
101
-
102
- col1, col2 = st.columns(2)
103
- with col1:
104
- method = st.selectbox(
105
- 'Choose method',
106
- ('Lemmatization', 'Stemming'), on_change=reset_data)
107
- with col2:
108
- keyword = st.selectbox(
109
- 'Choose column',
110
- (list_of_column_key), on_change=reset_data)
111
-
112
- @st.cache_data(ttl=3600)
113
- def clean_keyword(extype):
114
- global keyword, keywords
115
- try:
116
- key = keywords[keyword]
117
- except KeyError:
118
- st.error('Error: Please check your Author/Index Keywords column.')
119
- sys.exit(1)
120
- keywords = keywords.replace(np.nan, '', regex=True)
121
- keywords[keyword] = keywords[keyword].astype(str)
122
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
123
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
124
- keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
125
-
126
- #===Keywords list===
127
- key = key.dropna()
128
- key = pd.concat([key.str.split('; ', expand=True)], axis=1)
129
- key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
130
- key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
131
- key['new']=key[0].map(lambda x: x.lower())
132
-
133
- return keywords, key
134
-
135
- #===stem/lem===
136
- @st.cache_data(ttl=3600)
137
- def Lemmatization(extype):
138
- lemmatizer = WordNetLemmatizer()
139
- def lemmatize_words(text):
140
- words = text.split()
141
- words = [lemmatizer.lemmatize(word) for word in words]
142
- return ' '.join(words)
143
- keywords[keyword] = keywords[keyword].apply(lemmatize_words)
144
- key['new'] = key['new'].apply(lemmatize_words)
145
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
146
- return keywords, key
147
-
148
- @st.cache_data(ttl=3600)
149
- def Stemming(extype):
150
- stemmer = SnowballStemmer("english")
151
- def stem_words(text):
152
- words = text.split()
153
- words = [stemmer.stem(word) for word in words]
154
- return ' '.join(words)
155
- keywords[keyword] = keywords[keyword].apply(stem_words)
156
- key['new'] = key['new'].apply(stem_words)
157
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
158
- return keywords, key
159
-
160
- keywords, key = clean_keyword(extype)
161
-
162
- if method is 'Lemmatization':
163
- keywords, key = Lemmatization(extype)
164
- else:
165
- keywords, key = Stemming(extype)
166
-
167
- st.write('Congratulations! 🤩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
168
- st.divider()
169
-
170
- #===show & download csv===
171
- tab1, tab2, tab3, tab4 = st.tabs(["📥 Result", "📥 List of Keywords", "📃 Reference", "📃 Recommended Reading"])
172
-
173
- with tab1:
174
- st.dataframe(keywords, use_container_width=True, hide_index=True)
175
- @st.cache_data(ttl=3600)
176
- def convert_df(extype):
177
- return keywords.to_csv(index=False).encode('utf-8')
178
-
179
- @st.cache_data(ttl=3600)
180
- def convert_txt(extype):
181
- return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
182
-
183
- if extype.endswith('.csv'):
184
- csv = convert_df(extype)
185
- st.download_button(
186
- "Press to download result 👈",
187
- csv,
188
- "result.csv",
189
- "text/csv")
190
-
191
- elif extype.endswith('.txt'):
192
- keywords = rev_conv_txt(extype)
193
- txt = convert_txt(extype)
194
- st.download_button(
195
- "Press to download result 👈",
196
- txt,
197
- "result.txt",
198
- "text/csv")
199
-
200
- with tab2:
201
- @st.cache_data(ttl=3600)
202
- def table_keyword(extype):
203
- keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
204
- return keytab
205
-
206
- #===coloring the same keywords===
207
- @st.cache_data(ttl=3600)
208
- def highlight_cells(value):
209
- if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
210
- return 'background-color: yellow'
211
- return ''
212
- keytab = table_keyword(extype)
213
- st.dataframe(keytab.style.applymap(highlight_cells, subset=['new']), use_container_width=True, hide_index=True)
214
-
215
- @st.cache_data(ttl=3600)
216
- def convert_dfs(extype):
217
- return key.to_csv(index=False).encode('utf-8')
218
-
219
- csv = convert_dfs(extype)
220
-
221
- st.download_button(
222
- "Press to download keywords 👈",
223
- csv,
224
- "keywords.csv",
225
- "text/csv")
226
-
227
- with tab3:
228
- st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
229
-
230
- with tab4:
231
- st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
232
- st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, &amp; Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
233
- st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
234
-
235
-
236
- except:
237
- st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
238
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import nltk
6
+ nltk.download('wordnet')
7
+ from nltk.stem import WordNetLemmatizer
8
+ nltk.download('stopwords')
9
+ from nltk.corpus import stopwords
10
+ from pprint import pprint
11
+ import pickle
12
+ import streamlit.components.v1 as components
13
+ from io import StringIO
14
+ from nltk.stem.snowball import SnowballStemmer
15
+ import csv
16
+ import sys
17
+ import json
18
+ from tools import sourceformat as sf
19
+
20
+
21
+ #===config===
22
+ st.set_page_config(
23
+ page_title="Coconut",
24
+ page_icon="🥥",
25
+ layout="wide",
26
+ initial_sidebar_state="collapsed"
27
+ )
28
+
29
+ hide_streamlit_style = """
30
+ <style>
31
+ #MainMenu
32
+ {visibility: hidden;}
33
+ footer {visibility: hidden;}
34
+ [data-testid="collapsedControl"] {display: none}
35
+ </style>
36
+ """
37
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
38
+
39
+ with st.popover("🔗 Menu"):
40
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
41
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
42
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
43
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
44
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
45
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
46
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
47
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
48
+
49
+
50
+ st.header("Keywords Stem", anchor=False)
51
+ st.subheader('Put your file here...', anchor=False)
52
+
53
+ def reset_data():
54
+ st.cache_data.clear()
55
+
56
+ #===check filetype===
57
+ @st.cache_data(ttl=3600)
58
+ def get_ext(extype):
59
+ extype = uploaded_file.name
60
+ return extype
61
+
62
+ #===upload===
63
+ @st.cache_data(ttl=3600)
64
+ def upload(extype):
65
+ keywords = pd.read_csv(uploaded_file)
66
+
67
+ if "About the data" in keywords.columns[0]:
68
+ keywords = sf.dim(keywords)
69
+ col_dict = {'MeSH terms': 'Keywords',
70
+ 'PubYear': 'Year',
71
+ 'Times cited': 'Cited by',
72
+ 'Publication Type': 'Document Type'
73
+ }
74
+ keywords.rename(columns=col_dict, inplace=True)
75
+
76
+ return keywords
77
+ @st.cache_data(ttl=3600)
78
+ def conv_txt(extype):
79
+ if("PMID" in (uploaded_file.read()).decode()):
80
+ uploaded_file.seek(0)
81
+ papers = sf.medline(uploaded_file)
82
+ print(papers)
83
+ return papers
84
+ col_dict = {'TI': 'Title',
85
+ 'SO': 'Source title',
86
+ 'DE': 'Author Keywords',
87
+ 'DT': 'Document Type',
88
+ 'AB': 'Abstract',
89
+ 'TC': 'Cited by',
90
+ 'PY': 'Year',
91
+ 'ID': 'Keywords Plus',
92
+ 'rights_date_used': 'Year'}
93
+ uploaded_file.seek(0)
94
+ papers = pd.read_csv(uploaded_file, sep='\t')
95
+ if("htid" in papers.columns):
96
+ papers = sf.htrc(papers)
97
+ papers.rename(columns=col_dict, inplace=True)
98
+ print(papers)
99
+ return papers
100
+
101
+ @st.cache_data(ttl=3600)
102
+ def rev_conv_txt(extype):
103
+ col_dict_rev = {'Title': 'TI',
104
+ 'Source title': 'SO',
105
+ 'Author Keywords': 'DE',
106
+ 'Keywords Plus': 'ID'}
107
+ keywords.rename(columns=col_dict_rev, inplace=True)
108
+ return keywords
109
+
110
+ @st.cache_data(ttl=3600)
111
+ def conv_json(extype):
112
+ col_dict={'title': 'title',
113
+ 'rights_date_used': 'Year',
114
+ }
115
+
116
+ data = json.load(uploaded_file)
117
+ hathifile = data['gathers']
118
+ keywords = pd.DataFrame.from_records(hathifile)
119
+
120
+ keywords = sf.htrc(keywords)
121
+ keywords.rename(columns=col_dict,inplace=True)
122
+ return keywords
123
+
124
+ def conv_pub(extype):
125
+ if (get_ext(extype)).endswith('.tar.gz'):
126
+ bytedata = extype.read()
127
+ keywords = sf.readPub(bytedata)
128
+ elif (get_ext(extype)).endswith('.xml'):
129
+ bytedata = extype.read()
130
+ keywords = sf.readxml(bytedata)
131
+ return keywords
132
+
133
+ @st.cache_data(ttl=3600)
134
+ def get_data(extype):
135
+ list_of_column_key = list(keywords.columns)
136
+ list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
137
+ return list_of_column_key
138
+
139
+ uploaded_file = st.file_uploader('', type=['csv','txt','json','tar.gz','xml'], on_change=reset_data)
140
+
141
+ if uploaded_file is not None:
142
+ try:
143
+ extype = get_ext(uploaded_file)
144
+ if extype.endswith('.csv'):
145
+ keywords = upload(extype)
146
+
147
+ elif extype.endswith('.txt'):
148
+ keywords = conv_txt(extype)
149
+
150
+ elif extype.endswith('.json'):
151
+ keywords = conv_json(extype)
152
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
153
+ keywords = conv_pub(uploaded_file)
154
+
155
+ list_of_column_key = get_data(extype)
156
+
157
+ col1, col2 = st.columns(2)
158
+ with col1:
159
+ method = st.selectbox(
160
+ 'Choose method',
161
+ ('Lemmatization', 'Stemming'), on_change=reset_data)
162
+ with col2:
163
+ keyword = st.selectbox(
164
+ 'Choose column',
165
+ (list_of_column_key), on_change=reset_data)
166
+
167
+ @st.cache_data(ttl=3600)
168
+ def clean_keyword(extype):
169
+ global keyword, keywords
170
+ try:
171
+ key = keywords[keyword]
172
+ except KeyError:
173
+ st.error('Error: Please check your Author/Index Keywords column.')
174
+ sys.exit(1)
175
+ keywords = keywords.replace(np.nan, '', regex=True)
176
+ keywords[keyword] = keywords[keyword].astype(str)
177
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
178
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
179
+ keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
180
+
181
+ #===Keywords list===
182
+ key = key.dropna()
183
+ key = pd.concat([key.str.split('; ', expand=True)], axis=1)
184
+ key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
185
+ key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
186
+ key['new']=key[0].map(lambda x: x.lower())
187
+
188
+ return keywords, key
189
+
190
+ #===stem/lem===
191
+ @st.cache_data(ttl=3600)
192
+ def Lemmatization(extype):
193
+ lemmatizer = WordNetLemmatizer()
194
+ def lemmatize_words(text):
195
+ words = text.split()
196
+ words = [lemmatizer.lemmatize(word) for word in words]
197
+ return ' '.join(words)
198
+ keywords[keyword] = keywords[keyword].apply(lemmatize_words)
199
+ key['new'] = key['new'].apply(lemmatize_words)
200
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
201
+ return keywords, key
202
+
203
+ @st.cache_data(ttl=3600)
204
+ def Stemming(extype):
205
+ stemmer = SnowballStemmer("english")
206
+ def stem_words(text):
207
+ words = text.split()
208
+ words = [stemmer.stem(word) for word in words]
209
+ return ' '.join(words)
210
+ keywords[keyword] = keywords[keyword].apply(stem_words)
211
+ key['new'] = key['new'].apply(stem_words)
212
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
213
+ return keywords, key
214
+
215
+ keywords, key = clean_keyword(extype)
216
+
217
+ if method is 'Lemmatization':
218
+ keywords, key = Lemmatization(extype)
219
+ else:
220
+ keywords, key = Stemming(extype)
221
+
222
+ st.write('Congratulations! 🤩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
223
+ st.divider()
224
+
225
+ #===show & download csv===
226
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["📥 Result", "📥 List of Keywords", "📃 Reference", "📃 Recommended Reading", "⬇️ Download Help"])
227
+
228
+ with tab1:
229
+ st.dataframe(keywords, use_container_width=True, hide_index=True)
230
+ @st.cache_data(ttl=3600)
231
+ def convert_df(extype):
232
+ return keywords.to_csv(index=False).encode('utf-8')
233
+
234
+ @st.cache_data(ttl=3600)
235
+ def convert_txt(extype):
236
+ return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
237
+
238
+ if extype.endswith('.csv'):
239
+ csv = convert_df(extype)
240
+ st.download_button(
241
+ "Press to download result 👈",
242
+ csv,
243
+ "result.csv",
244
+ "text/csv")
245
+
246
+ elif extype.endswith('.txt'):
247
+ keywords = rev_conv_txt(extype)
248
+ txt = convert_txt(extype)
249
+ st.download_button(
250
+ "Press to download result 👈",
251
+ txt,
252
+ "result.txt",
253
+ "text/csv")
254
+
255
+ with tab2:
256
+ @st.cache_data(ttl=3600)
257
+ def table_keyword(extype):
258
+ keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
259
+ return keytab
260
+
261
+ #===coloring the same keywords===
262
+ @st.cache_data(ttl=3600)
263
+ def highlight_cells(value):
264
+ if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
265
+ return 'background-color: yellow'
266
+ return ''
267
+ keytab = table_keyword(extype)
268
+ st.dataframe(keytab.style.applymap(highlight_cells, subset=['new']), use_container_width=True, hide_index=True)
269
+
270
+ @st.cache_data(ttl=3600)
271
+ def convert_dfs(extype):
272
+ return key.to_csv(index=False).encode('utf-8')
273
+
274
+ csv = convert_dfs(extype)
275
+
276
+ st.download_button(
277
+ "Press to download keywords 👈",
278
+ csv,
279
+ "keywords.csv",
280
+ "text/csv")
281
+
282
+ with tab3:
283
+ st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
284
+
285
+ with tab4:
286
+ st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
287
+ st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, &amp; Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
288
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
289
+
290
+ with tab5:
291
+ st.text("Download keywords at bottom of table")
292
+ st.divider()
293
+ st.text("Download table")
294
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/mainimages/downloadtable.png")
295
+ except Exception as e:
296
+ st.write(e)
297
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
298
+ st.stop()
pages/7 Sentiment Analysis.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import module
2
+ import streamlit as st
3
+ import streamlit.components.v1 as components
4
+ import pandas as pd
5
+ import re
6
+ import nltk
7
+ import pandas as pd
8
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
9
+ nltk.download('stopwords')
10
+ from nltk.corpus import stopwords
11
+ from nltk.tokenize import word_tokenize
12
+ from nltk.stem import WordNetLemmatizer
13
+ nltk.download('punkt_tab')
14
+ nltk.download('vader_lexicon')
15
+ from textblob import TextBlob
16
+ import os
17
+ import numpy as np
18
+ import plotly.express as px
19
+ import json
20
+ from tools import sourceformat as sf
21
+
22
+ #===config===
23
+ st.set_page_config(
24
+ page_title="Coconut",
25
+ page_icon="🥥",
26
+ layout="wide",
27
+ initial_sidebar_state="collapsed"
28
+ )
29
+
30
+ hide_streamlit_style = """
31
+ <style>
32
+ #MainMenu
33
+ {visibility: hidden;}
34
+ footer {visibility: hidden;}
35
+ [data-testid="collapsedControl"] {display: none}
36
+ </style>
37
+ """
38
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
39
+
40
+ with st.popover("🔗 Menu"):
41
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
42
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
43
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
44
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
45
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
46
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
47
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
48
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
49
+
50
+ st.header("Sentiment Analysis", anchor=False)
51
+ st.subheader('Put your file here...', anchor=False)
52
+
53
+ #========unique id========
54
+ @st.cache_resource(ttl=3600)
55
+ def create_list():
56
+ l = [1, 2, 3]
57
+ return l
58
+
59
+ l = create_list()
60
+ first_list_value = l[0]
61
+ l[0] = first_list_value + 1
62
+ uID = str(l[0])
63
+
64
+ @st.cache_data(ttl=3600)
65
+ def get_ext(uploaded_file):
66
+ extype = uID+uploaded_file.name
67
+ return extype
68
+
69
+ #===clear cache===
70
+
71
+
72
+ def reset_all():
73
+ st.cache_data.clear()
74
+
75
+ #===avoiding deadlock===
76
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
77
+
78
+ #===upload file===
79
+ @st.cache_data(ttl=3600)
80
+ def upload(file):
81
+ papers = pd.read_csv(uploaded_file)
82
+ if "About the data" in papers.columns[0]:
83
+ papers = sf.dim(papers)
84
+ col_dict = {'MeSH terms': 'Keywords',
85
+ 'PubYear': 'Year',
86
+ 'Times cited': 'Cited by',
87
+ 'Publication Type': 'Document Type'
88
+ }
89
+ papers.rename(columns=col_dict, inplace=True)
90
+ return papers
91
+
92
+ @st.cache_data(ttl=3600)
93
+ def conv_txt(extype):
94
+ if("PMID" in (uploaded_file.read()).decode()):
95
+ uploaded_file.seek(0)
96
+ papers = sf.medline(uploaded_file)
97
+ print(papers)
98
+ return papers
99
+ col_dict = {'TI': 'Title',
100
+ 'SO': 'Source title',
101
+ 'DE': 'Author Keywords',
102
+ 'DT': 'Document Type',
103
+ 'AB': 'Abstract',
104
+ 'TC': 'Cited by',
105
+ 'PY': 'Year',
106
+ 'ID': 'Keywords Plus',
107
+ 'rights_date_used': 'Year'}
108
+ uploaded_file.seek(0)
109
+ papers = pd.read_csv(uploaded_file, sep='\t')
110
+ if("htid" in papers.columns):
111
+ papers = sf.htrc(papers)
112
+ papers.rename(columns=col_dict, inplace=True)
113
+ print(papers)
114
+ return papers
115
+
116
+
117
+ @st.cache_data(ttl=3600)
118
+ def conv_json(extype):
119
+ col_dict={'title': 'title',
120
+ 'rights_date_used': 'Year',
121
+ }
122
+
123
+ data = json.load(uploaded_file)
124
+ hathifile = data['gathers']
125
+ keywords = pd.DataFrame.from_records(hathifile)
126
+
127
+ keywords = sf.htrc(keywords)
128
+ keywords.rename(columns=col_dict,inplace=True)
129
+ return keywords
130
+
131
+ @st.cache_resource(ttl=3600)
132
+ def conv_pub(extype):
133
+ if (get_ext(extype)).endswith('.tar.gz'):
134
+ bytedata = extype.read()
135
+ keywords = sf.readPub(bytedata)
136
+ elif (get_ext(extype)).endswith('.xml'):
137
+ bytedata = extype.read()
138
+ keywords = sf.readxml(bytedata)
139
+ return keywords
140
+
141
+ #===Read data===
142
+ uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all)
143
+
144
+ if uploaded_file is not None:
145
+ try:
146
+ extype = get_ext(uploaded_file)
147
+
148
+ if extype.endswith('.csv'):
149
+ papers = upload(extype)
150
+ elif extype.endswith('.txt'):
151
+ papers = conv_txt(extype)
152
+
153
+ elif extype.endswith('.json'):
154
+ papers = conv_json(extype)
155
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
156
+ papers = conv_pub(uploaded_file)
157
+
158
+ coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
159
+
160
+ c1, c2 = st.columns(2)
161
+ ColCho = c1.selectbox(
162
+ 'Choose column',
163
+ (coldf), on_change=reset_all)
164
+ method = c2.selectbox(
165
+ 'Choose method',[
166
+ 'TextBlob','NLTKvader']
167
+ )
168
+ words_to_remove = c1.text_input("Remove specific words. Separate words by semicolons (;)")
169
+ wordcount = c2.number_input(label = "Words displayed", min_value = 1, step = 1, value=5)-1
170
+ rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
171
+ rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
172
+
173
+ #===clean csv===
174
+ @st.cache_data(ttl=3600, show_spinner=False)
175
+ def clean_csv(extype):
176
+ paper = papers.dropna(subset=[ColCho])
177
+
178
+ #===mapping===
179
+ paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
180
+ if rem_punc:
181
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
182
+ paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
183
+ if rem_copyright:
184
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
185
+
186
+ #===stopword removal===
187
+ stop = stopwords.words('english')
188
+ paper[ColCho] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
189
+
190
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
191
+ remove_dict = {word: None for word in words_rmv}
192
+
193
+ @st.cache_resource(ttl=3600)
194
+ def remove_words(text):
195
+ words = text.split()
196
+ cleaned_words = [word for word in words if word not in remove_dict]
197
+ return ' '.join(cleaned_words)
198
+
199
+ paper['Sentences__'] = paper['Abstract_pre'].map(remove_words)
200
+
201
+ return paper
202
+ paper=clean_csv(extype)
203
+
204
+ if method == 'NLTKvader':
205
+ analyzer = SentimentIntensityAnalyzer()
206
+
207
+ @st.cache_resource(ttl=3600)
208
+ def get_sentiment(text):
209
+ score = analyzer.polarity_scores(text)
210
+ return score
211
+
212
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Result", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
213
+ with tab1:
214
+
215
+ paper['Scores'] = paper['Sentences__'].apply(get_sentiment)
216
+
217
+ scoreframe = pd.DataFrame()
218
+
219
+ scoreframe['Phrase'] = pd.Series(paper['Sentences__'])
220
+
221
+ scoreframe[['Negativity','Neutrality','Positivity','Compound']] = pd.DataFrame.from_records(paper['Scores'])
222
+
223
+ scoreframe = scoreframe.groupby(scoreframe.columns.tolist(),as_index=False).size()
224
+
225
+ scoreframe = scoreframe.truncate(after = wordcount)
226
+
227
+ with st.expander("Sentence and Results"):
228
+ finalframe = pd.DataFrame()
229
+ finalframe['Sentence'] = scoreframe['Phrase']
230
+ finalframe[['Negativity','Neutrality','Positivity','Compound']] = scoreframe[['Negativity','Neutrality','Positivity','Compound']]
231
+ finalframe[['Count']] = scoreframe[['size']]
232
+
233
+ st.dataframe(finalframe, use_container_width=True)
234
+
235
+ with tab2:
236
+ st.markdown('**Hutto, C. and Gilbert, E. (2014) ‘VADER: A Parsimonious Rule-Based Model for Sentiment Analysis of Social Media Text’, Proceedings of the International AAAI Conference on Web and Social Media, 8(1), pp. 216–225.** https://doi.org/10.1609/icwsm.v8i1.14550')
237
+
238
+ with tab3:
239
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Sentiment Analysis. Text Mining for Information Professionals, 191–211.** https://doi.org/10.1007/978-3-030-85085-2_7')
240
+
241
+ with tab4:
242
+ st.subheader(':blue[CSV Results]', anchor=False)
243
+ st.text("Click Download button")
244
+ st.markdown("![Downloading results](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/sentitable.png)")
245
+
246
+ elif(method == 'TextBlob'):
247
+
248
+ @st.cache_resource(ttl=3600)
249
+ def get_sentimentb(text):
250
+ line = TextBlob(text)
251
+ return line.sentiment
252
+
253
+ @st.cache_resource(ttl=3600)
254
+ def get_assessments(frame):
255
+ text = TextBlob(str(frame))
256
+
257
+ polar, subject, assessment = text.sentiment_assessments
258
+
259
+ try:
260
+ phrase, phrasepolar, phrasesubject, unknown = assessment[0]
261
+ except: #this only happens if assessment is empty
262
+ phrase, phrasepolar, phrasesubject = "empty", 0, 0
263
+
264
+ return phrase, phrasepolar, phrasesubject
265
+
266
+ @st.cache_resource(ttl=3600)
267
+ def mergelist(data):
268
+ return ' '.join(data)
269
+
270
+ @st.cache_resource(ttl=3600)
271
+ def assignscore(data):
272
+ if data>0:
273
+ return "Positive"
274
+ elif data<0:
275
+ return "Negative"
276
+ else:
277
+ return "Neutral"
278
+
279
+ phrases = paper['Sentences__'].apply(get_assessments)
280
+
281
+ phraselist = phrases.to_list()
282
+
283
+ phraseframe = pd.DataFrame(phraselist, columns =["Phrase","Polarity","Subjectivity"])
284
+
285
+ phraseframe["Phrase"] = phraseframe["Phrase"].apply(mergelist)
286
+
287
+ phraseframe = phraseframe.groupby(phraseframe.columns.tolist(),as_index=False).size()
288
+
289
+ phraseframe["Score"] = phraseframe["Polarity"].apply(assignscore)
290
+
291
+ neut = phraseframe.loc[phraseframe['Score']=="Neutral"]
292
+ neut.reset_index(inplace = True)
293
+
294
+ pos = phraseframe.loc[phraseframe['Score']=="Positive"]
295
+ pos.reset_index(inplace = True)
296
+
297
+ neg = phraseframe.loc[phraseframe['Score']=="Negative"]
298
+ neg.reset_index(inplace = True)
299
+
300
+ paper['Sentiment'] = paper['Sentences__'].apply(get_sentimentb)
301
+
302
+ pos.sort_values(by=["size"], inplace = True, ascending = False, ignore_index = True)
303
+ pos = pos.truncate(after = wordcount)
304
+
305
+ neg.sort_values(by=["size"], inplace = True, ascending = False, ignore_index = True)
306
+ neg = neg.truncate(after = wordcount)
307
+
308
+ neut.sort_values(by=["size"], inplace = True, ascending = False, ignore_index = True)
309
+ neut = neut.truncate(after = wordcount)
310
+
311
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
312
+ with tab1:
313
+ #display tables and graphs
314
+
315
+ with st.expander("Positive Sentiment"):
316
+ st.dataframe(pos, use_container_width=True)
317
+ figpos = px.bar(pos, x="Phrase", y="size", labels={"size": "Count", "Phrase": "Word"})
318
+ st.plotly_chart(figpos, use_container_width=True)
319
+
320
+ with st.expander("Negative Sentiment"):
321
+ st.dataframe(neg, use_container_width=True)
322
+ figneg = px.bar(neg, x="Phrase", y="size", labels={"size": "Count", "Phrase": "Word"}, color_discrete_sequence=["#e57d7d"])
323
+ st.plotly_chart(figneg, use_container_width=True)
324
+
325
+ with st.expander("Neutral Sentiment"):
326
+ st.dataframe(neut, use_container_width=True)
327
+ figneut = px.bar(neut, x="Phrase", y="size", labels={"size": "Count", "Phrase": "Word"}, color_discrete_sequence=["#737a72"])
328
+ st.plotly_chart(figneut, use_container_width=True)
329
+
330
+
331
+ with st.expander("Sentence and Results"):
332
+ finalframe = pd.DataFrame()
333
+ finalframe['Sentence'] = paper['Sentences__']
334
+ finalframe[['Polarity','Subjectivity']] = pd.DataFrame(paper['Sentiment'].tolist(), index = paper.index)
335
+
336
+ st.dataframe(finalframe, use_container_width=True)
337
+
338
+ with tab2:
339
+ st.markdown('**Steven, L. et al. (2018) TextBlob: Simplified Text Processing — TextBlob 0.15.2 documentation, Readthedocs.io.** https://textblob.readthedocs.io/en/dev/')
340
+
341
+ with tab3:
342
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Sentiment Analysis. Text Mining for Information Professionals, 191–211.** https://doi.org/10.1007/978-3-030-85085-2_7')
343
+
344
+ with tab4:
345
+ st.subheader(':blue[Sentiment Analysis]', anchor=False)
346
+ st.write("Click the three dots at the top right then select the desired format")
347
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_sentiment.png)")
348
+ st.divider()
349
+ st.subheader(':blue[CSV Results]', anchor=False)
350
+ st.text("Click Download button")
351
+ st.markdown("![Downloading results](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/sentitable.png)")
352
+
353
+
354
+ except Exception as e:
355
+ st.write(e)
356
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
357
+ st.stop()
pages/8 Shifterator.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as components
3
+ import shifterator as sh
4
+ from shifterator import ProportionShift
5
+ import pandas as pd
6
+ import re
7
+ import nltk
8
+ nltk.download('wordnet')
9
+ from nltk.stem import WordNetLemmatizer
10
+ nltk.download('stopwords')
11
+ from nltk.corpus import stopwords
12
+ import time
13
+ import sys
14
+ import json
15
+ from tools import sourceformat as sf
16
+ from collections import Counter
17
+ import io
18
+
19
+ #===config===
20
+ st.set_page_config(
21
+ page_title="Coconut",
22
+ page_icon="🥥",
23
+ layout="wide",
24
+ initial_sidebar_state="collapsed"
25
+ )
26
+
27
+ hide_streamlit_style = """
28
+ <style>
29
+ #MainMenu
30
+ {visibility: hidden;}
31
+ footer {visibility: hidden;}
32
+ [data-testid="collapsedControl"] {display: none}
33
+ </style>
34
+ """
35
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
36
+
37
+ with st.popover("🔗 Menu"):
38
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
39
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
40
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
41
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
42
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
43
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
44
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
45
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
46
+ st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
47
+
48
+ st.header("Shifterator", anchor=False)
49
+ st.subheader('Put your file here...', anchor=False)
50
+
51
+ def reset_all():
52
+ st.cache_data.clear()
53
+
54
+ @st.cache_data(ttl=3600)
55
+ def get_ext(extype):
56
+ extype = uploaded_file.name
57
+ return extype
58
+
59
+ #===upload file===
60
+ @st.cache_data(ttl=3600)
61
+ def upload(extype):
62
+ papers = pd.read_csv(uploaded_file)
63
+ #lens.org
64
+ if 'Publication Year' in papers.columns:
65
+ papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
66
+ 'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
67
+
68
+ if "dimensions" in uploaded_file.name.lower():
69
+ papers = sf.dim(papers)
70
+ col_dict = {'MeSH terms': 'Keywords',
71
+ 'PubYear': 'Year',
72
+ 'Times cited': 'Cited by',
73
+ 'Publication Type': 'Document Type'
74
+ }
75
+ papers.rename(columns=col_dict, inplace=True)
76
+
77
+ return papers
78
+
79
+ @st.cache_data(ttl=3600)
80
+ def conv_txt(extype):
81
+ if("pmc" in uploaded_file.name.lower() or "pubmed" in uploaded_file.name.lower()):
82
+ file = uploaded_file
83
+ papers = sf.medline(file)
84
+
85
+ elif("hathi" in uploaded_file.name.lower()):
86
+ papers = pd.read_csv(uploaded_file,sep = '\t')
87
+ papers = sf.htrc(papers)
88
+ col_dict={'title': 'title',
89
+ 'rights_date_used': 'Year',
90
+ }
91
+ papers.rename(columns=col_dict, inplace=True)
92
+
93
+ else:
94
+ col_dict = {'TI': 'Title',
95
+ 'SO': 'Source title',
96
+ 'DE': 'Author Keywords',
97
+ 'DT': 'Document Type',
98
+ 'AB': 'Abstract',
99
+ 'TC': 'Cited by',
100
+ 'PY': 'Year',
101
+ 'ID': 'Keywords Plus'}
102
+ papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
103
+ papers.rename(columns=col_dict, inplace=True)
104
+ print(papers)
105
+ return papers
106
+
107
+ @st.cache_data(ttl=3600)
108
+ def conv_json(extype):
109
+ col_dict={'title': 'title',
110
+ 'rights_date_used': 'Year',
111
+ }
112
+
113
+ data = json.load(uploaded_file)
114
+ hathifile = data['gathers']
115
+ keywords = pd.DataFrame.from_records(hathifile)
116
+
117
+ keywords = sf.htrc(keywords)
118
+ keywords.rename(columns=col_dict,inplace=True)
119
+ return keywords
120
+
121
+ @st.cache_data(ttl=3600)
122
+ def conv_pub(extype):
123
+ if (get_ext(extype)).endswith('.tar.gz'):
124
+ bytedata = extype.read()
125
+ keywords = sf.readPub(bytedata)
126
+ elif (get_ext(extype)).endswith('.xml'):
127
+ bytedata = extype.read()
128
+ keywords = sf.readxml(bytedata)
129
+ return keywords
130
+
131
+ @st.cache_data(ttl=3600)
132
+ def get_data(extype):
133
+ df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
134
+ list_title = [col for col in df_col if col.lower() == "title"]
135
+ abstract_pattern = re.compile(r'abstract', re.IGNORECASE)
136
+ list_abstract = [col for col in df_col if abstract_pattern.search(col)]
137
+
138
+ if all(col in df_col for col in list_title) and all(col in df_col for col in list_abstract):
139
+ selected_cols = list_abstract + list_title
140
+ elif all(col in df_col for col in list_title):
141
+ selected_cols = list_title
142
+ elif all(col in df_col for col in list_abstract):
143
+ selected_cols = list_abstract
144
+ else:
145
+ selected_cols = df_col
146
+
147
+ if not selected_cols:
148
+ selected_cols = df_col
149
+
150
+ return df_col, selected_cols
151
+
152
+ @st.cache_data(ttl=3600)
153
+ def check_comparison(extype):
154
+ comparison = ['Word-to-word', 'Manual label']
155
+
156
+ if any('year' in col.lower() for col in papers.columns):
157
+ comparison.append('Years')
158
+ if any('source title' in col.lower() for col in papers.columns):
159
+ comparison.append('Sources')
160
+
161
+ comparison.sort(reverse=False)
162
+ return comparison
163
+
164
+ #===clean csv===
165
+ @st.cache_data(ttl=3600, show_spinner=False)
166
+ def clean_csv(extype):
167
+ paper = papers.dropna(subset=[ColCho])
168
+
169
+ #===mapping===
170
+ paper[ColCho] = paper[ColCho].map(lambda x: x.lower())
171
+ if rem_punc:
172
+ paper[ColCho] = paper[ColCho].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
173
+ paper[ColCho] = paper[ColCho].str.replace('\u201c|\u201d', '', regex=True)
174
+ if rem_copyright:
175
+ paper[ColCho] = paper[ColCho].map(lambda x: re.sub('©.*', '', x))
176
+
177
+ #===stopword removal===
178
+ stop = stopwords.words('english')
179
+ paper[ColCho] = paper[ColCho].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
180
+
181
+ #===lemmatize===
182
+ lemmatizer = WordNetLemmatizer()
183
+
184
+ @st.cache_data(ttl=3600)
185
+ def lemmatize_words(text):
186
+ words = text.split()
187
+ words = [lemmatizer.lemmatize(word) for word in words]
188
+ return ' '.join(words)
189
+
190
+ paper[ColCho] = paper[ColCho].apply(lemmatize_words)
191
+
192
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
193
+ remove_set = set(words_rmv)
194
+
195
+ @st.cache_data(ttl=3600)
196
+ def remove_words(text):
197
+ words = text.split()
198
+ cleaned_words = [word for word in words if word not in remove_set]
199
+ return ' '.join(cleaned_words)
200
+
201
+ paper[ColCho] = paper[ColCho].apply(remove_words)
202
+
203
+ return paper
204
+
205
+ @st.cache_data(ttl=3600)
206
+ def get_minmax(extype):
207
+ MIN = int(papers['Year'].min())
208
+ MAX = int(papers['Year'].max())
209
+ GAP = MAX - MIN
210
+ MID = round((MIN + MAX) / 2)
211
+ return MIN, MAX, GAP, MID
212
+
213
+ @st.cache_data(ttl=3600)
214
+ def running_shifterator(dict1, dict2):
215
+ try:
216
+ if method_shifts == 'Proportion Shifts':
217
+ proportion_shift = sh.ProportionShift(type2freq_1=dict1, type2freq_2=dict2)
218
+ ax = proportion_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Proportion Shifts')
219
+
220
+ elif method_shifts == 'Shannon Entropy Shifts':
221
+ entropy_shift = sh.EntropyShift(type2freq_1=dict1,
222
+ type2freq_2=dict2,
223
+ base=2)
224
+ ax = entropy_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Shannon Entropy Shifts')
225
+
226
+ elif method_shifts == 'Tsallis Entropy Shifts':
227
+ entropy_shift = sh.EntropyShift(type2freq_1=dict1,
228
+ type2freq_2=dict2,
229
+ base=2,
230
+ alpha=0.8)
231
+ ax = entropy_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Tsallis Entropy Shifts')
232
+
233
+ elif method_shifts == 'Kullback-Leibler Divergence Shifts':
234
+ kld_shift = sh.KLDivergenceShift(type2freq_1=dict1,
235
+ type2freq_2=dict2,
236
+ base=2)
237
+ ax = kld_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Kullback-Leibler Divergence Shifts')
238
+
239
+ elif method_shifts == 'Jensen-Shannon Divergence Shifts':
240
+ jsd_shift = sh.JSDivergenceShift(type2freq_1=dict1,
241
+ type2freq_2=dict2,
242
+ weight_1=0.5,
243
+ weight_2=0.5,
244
+ base=2,
245
+ alpha=1)
246
+ ax = jsd_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Jensen-Shannon Divergence Shifts')
247
+
248
+ fig = ax.get_figure()
249
+
250
+ buf = io.BytesIO()
251
+ fig.savefig(buf, format="png", bbox_inches='tight')
252
+ buf.seek(0)
253
+
254
+ return fig, buf
255
+
256
+ except ValueError:
257
+ st.warning('Please check your data.', icon="⚠️")
258
+ sys.exit()
259
+
260
+ @st.cache_data(ttl=3600)
261
+ def df2dict(df_1, df_2):
262
+ text1 = ' '.join(df_1.dropna().astype(str))
263
+ text2 = ' '.join(df_2.dropna().astype(str))
264
+
265
+ text1_clean = re.sub(r'\d+', '', text1)
266
+ text2_clean = re.sub(r'\d+', '', text2)
267
+
268
+ tokens1 = re.findall(r'\b\w+\b', text1_clean.lower())
269
+ tokens2 = re.findall(r'\b\w+\b', text2_clean.lower())
270
+
271
+ type2freq_1 = {k: int(v) for k, v in Counter(tokens1).items()}
272
+ type2freq_2 = {k: int(v) for k, v in Counter(tokens2).items()}
273
+
274
+ return type2freq_1, type2freq_2
275
+
276
+ @st.cache_data(ttl=3600)
277
+ def dict_w2w(search_terms1, search_terms2):
278
+ selected_col = [ColCho]
279
+ dfs1 = pd.DataFrame()
280
+ for term in search_terms1:
281
+ dfs1 = pd.concat([dfs1, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
282
+ dfs1['Topic'] = 'First Term'
283
+ dfs1 = dfs1.drop_duplicates()
284
+
285
+ dfs2 = pd.DataFrame()
286
+ for term in search_terms2:
287
+ dfs2 = pd.concat([dfs2, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
288
+ dfs2['Topic'] = 'Second Term'
289
+ dfs2 = dfs2.drop_duplicates()
290
+
291
+ type2freq_1, type2freq_2 = df2dict(dfs1[selected_col[0]], dfs2[selected_col[0]])
292
+
293
+ return type2freq_1, type2freq_2
294
+
295
+ @st.cache_data(ttl=3600)
296
+ def dict_sources(stitle1, stitle2):
297
+ selected_col = [ColCho]
298
+ dfs1 = paper[paper['Source title'].str.contains(stitle1, case=False, na=False)]
299
+ dfs1['Topic'] = stitle1
300
+ dfs2 = paper[paper['Source title'].str.contains(stitle2, case=False, na=False)]
301
+ dfs2['Topic'] = stitle2
302
+
303
+ type2freq_1, type2freq_2 = df2dict(dfs1[selected_col[0]], dfs2[selected_col[0]])
304
+
305
+ return type2freq_1, type2freq_2
306
+
307
+ @st.cache_data(ttl=3600)
308
+ def dict_years(first_range, second_range):
309
+ selected_col = [ColCho]
310
+ first_filter_df = paper[(paper['Year'] >= first_range[0]) & (paper['Year'] <= first_range[1])].copy()
311
+ first_filter_df['Topic Range'] = 'First range'
312
+
313
+ second_filter_df = paper[(paper['Year'] >= second_range[0]) & (paper['Year'] <= second_range[1])].copy()
314
+ second_filter_df['Topic Range'] = 'Second range'
315
+
316
+ type2freq_1, type2freq_2 = df2dict(first_filter_df[selected_col[0]], second_filter_df[selected_col[0]])
317
+
318
+ return type2freq_1, type2freq_2
319
+
320
+
321
+ #===Read data===
322
+ uploaded_file = st.file_uploader('', type=['csv', 'txt', 'json', 'tar.gz','xml'], on_change=reset_all)
323
+
324
+ if uploaded_file is not None:
325
+ try:
326
+ extype = get_ext(uploaded_file)
327
+
328
+ if extype.endswith('.csv'):
329
+ papers = upload(extype)
330
+ elif extype.endswith('.txt'):
331
+ papers = conv_txt(extype)
332
+ elif extype.endswith('.json'):
333
+ papers = conv_json(extype)
334
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
335
+ papers = conv_pub(uploaded_file)
336
+
337
+ df_col, selected_cols = get_data(extype)
338
+ comparison = check_comparison(extype)
339
+
340
+ #Menu
341
+ c1, c2, c3 = st.columns([4,0.1,4])
342
+ ColCho = c1.selectbox(
343
+ 'Choose column to analyze',
344
+ (selected_cols), on_change=reset_all)
345
+
346
+ c2.write('')
347
+
348
+ compare = c3.selectbox(
349
+ 'Type of comparison',
350
+ (comparison), on_change=reset_all)
351
+
352
+ with st.expander("🧮 Show advance settings"):
353
+ y1, y2, y3 = st.columns([4,0.1,4])
354
+ t1, t2 = st.columns([3,3])
355
+ words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
356
+ method_shifts = y3.selectbox("Choose preferred method",('Proportion Shifts','Shannon Entropy Shifts', 'Tsallis Entropy Shifts','Kullback-Leibler Divergence Shifts',
357
+ 'Jensen-Shannon Divergence Shifts'), on_change=reset_all)
358
+ rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
359
+ rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
360
+
361
+ if method_shifts == 'Kullback-Leibler Divergence Shifts':
362
+ st.info('The Kullback-Leibler Divergence is only well-defined if every single word in the comparison text is also in the reference text.', icon="ℹ️")
363
+
364
+ paper = clean_csv(extype)
365
+
366
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
367
+
368
+ with tab1:
369
+ #===visualization===
370
+ if compare == 'Word-to-word':
371
+ col1, col2, col3 = st.columns([4,0.1,4])
372
+ text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
373
+ search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
374
+ col2.write('')
375
+ text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
376
+ search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
377
+
378
+ type2freq_1, type2freq_2 = dict_w2w(search_terms1, search_terms2)
379
+
380
+ if not type2freq_1 and not type2freq_2:
381
+ st.warning('We cannot find anything in your document.', icon="⚠️")
382
+ elif not type2freq_1:
383
+ st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
384
+ elif not type2freq_2:
385
+ st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
386
+ else:
387
+ with st.spinner('Processing. Please wait until the visualization comes up'):
388
+ fig, buf = running_shifterator(type2freq_1, type2freq_2)
389
+ st.pyplot(fig)
390
+
391
+ elif compare == 'Manual label':
392
+ col1, col2, col3 = st.columns(3)
393
+
394
+ df_col_sel = sorted([col for col in paper.columns.tolist()])
395
+
396
+ column_selected = col1.selectbox(
397
+ 'Choose column',
398
+ (df_col_sel), on_change=reset_all)
399
+
400
+ list_words = paper[column_selected].values.tolist()
401
+ list_unique = sorted(list(set(list_words)))
402
+
403
+ if column_selected is not None:
404
+ label1 = col2.selectbox(
405
+ 'Choose first label',
406
+ (list_unique), on_change=reset_all)
407
+
408
+ default_index = 0 if len(list_unique) == 1 else 1
409
+ label2 = col3.selectbox(
410
+ 'Choose second label',
411
+ (list_unique), on_change=reset_all, index=default_index)
412
+
413
+ filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
414
+
415
+ dfs1 = filtered_df[filtered_df[column_selected] == label1].reset_index(drop=True)
416
+ dfs2 = filtered_df[filtered_df[column_selected] == label2].reset_index(drop=True)
417
+
418
+ type2freq_1, type2freq_2 = df2dict(dfs1[ColCho], dfs2[ColCho])
419
+
420
+ with st.spinner('Processing. Please wait until the visualization comes up'):
421
+ fig, buf = running_shifterator(type2freq_1, type2freq_2)
422
+ st.pyplot(fig)
423
+
424
+ elif compare == 'Sources':
425
+ col1, col2, col3 = st.columns([4,0.1,4])
426
+
427
+ unique_stitle = set()
428
+ unique_stitle.update(paper['Source title'].dropna())
429
+ list_stitle = sorted(list(unique_stitle))
430
+
431
+ stitle1 = col1.selectbox(
432
+ 'Choose first label',
433
+ (list_stitle), on_change=reset_all)
434
+ col2.write('')
435
+ default_index = 0 if len(list_stitle) == 1 else 1
436
+ stitle2 = col3.selectbox(
437
+ 'Choose second label',
438
+ (list_stitle), on_change=reset_all, index=default_index)
439
+
440
+ type2freq_1, type2freq_2 = dict_sources(stitle1, stitle2)
441
+
442
+ with st.spinner('Processing. Please wait until the visualization comes up'):
443
+ fig, buf = running_shifterator(type2freq_1, type2freq_2)
444
+ st.pyplot(fig)
445
+
446
+ elif compare == 'Years':
447
+ col1, col2, col3 = st.columns([4,0.1,4])
448
+
449
+ MIN, MAX, GAP, MID = get_minmax(extype)
450
+ if (GAP != 0):
451
+ first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
452
+ col2.write('')
453
+ second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
454
+
455
+ type2freq_1, type2freq_2 = dict_years(first_range, second_range)
456
+
457
+ with st.spinner('Processing. Please wait until the visualization comes up'):
458
+ fig, buf = running_shifterator(type2freq_1, type2freq_2)
459
+ st.pyplot(fig)
460
+
461
+ else:
462
+ st.write('You only have data in ', (MAX))
463
+
464
+ d1, d2 = st.columns(2)
465
+
466
+ d1.download_button(
467
+ label="📥 Download Graph",
468
+ data=buf,
469
+ file_name="shifterator.png",
470
+ mime="image/png"
471
+ )
472
+
473
+ @st.cache_data(ttl=3600)
474
+ def shifts_dfs(type2freq_1, type2freq_2):
475
+ proportion_shift = ProportionShift(type2freq_1=type2freq_1, type2freq_2=type2freq_2)
476
+
477
+ words = list(proportion_shift.types)
478
+ shift_scores = proportion_shift.get_shift_scores()
479
+ freq1 = proportion_shift.type2freq_1
480
+ freq2 = proportion_shift.type2freq_2
481
+
482
+ data = []
483
+ for word, score in shift_scores.items():
484
+ data.append({
485
+ 'word': word,
486
+ 'freq_text1': proportion_shift.type2freq_1.get(word, 0),
487
+ 'freq_text2': proportion_shift.type2freq_2.get(word, 0),
488
+ 'shift_score': score
489
+ })
490
+
491
+ df_shift = pd.DataFrame(data)
492
+ df_shift = df_shift.sort_values('shift_score')
493
+
494
+ return df_shift.to_csv(index=False).encode('utf-8')
495
+
496
+ csv = shifts_dfs(type2freq_1, type2freq_2)
497
+
498
+ d2.download_button(
499
+ "📥 Click to download result",
500
+ csv,
501
+ "shiftertor_dataframe.csv",
502
+ "text/csv")
503
+
504
+ with tab2:
505
+ st.markdown('**Gallagher, R.J., Frank, M.R., Mitchell, L. et al. (2021). Generalized Word Shift Graphs: A Method for Visualizing and Explaining Pairwise Comparisons Between Texts. EPJ Data Science, 10(4).** https://doi.org/10.1140/epjds/s13688-021-00260-3')
506
+
507
+ with tab3:
508
+ st.markdown('**Sánchez-Franco, M. J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision, 62(7).** https://doi.org/10.1108/md-06-2023-0966')
509
+ st.markdown('**Ipek Baris Schlicht, Fernandez, E., Chulvi, B., & Rosso, P. (2023). Automatic detection of health misinformation: a systematic review. Journal of Ambient Intelligence and Humanized Computing, 15.** https://doi.org/10.1007/s12652-023-04619-4')
510
+ st.markdown('**Torricelli, M., Falkenberg, M., Galeazzi, A., Zollo, F., Quattrociocchi, W., & Baronchelli, A. (2023). Hurricanes Increase Climate Change Conversations on Twitter. PLOS Climate, 2(11)** https://doi.org/10.1371/journal.pclm.0000277')
511
+
512
+ with tab4:
513
+ st.subheader(':blue[Result]', anchor=False)
514
+ st.button('📥 Download Graph')
515
+ st.text("Click Download Graph button.")
516
+
517
+ st.divider()
518
+ st.subheader(':blue[Shifterator Dataframe]', anchor=False)
519
+ st.button('📥 Click to download result')
520
+ st.text("Click the Download button to get the CSV result.")
521
+
522
+ except Exception as e:
523
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
524
+ st.stop()
pages/9 Summarization.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as components
3
+ import nltk
4
+ import spacy
5
+ import pytextrank
6
+ import pandas as pd
7
+ from rouge_score import rouge_scorer
8
+ from nltk.translate.bleu_score import sentence_bleu
9
+ from transformers import pipeline, PegasusForConditionalGeneration, PegasusTokenizer, T5ForConditionalGeneration, T5Tokenizer
10
+ nltk.download('punkt')
11
+
12
+ #===config===
13
+ st.set_page_config(
14
+ page_title="Coconut",
15
+ page_icon="🥥",
16
+ layout="wide",
17
+ initial_sidebar_state="collapsed"
18
+ )
19
+
20
+ hide_streamlit_style = """
21
+ <style>
22
+ #MainMenu
23
+ {visibility: hidden;}
24
+ footer {visibility: hidden;}
25
+ [data-testid="collapsedControl"] {display: none}
26
+ </style>
27
+ """
28
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
29
+
30
+ with st.popover("🔗 Menu"):
31
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
32
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
33
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
34
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
35
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
36
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
37
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
38
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
39
+ st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
40
+
41
+ st.header("Summarization test", anchor=False)
42
+ st.subheader('Put your file here...', anchor=False)
43
+
44
+ #========unique id========
45
+ @st.cache_resource(ttl=3600)
46
+ def create_list():
47
+ l = [1, 2, 3]
48
+ return l
49
+
50
+ l = create_list()
51
+ first_list_value = l[0]
52
+ l[0] = first_list_value + 1
53
+ uID = str(l[0])
54
+
55
+ @st.cache_data(ttl=3600)
56
+ def get_ext(uploaded_file):
57
+ extype = uID+uploaded_file.name
58
+ return extype
59
+
60
+ #===clear cache===
61
+ def reset_all():
62
+ st.cache_data.clear()
63
+
64
+ #===text reading===
65
+ def read_txt(intext):
66
+ return (intext.read()).decode()
67
+
68
+ #===csv reading===
69
+ def read_csv(uploaded_file):
70
+ fulltexts = pd.read_csv(uploaded_file)
71
+ fulltexts.rename(columns={fulltexts.columns[0]: "texts"}, inplace = True)
72
+ return fulltexts
73
+
74
+
75
+ #===Read data===
76
+ uploaded_file = st.file_uploader('', type=['txt','csv'], on_change=reset_all)
77
+
78
+
79
+ if uploaded_file is not None:
80
+ try:
81
+ extype = get_ext(uploaded_file)
82
+
83
+ if extype.endswith(".txt"):
84
+ fulltext = read_txt(uploaded_file)
85
+ elif extype.endswith(".csv"):
86
+ texts = read_csv(uploaded_file)
87
+
88
+ #Menu
89
+
90
+ method = st.selectbox("Method",("Extractive","Abstractive"))
91
+ if method == "Abstractive":
92
+ ab_method = st.selectbox("Abstractive method", ("Pegasus x-sum","FalconsAI t5"))
93
+ min_length = st.number_input("Minimum length", min_value = 0)
94
+ max_length = st.number_input("Maximum length", min_value = 1)
95
+
96
+ if method == "Extractive":
97
+ ex_method = st.selectbox("Extractive method", ("t5","Spacy PyTextRank"))
98
+ if ex_method == "Spacy PyTextRank":
99
+ phrase_limit = st.number_input("Phrase length limit", min_value = 0)
100
+ sentence_limit = st.number_input("Sentence limit", min_value = 0)
101
+ elif ex_method == "t5" or ex_method == "FalconsAI t5":
102
+ min_length = st.number_input("Minimum length", min_value = 0)
103
+ max_length = st.number_input("Maximum length", min_value = 1)
104
+
105
+
106
+
107
+ if st.button("Submit", on_click=reset_all):
108
+
109
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "⬇️ Download Help"])
110
+
111
+ with tab1:
112
+
113
+ def SpacyRank(text):
114
+ nlp = spacy.load("en_core_web_lg")
115
+ nlp.add_pipe("textrank")
116
+ doc = nlp(text)
117
+ summary = ""
118
+ for sent in doc._.textrank.summary(limit_phrases = phrase_limit, limit_sentences = sentence_limit):
119
+ summary+=str(sent) + '\n'
120
+ return summary
121
+
122
+ def t5summ(text):
123
+ model = T5ForConditionalGeneration.from_pretrained('t5-small')
124
+ tokenizer = T5Tokenizer.from_pretrained('t5-small')
125
+
126
+ input_text = "summarize: " + text
127
+ input_ids = tokenizer.encode(input_text,return_tensors='pt')
128
+
129
+ summed = model.generate(input_ids, max_length = max_length, min_length = min_length)
130
+
131
+ summary = tokenizer.decode(summed[0],skip_special_tokens=True)
132
+ return summary
133
+
134
+ def xsum(text):
135
+ model_name = "google/pegasus-xsum"
136
+
137
+ pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)
138
+
139
+ summarizer = pipeline("summarization",
140
+ model=model_name,
141
+ tokenizer=pegasus_tokenizer,
142
+ framework="pt")
143
+
144
+ summed = summarizer(text, min_length = min_length, max_length = max_length)
145
+ summary = summed[0]["summary_text"]
146
+
147
+ return summary
148
+
149
+ def falcsum(text):
150
+ summarizer = pipeline("summarization",model = "Falconsai/text_summarization")
151
+ summed = summarizer(text, max_length = max_length, min_length = min_length, do_sample = False)
152
+ summary = summed[0]["summary_text"]
153
+ return summary
154
+
155
+ def bulkScore(combined):
156
+
157
+ scorelist = []
158
+
159
+ for column in range(len(combined)):
160
+ ref = combined[column][0]
161
+ cand = combined[column][1]
162
+
163
+ BLEuscore = nltk.translate.bleu_score.sentence_bleu([ref], cand)
164
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
165
+ rougescores = scorer.score(ref, cand)
166
+
167
+ Bscore = f"{BLEuscore:.2f}"
168
+ Rscore = f"{rougescores['rouge1'].fmeasure:.2f}"
169
+
170
+ scoreTuplet = Bscore, Rscore
171
+
172
+ scorelist.append(scoreTuplet)
173
+
174
+ return scorelist
175
+
176
+
177
+ with st.spinner('Performing computations. Please wait ...'):
178
+
179
+ c1, c2 = st.columns([0.5,0.5], border=True)
180
+
181
+ if(extype.endswith(".txt")):
182
+
183
+ with c1:
184
+ if(extype.endswith(".txt")):
185
+ st.header("Original text")
186
+ with st.container(border=True):
187
+ st.write(fulltext)
188
+
189
+ if method == "Extractive":
190
+ if(ex_method == "Spacy PyTextRank"):
191
+ summary = SpacyRank(fulltext)
192
+ elif(ex_method == "t5"):
193
+ summary = t5summ(fulltext)
194
+
195
+ elif method == "Abstractive":
196
+ if ab_method == "Pegasus x-sum":
197
+ summary = xsum(fulltext)
198
+
199
+ elif ab_method == "FalconsAI t5":
200
+ summary = t5summ(fulltext)
201
+ with c2:
202
+
203
+ st.header("Summarized")
204
+ with st.container(border = True):
205
+ st.write(summary)
206
+ st.header("Performance scores")
207
+ with st.container(border = True):
208
+
209
+ #performance metrics
210
+ reference = fulltext
211
+ candidate = summary
212
+
213
+ BLEuscore = nltk.translate.bleu_score.sentence_bleu([reference], candidate)
214
+
215
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
216
+ rougescores = scorer.score(reference, candidate)
217
+
218
+ st.write(f"BLEU Score (NLTK): {BLEuscore:.2f}")
219
+ st.write(f"ROUGE-1 F1 Score: {rougescores['rouge1'].fmeasure:.2f}")
220
+
221
+ text_file = summary
222
+ st.download_button(
223
+ label = "Download Results",
224
+ data=text_file,
225
+ file_name="Summary.txt",
226
+ mime="text\csv",
227
+ on_click="ignore",)
228
+
229
+ elif(extype.endswith(".csv")):
230
+ if method == "Extractive":
231
+ if(ex_method == "Spacy PyTextRank"):
232
+ summaries = texts['texts'].apply(SpacyRank)
233
+ fullnsums = summaries.to_frame()
234
+ fullnsums['full'] = texts['texts']
235
+ fullnsums['combined'] = fullnsums.values.tolist()
236
+
237
+
238
+ elif(ex_method == "t5"):
239
+ summaries = texts['texts'].apply(t5summ)
240
+ fullnsums = summaries.to_frame()
241
+ fullnsums['full'] = texts['texts']
242
+ fullnsums['combined'] = fullnsums.values.tolist()
243
+
244
+
245
+ elif method == "Abstractive":
246
+ if ab_method == "Pegasus x-sum":
247
+ summaries = texts['texts'].apply(xsum)
248
+ fullnsums = summaries.to_frame()
249
+ fullnsums['full'] = texts['texts']
250
+ fullnsums['combined'] = fullnsums.values.tolist()
251
+
252
+ elif ab_method == "FalconsAI t5":
253
+ summaries = texts['texts'].apply(falcsum)
254
+ fullnsums = summaries.to_frame()
255
+ fullnsums['full'] = texts['texts']
256
+ fullnsums['combined'] = fullnsums.values.tolist()
257
+
258
+ with c1:
259
+ st.header("Download bulk summarization results")
260
+
261
+ result = summaries.to_csv()
262
+ st.download_button(
263
+ label = "Download Results",
264
+ data = result,
265
+ file_name = "Summaries.csv",
266
+ mime="text\csv",
267
+ on_click = "ignore"
268
+ )
269
+
270
+ with c2:
271
+ st.header("Scores and summaries results")
272
+ scores = pd.DataFrame.from_records(bulkScore(fullnsums.combined.to_list()),columns = ["BLEU","Rouge"])
273
+
274
+ summariesscores = fullnsums.join(scores)
275
+
276
+ summariesscores.drop("combined", axis = 1, inplace = True)
277
+ summariesscores.rename(columns = {"texts":"summarized"}, inplace = True)
278
+
279
+ result2 = summariesscores.to_csv()
280
+
281
+ st.download_button(
282
+ label = "Download scores and results",
283
+ data = result2,
284
+ file_name = "ScoredSummaries.csv",
285
+ mime = "test\csv",
286
+ on_click = "ignore"
287
+ )
288
+
289
+
290
+
291
+
292
+
293
+ #do this
294
+ with tab2:
295
+ st.write("")
296
+
297
+ with tab3:
298
+ st.header("Summarization result (.txt)")
299
+ st.write("Click the download button (example) to get the text file result")
300
+ st.button(label = "Download Results")
301
+
302
+
303
+ except Exception as e:
304
+ st.write(e)
tools/__pycache__/sourceformat.cpython-310.pyc ADDED
Binary file (5.74 kB). View file
 
tools/sourceformat.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO, BytesIO
2
+ import pymarc
3
+ import requests
4
+ import string
5
+ import pandas as pd
6
+ import tarfile
7
+ try:
8
+ from lxml import etree as ET
9
+ except ImportError:
10
+ import xml.etree.ElementTree as ET
11
+
12
+ #metadata for htrc worksets
13
+ def htrc(self):
14
+
15
+ #variables/arrays and stuff
16
+
17
+ #string of keywords per volume/htid
18
+ keywords = ""
19
+
20
+ #array of all the keywords per each volume/htid, to add to the file
21
+ keylist = []
22
+
23
+ #get htids of the volumes
24
+ htids = self['htid'].values.tolist()
25
+ #iterate through list of htids
26
+ for id in range(len(htids)):
27
+ htid = htids[id]
28
+
29
+ #api call for the extra metadata using htid
30
+ extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
31
+
32
+ #turn the request into a json file
33
+ extradata = extradata.json()
34
+
35
+ #get record id and use it to get the xml/marc file with the actual metadata
36
+ recid = extradata['items'][0]['fromRecord']
37
+ xmlmarc = extradata['records'][recid]['marc-xml']
38
+
39
+ #turn the formatted xml into an actual pymarc
40
+ xml = StringIO(xmlmarc)
41
+ marc = pymarc.parse_xml_to_array(xml)[0]
42
+ xml.close()
43
+
44
+ for term in marc.get_fields('650'):
45
+ if "http" in (term.value()).lower():
46
+ keywords+= ""
47
+ elif "ocolc" in (term.value()).lower():
48
+ keywords+=""
49
+ else:
50
+ keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; "
51
+ keylist.append(keywords)
52
+ self['Keywords'] = keylist
53
+ return self
54
+
55
+ def htrcxtra(self):
56
+
57
+ #variables/arrays and stuff
58
+
59
+ #string of keywords per volume/htid
60
+ pages = ""
61
+
62
+ #array of all the keywords per each volume/htid, to add to the file
63
+ pagecount = []
64
+
65
+ #get htids of the volumes
66
+ htids = self['htid'].values.tolist()
67
+ #iterate through list of htids
68
+ for id in range(len(htids)):
69
+ htid = htids[id]
70
+
71
+ #api call for the extra metadata using htid
72
+ extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
73
+
74
+ #turn the request into a json file
75
+ extradata = extradata.json()
76
+
77
+ #get record id and use it to get the xml/marc file with the actual metadata
78
+ recid = extradata['items'][0]['fromRecord']
79
+ xmlmarc = extradata['records'][recid]['marc-xml']
80
+
81
+ #turn the formatted xml into an actual pymarc
82
+ xml = StringIO(xmlmarc)
83
+ marc = pymarc.parse_xml_to_array(xml)[0]
84
+ xml.close()
85
+
86
+ for term in marc.get_fields('350'):
87
+ pages+=term.value()
88
+ pagecount.append(pages)
89
+ self['pages'] = pagecount
90
+ return self
91
+
92
+
93
+ #format files from dimensions
94
+ def dim(file):
95
+ formatted = file.drop(file.columns[[0]],axis=1)
96
+
97
+ done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False))))
98
+
99
+ return done
100
+
101
+
102
+
103
+ def readPub(tar):
104
+
105
+ #list to put xmls from tarfile in
106
+ xmllist = []
107
+
108
+ readfile = BytesIO(tar)
109
+
110
+ #get the files from the tarfile into the list
111
+ files = tarfile.open(fileobj=readfile, mode = 'r:gz', )
112
+ for member in files.getmembers():
113
+ singlefile = files.extractfile(member)
114
+ if singlefile is not None:
115
+ article = singlefile.read()
116
+ article = article.decode("utf-8")
117
+ article = StringIO(article)
118
+ xmllist.append(article)
119
+
120
+ #lists for each data point
121
+ titles = []
122
+ years = []
123
+ keys = []
124
+ authors = []
125
+ publishers = []
126
+ journaltitles = []
127
+
128
+ #go through each xml file in the list
129
+ for art in range(len(xmllist)):
130
+
131
+ #make a parseable element tree out of the xml file
132
+ tree = ET.parse(xmllist[art])
133
+ root = tree.getroot()
134
+
135
+ #remove parts of the main branch that do not have metadata that we care about
136
+ for child in list(root):
137
+ if(child.tag!="front"):
138
+ root.remove(child)
139
+
140
+ #names to concatnate for each article
141
+ firstname = []
142
+ lastname = []
143
+
144
+ #individual strings for multiple keywords/titles
145
+ key = ""
146
+ title = ""
147
+
148
+
149
+ for target in root.iter('article-title'):
150
+ if target.text is not None:
151
+ title += target.text + ", "
152
+ else:
153
+ title += " "
154
+ for target in root.iter('kwd'):
155
+ if target.text is not None:
156
+ key+=target.text+ "; "
157
+ else:
158
+ key += " "
159
+ for target in root.iter('year'):
160
+ year=int(target.text)
161
+ years.append(year)
162
+ for names in root.iter('given-names'):
163
+ firstname.append(names.text)
164
+ for names in root.iter('surname'):
165
+ lastname.append(names.text)
166
+ for target in root.iter('journal-title'):
167
+ jtitle = target.text
168
+ journaltitles.append(jtitle)
169
+ for target in root.iter('publisher-name'):
170
+ publisher = target.text
171
+ publishers.append(publisher)
172
+
173
+ titles.append(title)
174
+ keys.append(key)
175
+
176
+ fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
177
+
178
+ #join the names into a single string with authors
179
+ author = str.join(', ', fullnames)
180
+
181
+ authors.append(author)
182
+
183
+ data = pd.DataFrame()
184
+
185
+ data["Title"] = pd.Series(titles)
186
+ data["Keywords"] = pd.Series(keys)
187
+ data["Authors"] = pd.Series(authors)
188
+ data["Year"] = pd.Series(years)
189
+ data["Document Type"] = pd.Series(publisher)
190
+ data["Source title"] = pd.Series(journaltitles)
191
+
192
+ data.fillna(value = "empty", inplace = True)
193
+
194
+ return data
195
+
196
+
197
+ def readxml(file):
198
+ root = ET.fromstring(file)
199
+
200
+
201
+
202
+ #remove stuff from the xml that we do not need
203
+ for child in list(root):
204
+ for lchild in list(child):
205
+ if(lchild.tag!="front"):
206
+ child.remove(lchild)
207
+
208
+ #get stuff
209
+
210
+ keys = []
211
+ titles = []
212
+ authors = []
213
+ jtitle = []
214
+ publishers = []
215
+ years = []
216
+
217
+ for child in list(root):
218
+ for article in list(child):
219
+ key = ""
220
+ firstname = []
221
+ lastname = []
222
+ for target in article.iter('article-title'):
223
+
224
+ if target.text is not None:
225
+ titles.append(target.text)
226
+ else:
227
+ titles.append("empty")
228
+ for target in article.iter('kwd'):
229
+ if target.text is not None:
230
+ key+= target.text + "; "
231
+ else:
232
+ key += ""
233
+ keys.append(key)
234
+ for target in article.iter('given-names'):
235
+ firstname.append(target.text)
236
+ for target in article.iter('surname'):
237
+ lastname.append(target.text)
238
+
239
+ fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
240
+ author = str.join(', ', fullnames)
241
+ authors.append(author)
242
+
243
+ for target in article.iter('journal-title'):
244
+ jtitle.append(target.text)
245
+ for target in article.iter('publisher-name'):
246
+ publishers.append(target.text)
247
+
248
+ for target in article.iter('year'):
249
+ years.append(int(target.text))
250
+
251
+ frame = pd.DataFrame()
252
+
253
+ frame["Title"] = pd.Series(titles)
254
+ frame["Keywords"] = pd.Series(keys)
255
+ frame["Authors"] = pd.Series(authors)
256
+ frame["Year"] = pd.Series(years)
257
+ frame["Document Type"] = pd.Series(jtitle)
258
+ frame["Source title"] = pd.Series(publishers)
259
+
260
+ frame.fillna(value = "empty", inplace = True)
261
+
262
+ return frame
263
+
264
+ def medline(file):
265
+
266
+ textfile = file.read()
267
+
268
+
269
+ text = textfile.decode()
270
+
271
+
272
+
273
+
274
+
275
+ authors = []
276
+ titles = []
277
+ year = []
278
+ meshkeys = []
279
+ otherkeys = []
280
+
281
+ #articles are separated by newlines so seperate them
282
+ articles = text.split('\n\n')
283
+
284
+ for paper in articles:
285
+ names = ""
286
+ meshk = ""
287
+ otherk = ""
288
+ largetext = paper.splitlines()
289
+ for line in largetext:
290
+ #title
291
+ if "TI - " in line:
292
+ #checking if the title goes over another line, and to add it if it does
293
+ startpos = line.index("-") + 2
294
+ if "- " not in(largetext[largetext.index(line)+1]):
295
+ titles.append(line[startpos:] + " " + largetext[largetext.index(line)+1].strip())
296
+ else:
297
+ titles.append(line[startpos:])
298
+ #author
299
+ if "FAU - " in line:
300
+ startpos = line.index("-") + 2
301
+ names+= line[startpos:] + "; "
302
+ #year
303
+ if "DP - " in line:
304
+ startpos = line.index("-") + 2
305
+ year.append(int(line[startpos:startpos+4]))
306
+ #key terms
307
+ if "MH - " in line:
308
+ startpos = line.index("-") + 2
309
+ meshk += line[startpos:] + "; "
310
+ if"OT - " in line:
311
+ startpos = line.index("-") + 2
312
+ otherk += line[startpos:] + "; "
313
+
314
+ authors.append(names)
315
+ meshkeys.append(meshk)
316
+ otherkeys.append(otherk)
317
+
318
+ frame = pd.DataFrame()
319
+
320
+ frame['Title'] = pd.Series(titles)
321
+ frame['Authors'] = pd.Series(authors)
322
+ frame['Year'] = pd.Series(year)
323
+ frame['MeSH Keywords'] = pd.Series(meshkeys)
324
+ frame['Other Keywords'] = pd.Series(otherkeys)
325
+
326
+ frame.fillna(value = "empty", inplace = True)
327
+
328
+ return frame