Spaces:
Running
Running
T Le
commited on
Commit
·
c837e02
1
Parent(s):
d4ac1c4
Upload updated files
Browse files- images/bidirected.png +0 -0
- images/burst.png +0 -0
- images/coconut-web.jpg +0 -0
- images/download_bertopic.jpg +0 -0
- images/download_bidirected.jpg +0 -0
- images/download_biterm.jpg +0 -0
- images/download_sentiment.png +0 -0
- images/downloadtable.png +0 -0
- images/lemma.png +0 -0
- images/scattertext.png +0 -0
- images/sentiment.png +0 -0
- images/sentitable.png +0 -0
- images/sunburst.png +0 -0
- images/tablenetwork.png +0 -0
- images/topicmodeling.png +0 -0
- pages/.DS_Store +0 -0
- pages/0 FileChecker.py +60 -6
- pages/1 Scattertext.py +106 -18
- pages/2 Topic Modeling.py +659 -445
- pages/3 Bidirected Network.py +370 -276
- pages/4 Sunburst.py +83 -16
- pages/5 Burst Detection.py +171 -52
- pages/6 Keywords Stem.py +298 -238
- pages/7 Sentiment Analysis.py +357 -0
- pages/8 Shifterator.py +524 -0
- pages/9 Summarization.py +304 -0
- tools/__pycache__/sourceformat.cpython-310.pyc +0 -0
- tools/sourceformat.py +328 -0
images/bidirected.png
ADDED
![]() |
images/burst.png
ADDED
![]() |
images/coconut-web.jpg
ADDED
![]() |
images/download_bertopic.jpg
ADDED
![]() |
images/download_bidirected.jpg
ADDED
![]() |
images/download_biterm.jpg
ADDED
![]() |
images/download_sentiment.png
ADDED
![]() |
images/downloadtable.png
ADDED
![]() |
images/lemma.png
ADDED
![]() |
images/scattertext.png
ADDED
![]() |
images/sentiment.png
ADDED
![]() |
images/sentitable.png
ADDED
![]() |
images/sunburst.png
ADDED
![]() |
images/tablenetwork.png
ADDED
![]() |
images/topicmodeling.png
ADDED
![]() |
pages/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
pages/0 FileChecker.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
3 |
|
4 |
#===config===
|
5 |
st.set_page_config(
|
@@ -34,10 +36,23 @@ def get_ext(extype):
|
|
34 |
@st.cache_data(ttl=3600)
|
35 |
def upload(extype):
|
36 |
keywords = pd.read_csv(uploaded_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return keywords
|
38 |
|
39 |
@st.cache_data(ttl=3600)
|
40 |
def conv_txt(extype):
|
|
|
|
|
|
|
|
|
|
|
41 |
col_dict = {'TI': 'Title',
|
42 |
'SO': 'Source title',
|
43 |
'DE': 'Author Keywords',
|
@@ -45,16 +60,48 @@ def conv_txt(extype):
|
|
45 |
'AB': 'Abstract',
|
46 |
'TC': 'Cited by',
|
47 |
'PY': 'Year',
|
48 |
-
'ID': 'Keywords Plus'
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
return keywords
|
52 |
|
53 |
st.header('File Checker', anchor=False)
|
54 |
st.subheader('Put your file here...', anchor=False)
|
55 |
|
56 |
#===read data===
|
57 |
-
uploaded_file = st.file_uploader('', type=['csv','txt'], on_change=reset_data)
|
58 |
|
59 |
if uploaded_file is not None:
|
60 |
extype = get_ext(uploaded_file)
|
@@ -64,8 +111,15 @@ if uploaded_file is not None:
|
|
64 |
elif extype.endswith('.txt'):
|
65 |
data = conv_txt(extype)
|
66 |
|
67 |
-
|
|
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
with col1:
|
70 |
#===check keywords===
|
71 |
keycheck = list(data.columns)
|
@@ -134,4 +188,4 @@ if uploaded_file is not None:
|
|
134 |
container6.write("Unfortunately, you don't have a column containing object in your data. Please check again.")
|
135 |
else:
|
136 |
container6.subheader('✔️ Scattertext', divider='blue', anchor=False)
|
137 |
-
container6.write('Congratulations! You can use Scattertext')
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
import json
|
4 |
+
from tools import sourceformat as sf
|
5 |
|
6 |
#===config===
|
7 |
st.set_page_config(
|
|
|
36 |
@st.cache_data(ttl=3600)
|
37 |
def upload(extype):
|
38 |
keywords = pd.read_csv(uploaded_file)
|
39 |
+
if "dimensions" in uploaded_file.name.lower():
|
40 |
+
keywords = sf.dim(keywords)
|
41 |
+
col_dict = {'MeSH terms': 'Keywords',
|
42 |
+
'PubYear': 'Year',
|
43 |
+
'Times cited': 'Cited by',
|
44 |
+
'Publication Type': 'Document Type'
|
45 |
+
}
|
46 |
+
keywords.rename(columns=col_dict, inplace=True)
|
47 |
return keywords
|
48 |
|
49 |
@st.cache_data(ttl=3600)
|
50 |
def conv_txt(extype):
|
51 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
52 |
+
uploaded_file.seek(0)
|
53 |
+
papers = sf.medline(uploaded_file)
|
54 |
+
print(papers)
|
55 |
+
return papers
|
56 |
col_dict = {'TI': 'Title',
|
57 |
'SO': 'Source title',
|
58 |
'DE': 'Author Keywords',
|
|
|
60 |
'AB': 'Abstract',
|
61 |
'TC': 'Cited by',
|
62 |
'PY': 'Year',
|
63 |
+
'ID': 'Keywords Plus',
|
64 |
+
'rights_date_used': 'Year'}
|
65 |
+
uploaded_file.seek(0)
|
66 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
67 |
+
if("htid" in papers.columns):
|
68 |
+
papers = sf.htrc(papers)
|
69 |
+
papers.rename(columns=col_dict, inplace=True)
|
70 |
+
print(papers)
|
71 |
+
return papers
|
72 |
+
|
73 |
+
|
74 |
+
@st.cache_data(ttl=3600)
|
75 |
+
def conv_json(extype):
|
76 |
+
col_dict={'title': 'title',
|
77 |
+
'rights_date_used': 'Year',
|
78 |
+
'content_provider_code':'Source title'
|
79 |
+
}
|
80 |
+
|
81 |
+
data = json.load(uploaded_file)
|
82 |
+
hathifile = data['gathers']
|
83 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
84 |
+
|
85 |
+
keywords = sf.htrc(keywords)
|
86 |
+
keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
|
87 |
+
keywords.rename(columns=col_dict,inplace=True)
|
88 |
+
return keywords
|
89 |
+
|
90 |
+
@st.cache_data(ttl=3600)
|
91 |
+
def conv_pub(extype):
|
92 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
93 |
+
bytedata = extype.read()
|
94 |
+
keywords = sf.readPub(bytedata)
|
95 |
+
elif (get_ext(extype)).endswith('.xml'):
|
96 |
+
bytedata = extype.read()
|
97 |
+
keywords = sf.readxml(bytedata)
|
98 |
return keywords
|
99 |
|
100 |
st.header('File Checker', anchor=False)
|
101 |
st.subheader('Put your file here...', anchor=False)
|
102 |
|
103 |
#===read data===
|
104 |
+
uploaded_file = st.file_uploader('', type=['csv','txt','json', 'tar.gz', 'xml'], on_change=reset_data)
|
105 |
|
106 |
if uploaded_file is not None:
|
107 |
extype = get_ext(uploaded_file)
|
|
|
111 |
elif extype.endswith('.txt'):
|
112 |
data = conv_txt(extype)
|
113 |
|
114 |
+
elif extype.endswith('.json'):
|
115 |
+
data = conv_json(extype)
|
116 |
|
117 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
118 |
+
data = conv_pub(uploaded_file)
|
119 |
+
|
120 |
+
|
121 |
+
col1, col2, col3 = st.columns(3)
|
122 |
+
|
123 |
with col1:
|
124 |
#===check keywords===
|
125 |
keycheck = list(data.columns)
|
|
|
188 |
container6.write("Unfortunately, you don't have a column containing object in your data. Please check again.")
|
189 |
else:
|
190 |
container6.subheader('✔️ Scattertext', divider='blue', anchor=False)
|
191 |
+
container6.write('Congratulations! You can use Scattertext')
|
pages/1 Scattertext.py
CHANGED
@@ -10,6 +10,8 @@ nltk.download('stopwords')
|
|
10 |
from nltk.corpus import stopwords
|
11 |
import time
|
12 |
import sys
|
|
|
|
|
13 |
|
14 |
#===config===
|
15 |
st.set_page_config(
|
@@ -37,6 +39,9 @@ with st.popover("🔗 Menu"):
|
|
37 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
38 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
39 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
|
|
|
|
|
|
40 |
|
41 |
st.header("Scattertext", anchor=False)
|
42 |
st.subheader('Put your file here...', anchor=False)
|
@@ -57,19 +62,66 @@ def upload(extype):
|
|
57 |
if 'Publication Year' in papers.columns:
|
58 |
papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
59 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
return papers
|
61 |
|
62 |
@st.cache_data(ttl=3600)
|
63 |
def conv_txt(extype):
|
|
|
|
|
|
|
|
|
|
|
64 |
col_dict = {'TI': 'Title',
|
65 |
'SO': 'Source title',
|
|
|
66 |
'DT': 'Document Type',
|
67 |
'AB': 'Abstract',
|
68 |
-
'
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
papers.rename(columns=col_dict, inplace=True)
|
|
|
71 |
return papers
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
@st.cache_data(ttl=3600)
|
74 |
def get_data(extype):
|
75 |
df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
@@ -122,18 +174,24 @@ def clean_csv(extype):
|
|
122 |
|
123 |
#===lemmatize===
|
124 |
lemmatizer = WordNetLemmatizer()
|
|
|
|
|
125 |
def lemmatize_words(text):
|
126 |
words = text.split()
|
127 |
words = [lemmatizer.lemmatize(word) for word in words]
|
128 |
return ' '.join(words)
|
|
|
129 |
paper[ColCho] = paper[ColCho].apply(lemmatize_words)
|
130 |
|
131 |
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
132 |
remove_set = set(words_rmv)
|
|
|
|
|
133 |
def remove_words(text):
|
134 |
words = text.split()
|
135 |
cleaned_words = [word for word in words if word not in remove_set]
|
136 |
return ' '.join(cleaned_words)
|
|
|
137 |
paper[ColCho] = paper[ColCho].apply(remove_words)
|
138 |
|
139 |
return paper
|
@@ -155,7 +213,11 @@ def running_scattertext(cat_col, catname, noncatname):
|
|
155 |
nlp = stx.whitespace_nlp_with_sentences,
|
156 |
).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count = min_term)
|
157 |
|
158 |
-
|
|
|
|
|
|
|
|
|
159 |
|
160 |
try:
|
161 |
html = stx.produce_scattertext_explorer(corpus,
|
@@ -175,11 +237,8 @@ def running_scattertext(cat_col, catname, noncatname):
|
|
175 |
width_in_pixels = 900,
|
176 |
minimum_term_frequency = 0,
|
177 |
save_svg_button=True)
|
178 |
-
|
179 |
-
|
180 |
-
time.sleep(1)
|
181 |
-
st.toast('Visualizing', icon='⏳')
|
182 |
-
components.html(html, height = 1200, scrolling = True)
|
183 |
|
184 |
except ValueError:
|
185 |
st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
|
@@ -226,7 +285,7 @@ def df_years(first_range, second_range):
|
|
226 |
return filtered_df
|
227 |
|
228 |
#===Read data===
|
229 |
-
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
230 |
|
231 |
if uploaded_file is not None:
|
232 |
try:
|
@@ -236,6 +295,10 @@ if uploaded_file is not None:
|
|
236 |
papers = upload(extype)
|
237 |
elif extype.endswith('.txt'):
|
238 |
papers = conv_txt(extype)
|
|
|
|
|
|
|
|
|
239 |
|
240 |
df_col, selected_cols = get_data(extype)
|
241 |
comparison = check_comparison(extype)
|
@@ -264,7 +327,7 @@ if uploaded_file is not None:
|
|
264 |
|
265 |
paper = clean_csv(extype)
|
266 |
|
267 |
-
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
268 |
|
269 |
with tab1:
|
270 |
#===visualization===
|
@@ -286,7 +349,7 @@ if uploaded_file is not None:
|
|
286 |
st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
|
287 |
else:
|
288 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
289 |
-
running_scattertext('Topic', 'First Term', 'Second Term')
|
290 |
|
291 |
elif compare == 'Manual label':
|
292 |
col1, col2, col3 = st.columns(3)
|
@@ -313,7 +376,7 @@ if uploaded_file is not None:
|
|
313 |
filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
|
314 |
|
315 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
316 |
-
running_scattertext(column_selected, label1, label2)
|
317 |
|
318 |
elif compare == 'Sources':
|
319 |
col1, col2, col3 = st.columns([4,0.1,4])
|
@@ -334,7 +397,7 @@ if uploaded_file is not None:
|
|
334 |
filtered_df = df_sources(stitle1, stitle2)
|
335 |
|
336 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
337 |
-
running_scattertext('Source title', stitle1, stitle2)
|
338 |
|
339 |
elif compare == 'Years':
|
340 |
col1, col2, col3 = st.columns([4,0.1,4])
|
@@ -348,19 +411,44 @@ if uploaded_file is not None:
|
|
348 |
filtered_df = df_years(first_range, second_range)
|
349 |
|
350 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
351 |
-
running_scattertext('Topic Range', 'First range', 'Second range')
|
352 |
-
|
353 |
else:
|
354 |
st.write('You only have data in ', (MAX))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
with tab2:
|
357 |
st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
|
358 |
|
359 |
with tab3:
|
|
|
360 |
st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
|
361 |
st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
|
362 |
-
st.markdown('**
|
363 |
|
364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
366 |
-
st.stop()
|
|
|
10 |
from nltk.corpus import stopwords
|
11 |
import time
|
12 |
import sys
|
13 |
+
import json
|
14 |
+
from tools import sourceformat as sf
|
15 |
|
16 |
#===config===
|
17 |
st.set_page_config(
|
|
|
39 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
40 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
41 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
42 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
43 |
+
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
44 |
+
st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣")
|
45 |
|
46 |
st.header("Scattertext", anchor=False)
|
47 |
st.subheader('Put your file here...', anchor=False)
|
|
|
62 |
if 'Publication Year' in papers.columns:
|
63 |
papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
64 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
65 |
+
|
66 |
+
elif "About the data" in papers.columns[0]:
|
67 |
+
papers = sf.dim(papers)
|
68 |
+
col_dict = {'MeSH terms': 'Keywords',
|
69 |
+
'PubYear': 'Year',
|
70 |
+
'Times cited': 'Cited by',
|
71 |
+
'Publication Type': 'Document Type'
|
72 |
+
}
|
73 |
+
papers.rename(columns=col_dict, inplace=True)
|
74 |
+
|
75 |
return papers
|
76 |
|
77 |
@st.cache_data(ttl=3600)
|
78 |
def conv_txt(extype):
|
79 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
80 |
+
uploaded_file.seek(0)
|
81 |
+
papers = sf.medline(uploaded_file)
|
82 |
+
print(papers)
|
83 |
+
return papers
|
84 |
col_dict = {'TI': 'Title',
|
85 |
'SO': 'Source title',
|
86 |
+
'DE': 'Author Keywords',
|
87 |
'DT': 'Document Type',
|
88 |
'AB': 'Abstract',
|
89 |
+
'TC': 'Cited by',
|
90 |
+
'PY': 'Year',
|
91 |
+
'ID': 'Keywords Plus',
|
92 |
+
'rights_date_used': 'Year'}
|
93 |
+
uploaded_file.seek(0)
|
94 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
95 |
+
if("htid" in papers.columns):
|
96 |
+
papers = sf.htrc(papers)
|
97 |
papers.rename(columns=col_dict, inplace=True)
|
98 |
+
print(papers)
|
99 |
return papers
|
100 |
|
101 |
+
@st.cache_data(ttl=3600)
|
102 |
+
def conv_json(extype):
|
103 |
+
col_dict={'title': 'title',
|
104 |
+
'rights_date_used': 'Year',
|
105 |
+
}
|
106 |
+
|
107 |
+
data = json.load(uploaded_file)
|
108 |
+
hathifile = data['gathers']
|
109 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
110 |
+
|
111 |
+
keywords = sf.htrc(keywords)
|
112 |
+
keywords.rename(columns=col_dict,inplace=True)
|
113 |
+
return keywords
|
114 |
+
|
115 |
+
@st.cache_data(ttl=3600)
|
116 |
+
def conv_pub(extype):
|
117 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
118 |
+
bytedata = extype.read()
|
119 |
+
keywords = sf.readPub(bytedata)
|
120 |
+
elif (get_ext(extype)).endswith('.xml'):
|
121 |
+
bytedata = extype.read()
|
122 |
+
keywords = sf.readxml(bytedata)
|
123 |
+
return keywords
|
124 |
+
|
125 |
@st.cache_data(ttl=3600)
|
126 |
def get_data(extype):
|
127 |
df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
|
|
174 |
|
175 |
#===lemmatize===
|
176 |
lemmatizer = WordNetLemmatizer()
|
177 |
+
|
178 |
+
@st.cache_data(ttl=3600)
|
179 |
def lemmatize_words(text):
|
180 |
words = text.split()
|
181 |
words = [lemmatizer.lemmatize(word) for word in words]
|
182 |
return ' '.join(words)
|
183 |
+
|
184 |
paper[ColCho] = paper[ColCho].apply(lemmatize_words)
|
185 |
|
186 |
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
187 |
remove_set = set(words_rmv)
|
188 |
+
|
189 |
+
@st.cache_data(ttl=3600)
|
190 |
def remove_words(text):
|
191 |
words = text.split()
|
192 |
cleaned_words = [word for word in words if word not in remove_set]
|
193 |
return ' '.join(cleaned_words)
|
194 |
+
|
195 |
paper[ColCho] = paper[ColCho].apply(remove_words)
|
196 |
|
197 |
return paper
|
|
|
213 |
nlp = stx.whitespace_nlp_with_sentences,
|
214 |
).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count = min_term)
|
215 |
|
216 |
+
#table results
|
217 |
+
disp = stx.Dispersion(corpus)
|
218 |
+
disp_df = disp.get_df()
|
219 |
+
|
220 |
+
disp_csv = disp_df.to_csv(index=False).encode('utf-8')
|
221 |
|
222 |
try:
|
223 |
html = stx.produce_scattertext_explorer(corpus,
|
|
|
237 |
width_in_pixels = 900,
|
238 |
minimum_term_frequency = 0,
|
239 |
save_svg_button=True)
|
240 |
+
|
241 |
+
return disp_csv, html
|
|
|
|
|
|
|
242 |
|
243 |
except ValueError:
|
244 |
st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
|
|
|
285 |
return filtered_df
|
286 |
|
287 |
#===Read data===
|
288 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt', 'json', 'tar.gz','xml'], on_change=reset_all)
|
289 |
|
290 |
if uploaded_file is not None:
|
291 |
try:
|
|
|
295 |
papers = upload(extype)
|
296 |
elif extype.endswith('.txt'):
|
297 |
papers = conv_txt(extype)
|
298 |
+
elif extype.endswith('.json'):
|
299 |
+
papers = conv_json(extype)
|
300 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
301 |
+
papers = conv_pub(uploaded_file)
|
302 |
|
303 |
df_col, selected_cols = get_data(extype)
|
304 |
comparison = check_comparison(extype)
|
|
|
327 |
|
328 |
paper = clean_csv(extype)
|
329 |
|
330 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
331 |
|
332 |
with tab1:
|
333 |
#===visualization===
|
|
|
349 |
st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
|
350 |
else:
|
351 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
352 |
+
disp_df, html = running_scattertext('Topic', 'First Term', 'Second Term')
|
353 |
|
354 |
elif compare == 'Manual label':
|
355 |
col1, col2, col3 = st.columns(3)
|
|
|
376 |
filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
|
377 |
|
378 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
379 |
+
disp_df, html = running_scattertext(column_selected, label1, label2)
|
380 |
|
381 |
elif compare == 'Sources':
|
382 |
col1, col2, col3 = st.columns([4,0.1,4])
|
|
|
397 |
filtered_df = df_sources(stitle1, stitle2)
|
398 |
|
399 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
400 |
+
disp_df, html = running_scattertext('Source title', stitle1, stitle2)
|
401 |
|
402 |
elif compare == 'Years':
|
403 |
col1, col2, col3 = st.columns([4,0.1,4])
|
|
|
411 |
filtered_df = df_years(first_range, second_range)
|
412 |
|
413 |
with st.spinner('Processing. Please wait until the visualization comes up'):
|
414 |
+
disp_df, html = running_scattertext('Topic Range', 'First range', 'Second range')
|
415 |
+
|
416 |
else:
|
417 |
st.write('You only have data in ', (MAX))
|
418 |
+
|
419 |
+
if html:
|
420 |
+
st.toast('Process completed', icon='🎉')
|
421 |
+
time.sleep(1)
|
422 |
+
st.toast('Visualizing', icon='⏳')
|
423 |
+
components.html(html, height = 1200, scrolling = True)
|
424 |
+
|
425 |
+
st.download_button(
|
426 |
+
"📥 Click to download result",
|
427 |
+
disp_df,
|
428 |
+
"scattertext_dataframe.csv",
|
429 |
+
"text/csv",
|
430 |
+
on_click="ignore")
|
431 |
|
432 |
with tab2:
|
433 |
st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
|
434 |
|
435 |
with tab3:
|
436 |
+
st.markdown('**Sánchez-Franco, M. J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision, 62(7).** https://doi.org/10.1108/md-06-2023-0966')
|
437 |
st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
|
438 |
st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
|
439 |
+
st.markdown('**Santosa, F. A. (2025). Artificial Intelligence in Library Studies: A Textual Analysis. JLIS.It, 16(1).** https://doi.org/10.36253/jlis.it-626')
|
440 |
|
441 |
+
with tab4:
|
442 |
+
st.subheader(':blue[Image]', anchor=False)
|
443 |
+
st.write("Click the :blue[Download SVG] on the right side.")
|
444 |
+
st.divider()
|
445 |
+
st.subheader(':blue[Scattertext Dataframe]', anchor=False)
|
446 |
+
st.button('📥 Click to download result')
|
447 |
+
st.text("Click the Download button to get the CSV result.")
|
448 |
+
|
449 |
+
except NameError:
|
450 |
+
pass
|
451 |
+
|
452 |
+
except Exception as e:
|
453 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
454 |
+
st.stop()
|
pages/2 Topic Modeling.py
CHANGED
@@ -1,445 +1,659 @@
|
|
1 |
-
#import module
|
2 |
-
import streamlit as st
|
3 |
-
import streamlit.components.v1 as components
|
4 |
-
import pandas as pd
|
5 |
-
import numpy as np
|
6 |
-
import re
|
7 |
-
import
|
8 |
-
nltk
|
9 |
-
|
10 |
-
nltk.
|
11 |
-
|
12 |
-
|
13 |
-
import gensim
|
14 |
-
import gensim.corpora as corpora
|
15 |
-
from gensim.corpora import Dictionary
|
16 |
-
from gensim.models.coherencemodel import CoherenceModel
|
17 |
-
from gensim.models.ldamodel import LdaModel
|
18 |
-
from
|
19 |
-
import
|
20 |
-
import
|
21 |
-
import
|
22 |
-
|
23 |
-
|
24 |
-
from
|
25 |
-
from
|
26 |
-
|
27 |
-
from
|
28 |
-
import
|
29 |
-
import
|
30 |
-
import
|
31 |
-
import
|
32 |
-
import
|
33 |
-
import
|
34 |
-
import
|
35 |
-
|
36 |
-
|
37 |
-
import
|
38 |
-
import
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
st.
|
69 |
-
st.
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
@st.
|
83 |
-
def
|
84 |
-
|
85 |
-
return
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
papers = pd.read_csv(uploaded_file
|
116 |
-
papers.
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#import module
|
2 |
+
import streamlit as st
|
3 |
+
import streamlit.components.v1 as components
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import re
|
7 |
+
import string
|
8 |
+
import nltk
|
9 |
+
nltk.download('wordnet')
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
nltk.download('stopwords')
|
12 |
+
from nltk.corpus import stopwords
|
13 |
+
import gensim
|
14 |
+
import gensim.corpora as corpora
|
15 |
+
from gensim.corpora import Dictionary
|
16 |
+
from gensim.models.coherencemodel import CoherenceModel
|
17 |
+
from gensim.models.ldamodel import LdaModel
|
18 |
+
from gensim.models import Phrases
|
19 |
+
from gensim.models.phrases import Phraser
|
20 |
+
from pprint import pprint
|
21 |
+
import pickle
|
22 |
+
import pyLDAvis
|
23 |
+
import pyLDAvis.gensim_models as gensimvis
|
24 |
+
from io import StringIO
|
25 |
+
from ipywidgets.embed import embed_minimal_html
|
26 |
+
from nltk.stem.snowball import SnowballStemmer
|
27 |
+
from bertopic import BERTopic
|
28 |
+
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
|
29 |
+
import plotly.express as px
|
30 |
+
from sklearn.cluster import KMeans
|
31 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
32 |
+
import bitermplus as btm
|
33 |
+
import tmplot as tmp
|
34 |
+
import tomotopy
|
35 |
+
import sys
|
36 |
+
import spacy
|
37 |
+
import en_core_web_sm
|
38 |
+
import pipeline
|
39 |
+
from html2image import Html2Image
|
40 |
+
from umap import UMAP
|
41 |
+
import os
|
42 |
+
import time
|
43 |
+
import json
|
44 |
+
from tools import sourceformat as sf
|
45 |
+
import datamapplot
|
46 |
+
from sentence_transformers import SentenceTransformer
|
47 |
+
import openai
|
48 |
+
from transformers import pipeline
|
49 |
+
|
50 |
+
#===config===
|
51 |
+
st.set_page_config(
|
52 |
+
page_title="Coconut",
|
53 |
+
page_icon="🥥",
|
54 |
+
layout="wide",
|
55 |
+
initial_sidebar_state="collapsed"
|
56 |
+
)
|
57 |
+
|
58 |
+
hide_streamlit_style = """
|
59 |
+
<style>
|
60 |
+
#MainMenu
|
61 |
+
{visibility: hidden;}
|
62 |
+
footer {visibility: hidden;}
|
63 |
+
[data-testid="collapsedControl"] {display: none}
|
64 |
+
</style>
|
65 |
+
"""
|
66 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
67 |
+
|
68 |
+
with st.popover("🔗 Menu"):
|
69 |
+
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
70 |
+
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
71 |
+
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
72 |
+
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
73 |
+
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
74 |
+
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
75 |
+
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
76 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
77 |
+
|
78 |
+
st.header("Topic Modeling", anchor=False)
|
79 |
+
st.subheader('Put your file here...', anchor=False)
|
80 |
+
|
81 |
+
#========unique id========
|
82 |
+
@st.cache_resource(ttl=3600)
|
83 |
+
def create_list():
|
84 |
+
l = [1, 2, 3]
|
85 |
+
return l
|
86 |
+
|
87 |
+
l = create_list()
|
88 |
+
first_list_value = l[0]
|
89 |
+
l[0] = first_list_value + 1
|
90 |
+
uID = str(l[0])
|
91 |
+
|
92 |
+
@st.cache_data(ttl=3600)
|
93 |
+
def get_ext(uploaded_file):
|
94 |
+
extype = uID+uploaded_file.name
|
95 |
+
return extype
|
96 |
+
|
97 |
+
#===clear cache===
|
98 |
+
|
99 |
+
def reset_biterm():
|
100 |
+
try:
|
101 |
+
biterm_map.clear()
|
102 |
+
biterm_bar.clear()
|
103 |
+
except NameError:
|
104 |
+
biterm_topic.clear()
|
105 |
+
|
106 |
+
def reset_all():
|
107 |
+
st.cache_data.clear()
|
108 |
+
|
109 |
+
#===avoiding deadlock===
|
110 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
111 |
+
|
112 |
+
#===upload file===
|
113 |
+
@st.cache_data(ttl=3600)
|
114 |
+
def upload(file):
|
115 |
+
papers = pd.read_csv(uploaded_file)
|
116 |
+
if "About the data" in papers.columns[0]:
|
117 |
+
papers = sf.dim(papers)
|
118 |
+
col_dict = {'MeSH terms': 'Keywords',
|
119 |
+
'PubYear': 'Year',
|
120 |
+
'Times cited': 'Cited by',
|
121 |
+
'Publication Type': 'Document Type'
|
122 |
+
}
|
123 |
+
papers.rename(columns=col_dict, inplace=True)
|
124 |
+
|
125 |
+
return papers
|
126 |
+
|
127 |
+
@st.cache_data(ttl=3600)
|
128 |
+
def conv_txt(extype):
|
129 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
130 |
+
uploaded_file.seek(0)
|
131 |
+
papers = sf.medline(uploaded_file)
|
132 |
+
print(papers)
|
133 |
+
return papers
|
134 |
+
col_dict = {'TI': 'Title',
|
135 |
+
'SO': 'Source title',
|
136 |
+
'DE': 'Author Keywords',
|
137 |
+
'DT': 'Document Type',
|
138 |
+
'AB': 'Abstract',
|
139 |
+
'TC': 'Cited by',
|
140 |
+
'PY': 'Year',
|
141 |
+
'ID': 'Keywords Plus',
|
142 |
+
'rights_date_used': 'Year'}
|
143 |
+
uploaded_file.seek(0)
|
144 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
145 |
+
if("htid" in papers.columns):
|
146 |
+
papers = sf.htrc(papers)
|
147 |
+
papers.rename(columns=col_dict, inplace=True)
|
148 |
+
print(papers)
|
149 |
+
return papers
|
150 |
+
|
151 |
+
|
152 |
+
@st.cache_data(ttl=3600)
|
153 |
+
def conv_json(extype):
|
154 |
+
col_dict={'title': 'title',
|
155 |
+
'rights_date_used': 'Year',
|
156 |
+
}
|
157 |
+
|
158 |
+
data = json.load(uploaded_file)
|
159 |
+
hathifile = data['gathers']
|
160 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
161 |
+
|
162 |
+
keywords = sf.htrc(keywords)
|
163 |
+
keywords.rename(columns=col_dict,inplace=True)
|
164 |
+
return keywords
|
165 |
+
|
166 |
+
@st.cache_resource(ttl=3600)
|
167 |
+
def conv_pub(extype):
|
168 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
169 |
+
bytedata = extype.read()
|
170 |
+
keywords = sf.readPub(bytedata)
|
171 |
+
elif (get_ext(extype)).endswith('.xml'):
|
172 |
+
bytedata = extype.read()
|
173 |
+
keywords = sf.readxml(bytedata)
|
174 |
+
return keywords
|
175 |
+
|
176 |
+
#===Read data===
|
177 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
|
178 |
+
|
179 |
+
if uploaded_file is not None:
|
180 |
+
try:
|
181 |
+
extype = get_ext(uploaded_file)
|
182 |
+
|
183 |
+
if extype.endswith('.csv'):
|
184 |
+
papers = upload(extype)
|
185 |
+
elif extype.endswith('.txt'):
|
186 |
+
papers = conv_txt(extype)
|
187 |
+
|
188 |
+
elif extype.endswith('.json'):
|
189 |
+
papers = conv_json(extype)
|
190 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
191 |
+
papers = conv_pub(uploaded_file)
|
192 |
+
|
193 |
+
coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
194 |
+
|
195 |
+
c1, c2, c3 = st.columns([3,3,4])
|
196 |
+
method = c1.selectbox(
|
197 |
+
'Choose method',
|
198 |
+
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
|
199 |
+
ColCho = c2.selectbox('Choose column', (coldf))
|
200 |
+
num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
|
201 |
+
|
202 |
+
d1, d2 = st.columns([3,7])
|
203 |
+
xgram = d1.selectbox("N-grams", ("1", "2", "3"))
|
204 |
+
xgram = int(xgram)
|
205 |
+
words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)")
|
206 |
+
|
207 |
+
rem_copyright = d1.toggle('Remove copyright statement', value=True)
|
208 |
+
rem_punc = d2.toggle('Remove punctuation', value=True)
|
209 |
+
|
210 |
+
#===advance settings===
|
211 |
+
with st.expander("🧮 Show advance settings"):
|
212 |
+
t1, t2, t3 = st.columns([3,3,4])
|
213 |
+
if method == 'pyLDA':
|
214 |
+
py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
|
215 |
+
py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
|
216 |
+
opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
|
217 |
+
|
218 |
+
elif method == 'Biterm':
|
219 |
+
btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
|
220 |
+
btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
|
221 |
+
opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
|
222 |
+
|
223 |
+
elif method == 'BERTopic':
|
224 |
+
u1, u2 = st.columns([5,5])
|
225 |
+
|
226 |
+
bert_top_n_words = u1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
|
227 |
+
bert_random_state = u2.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
|
228 |
+
bert_n_components = u1.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
|
229 |
+
bert_n_neighbors = u2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
|
230 |
+
bert_embedding_model = st.radio(
|
231 |
+
"embedding_model",
|
232 |
+
["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_sm"], index=0, horizontal=True)
|
233 |
+
|
234 |
+
fine_tuning = st.toggle("Use Fine-tuning")
|
235 |
+
if fine_tuning:
|
236 |
+
topic_labelling = st.toggle("Automatic topic labelling")
|
237 |
+
if topic_labelling:
|
238 |
+
llm_provider = st.selectbox("Provider",["OpenAI","HuggingFace"])
|
239 |
+
if llm_provider == "OpenAI":
|
240 |
+
api_key = st.text_input("API Key")
|
241 |
+
|
242 |
+
else:
|
243 |
+
st.write('Please choose your preferred method')
|
244 |
+
|
245 |
+
#===clean csv===
|
246 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
247 |
+
def clean_csv(extype):
|
248 |
+
paper = papers.dropna(subset=[ColCho])
|
249 |
+
|
250 |
+
#===mapping===
|
251 |
+
paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
|
252 |
+
if rem_punc:
|
253 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(
|
254 |
+
lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x)
|
255 |
+
).map(lambda x: re.sub(r"\s+", " ", x).strip())
|
256 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('[\u2018\u2019\u201c\u201d]', '', regex=True)
|
257 |
+
if rem_copyright:
|
258 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
|
259 |
+
|
260 |
+
#===stopword removal===
|
261 |
+
stop = stopwords.words('english')
|
262 |
+
paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
263 |
+
|
264 |
+
#===lemmatize===
|
265 |
+
lemmatizer = WordNetLemmatizer()
|
266 |
+
|
267 |
+
@st.cache_resource(ttl=3600)
|
268 |
+
def lemmatize_words(text):
|
269 |
+
words = text.split()
|
270 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
271 |
+
return ' '.join(words)
|
272 |
+
paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
|
273 |
+
|
274 |
+
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
275 |
+
remove_dict = {word: None for word in words_rmv}
|
276 |
+
|
277 |
+
@st.cache_resource(ttl=3600)
|
278 |
+
def remove_words(text):
|
279 |
+
words = text.split()
|
280 |
+
cleaned_words = [word for word in words if word not in remove_dict]
|
281 |
+
return ' '.join(cleaned_words)
|
282 |
+
paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
|
283 |
+
|
284 |
+
topic_abs = paper.Abstract_lem.values.tolist()
|
285 |
+
return topic_abs, paper
|
286 |
+
|
287 |
+
topic_abs, paper=clean_csv(extype)
|
288 |
+
|
289 |
+
if st.button("Submit", on_click=reset_all):
|
290 |
+
num_topic = num_cho
|
291 |
+
|
292 |
+
if method == 'BERTopic':
|
293 |
+
st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
|
294 |
+
|
295 |
+
#===topic===
|
296 |
+
if method == 'Choose...':
|
297 |
+
st.write('')
|
298 |
+
|
299 |
+
elif method == 'pyLDA':
|
300 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
301 |
+
|
302 |
+
with tab1:
|
303 |
+
#===visualization===
|
304 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
305 |
+
def pylda(extype):
|
306 |
+
topic_abs_LDA = [t.split(' ') for t in topic_abs]
|
307 |
+
|
308 |
+
bigram = Phrases(topic_abs_LDA, min_count=xgram, threshold=opt_threshold)
|
309 |
+
trigram = Phrases(bigram[topic_abs_LDA], threshold=opt_threshold)
|
310 |
+
bigram_mod = Phraser(bigram)
|
311 |
+
trigram_mod = Phraser(trigram)
|
312 |
+
|
313 |
+
topic_abs_LDA = [trigram_mod[bigram_mod[doc]] for doc in topic_abs_LDA]
|
314 |
+
|
315 |
+
id2word = Dictionary(topic_abs_LDA)
|
316 |
+
corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
|
317 |
+
#===LDA===
|
318 |
+
lda_model = LdaModel(corpus=corpus,
|
319 |
+
id2word=id2word,
|
320 |
+
num_topics=num_topic,
|
321 |
+
random_state=py_random_state,
|
322 |
+
chunksize=py_chunksize,
|
323 |
+
alpha='auto',
|
324 |
+
per_word_topics=False)
|
325 |
+
pprint(lda_model.print_topics())
|
326 |
+
doc_lda = lda_model[corpus]
|
327 |
+
topics = lda_model.show_topics(num_words = 30,formatted=False)
|
328 |
+
|
329 |
+
#===visualization===
|
330 |
+
coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
|
331 |
+
coherence_lda = coherence_model_lda.get_coherence()
|
332 |
+
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
|
333 |
+
py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
|
334 |
+
return py_lda_vis_html, coherence_lda, vis, topics
|
335 |
+
|
336 |
+
with st.spinner('Performing computations. Please wait ...'):
|
337 |
+
try:
|
338 |
+
py_lda_vis_html, coherence_lda, vis, topics = pylda(extype)
|
339 |
+
st.write('Coherence score: ', coherence_lda)
|
340 |
+
components.html(py_lda_vis_html, width=1500, height=800)
|
341 |
+
st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
|
342 |
+
|
343 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
344 |
+
def img_lda(vis):
|
345 |
+
pyLDAvis.save_html(vis, 'output.html')
|
346 |
+
hti = Html2Image()
|
347 |
+
hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
|
348 |
+
hti.browser.use_new_headless = None
|
349 |
+
css = "body {background: white;}"
|
350 |
+
hti.screenshot(
|
351 |
+
other_file='output.html', css_str=css, size=(1500, 800),
|
352 |
+
save_as='ldavis_img.png'
|
353 |
+
)
|
354 |
+
|
355 |
+
img_lda(vis)
|
356 |
+
|
357 |
+
d1, d2 = st.columns(2)
|
358 |
+
with open("ldavis_img.png", "rb") as file:
|
359 |
+
btn = d1.download_button(
|
360 |
+
label="Download image",
|
361 |
+
data=file,
|
362 |
+
file_name="ldavis_img.png",
|
363 |
+
mime="image/png"
|
364 |
+
)
|
365 |
+
|
366 |
+
#===download results===#
|
367 |
+
resultf = pd.DataFrame(topics)
|
368 |
+
#formatting
|
369 |
+
resultf = resultf.transpose()
|
370 |
+
resultf = resultf.drop([0])
|
371 |
+
resultf = resultf.explode(list(range(len(resultf.columns))), ignore_index=False)
|
372 |
+
|
373 |
+
resultcsv = resultf.to_csv().encode("utf-8")
|
374 |
+
d2.download_button(
|
375 |
+
label = "Download Results",
|
376 |
+
data=resultcsv,
|
377 |
+
file_name="results.csv",
|
378 |
+
mime="text\csv",
|
379 |
+
on_click="ignore")
|
380 |
+
|
381 |
+
except NameError as f:
|
382 |
+
st.warning('🖱️ Please click Submit')
|
383 |
+
|
384 |
+
with tab2:
|
385 |
+
st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
|
386 |
+
|
387 |
+
with tab3:
|
388 |
+
st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
|
389 |
+
st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
|
390 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
|
391 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
|
392 |
+
|
393 |
+
with tab4:
|
394 |
+
st.subheader(':blue[pyLDA]', anchor=False)
|
395 |
+
st.button('Download image')
|
396 |
+
st.text("Click Download Image button.")
|
397 |
+
st.divider()
|
398 |
+
st.subheader(':blue[Downloading CSV Results]', anchor=False)
|
399 |
+
st.button("Download Results")
|
400 |
+
st.text("Click Download results button at bottom of page")
|
401 |
+
|
402 |
+
#===Biterm===
|
403 |
+
elif method == 'Biterm':
|
404 |
+
|
405 |
+
#===optimize Biterm===
|
406 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
407 |
+
def biterm_topic(extype):
|
408 |
+
tokenized_abs = [t.split(' ') for t in topic_abs]
|
409 |
+
|
410 |
+
bigram = Phrases(tokenized_abs, min_count=xgram, threshold=opt_threshold)
|
411 |
+
trigram = Phrases(bigram[tokenized_abs], threshold=opt_threshold)
|
412 |
+
bigram_mod = Phraser(bigram)
|
413 |
+
trigram_mod = Phraser(trigram)
|
414 |
+
|
415 |
+
topic_abs_ngram = [trigram_mod[bigram_mod[doc]] for doc in tokenized_abs]
|
416 |
+
|
417 |
+
topic_abs_str = [' '.join(doc) for doc in topic_abs_ngram]
|
418 |
+
|
419 |
+
|
420 |
+
X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs_str)
|
421 |
+
tf = np.array(X.sum(axis=0)).ravel()
|
422 |
+
docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
|
423 |
+
docs_lens = list(map(len, docs_vec))
|
424 |
+
biterms = btm.get_biterms(docs_vec)
|
425 |
+
|
426 |
+
model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
|
427 |
+
model.fit(biterms, iterations=btm_iterations)
|
428 |
+
|
429 |
+
p_zd = model.transform(docs_vec)
|
430 |
+
coherence = model.coherence_
|
431 |
+
phi = tmp.get_phi(model)
|
432 |
+
topics_coords = tmp.prepare_coords(model)
|
433 |
+
totaltop = topics_coords.label.values.tolist()
|
434 |
+
perplexity = model.perplexity_
|
435 |
+
top_topics = model.df_words_topics_
|
436 |
+
|
437 |
+
return topics_coords, phi, totaltop, perplexity, top_topics
|
438 |
+
|
439 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
440 |
+
with tab1:
|
441 |
+
try:
|
442 |
+
with st.spinner('Performing computations. Please wait ...'):
|
443 |
+
topics_coords, phi, totaltop, perplexity, top_topics = biterm_topic(extype)
|
444 |
+
col1, col2 = st.columns([4,6])
|
445 |
+
|
446 |
+
@st.cache_data(ttl=3600)
|
447 |
+
def biterm_map(extype):
|
448 |
+
btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
|
449 |
+
return btmvis_coords
|
450 |
+
|
451 |
+
@st.cache_data(ttl=3600)
|
452 |
+
def biterm_bar(extype):
|
453 |
+
terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
|
454 |
+
btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
|
455 |
+
return btmvis_probs
|
456 |
+
|
457 |
+
with col1:
|
458 |
+
st.write('Perplexity score: ', perplexity)
|
459 |
+
st.write('')
|
460 |
+
numvis = st.selectbox(
|
461 |
+
'Choose topic',
|
462 |
+
(totaltop), on_change=reset_biterm)
|
463 |
+
btmvis_coords = biterm_map(extype)
|
464 |
+
st.altair_chart(btmvis_coords)
|
465 |
+
with col2:
|
466 |
+
btmvis_probs = biterm_bar(extype)
|
467 |
+
st.altair_chart(btmvis_probs, use_container_width=True)
|
468 |
+
|
469 |
+
#===download results===#
|
470 |
+
resultcsv = top_topics.to_csv().encode("utf-8")
|
471 |
+
st.download_button(label = "Download Results", data=resultcsv, file_name="results.csv", mime="text\csv", on_click="ignore")
|
472 |
+
|
473 |
+
except ValueError as g:
|
474 |
+
st.error('🙇♂️ Please raise the number of topics and click submit')
|
475 |
+
|
476 |
+
except NameError as f:
|
477 |
+
st.warning('🖱️ Please click Submit')
|
478 |
+
|
479 |
+
with tab2:
|
480 |
+
st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
|
481 |
+
with tab3:
|
482 |
+
st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
|
483 |
+
st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
|
484 |
+
st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
|
485 |
+
st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
|
486 |
+
with tab4:
|
487 |
+
st.subheader(':blue[Biterm]', anchor=False)
|
488 |
+
st.text("Click the three dots at the top right then select the desired format.")
|
489 |
+
st.markdown("")
|
490 |
+
st.divider()
|
491 |
+
st.subheader(':blue[Downloading CSV Results]', anchor=False)
|
492 |
+
st.button("Download Results")
|
493 |
+
st.text("Click Download results button at bottom of page")
|
494 |
+
|
495 |
+
|
496 |
+
#===BERTopic===
|
497 |
+
elif method == 'BERTopic':
|
498 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
499 |
+
#@st.cache_data(ttl=3600, show_spinner=False)
|
500 |
+
def bertopic_vis(extype):
|
501 |
+
umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
|
502 |
+
min_dist=0.0, metric='cosine', random_state=bert_random_state)
|
503 |
+
cluster_model = KMeans(n_clusters=num_topic)
|
504 |
+
if bert_embedding_model == 'all-MiniLM-L6-v2':
|
505 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
506 |
+
lang = 'en'
|
507 |
+
embeddings = model.encode(topic_abs, show_progress_bar=True)
|
508 |
+
|
509 |
+
elif bert_embedding_model == 'en_core_web_sm':
|
510 |
+
nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
511 |
+
model = nlp
|
512 |
+
lang = 'en'
|
513 |
+
embeddings = np.array([nlp(text).vector for text in topic_abs])
|
514 |
+
|
515 |
+
elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
|
516 |
+
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
517 |
+
lang = 'multilingual'
|
518 |
+
embeddings = model.encode(topic_abs, show_progress_bar=True)
|
519 |
+
|
520 |
+
representation_model = ""
|
521 |
+
|
522 |
+
if fine_tuning:
|
523 |
+
keybert = KeyBERTInspired()
|
524 |
+
mmr = MaximalMarginalRelevance(diversity=0.3)
|
525 |
+
representation_model = {
|
526 |
+
"KeyBERT": keybert,
|
527 |
+
"MMR": mmr,
|
528 |
+
}
|
529 |
+
if topic_labelling:
|
530 |
+
if llm_provider == "OpenAI":
|
531 |
+
client = openai.OpenAI(api_key=api_key)
|
532 |
+
representation_model = {
|
533 |
+
"KeyBERT": keybert,
|
534 |
+
"MMR": mmr,
|
535 |
+
"test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
|
536 |
+
}
|
537 |
+
elif llm_provider == "HuggingFace":
|
538 |
+
gennie = pipeline("text2text-generation", model = "google/flan-t5-base")
|
539 |
+
clientmod = TextGeneration(gennie)
|
540 |
+
representation_model = {
|
541 |
+
"KeyBERT": keybert,
|
542 |
+
"MMR": mmr,
|
543 |
+
"test": clientmod
|
544 |
+
}
|
545 |
+
|
546 |
+
vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
|
547 |
+
topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
|
548 |
+
topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
|
549 |
+
|
550 |
+
if(fine_tuning and topic_labelling):
|
551 |
+
generated_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["test"].values()]
|
552 |
+
topic_model.set_topic_labels(generated_labels)
|
553 |
+
|
554 |
+
return topic_model, topics, probs, embeddings
|
555 |
+
|
556 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
557 |
+
def Vis_Topics(extype):
|
558 |
+
fig1 = topic_model.visualize_topics()
|
559 |
+
return fig1
|
560 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
561 |
+
def Vis_Documents(extype):
|
562 |
+
fig2 = topic_model.visualize_document_datamap(topic_abs, embeddings=embeddings, custom_labels = True)
|
563 |
+
return fig2
|
564 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
565 |
+
def Vis_Hierarchy(extype):
|
566 |
+
fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic, custom_labels = True)
|
567 |
+
return fig3
|
568 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
569 |
+
def Vis_Heatmap(extype):
|
570 |
+
global topic_model
|
571 |
+
fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000, custom_labels = True)
|
572 |
+
return fig4
|
573 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
574 |
+
def Vis_Barchart(extype):
|
575 |
+
fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, custom_labels = True)
|
576 |
+
return fig5
|
577 |
+
|
578 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
579 |
+
with tab1:
|
580 |
+
try:
|
581 |
+
with st.spinner('Performing computations. Please wait ...'):
|
582 |
+
|
583 |
+
topic_model, topics, probs, embeddings = bertopic_vis(extype)
|
584 |
+
time.sleep(.5)
|
585 |
+
st.toast('Visualize Topics', icon='🏃')
|
586 |
+
fig1 = Vis_Topics(extype)
|
587 |
+
|
588 |
+
time.sleep(.5)
|
589 |
+
st.toast('Visualize Document', icon='🏃')
|
590 |
+
fig2 = Vis_Documents(extype)
|
591 |
+
|
592 |
+
time.sleep(.5)
|
593 |
+
st.toast('Visualize Document Hierarchy', icon='🏃')
|
594 |
+
fig3 = Vis_Hierarchy(extype)
|
595 |
+
|
596 |
+
time.sleep(.5)
|
597 |
+
st.toast('Visualize Topic Similarity', icon='🏃')
|
598 |
+
fig4 = Vis_Heatmap(extype)
|
599 |
+
|
600 |
+
time.sleep(.5)
|
601 |
+
st.toast('Visualize Terms', icon='🏃')
|
602 |
+
fig5 = Vis_Barchart(extype)
|
603 |
+
|
604 |
+
bertab1, bertab2, bertab3, bertab4, bertab5 = st.tabs(["Visualize Topics", "Visualize Terms", "Visualize Documents",
|
605 |
+
"Visualize Document Hierarchy", "Visualize Topic Similarity"])
|
606 |
+
|
607 |
+
with bertab1:
|
608 |
+
st.plotly_chart(fig1, use_container_width=True)
|
609 |
+
with bertab2:
|
610 |
+
st.plotly_chart(fig5, use_container_width=True)
|
611 |
+
with bertab3:
|
612 |
+
st.plotly_chart(fig2, use_container_width=True)
|
613 |
+
with bertab4:
|
614 |
+
st.plotly_chart(fig3, use_container_width=True)
|
615 |
+
with bertab5:
|
616 |
+
st.plotly_chart(fig4, use_container_width=True)
|
617 |
+
|
618 |
+
#===download results===#
|
619 |
+
results = topic_model.get_topic_info()
|
620 |
+
resultf = pd.DataFrame(results)
|
621 |
+
resultcsv = resultf.to_csv().encode("utf-8")
|
622 |
+
st.download_button(
|
623 |
+
label = "Download Results",
|
624 |
+
data=resultcsv,
|
625 |
+
file_name="results.csv",
|
626 |
+
mime="text\csv",
|
627 |
+
on_click="ignore",
|
628 |
+
)
|
629 |
+
|
630 |
+
except ValueError as e:
|
631 |
+
st.write(e)
|
632 |
+
st.error('🙇♂️ Please raise the number of topics and click submit')
|
633 |
+
|
634 |
+
|
635 |
+
except NameError as e:
|
636 |
+
st.warning('🖱️ Please click Submit')
|
637 |
+
st.write(e)
|
638 |
+
|
639 |
+
with tab2:
|
640 |
+
st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
|
641 |
+
|
642 |
+
with tab3:
|
643 |
+
st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
|
644 |
+
st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
|
645 |
+
|
646 |
+
with tab4:
|
647 |
+
st.divider()
|
648 |
+
st.subheader(':blue[BERTopic]', anchor=False)
|
649 |
+
st.text("Click the camera icon on the top right menu")
|
650 |
+
st.markdown("")
|
651 |
+
st.divider()
|
652 |
+
st.subheader(':blue[Downloading CSV Results]', anchor=False)
|
653 |
+
st.button("Download Results")
|
654 |
+
st.text("Click Download results button at bottom of page")
|
655 |
+
|
656 |
+
except Exception as e:
|
657 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
658 |
+
st.write(e)
|
659 |
+
st.stop()
|
pages/3 Bidirected Network.py
CHANGED
@@ -1,276 +1,370 @@
|
|
1 |
-
#import module
|
2 |
-
import streamlit as st
|
3 |
-
import pandas as pd
|
4 |
-
import re
|
5 |
-
import nltk
|
6 |
-
nltk.download('punkt')
|
7 |
-
from nltk.tokenize import word_tokenize
|
8 |
-
from mlxtend.preprocessing import TransactionEncoder
|
9 |
-
te = TransactionEncoder()
|
10 |
-
from mlxtend.frequent_patterns import fpgrowth
|
11 |
-
from mlxtend.frequent_patterns import association_rules
|
12 |
-
from streamlit_agraph import agraph, Node, Edge, Config
|
13 |
-
import nltk
|
14 |
-
nltk.download('wordnet')
|
15 |
-
from nltk.stem import WordNetLemmatizer
|
16 |
-
nltk.download('stopwords')
|
17 |
-
from nltk.corpus import stopwords
|
18 |
-
from nltk.stem.snowball import SnowballStemmer
|
19 |
-
import sys
|
20 |
-
import time
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
st.
|
50 |
-
st.
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
st.
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
papers.
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
papers
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#import module
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
import nltk
|
6 |
+
nltk.download('punkt')
|
7 |
+
from nltk.tokenize import word_tokenize
|
8 |
+
from mlxtend.preprocessing import TransactionEncoder
|
9 |
+
te = TransactionEncoder()
|
10 |
+
from mlxtend.frequent_patterns import fpgrowth
|
11 |
+
from mlxtend.frequent_patterns import association_rules
|
12 |
+
from streamlit_agraph import agraph, Node, Edge, Config
|
13 |
+
import nltk
|
14 |
+
nltk.download('wordnet')
|
15 |
+
from nltk.stem import WordNetLemmatizer
|
16 |
+
nltk.download('stopwords')
|
17 |
+
from nltk.corpus import stopwords
|
18 |
+
from nltk.stem.snowball import SnowballStemmer
|
19 |
+
import sys
|
20 |
+
import time
|
21 |
+
import json
|
22 |
+
from tools import sourceformat as sf
|
23 |
+
|
24 |
+
import networkx as nx
|
25 |
+
import matplotlib.pyplot as plt
|
26 |
+
import plotly.graph_objects as go
|
27 |
+
|
28 |
+
import altair as alt
|
29 |
+
import altair_nx as anx
|
30 |
+
|
31 |
+
#===config===
|
32 |
+
st.set_page_config(
|
33 |
+
page_title="Coconut",
|
34 |
+
page_icon="🥥",
|
35 |
+
layout="wide",
|
36 |
+
initial_sidebar_state="collapsed"
|
37 |
+
)
|
38 |
+
|
39 |
+
hide_streamlit_style = """
|
40 |
+
<style>
|
41 |
+
#MainMenu
|
42 |
+
{visibility: hidden;}
|
43 |
+
footer {visibility: hidden;}
|
44 |
+
[data-testid="collapsedControl"] {display: none}
|
45 |
+
</style>
|
46 |
+
"""
|
47 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
48 |
+
|
49 |
+
with st.popover("🔗 Menu"):
|
50 |
+
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
51 |
+
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
52 |
+
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
53 |
+
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
54 |
+
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
55 |
+
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
56 |
+
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
57 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
58 |
+
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
59 |
+
|
60 |
+
st.header("Bidirected Network", anchor=False)
|
61 |
+
st.subheader('Put your file here...', anchor=False)
|
62 |
+
|
63 |
+
#===clear cache===
|
64 |
+
def reset_all():
|
65 |
+
st.cache_data.clear()
|
66 |
+
|
67 |
+
#===check type===
|
68 |
+
@st.cache_data(ttl=3600)
|
69 |
+
def get_ext(extype):
|
70 |
+
extype = uploaded_file.name
|
71 |
+
return extype
|
72 |
+
|
73 |
+
@st.cache_data(ttl=3600)
|
74 |
+
def upload(extype):
|
75 |
+
papers = pd.read_csv(uploaded_file)
|
76 |
+
|
77 |
+
if "About the data" in papers.columns[0]:
|
78 |
+
papers = sf.dim(papers)
|
79 |
+
col_dict = {'MeSH terms': 'Keywords',
|
80 |
+
'PubYear': 'Year',
|
81 |
+
'Times cited': 'Cited by',
|
82 |
+
'Publication Type': 'Document Type'
|
83 |
+
}
|
84 |
+
papers.rename(columns=col_dict, inplace=True)
|
85 |
+
|
86 |
+
return papers
|
87 |
+
|
88 |
+
return papers
|
89 |
+
|
90 |
+
@st.cache_data(ttl=3600)
|
91 |
+
def conv_txt(extype):
|
92 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
93 |
+
uploaded_file.seek(0)
|
94 |
+
papers = sf.medline(uploaded_file)
|
95 |
+
print(papers)
|
96 |
+
return papers
|
97 |
+
col_dict = {'TI': 'Title',
|
98 |
+
'SO': 'Source title',
|
99 |
+
'DE': 'Author Keywords',
|
100 |
+
'DT': 'Document Type',
|
101 |
+
'AB': 'Abstract',
|
102 |
+
'TC': 'Cited by',
|
103 |
+
'PY': 'Year',
|
104 |
+
'ID': 'Keywords Plus',
|
105 |
+
'rights_date_used': 'Year'}
|
106 |
+
uploaded_file.seek(0)
|
107 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
108 |
+
if("htid" in papers.columns):
|
109 |
+
papers = sf.htrc(papers)
|
110 |
+
papers.rename(columns=col_dict, inplace=True)
|
111 |
+
print(papers)
|
112 |
+
return papers
|
113 |
+
|
114 |
+
|
115 |
+
@st.cache_data(ttl=3600)
|
116 |
+
def conv_json(extype):
|
117 |
+
col_dict={'title': 'title',
|
118 |
+
'rights_date_used': 'Year',
|
119 |
+
}
|
120 |
+
|
121 |
+
data = json.load(uploaded_file)
|
122 |
+
hathifile = data['gathers']
|
123 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
124 |
+
|
125 |
+
keywords = sf.htrc(keywords)
|
126 |
+
keywords.rename(columns=col_dict,inplace=True)
|
127 |
+
return keywords
|
128 |
+
|
129 |
+
@st.cache_data(ttl=3600)
|
130 |
+
def conv_pub(extype):
|
131 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
132 |
+
bytedata = extype.read()
|
133 |
+
keywords = sf.readPub(bytedata)
|
134 |
+
elif (get_ext(extype)).endswith('.xml'):
|
135 |
+
bytedata = extype.read()
|
136 |
+
keywords = sf.readxml(bytedata)
|
137 |
+
return keywords
|
138 |
+
|
139 |
+
#===Read data===
|
140 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all)
|
141 |
+
|
142 |
+
if uploaded_file is not None:
|
143 |
+
try:
|
144 |
+
extype = get_ext(uploaded_file)
|
145 |
+
if extype.endswith('.csv'):
|
146 |
+
papers = upload(extype)
|
147 |
+
elif extype.endswith('.txt'):
|
148 |
+
papers = conv_txt(extype)
|
149 |
+
elif extype.endswith('.json'):
|
150 |
+
papers = conv_json(extype)
|
151 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
152 |
+
papers = conv_pub(uploaded_file)
|
153 |
+
|
154 |
+
@st.cache_data(ttl=3600)
|
155 |
+
def get_data_arul(extype):
|
156 |
+
list_of_column_key = list(papers.columns)
|
157 |
+
list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
|
158 |
+
return papers, list_of_column_key
|
159 |
+
|
160 |
+
papers, list_of_column_key = get_data_arul(extype)
|
161 |
+
|
162 |
+
col1, col2 = st.columns(2)
|
163 |
+
with col1:
|
164 |
+
method = st.selectbox(
|
165 |
+
'Choose method',
|
166 |
+
('Lemmatization', 'Stemming'), on_change=reset_all)
|
167 |
+
layout = st.selectbox(
|
168 |
+
'Choose graph layout',
|
169 |
+
['Circular','Kamada Kawai','Random','Spring','Shell']
|
170 |
+
)
|
171 |
+
with col2:
|
172 |
+
keyword = st.selectbox(
|
173 |
+
'Choose column',
|
174 |
+
(list_of_column_key), on_change=reset_all)
|
175 |
+
|
176 |
+
|
177 |
+
#===body===
|
178 |
+
@st.cache_data(ttl=3600)
|
179 |
+
def clean_arul(extype):
|
180 |
+
global keyword, papers
|
181 |
+
try:
|
182 |
+
arul = papers.dropna(subset=[keyword])
|
183 |
+
except KeyError:
|
184 |
+
st.error('Error: Please check your Author/Index Keywords column.')
|
185 |
+
sys.exit(1)
|
186 |
+
arul[keyword] = arul[keyword].map(lambda x: re.sub('-—–', ' ', x))
|
187 |
+
arul[keyword] = arul[keyword].map(lambda x: re.sub('; ', ' ; ', x))
|
188 |
+
arul[keyword] = arul[keyword].map(lambda x: x.lower())
|
189 |
+
arul[keyword] = arul[keyword].dropna()
|
190 |
+
return arul
|
191 |
+
|
192 |
+
arul = clean_arul(extype)
|
193 |
+
|
194 |
+
#===stem/lem===
|
195 |
+
@st.cache_data(ttl=3600)
|
196 |
+
def lemma_arul(extype):
|
197 |
+
lemmatizer = WordNetLemmatizer()
|
198 |
+
def lemmatize_words(text):
|
199 |
+
words = text.split()
|
200 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
201 |
+
return ' '.join(words)
|
202 |
+
arul[keyword] = arul[keyword].apply(lemmatize_words)
|
203 |
+
return arul
|
204 |
+
|
205 |
+
@st.cache_data(ttl=3600)
|
206 |
+
def stem_arul(extype):
|
207 |
+
stemmer = SnowballStemmer("english")
|
208 |
+
def stem_words(text):
|
209 |
+
words = text.split()
|
210 |
+
words = [stemmer.stem(word) for word in words]
|
211 |
+
return ' '.join(words)
|
212 |
+
arul[keyword] = arul[keyword].apply(stem_words)
|
213 |
+
return arul
|
214 |
+
|
215 |
+
if method is 'Lemmatization':
|
216 |
+
arul = lemma_arul(extype)
|
217 |
+
else:
|
218 |
+
arul = stem_arul(extype)
|
219 |
+
|
220 |
+
@st.cache_data(ttl=3600)
|
221 |
+
def arm(extype):
|
222 |
+
arule = arul[keyword].str.split(' ; ')
|
223 |
+
arule_list = arule.values.tolist()
|
224 |
+
te_ary = te.fit(arule_list).transform(arule_list)
|
225 |
+
df = pd.DataFrame(te_ary, columns=te.columns_)
|
226 |
+
return df
|
227 |
+
df = arm(extype)
|
228 |
+
|
229 |
+
col1, col2, col3 = st.columns(3)
|
230 |
+
with col1:
|
231 |
+
supp = st.slider(
|
232 |
+
'Select value of Support',
|
233 |
+
0.001, 1.000, (0.010), on_change=reset_all)
|
234 |
+
with col2:
|
235 |
+
conf = st.slider(
|
236 |
+
'Select value of Confidence',
|
237 |
+
0.001, 1.000, (0.050), on_change=reset_all)
|
238 |
+
with col3:
|
239 |
+
maxlen = st.slider(
|
240 |
+
'Maximum length of the itemsets generated',
|
241 |
+
2, 8, (2), on_change=reset_all)
|
242 |
+
|
243 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Result & Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
244 |
+
|
245 |
+
with tab1:
|
246 |
+
#===Association rules===
|
247 |
+
@st.cache_data(ttl=3600)
|
248 |
+
def freqitem(extype):
|
249 |
+
freq_item = fpgrowth(df, min_support=supp, use_colnames=True, max_len=maxlen)
|
250 |
+
return freq_item
|
251 |
+
|
252 |
+
freq_item = freqitem(extype)
|
253 |
+
col1, col2 = st.columns(2)
|
254 |
+
with col1:
|
255 |
+
st.write('🚨 The more data you have, the longer you will have to wait.')
|
256 |
+
with col2:
|
257 |
+
showall = st.checkbox('Show all nodes', value=True, on_change=reset_all)
|
258 |
+
|
259 |
+
@st.cache_data(ttl=3600)
|
260 |
+
def arm_table(extype):
|
261 |
+
restab = association_rules(freq_item, metric='confidence', min_threshold=conf)
|
262 |
+
restab = restab[['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'conviction']]
|
263 |
+
restab['antecedents'] = restab['antecedents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
|
264 |
+
restab['consequents'] = restab['consequents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
|
265 |
+
if showall:
|
266 |
+
restab['Show'] = True
|
267 |
+
else:
|
268 |
+
restab['Show'] = False
|
269 |
+
return restab
|
270 |
+
|
271 |
+
if freq_item.empty:
|
272 |
+
st.error('Please lower your value.', icon="🚨")
|
273 |
+
else:
|
274 |
+
restab = arm_table(extype)
|
275 |
+
restab = st.data_editor(restab, use_container_width=True)
|
276 |
+
res = restab[restab['Show'] == True]
|
277 |
+
|
278 |
+
#===visualize===
|
279 |
+
|
280 |
+
if st.button('📈 Generate network visualization', on_click=reset_all):
|
281 |
+
with st.spinner('Visualizing, please wait ....'):
|
282 |
+
@st.cache_data(ttl=3600)
|
283 |
+
def map_node(extype):
|
284 |
+
res['to'] = res['antecedents'] + ' → ' + res['consequents'] + '\n Support = ' + res['support'].astype(str) + '\n Confidence = ' + res['confidence'].astype(str) + '\n Conviction = ' + res['conviction'].astype(str)
|
285 |
+
res_ant = res[['antecedents','antecedent support']].rename(columns={'antecedents': 'node', 'antecedent support': 'size'})
|
286 |
+
res_con = res[['consequents','consequent support']].rename(columns={'consequents': 'node', 'consequent support': 'size'})
|
287 |
+
res_node = pd.concat([res_ant, res_con]).drop_duplicates(keep='first')
|
288 |
+
return res_node, res
|
289 |
+
|
290 |
+
res_node, res = map_node(extype)
|
291 |
+
___='''
|
292 |
+
@st.cache_data(ttl=3600)
|
293 |
+
def arul_net(extype):
|
294 |
+
nodes = []
|
295 |
+
edges = []
|
296 |
+
|
297 |
+
for w,x in zip(res_node['size'], res_node['node']):
|
298 |
+
nodes.append(x)
|
299 |
+
for y,z,a,b in zip(res['antecedents'],res['consequents'],res['confidence'],res['to']):
|
300 |
+
edge = (y,z)
|
301 |
+
|
302 |
+
edges.append(edge)
|
303 |
+
|
304 |
+
return nodes, edges
|
305 |
+
|
306 |
+
#nodes, edges = arul_net(res)
|
307 |
+
'''
|
308 |
+
|
309 |
+
@st.cache_data(ttl=3600)
|
310 |
+
def graphmaker(__netgraph):
|
311 |
+
|
312 |
+
#add nodes, w is weight, x is node label
|
313 |
+
for w,x in zip(res_node['size'], res_node['node']):
|
314 |
+
__netgraph.add_node(x, size = w)
|
315 |
+
#add edges, y is startpoint, z is endpoint, a is edge weight, b is title
|
316 |
+
for y,z,a,b in zip(res['antecedents'],res['consequents'],res['confidence'],res['to']):
|
317 |
+
__netgraph.add_edge(y,z, weight = int(a*100))
|
318 |
+
|
319 |
+
|
320 |
+
#Make graph with NetworkX
|
321 |
+
|
322 |
+
G=nx.DiGraph()
|
323 |
+
|
324 |
+
graphmaker(G)
|
325 |
+
|
326 |
+
#G.add_edges_from(edges) ##### remove this later
|
327 |
+
|
328 |
+
#Graph layout
|
329 |
+
if(layout=="Spring"):
|
330 |
+
pos=nx.spring_layout(G)
|
331 |
+
elif(layout == "Kamada Kawai"):
|
332 |
+
pos=nx.kamada_kawai_layout(G)
|
333 |
+
elif(layout == "Circular"):
|
334 |
+
pos = nx.circular_layout(G)
|
335 |
+
elif(layout=="Random"):
|
336 |
+
pos = nx.random_layout(G)
|
337 |
+
elif(layout=="Shell"):
|
338 |
+
pos=nx.shell_layout(G)
|
339 |
+
|
340 |
+
graph = anx.draw_networkx(G,pos, node_label = 'node',
|
341 |
+
edge_width = 'weight',
|
342 |
+
node_size = 'size',
|
343 |
+
curved_edges = True,
|
344 |
+
node_font_size=12,
|
345 |
+
chart_width=1920,
|
346 |
+
chart_height=1080).interactive()
|
347 |
+
|
348 |
+
st.altair_chart(graph)
|
349 |
+
|
350 |
+
|
351 |
+
with tab2:
|
352 |
+
st.markdown('**Santosa, F. A. (2023). Adding Perspective to the Bibliometric Mapping Using Bidirected Graph. Open Information Science, 7(1), 20220152.** https://doi.org/10.1515/opis-2022-0152')
|
353 |
+
|
354 |
+
with tab3:
|
355 |
+
st.markdown('**Agrawal, R., Imieliński, T., & Swami, A. (1993). Mining association rules between sets of items in large databases. In ACM SIGMOD Record (Vol. 22, Issue 2, pp. 207–216). Association for Computing Machinery (ACM).** https://doi.org/10.1145/170036.170072')
|
356 |
+
st.markdown('**Brin, S., Motwani, R., Ullman, J. D., & Tsur, S. (1997). Dynamic itemset counting and implication rules for market basket data. ACM SIGMOD Record, 26(2), 255–264.** https://doi.org/10.1145/253262.253325')
|
357 |
+
st.markdown('**Edmonds, J., & Johnson, E. L. (2003). Matching: A Well-Solved Class of Integer Linear Programs. Combinatorial Optimization — Eureka, You Shrink!, 27–30.** https://doi.org/10.1007/3-540-36478-1_3')
|
358 |
+
st.markdown('**Li, M. (2016, August 23). An exploration to visualise the emerging trends of technology foresight based on an improved technique of co-word analysis and relevant literature data of WOS. Technology Analysis & Strategic Management, 29(6), 655–671.** https://doi.org/10.1080/09537325.2016.1220518')
|
359 |
+
with tab4:
|
360 |
+
st.subheader("Download visualization")
|
361 |
+
st.text("Zoom in, zoom out, or shift the nodes as desired, then right-click and select Save image as ...")
|
362 |
+
st.markdown("")
|
363 |
+
st.subheader("Download table as CSV")
|
364 |
+
st.text("Hover cursor over table, and click download arrow")
|
365 |
+
st.image("images/tablenetwork.png")
|
366 |
+
|
367 |
+
except Exception as e:
|
368 |
+
st.write(e)
|
369 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
370 |
+
st.stop()
|
pages/4 Sunburst.py
CHANGED
@@ -4,6 +4,9 @@ import pandas as pd
|
|
4 |
import plotly.express as px
|
5 |
import numpy as np
|
6 |
import sys
|
|
|
|
|
|
|
7 |
|
8 |
#===config===
|
9 |
st.set_page_config(
|
@@ -31,6 +34,7 @@ with st.popover("🔗 Menu"):
|
|
31 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
32 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
33 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
|
|
34 |
|
35 |
st.header("Sunburst Visualization", anchor=False)
|
36 |
st.subheader('Put your file here...', anchor=False)
|
@@ -52,24 +56,71 @@ def upload(extype):
|
|
52 |
if 'Publication Year' in papers.columns:
|
53 |
papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
54 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
return papers
|
56 |
|
57 |
@st.cache_data(ttl=3600)
|
58 |
def conv_txt(extype):
|
|
|
|
|
|
|
|
|
|
|
59 |
col_dict = {'TI': 'Title',
|
60 |
'SO': 'Source title',
|
61 |
-
'DT': 'Document Type',
|
62 |
'DE': 'Author Keywords',
|
63 |
-
'
|
64 |
'AB': 'Abstract',
|
65 |
'TC': 'Cited by',
|
66 |
-
'PY': 'Year',
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
68 |
papers.rename(columns=col_dict, inplace=True)
|
|
|
69 |
return papers
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
#===Read data===
|
72 |
-
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
73 |
|
74 |
if uploaded_file is not None:
|
75 |
try:
|
@@ -79,36 +130,45 @@ if uploaded_file is not None:
|
|
79 |
|
80 |
elif extype.endswith('.txt'):
|
81 |
papers = conv_txt(extype)
|
82 |
-
|
|
|
|
|
|
|
|
|
83 |
@st.cache_data(ttl=3600)
|
84 |
def get_minmax(extype):
|
85 |
extype = extype
|
86 |
MIN = int(papers['Year'].min())
|
87 |
MAX = int(papers['Year'].max())
|
|
|
|
|
88 |
GAP = MAX - MIN
|
89 |
-
return papers, MIN, MAX, GAP
|
90 |
-
|
91 |
-
tab1, tab2 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading"])
|
92 |
|
93 |
with tab1:
|
94 |
#===sunburst===
|
95 |
try:
|
96 |
-
papers, MIN, MAX, GAP = get_minmax(extype)
|
97 |
except KeyError:
|
98 |
st.error('Error: Please check again your columns.')
|
99 |
sys.exit(1)
|
100 |
|
101 |
if (GAP != 0):
|
102 |
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
|
|
103 |
else:
|
104 |
st.write('You only have data in ', (MAX))
|
105 |
YEAR = (MIN, MAX)
|
106 |
-
|
107 |
@st.cache_data(ttl=3600)
|
108 |
def listyear(extype):
|
109 |
global papers
|
110 |
years = list(range(YEAR[0],YEAR[1]+1))
|
|
|
111 |
papers = papers.loc[papers['Year'].isin(years)]
|
|
|
112 |
return years, papers
|
113 |
|
114 |
@st.cache_data(ttl=3600)
|
@@ -118,19 +178,23 @@ if uploaded_file is not None:
|
|
118 |
vis[['doctype','source','citby','year']] = papers[['Document Type','Source title','Cited by','Year']]
|
119 |
viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index()
|
120 |
viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True)
|
121 |
-
|
|
|
|
|
122 |
fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs',
|
123 |
color='cited by',
|
124 |
color_continuous_scale='RdBu',
|
125 |
color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
|
126 |
fig.update_layout(height=800, width=1200)
|
127 |
-
return fig
|
128 |
|
129 |
years, papers = listyear(extype)
|
130 |
|
131 |
if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
|
132 |
-
fig = vis_sunbrust(extype)
|
133 |
st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
|
|
|
|
|
134 |
|
135 |
else:
|
136 |
st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨")
|
@@ -138,7 +202,10 @@ if uploaded_file is not None:
|
|
138 |
with tab2:
|
139 |
st.markdown('**numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual.** https://numpy.org/doc/stable/reference/generated/numpy.average.html')
|
140 |
st.markdown('**Sunburst. (n.d.). Sunburst Charts in Python.** https://plotly.com/python/sunburst-charts/')
|
141 |
-
|
|
|
|
|
|
|
142 |
except:
|
143 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
144 |
-
st.stop()
|
|
|
4 |
import plotly.express as px
|
5 |
import numpy as np
|
6 |
import sys
|
7 |
+
import json
|
8 |
+
from tools import sourceformat as sf
|
9 |
+
|
10 |
|
11 |
#===config===
|
12 |
st.set_page_config(
|
|
|
34 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
35 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
36 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
37 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
38 |
|
39 |
st.header("Sunburst Visualization", anchor=False)
|
40 |
st.subheader('Put your file here...', anchor=False)
|
|
|
56 |
if 'Publication Year' in papers.columns:
|
57 |
papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
58 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
59 |
+
if "About the data" in papers.columns[0]:
|
60 |
+
papers = sf.dim(papers)
|
61 |
+
col_dict = {'MeSH terms': 'Keywords',
|
62 |
+
'PubYear': 'Year',
|
63 |
+
'Times cited': 'Cited by',
|
64 |
+
'Publication Type': 'Document Type'
|
65 |
+
}
|
66 |
+
papers.rename(columns=col_dict, inplace=True)
|
67 |
+
|
68 |
return papers
|
69 |
|
70 |
@st.cache_data(ttl=3600)
|
71 |
def conv_txt(extype):
|
72 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
73 |
+
uploaded_file.seek(0)
|
74 |
+
papers = sf.medline(uploaded_file)
|
75 |
+
print(papers)
|
76 |
+
return papers
|
77 |
col_dict = {'TI': 'Title',
|
78 |
'SO': 'Source title',
|
|
|
79 |
'DE': 'Author Keywords',
|
80 |
+
'DT': 'Document Type',
|
81 |
'AB': 'Abstract',
|
82 |
'TC': 'Cited by',
|
83 |
+
'PY': 'Year',
|
84 |
+
'ID': 'Keywords Plus',
|
85 |
+
'rights_date_used': 'Year'}
|
86 |
+
uploaded_file.seek(0)
|
87 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
88 |
+
if("htid" in papers.columns):
|
89 |
+
papers = sf.htrc(papers)
|
90 |
papers.rename(columns=col_dict, inplace=True)
|
91 |
+
print(papers)
|
92 |
return papers
|
93 |
|
94 |
+
@st.cache_data(ttl=3600)
|
95 |
+
def conv_json(extype):
|
96 |
+
col_dict={'title': 'title',
|
97 |
+
'rights_date_used': 'Year',
|
98 |
+
'content_provider_code': 'Document Type',
|
99 |
+
'Keywords':'Source title'
|
100 |
+
}
|
101 |
+
|
102 |
+
data = json.load(uploaded_file)
|
103 |
+
hathifile = data['gathers']
|
104 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
105 |
+
|
106 |
+
keywords = sf.htrc(keywords)
|
107 |
+
keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
|
108 |
+
keywords.rename(columns=col_dict,inplace=True)
|
109 |
+
return keywords
|
110 |
+
|
111 |
+
def conv_pub(extype):
|
112 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
113 |
+
bytedata = extype.read()
|
114 |
+
keywords = sf.readPub(bytedata)
|
115 |
+
elif (get_ext(extype)).endswith('.xml'):
|
116 |
+
bytedata = extype.read()
|
117 |
+
keywords = sf.readxml(bytedata)
|
118 |
+
keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
|
119 |
+
st.write(keywords)
|
120 |
+
return keywords
|
121 |
+
|
122 |
#===Read data===
|
123 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all)
|
124 |
|
125 |
if uploaded_file is not None:
|
126 |
try:
|
|
|
130 |
|
131 |
elif extype.endswith('.txt'):
|
132 |
papers = conv_txt(extype)
|
133 |
+
elif extype.endswith('.json'):
|
134 |
+
papers = conv_json(extype)
|
135 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
136 |
+
papers = conv_pub(uploaded_file)
|
137 |
+
|
138 |
@st.cache_data(ttl=3600)
|
139 |
def get_minmax(extype):
|
140 |
extype = extype
|
141 |
MIN = int(papers['Year'].min())
|
142 |
MAX = int(papers['Year'].max())
|
143 |
+
MIN1 = int(papers['Cited by'].min())
|
144 |
+
MAX1 = int(papers['Cited by'].max())
|
145 |
GAP = MAX - MIN
|
146 |
+
return papers, MIN, MAX, GAP, MIN1, MAX1
|
147 |
+
|
148 |
+
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading", "⬇️ Download Help"])
|
149 |
|
150 |
with tab1:
|
151 |
#===sunburst===
|
152 |
try:
|
153 |
+
papers, MIN, MAX, GAP, MIN1, MAX1 = get_minmax(extype)
|
154 |
except KeyError:
|
155 |
st.error('Error: Please check again your columns.')
|
156 |
sys.exit(1)
|
157 |
|
158 |
if (GAP != 0):
|
159 |
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
160 |
+
KEYLIM = st.slider('Cited By Count',min_value = MIN1, max_value = MAX1, value = (MIN1,MAX1), on_change=reset_all)
|
161 |
else:
|
162 |
st.write('You only have data in ', (MAX))
|
163 |
YEAR = (MIN, MAX)
|
164 |
+
KEYLIM = (MIN1,MAX1)
|
165 |
@st.cache_data(ttl=3600)
|
166 |
def listyear(extype):
|
167 |
global papers
|
168 |
years = list(range(YEAR[0],YEAR[1]+1))
|
169 |
+
cited = list(range(KEYLIM[0],KEYLIM[1]+1))
|
170 |
papers = papers.loc[papers['Year'].isin(years)]
|
171 |
+
papers = papers.loc[papers['Cited by'].isin(cited)]
|
172 |
return years, papers
|
173 |
|
174 |
@st.cache_data(ttl=3600)
|
|
|
178 |
vis[['doctype','source','citby','year']] = papers[['Document Type','Source title','Cited by','Year']]
|
179 |
viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index()
|
180 |
viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True)
|
181 |
+
|
182 |
+
|
183 |
+
|
184 |
fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs',
|
185 |
color='cited by',
|
186 |
color_continuous_scale='RdBu',
|
187 |
color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
|
188 |
fig.update_layout(height=800, width=1200)
|
189 |
+
return fig, viz
|
190 |
|
191 |
years, papers = listyear(extype)
|
192 |
|
193 |
if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
|
194 |
+
fig, viz = vis_sunbrust(extype)
|
195 |
st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
|
196 |
+
|
197 |
+
st.dataframe(viz)
|
198 |
|
199 |
else:
|
200 |
st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨")
|
|
|
202 |
with tab2:
|
203 |
st.markdown('**numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual.** https://numpy.org/doc/stable/reference/generated/numpy.average.html')
|
204 |
st.markdown('**Sunburst. (n.d.). Sunburst Charts in Python.** https://plotly.com/python/sunburst-charts/')
|
205 |
+
|
206 |
+
with tab3:
|
207 |
+
st.text("Click the camera icon on the top right menu (you may need to hover your cursor within the visualization)")
|
208 |
+
st.markdown("")
|
209 |
except:
|
210 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
211 |
+
st.stop()
|
pages/5 Burst Detection.py
CHANGED
@@ -15,6 +15,9 @@ import plotly.graph_objects as go
|
|
15 |
from plotly.subplots import make_subplots
|
16 |
import plotly.io as pio
|
17 |
import sys
|
|
|
|
|
|
|
18 |
|
19 |
#===config===
|
20 |
st.set_page_config(
|
@@ -42,6 +45,8 @@ with st.popover("🔗 Menu"):
|
|
42 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
43 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
44 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
|
|
|
|
45 |
|
46 |
st.header("Burst Detection", anchor=False)
|
47 |
st.subheader('Put your file here...', anchor=False)
|
@@ -51,7 +56,7 @@ def reset_all():
|
|
51 |
st.cache_data.clear()
|
52 |
|
53 |
# Initialize NLP model
|
54 |
-
nlp = spacy.load("
|
55 |
|
56 |
@st.cache_data(ttl=3600)
|
57 |
def upload(extype):
|
@@ -60,6 +65,15 @@ def upload(extype):
|
|
60 |
if 'Publication Year' in df.columns:
|
61 |
df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
62 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
return df
|
64 |
|
65 |
@st.cache_data(ttl=3600)
|
@@ -76,14 +90,49 @@ def get_minmax(df):
|
|
76 |
|
77 |
@st.cache_data(ttl=3600)
|
78 |
def conv_txt(extype):
|
|
|
|
|
|
|
|
|
|
|
79 |
col_dict = {'TI': 'Title',
|
80 |
'SO': 'Source title',
|
|
|
81 |
'DT': 'Document Type',
|
82 |
'AB': 'Abstract',
|
83 |
-
'
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
# Helper Functions
|
89 |
@st.cache_data(ttl=3600)
|
@@ -107,6 +156,10 @@ def load_data(uploaded_file):
|
|
107 |
df = upload(extype)
|
108 |
elif extype.endswith('.txt'):
|
109 |
df = conv_txt(extype)
|
|
|
|
|
|
|
|
|
110 |
|
111 |
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
|
112 |
df = df.dropna(subset=['Year'])
|
@@ -133,23 +186,29 @@ def clean_data(df):
|
|
133 |
|
134 |
# Preprocess text
|
135 |
df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
|
|
|
|
|
136 |
|
137 |
# Vectorize processed text
|
138 |
if count_method == "Document Frequency":
|
139 |
-
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), binary=True)
|
140 |
else:
|
141 |
-
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
|
142 |
X = vectorizer.fit_transform(df['processed'].tolist())
|
143 |
|
144 |
# Create DataFrame from the Document-Term Matrix (DTM)
|
145 |
dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
|
146 |
yearly_term_frequency = dtm.groupby(dtm.index).sum()
|
147 |
|
148 |
-
#
|
149 |
-
|
|
|
|
|
150 |
|
151 |
-
|
152 |
-
|
|
|
|
|
153 |
top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
|
154 |
|
155 |
return yearly_term_frequency, top_words
|
@@ -205,27 +264,38 @@ def convert_df(df):
|
|
205 |
return df.to_csv().encode("utf-8")
|
206 |
|
207 |
@st.cache_data(ttl=3600)
|
208 |
-
def scattervis(bursts, freq_data):
|
209 |
-
freq_data.reset_index(
|
210 |
freq_data.rename(columns={"index": "Year"}, inplace=True)
|
211 |
-
|
212 |
freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
|
213 |
freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
|
214 |
-
wordlist = freq_data_melted["Category"].unique()
|
215 |
|
|
|
216 |
years = freq_data["Year"].tolist()
|
|
|
217 |
bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
218 |
bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
|
|
219 |
burst_points = []
|
220 |
-
|
221 |
for _, row in bursts.iterrows():
|
222 |
for year in range(row["begin"], row["end"] + 1):
|
223 |
burst_points.append((year, row["label"], row["weight"]))
|
224 |
-
|
225 |
burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
fig = go.Figure()
|
228 |
-
|
229 |
# scatter trace for burst points
|
230 |
fig.add_trace(go.Scatter(
|
231 |
x=burst_points_df["Year"],
|
@@ -233,14 +303,15 @@ def scattervis(bursts, freq_data):
|
|
233 |
mode='markers',
|
234 |
marker=dict(
|
235 |
symbol='square',
|
236 |
-
size=40,
|
237 |
color='red',
|
238 |
-
opacity=0.5
|
|
|
239 |
hoverinfo='text',
|
240 |
text=burst_points_df["Weight"],
|
241 |
showlegend=False
|
242 |
))
|
243 |
-
|
244 |
# scatter trace for freq_data
|
245 |
fig.add_trace(go.Scatter(
|
246 |
x=freq_data_melted["Year"],
|
@@ -251,26 +322,43 @@ def scattervis(bursts, freq_data):
|
|
251 |
size=30,
|
252 |
color=freq_data_melted["Value"],
|
253 |
colorscale='Blues',
|
254 |
-
showscale=False
|
|
|
255 |
text=freq_data_melted["Value"],
|
256 |
textposition="middle center",
|
257 |
textfont=dict(
|
258 |
size=16,
|
259 |
-
color=['white' if value > freq_data_melted["Value"].max()/2 else 'black'
|
|
|
|
|
260 |
))
|
261 |
-
|
262 |
-
|
263 |
-
max_year = max(years)
|
264 |
-
|
265 |
fig.update_layout(
|
266 |
-
xaxis=dict(
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
showlegend=False,
|
271 |
-
margin=dict(l=
|
272 |
-
height=
|
273 |
-
width=
|
274 |
autosize=False
|
275 |
)
|
276 |
|
@@ -289,15 +377,13 @@ def linegraph(bursts, freq_data):
|
|
289 |
line_shape='linear',
|
290 |
hoverinfo='text',
|
291 |
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
|
292 |
-
text=freq_data[column],
|
293 |
textposition='top center'
|
294 |
), row=row, col=col)
|
295 |
-
|
296 |
# Add area charts
|
297 |
for _, row_data in bursts[bursts['label'] == column].iterrows():
|
298 |
x_values = freq_data.index[row_data['begin']:row_data['end']+1]
|
299 |
y_values = freq_data[column][row_data['begin']:row_data['end']+1]
|
300 |
-
|
301 |
#middle_y = sum(y_values) / len(y_values)
|
302 |
y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
|
303 |
x_offset = 0.1
|
@@ -326,7 +412,19 @@ def linegraph(bursts, freq_data):
|
|
326 |
textangle=270,
|
327 |
row=row, col=col
|
328 |
)
|
329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
col += 1
|
331 |
if col > 2:
|
332 |
col = 1
|
@@ -349,36 +447,41 @@ def download_result(freq_data, bursts):
|
|
349 |
csv2 = convert_df(bursts)
|
350 |
return csv1, csv2
|
351 |
|
352 |
-
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
353 |
|
354 |
if uploaded_file is not None:
|
355 |
try:
|
356 |
-
c1, c2, c3
|
357 |
top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
|
358 |
viz_selected = c2.selectbox("Option for visualization",
|
359 |
-
("Line graph", "
|
360 |
running_total = c3.selectbox("Calculation method",
|
361 |
("Running total", "By occurrences each year"), on_change=reset_all)
|
362 |
-
count_method =
|
363 |
("Term Frequency", "Document Frequency"), on_change=reset_all)
|
364 |
|
365 |
-
d1, d2 = st.columns([2,8])
|
366 |
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
|
367 |
-
col_name =
|
368 |
(coldf), on_change=reset_all)
|
369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
|
371 |
if (GAP != 0):
|
372 |
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
373 |
else:
|
374 |
-
|
375 |
sys.exit(1)
|
376 |
|
377 |
yearly_term_frequency, top_words = clean_data(df)
|
378 |
|
379 |
bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
|
380 |
|
381 |
-
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
382 |
|
383 |
with tab1:
|
384 |
if bursts.empty:
|
@@ -394,7 +497,7 @@ if uploaded_file is not None:
|
|
394 |
linegraph(bursts, freq_data)
|
395 |
|
396 |
elif viz_selected =="Scatter plot":
|
397 |
-
scattervis(bursts, freq_data)
|
398 |
|
399 |
csv1, csv2 = download_result(freq_data, bursts)
|
400 |
e1, e2, e3 = st.columns(3)
|
@@ -424,7 +527,23 @@ if uploaded_file is not None:
|
|
424 |
st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680')
|
425 |
st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
|
426 |
st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
|
428 |
-
except:
|
429 |
-
st.error("Please ensure that your file
|
430 |
-
st.stop()
|
|
|
15 |
from plotly.subplots import make_subplots
|
16 |
import plotly.io as pio
|
17 |
import sys
|
18 |
+
import json
|
19 |
+
from tools import sourceformat as sf
|
20 |
+
|
21 |
|
22 |
#===config===
|
23 |
st.set_page_config(
|
|
|
45 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
46 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
47 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
48 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
49 |
+
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
50 |
|
51 |
st.header("Burst Detection", anchor=False)
|
52 |
st.subheader('Put your file here...', anchor=False)
|
|
|
56 |
st.cache_data.clear()
|
57 |
|
58 |
# Initialize NLP model
|
59 |
+
nlp = spacy.load("en_core_web_sm")
|
60 |
|
61 |
@st.cache_data(ttl=3600)
|
62 |
def upload(extype):
|
|
|
65 |
if 'Publication Year' in df.columns:
|
66 |
df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
67 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
68 |
+
if "About the data" in df.columns[0]:
|
69 |
+
df = sf.dim(df)
|
70 |
+
col_dict = {'MeSH terms': 'Keywords',
|
71 |
+
'PubYear': 'Year',
|
72 |
+
'Times cited': 'Cited by',
|
73 |
+
'Publication Type': 'Document Type'
|
74 |
+
}
|
75 |
+
df.rename(columns=col_dict, inplace=True)
|
76 |
+
|
77 |
return df
|
78 |
|
79 |
@st.cache_data(ttl=3600)
|
|
|
90 |
|
91 |
@st.cache_data(ttl=3600)
|
92 |
def conv_txt(extype):
|
93 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
94 |
+
uploaded_file.seek(0)
|
95 |
+
papers = sf.medline(uploaded_file)
|
96 |
+
print(papers)
|
97 |
+
return papers
|
98 |
col_dict = {'TI': 'Title',
|
99 |
'SO': 'Source title',
|
100 |
+
'DE': 'Author Keywords',
|
101 |
'DT': 'Document Type',
|
102 |
'AB': 'Abstract',
|
103 |
+
'TC': 'Cited by',
|
104 |
+
'PY': 'Year',
|
105 |
+
'ID': 'Keywords Plus',
|
106 |
+
'rights_date_used': 'Year'}
|
107 |
+
uploaded_file.seek(0)
|
108 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
109 |
+
if("htid" in papers.columns):
|
110 |
+
papers = sf.htrc(papers)
|
111 |
+
papers.rename(columns=col_dict, inplace=True)
|
112 |
+
print(papers)
|
113 |
+
return papers
|
114 |
+
|
115 |
+
def conv_json(extype):
|
116 |
+
col_dict={'title': 'title',
|
117 |
+
'rights_date_used': 'Year',
|
118 |
+
}
|
119 |
+
|
120 |
+
data = json.load(uploaded_file)
|
121 |
+
hathifile = data['gathers']
|
122 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
123 |
+
|
124 |
+
keywords = sf.htrc(keywords)
|
125 |
+
keywords.rename(columns=col_dict,inplace=True)
|
126 |
+
return keywords
|
127 |
+
|
128 |
+
def conv_pub(extype):
|
129 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
130 |
+
bytedata = extype.read()
|
131 |
+
keywords = sf.readPub(bytedata)
|
132 |
+
elif (get_ext(extype)).endswith('.xml'):
|
133 |
+
bytedata = extype.read()
|
134 |
+
keywords = sf.readxml(bytedata)
|
135 |
+
return keywords
|
136 |
|
137 |
# Helper Functions
|
138 |
@st.cache_data(ttl=3600)
|
|
|
156 |
df = upload(extype)
|
157 |
elif extype.endswith('.txt'):
|
158 |
df = conv_txt(extype)
|
159 |
+
elif extype.endswith('.json'):
|
160 |
+
df = conv_json(extype)
|
161 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
162 |
+
df = conv_pub(uploaded_file)
|
163 |
|
164 |
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
|
165 |
df = df.dropna(subset=['Year'])
|
|
|
186 |
|
187 |
# Preprocess text
|
188 |
df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
|
189 |
+
|
190 |
+
ngram_range = (1, xgram)
|
191 |
|
192 |
# Vectorize processed text
|
193 |
if count_method == "Document Frequency":
|
194 |
+
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), binary=True, ngram_range=ngram_range)
|
195 |
else:
|
196 |
+
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), ngram_range=ngram_range)
|
197 |
X = vectorizer.fit_transform(df['processed'].tolist())
|
198 |
|
199 |
# Create DataFrame from the Document-Term Matrix (DTM)
|
200 |
dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
|
201 |
yearly_term_frequency = dtm.groupby(dtm.index).sum()
|
202 |
|
203 |
+
# excluded & included words
|
204 |
+
if exc_inc == "Words to exclude":
|
205 |
+
excluded_words = [word.strip() for word in words_input.split(',')]
|
206 |
+
filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
|
207 |
|
208 |
+
elif exc_inc == "Focus on these words":
|
209 |
+
included_words = [word.strip() for word in words_input.split(',')]
|
210 |
+
filtered_words = [word for word in yearly_term_frequency.columns if word in included_words]
|
211 |
+
|
212 |
top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
|
213 |
|
214 |
return yearly_term_frequency, top_words
|
|
|
264 |
return df.to_csv().encode("utf-8")
|
265 |
|
266 |
@st.cache_data(ttl=3600)
|
267 |
+
def scattervis(bursts, freq_data, top_n):
|
268 |
+
freq_data = freq_data.reset_index()
|
269 |
freq_data.rename(columns={"index": "Year"}, inplace=True)
|
270 |
+
|
271 |
freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
|
272 |
freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
|
|
|
273 |
|
274 |
+
wordlist = freq_data_melted["Category"].unique()
|
275 |
years = freq_data["Year"].tolist()
|
276 |
+
|
277 |
bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
278 |
bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
279 |
+
|
280 |
burst_points = []
|
|
|
281 |
for _, row in bursts.iterrows():
|
282 |
for year in range(row["begin"], row["end"] + 1):
|
283 |
burst_points.append((year, row["label"], row["weight"]))
|
|
|
284 |
burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
|
285 |
+
|
286 |
+
min_year = min(years)
|
287 |
+
max_year = max(years)
|
288 |
+
n_years = max_year - min_year + 1
|
289 |
+
n_labels = len(wordlist)
|
290 |
+
|
291 |
+
label_spacing = 50
|
292 |
+
year_spacing = 60
|
293 |
+
|
294 |
+
plot_height = n_labels * label_spacing + 100
|
295 |
+
plot_width = n_years * year_spacing + 150
|
296 |
+
|
297 |
fig = go.Figure()
|
298 |
+
|
299 |
# scatter trace for burst points
|
300 |
fig.add_trace(go.Scatter(
|
301 |
x=burst_points_df["Year"],
|
|
|
303 |
mode='markers',
|
304 |
marker=dict(
|
305 |
symbol='square',
|
306 |
+
size=40,
|
307 |
color='red',
|
308 |
+
opacity=0.5
|
309 |
+
),
|
310 |
hoverinfo='text',
|
311 |
text=burst_points_df["Weight"],
|
312 |
showlegend=False
|
313 |
))
|
314 |
+
|
315 |
# scatter trace for freq_data
|
316 |
fig.add_trace(go.Scatter(
|
317 |
x=freq_data_melted["Year"],
|
|
|
322 |
size=30,
|
323 |
color=freq_data_melted["Value"],
|
324 |
colorscale='Blues',
|
325 |
+
showscale=False
|
326 |
+
),
|
327 |
text=freq_data_melted["Value"],
|
328 |
textposition="middle center",
|
329 |
textfont=dict(
|
330 |
size=16,
|
331 |
+
color=['white' if value > freq_data_melted["Value"].max()/2 else 'black'
|
332 |
+
for value in freq_data_melted["Value"]]
|
333 |
+
)
|
334 |
))
|
335 |
+
|
336 |
+
# Layout
|
|
|
|
|
337 |
fig.update_layout(
|
338 |
+
xaxis=dict(
|
339 |
+
tickmode='linear',
|
340 |
+
dtick=1,
|
341 |
+
range=[min_year - 1, max_year + 1],
|
342 |
+
tickfont=dict(size=16),
|
343 |
+
automargin=True,
|
344 |
+
showgrid=False,
|
345 |
+
zeroline=False
|
346 |
+
),
|
347 |
+
yaxis=dict(
|
348 |
+
tickvals=wordlist,
|
349 |
+
ticktext=wordlist,
|
350 |
+
tickmode='array',
|
351 |
+
tickfont=dict(size=16),
|
352 |
+
automargin=True,
|
353 |
+
showgrid=False,
|
354 |
+
zeroline=False
|
355 |
+
),
|
356 |
+
plot_bgcolor='white',
|
357 |
+
paper_bgcolor='white',
|
358 |
showlegend=False,
|
359 |
+
margin=dict(l=20, r=20, t=20, b=20),
|
360 |
+
height=plot_height,
|
361 |
+
width=plot_width,
|
362 |
autosize=False
|
363 |
)
|
364 |
|
|
|
377 |
line_shape='linear',
|
378 |
hoverinfo='text',
|
379 |
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
|
380 |
+
#text=freq_data[column],
|
381 |
textposition='top center'
|
382 |
), row=row, col=col)
|
|
|
383 |
# Add area charts
|
384 |
for _, row_data in bursts[bursts['label'] == column].iterrows():
|
385 |
x_values = freq_data.index[row_data['begin']:row_data['end']+1]
|
386 |
y_values = freq_data[column][row_data['begin']:row_data['end']+1]
|
|
|
387 |
#middle_y = sum(y_values) / len(y_values)
|
388 |
y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
|
389 |
x_offset = 0.1
|
|
|
412 |
textangle=270,
|
413 |
row=row, col=col
|
414 |
)
|
415 |
+
|
416 |
+
# Add labels for values only in bursts
|
417 |
+
fig.add_trace(go.Scatter(
|
418 |
+
x=x_values, y=y_values, mode='lines+markers+text', name=column,
|
419 |
+
line_shape='linear',
|
420 |
+
hoverinfo='text',
|
421 |
+
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
|
422 |
+
text=y_values,
|
423 |
+
textposition='top center'
|
424 |
+
), row=row, col=col)
|
425 |
+
print(freq_data[column])
|
426 |
+
|
427 |
+
|
428 |
col += 1
|
429 |
if col > 2:
|
430 |
col = 1
|
|
|
447 |
csv2 = convert_df(bursts)
|
448 |
return csv1, csv2
|
449 |
|
450 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
|
451 |
|
452 |
if uploaded_file is not None:
|
453 |
try:
|
454 |
+
c1, c2, c3 = st.columns([3,3,4])
|
455 |
top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
|
456 |
viz_selected = c2.selectbox("Option for visualization",
|
457 |
+
("Line graph", "Heatmap"), on_change=reset_all)
|
458 |
running_total = c3.selectbox("Calculation method",
|
459 |
("Running total", "By occurrences each year"), on_change=reset_all)
|
460 |
+
count_method = c1.selectbox("Count by",
|
461 |
("Term Frequency", "Document Frequency"), on_change=reset_all)
|
462 |
|
|
|
463 |
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
|
464 |
+
col_name = c2.selectbox("Select column to analyze",
|
465 |
(coldf), on_change=reset_all)
|
466 |
+
xgram = c3.selectbox("N-grams", ("1", "2", "3"), on_change=reset_all)
|
467 |
+
xgram = int(xgram)
|
468 |
+
|
469 |
+
st.divider()
|
470 |
+
d1, d2 = st.columns([3,7])
|
471 |
+
exc_inc = d1.radio("Select to exclude or focus on specific words", ["Words to exclude","Focus on these words"], horizontal=True, on_change=reset_all)
|
472 |
+
words_input = d2.text_input("Words to exclude or focus on (comma-separated)", on_change=reset_all)
|
473 |
|
474 |
if (GAP != 0):
|
475 |
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
476 |
else:
|
477 |
+
c1.write('You only have data in ', (MAX))
|
478 |
sys.exit(1)
|
479 |
|
480 |
yearly_term_frequency, top_words = clean_data(df)
|
481 |
|
482 |
bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
|
483 |
|
484 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
485 |
|
486 |
with tab1:
|
487 |
if bursts.empty:
|
|
|
497 |
linegraph(bursts, freq_data)
|
498 |
|
499 |
elif viz_selected =="Scatter plot":
|
500 |
+
scattervis(bursts, freq_data, top_n)
|
501 |
|
502 |
csv1, csv2 = download_result(freq_data, bursts)
|
503 |
e1, e2, e3 = st.columns(3)
|
|
|
527 |
st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680')
|
528 |
st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
|
529 |
st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
|
530 |
+
st.markdown('**Santosa, F. A. (2025). Artificial Intelligence in Library Studies: A Textual Analysis. JLIS.It, 16(1).** https://doi.org/10.36253/jlis.it-626')
|
531 |
+
|
532 |
+
with tab4:
|
533 |
+
st.subheader(':blue[Burst Detection]', anchor=False)
|
534 |
+
st.button('📊 Download high resolution image', on_click=None)
|
535 |
+
st.text("Click download button.")
|
536 |
+
|
537 |
+
st.divider()
|
538 |
+
st.subheader(':blue[Top words]', anchor=False)
|
539 |
+
st.button('👉 Press to download list of top words', on_click=None)
|
540 |
+
st.text("Click download button.")
|
541 |
+
|
542 |
+
st.divider()
|
543 |
+
st.subheader(':blue[Burst]', anchor=False)
|
544 |
+
st.button('👉 Press to download the list of detected bursts', on_click=None)
|
545 |
+
st.text("Click download button.")
|
546 |
|
547 |
+
except Exception as e:
|
548 |
+
st.error("Please ensure that your file or settings are correct. If you think there is a mistake, feel free to reach out to us!", icon="🚨")
|
549 |
+
st.stop()
|
pages/6 Keywords Stem.py
CHANGED
@@ -1,238 +1,298 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
import re
|
5 |
-
import nltk
|
6 |
-
nltk.download('wordnet')
|
7 |
-
from nltk.stem import WordNetLemmatizer
|
8 |
-
nltk.download('stopwords')
|
9 |
-
from nltk.corpus import stopwords
|
10 |
-
from pprint import pprint
|
11 |
-
import pickle
|
12 |
-
import streamlit.components.v1 as components
|
13 |
-
from io import StringIO
|
14 |
-
from nltk.stem.snowball import SnowballStemmer
|
15 |
-
import csv
|
16 |
-
import sys
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
st.page_link("
|
41 |
-
st.page_link("pages/
|
42 |
-
st.page_link("pages/
|
43 |
-
st.page_link("pages/
|
44 |
-
|
45 |
-
|
46 |
-
st.
|
47 |
-
st.
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
keywords
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
st.
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
import nltk
|
6 |
+
nltk.download('wordnet')
|
7 |
+
from nltk.stem import WordNetLemmatizer
|
8 |
+
nltk.download('stopwords')
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from pprint import pprint
|
11 |
+
import pickle
|
12 |
+
import streamlit.components.v1 as components
|
13 |
+
from io import StringIO
|
14 |
+
from nltk.stem.snowball import SnowballStemmer
|
15 |
+
import csv
|
16 |
+
import sys
|
17 |
+
import json
|
18 |
+
from tools import sourceformat as sf
|
19 |
+
|
20 |
+
|
21 |
+
#===config===
|
22 |
+
st.set_page_config(
|
23 |
+
page_title="Coconut",
|
24 |
+
page_icon="🥥",
|
25 |
+
layout="wide",
|
26 |
+
initial_sidebar_state="collapsed"
|
27 |
+
)
|
28 |
+
|
29 |
+
hide_streamlit_style = """
|
30 |
+
<style>
|
31 |
+
#MainMenu
|
32 |
+
{visibility: hidden;}
|
33 |
+
footer {visibility: hidden;}
|
34 |
+
[data-testid="collapsedControl"] {display: none}
|
35 |
+
</style>
|
36 |
+
"""
|
37 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
38 |
+
|
39 |
+
with st.popover("🔗 Menu"):
|
40 |
+
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
41 |
+
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
42 |
+
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
43 |
+
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
44 |
+
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
45 |
+
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
46 |
+
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
47 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
48 |
+
|
49 |
+
|
50 |
+
st.header("Keywords Stem", anchor=False)
|
51 |
+
st.subheader('Put your file here...', anchor=False)
|
52 |
+
|
53 |
+
def reset_data():
|
54 |
+
st.cache_data.clear()
|
55 |
+
|
56 |
+
#===check filetype===
|
57 |
+
@st.cache_data(ttl=3600)
|
58 |
+
def get_ext(extype):
|
59 |
+
extype = uploaded_file.name
|
60 |
+
return extype
|
61 |
+
|
62 |
+
#===upload===
|
63 |
+
@st.cache_data(ttl=3600)
|
64 |
+
def upload(extype):
|
65 |
+
keywords = pd.read_csv(uploaded_file)
|
66 |
+
|
67 |
+
if "About the data" in keywords.columns[0]:
|
68 |
+
keywords = sf.dim(keywords)
|
69 |
+
col_dict = {'MeSH terms': 'Keywords',
|
70 |
+
'PubYear': 'Year',
|
71 |
+
'Times cited': 'Cited by',
|
72 |
+
'Publication Type': 'Document Type'
|
73 |
+
}
|
74 |
+
keywords.rename(columns=col_dict, inplace=True)
|
75 |
+
|
76 |
+
return keywords
|
77 |
+
@st.cache_data(ttl=3600)
|
78 |
+
def conv_txt(extype):
|
79 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
80 |
+
uploaded_file.seek(0)
|
81 |
+
papers = sf.medline(uploaded_file)
|
82 |
+
print(papers)
|
83 |
+
return papers
|
84 |
+
col_dict = {'TI': 'Title',
|
85 |
+
'SO': 'Source title',
|
86 |
+
'DE': 'Author Keywords',
|
87 |
+
'DT': 'Document Type',
|
88 |
+
'AB': 'Abstract',
|
89 |
+
'TC': 'Cited by',
|
90 |
+
'PY': 'Year',
|
91 |
+
'ID': 'Keywords Plus',
|
92 |
+
'rights_date_used': 'Year'}
|
93 |
+
uploaded_file.seek(0)
|
94 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
95 |
+
if("htid" in papers.columns):
|
96 |
+
papers = sf.htrc(papers)
|
97 |
+
papers.rename(columns=col_dict, inplace=True)
|
98 |
+
print(papers)
|
99 |
+
return papers
|
100 |
+
|
101 |
+
@st.cache_data(ttl=3600)
|
102 |
+
def rev_conv_txt(extype):
|
103 |
+
col_dict_rev = {'Title': 'TI',
|
104 |
+
'Source title': 'SO',
|
105 |
+
'Author Keywords': 'DE',
|
106 |
+
'Keywords Plus': 'ID'}
|
107 |
+
keywords.rename(columns=col_dict_rev, inplace=True)
|
108 |
+
return keywords
|
109 |
+
|
110 |
+
@st.cache_data(ttl=3600)
|
111 |
+
def conv_json(extype):
|
112 |
+
col_dict={'title': 'title',
|
113 |
+
'rights_date_used': 'Year',
|
114 |
+
}
|
115 |
+
|
116 |
+
data = json.load(uploaded_file)
|
117 |
+
hathifile = data['gathers']
|
118 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
119 |
+
|
120 |
+
keywords = sf.htrc(keywords)
|
121 |
+
keywords.rename(columns=col_dict,inplace=True)
|
122 |
+
return keywords
|
123 |
+
|
124 |
+
def conv_pub(extype):
|
125 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
126 |
+
bytedata = extype.read()
|
127 |
+
keywords = sf.readPub(bytedata)
|
128 |
+
elif (get_ext(extype)).endswith('.xml'):
|
129 |
+
bytedata = extype.read()
|
130 |
+
keywords = sf.readxml(bytedata)
|
131 |
+
return keywords
|
132 |
+
|
133 |
+
@st.cache_data(ttl=3600)
|
134 |
+
def get_data(extype):
|
135 |
+
list_of_column_key = list(keywords.columns)
|
136 |
+
list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
|
137 |
+
return list_of_column_key
|
138 |
+
|
139 |
+
uploaded_file = st.file_uploader('', type=['csv','txt','json','tar.gz','xml'], on_change=reset_data)
|
140 |
+
|
141 |
+
if uploaded_file is not None:
|
142 |
+
try:
|
143 |
+
extype = get_ext(uploaded_file)
|
144 |
+
if extype.endswith('.csv'):
|
145 |
+
keywords = upload(extype)
|
146 |
+
|
147 |
+
elif extype.endswith('.txt'):
|
148 |
+
keywords = conv_txt(extype)
|
149 |
+
|
150 |
+
elif extype.endswith('.json'):
|
151 |
+
keywords = conv_json(extype)
|
152 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
153 |
+
keywords = conv_pub(uploaded_file)
|
154 |
+
|
155 |
+
list_of_column_key = get_data(extype)
|
156 |
+
|
157 |
+
col1, col2 = st.columns(2)
|
158 |
+
with col1:
|
159 |
+
method = st.selectbox(
|
160 |
+
'Choose method',
|
161 |
+
('Lemmatization', 'Stemming'), on_change=reset_data)
|
162 |
+
with col2:
|
163 |
+
keyword = st.selectbox(
|
164 |
+
'Choose column',
|
165 |
+
(list_of_column_key), on_change=reset_data)
|
166 |
+
|
167 |
+
@st.cache_data(ttl=3600)
|
168 |
+
def clean_keyword(extype):
|
169 |
+
global keyword, keywords
|
170 |
+
try:
|
171 |
+
key = keywords[keyword]
|
172 |
+
except KeyError:
|
173 |
+
st.error('Error: Please check your Author/Index Keywords column.')
|
174 |
+
sys.exit(1)
|
175 |
+
keywords = keywords.replace(np.nan, '', regex=True)
|
176 |
+
keywords[keyword] = keywords[keyword].astype(str)
|
177 |
+
keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
|
178 |
+
keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
|
179 |
+
keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
|
180 |
+
|
181 |
+
#===Keywords list===
|
182 |
+
key = key.dropna()
|
183 |
+
key = pd.concat([key.str.split('; ', expand=True)], axis=1)
|
184 |
+
key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
|
185 |
+
key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
|
186 |
+
key['new']=key[0].map(lambda x: x.lower())
|
187 |
+
|
188 |
+
return keywords, key
|
189 |
+
|
190 |
+
#===stem/lem===
|
191 |
+
@st.cache_data(ttl=3600)
|
192 |
+
def Lemmatization(extype):
|
193 |
+
lemmatizer = WordNetLemmatizer()
|
194 |
+
def lemmatize_words(text):
|
195 |
+
words = text.split()
|
196 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
197 |
+
return ' '.join(words)
|
198 |
+
keywords[keyword] = keywords[keyword].apply(lemmatize_words)
|
199 |
+
key['new'] = key['new'].apply(lemmatize_words)
|
200 |
+
keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
|
201 |
+
return keywords, key
|
202 |
+
|
203 |
+
@st.cache_data(ttl=3600)
|
204 |
+
def Stemming(extype):
|
205 |
+
stemmer = SnowballStemmer("english")
|
206 |
+
def stem_words(text):
|
207 |
+
words = text.split()
|
208 |
+
words = [stemmer.stem(word) for word in words]
|
209 |
+
return ' '.join(words)
|
210 |
+
keywords[keyword] = keywords[keyword].apply(stem_words)
|
211 |
+
key['new'] = key['new'].apply(stem_words)
|
212 |
+
keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
|
213 |
+
return keywords, key
|
214 |
+
|
215 |
+
keywords, key = clean_keyword(extype)
|
216 |
+
|
217 |
+
if method is 'Lemmatization':
|
218 |
+
keywords, key = Lemmatization(extype)
|
219 |
+
else:
|
220 |
+
keywords, key = Stemming(extype)
|
221 |
+
|
222 |
+
st.write('Congratulations! 🤩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
|
223 |
+
st.divider()
|
224 |
+
|
225 |
+
#===show & download csv===
|
226 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs(["📥 Result", "📥 List of Keywords", "📃 Reference", "📃 Recommended Reading", "⬇️ Download Help"])
|
227 |
+
|
228 |
+
with tab1:
|
229 |
+
st.dataframe(keywords, use_container_width=True, hide_index=True)
|
230 |
+
@st.cache_data(ttl=3600)
|
231 |
+
def convert_df(extype):
|
232 |
+
return keywords.to_csv(index=False).encode('utf-8')
|
233 |
+
|
234 |
+
@st.cache_data(ttl=3600)
|
235 |
+
def convert_txt(extype):
|
236 |
+
return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
|
237 |
+
|
238 |
+
if extype.endswith('.csv'):
|
239 |
+
csv = convert_df(extype)
|
240 |
+
st.download_button(
|
241 |
+
"Press to download result 👈",
|
242 |
+
csv,
|
243 |
+
"result.csv",
|
244 |
+
"text/csv")
|
245 |
+
|
246 |
+
elif extype.endswith('.txt'):
|
247 |
+
keywords = rev_conv_txt(extype)
|
248 |
+
txt = convert_txt(extype)
|
249 |
+
st.download_button(
|
250 |
+
"Press to download result 👈",
|
251 |
+
txt,
|
252 |
+
"result.txt",
|
253 |
+
"text/csv")
|
254 |
+
|
255 |
+
with tab2:
|
256 |
+
@st.cache_data(ttl=3600)
|
257 |
+
def table_keyword(extype):
|
258 |
+
keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
|
259 |
+
return keytab
|
260 |
+
|
261 |
+
#===coloring the same keywords===
|
262 |
+
@st.cache_data(ttl=3600)
|
263 |
+
def highlight_cells(value):
|
264 |
+
if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
|
265 |
+
return 'background-color: yellow'
|
266 |
+
return ''
|
267 |
+
keytab = table_keyword(extype)
|
268 |
+
st.dataframe(keytab.style.applymap(highlight_cells, subset=['new']), use_container_width=True, hide_index=True)
|
269 |
+
|
270 |
+
@st.cache_data(ttl=3600)
|
271 |
+
def convert_dfs(extype):
|
272 |
+
return key.to_csv(index=False).encode('utf-8')
|
273 |
+
|
274 |
+
csv = convert_dfs(extype)
|
275 |
+
|
276 |
+
st.download_button(
|
277 |
+
"Press to download keywords 👈",
|
278 |
+
csv,
|
279 |
+
"keywords.csv",
|
280 |
+
"text/csv")
|
281 |
+
|
282 |
+
with tab3:
|
283 |
+
st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
|
284 |
+
|
285 |
+
with tab4:
|
286 |
+
st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
|
287 |
+
st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, & Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
|
288 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
|
289 |
+
|
290 |
+
with tab5:
|
291 |
+
st.text("Download keywords at bottom of table")
|
292 |
+
st.divider()
|
293 |
+
st.text("Download table")
|
294 |
+
st.markdown("
|
295 |
+
except Exception as e:
|
296 |
+
st.write(e)
|
297 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
298 |
+
st.stop()
|
pages/7 Sentiment Analysis.py
ADDED
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#import module
|
2 |
+
import streamlit as st
|
3 |
+
import streamlit.components.v1 as components
|
4 |
+
import pandas as pd
|
5 |
+
import re
|
6 |
+
import nltk
|
7 |
+
import pandas as pd
|
8 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
9 |
+
nltk.download('stopwords')
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.tokenize import word_tokenize
|
12 |
+
from nltk.stem import WordNetLemmatizer
|
13 |
+
nltk.download('punkt_tab')
|
14 |
+
nltk.download('vader_lexicon')
|
15 |
+
from textblob import TextBlob
|
16 |
+
import os
|
17 |
+
import numpy as np
|
18 |
+
import plotly.express as px
|
19 |
+
import json
|
20 |
+
from tools import sourceformat as sf
|
21 |
+
|
22 |
+
#===config===
|
23 |
+
st.set_page_config(
|
24 |
+
page_title="Coconut",
|
25 |
+
page_icon="🥥",
|
26 |
+
layout="wide",
|
27 |
+
initial_sidebar_state="collapsed"
|
28 |
+
)
|
29 |
+
|
30 |
+
hide_streamlit_style = """
|
31 |
+
<style>
|
32 |
+
#MainMenu
|
33 |
+
{visibility: hidden;}
|
34 |
+
footer {visibility: hidden;}
|
35 |
+
[data-testid="collapsedControl"] {display: none}
|
36 |
+
</style>
|
37 |
+
"""
|
38 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
39 |
+
|
40 |
+
with st.popover("🔗 Menu"):
|
41 |
+
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
42 |
+
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
43 |
+
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
44 |
+
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
45 |
+
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
46 |
+
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
47 |
+
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
48 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
49 |
+
|
50 |
+
st.header("Sentiment Analysis", anchor=False)
|
51 |
+
st.subheader('Put your file here...', anchor=False)
|
52 |
+
|
53 |
+
#========unique id========
|
54 |
+
@st.cache_resource(ttl=3600)
|
55 |
+
def create_list():
|
56 |
+
l = [1, 2, 3]
|
57 |
+
return l
|
58 |
+
|
59 |
+
l = create_list()
|
60 |
+
first_list_value = l[0]
|
61 |
+
l[0] = first_list_value + 1
|
62 |
+
uID = str(l[0])
|
63 |
+
|
64 |
+
@st.cache_data(ttl=3600)
|
65 |
+
def get_ext(uploaded_file):
|
66 |
+
extype = uID+uploaded_file.name
|
67 |
+
return extype
|
68 |
+
|
69 |
+
#===clear cache===
|
70 |
+
|
71 |
+
|
72 |
+
def reset_all():
|
73 |
+
st.cache_data.clear()
|
74 |
+
|
75 |
+
#===avoiding deadlock===
|
76 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
77 |
+
|
78 |
+
#===upload file===
|
79 |
+
@st.cache_data(ttl=3600)
|
80 |
+
def upload(file):
|
81 |
+
papers = pd.read_csv(uploaded_file)
|
82 |
+
if "About the data" in papers.columns[0]:
|
83 |
+
papers = sf.dim(papers)
|
84 |
+
col_dict = {'MeSH terms': 'Keywords',
|
85 |
+
'PubYear': 'Year',
|
86 |
+
'Times cited': 'Cited by',
|
87 |
+
'Publication Type': 'Document Type'
|
88 |
+
}
|
89 |
+
papers.rename(columns=col_dict, inplace=True)
|
90 |
+
return papers
|
91 |
+
|
92 |
+
@st.cache_data(ttl=3600)
|
93 |
+
def conv_txt(extype):
|
94 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
95 |
+
uploaded_file.seek(0)
|
96 |
+
papers = sf.medline(uploaded_file)
|
97 |
+
print(papers)
|
98 |
+
return papers
|
99 |
+
col_dict = {'TI': 'Title',
|
100 |
+
'SO': 'Source title',
|
101 |
+
'DE': 'Author Keywords',
|
102 |
+
'DT': 'Document Type',
|
103 |
+
'AB': 'Abstract',
|
104 |
+
'TC': 'Cited by',
|
105 |
+
'PY': 'Year',
|
106 |
+
'ID': 'Keywords Plus',
|
107 |
+
'rights_date_used': 'Year'}
|
108 |
+
uploaded_file.seek(0)
|
109 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
110 |
+
if("htid" in papers.columns):
|
111 |
+
papers = sf.htrc(papers)
|
112 |
+
papers.rename(columns=col_dict, inplace=True)
|
113 |
+
print(papers)
|
114 |
+
return papers
|
115 |
+
|
116 |
+
|
117 |
+
@st.cache_data(ttl=3600)
|
118 |
+
def conv_json(extype):
|
119 |
+
col_dict={'title': 'title',
|
120 |
+
'rights_date_used': 'Year',
|
121 |
+
}
|
122 |
+
|
123 |
+
data = json.load(uploaded_file)
|
124 |
+
hathifile = data['gathers']
|
125 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
126 |
+
|
127 |
+
keywords = sf.htrc(keywords)
|
128 |
+
keywords.rename(columns=col_dict,inplace=True)
|
129 |
+
return keywords
|
130 |
+
|
131 |
+
@st.cache_resource(ttl=3600)
|
132 |
+
def conv_pub(extype):
|
133 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
134 |
+
bytedata = extype.read()
|
135 |
+
keywords = sf.readPub(bytedata)
|
136 |
+
elif (get_ext(extype)).endswith('.xml'):
|
137 |
+
bytedata = extype.read()
|
138 |
+
keywords = sf.readxml(bytedata)
|
139 |
+
return keywords
|
140 |
+
|
141 |
+
#===Read data===
|
142 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all)
|
143 |
+
|
144 |
+
if uploaded_file is not None:
|
145 |
+
try:
|
146 |
+
extype = get_ext(uploaded_file)
|
147 |
+
|
148 |
+
if extype.endswith('.csv'):
|
149 |
+
papers = upload(extype)
|
150 |
+
elif extype.endswith('.txt'):
|
151 |
+
papers = conv_txt(extype)
|
152 |
+
|
153 |
+
elif extype.endswith('.json'):
|
154 |
+
papers = conv_json(extype)
|
155 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
156 |
+
papers = conv_pub(uploaded_file)
|
157 |
+
|
158 |
+
coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
159 |
+
|
160 |
+
c1, c2 = st.columns(2)
|
161 |
+
ColCho = c1.selectbox(
|
162 |
+
'Choose column',
|
163 |
+
(coldf), on_change=reset_all)
|
164 |
+
method = c2.selectbox(
|
165 |
+
'Choose method',[
|
166 |
+
'TextBlob','NLTKvader']
|
167 |
+
)
|
168 |
+
words_to_remove = c1.text_input("Remove specific words. Separate words by semicolons (;)")
|
169 |
+
wordcount = c2.number_input(label = "Words displayed", min_value = 1, step = 1, value=5)-1
|
170 |
+
rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
171 |
+
rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
|
172 |
+
|
173 |
+
#===clean csv===
|
174 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
175 |
+
def clean_csv(extype):
|
176 |
+
paper = papers.dropna(subset=[ColCho])
|
177 |
+
|
178 |
+
#===mapping===
|
179 |
+
paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
|
180 |
+
if rem_punc:
|
181 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
|
182 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
|
183 |
+
if rem_copyright:
|
184 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
|
185 |
+
|
186 |
+
#===stopword removal===
|
187 |
+
stop = stopwords.words('english')
|
188 |
+
paper[ColCho] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
189 |
+
|
190 |
+
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
191 |
+
remove_dict = {word: None for word in words_rmv}
|
192 |
+
|
193 |
+
@st.cache_resource(ttl=3600)
|
194 |
+
def remove_words(text):
|
195 |
+
words = text.split()
|
196 |
+
cleaned_words = [word for word in words if word not in remove_dict]
|
197 |
+
return ' '.join(cleaned_words)
|
198 |
+
|
199 |
+
paper['Sentences__'] = paper['Abstract_pre'].map(remove_words)
|
200 |
+
|
201 |
+
return paper
|
202 |
+
paper=clean_csv(extype)
|
203 |
+
|
204 |
+
if method == 'NLTKvader':
|
205 |
+
analyzer = SentimentIntensityAnalyzer()
|
206 |
+
|
207 |
+
@st.cache_resource(ttl=3600)
|
208 |
+
def get_sentiment(text):
|
209 |
+
score = analyzer.polarity_scores(text)
|
210 |
+
return score
|
211 |
+
|
212 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Result", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
213 |
+
with tab1:
|
214 |
+
|
215 |
+
paper['Scores'] = paper['Sentences__'].apply(get_sentiment)
|
216 |
+
|
217 |
+
scoreframe = pd.DataFrame()
|
218 |
+
|
219 |
+
scoreframe['Phrase'] = pd.Series(paper['Sentences__'])
|
220 |
+
|
221 |
+
scoreframe[['Negativity','Neutrality','Positivity','Compound']] = pd.DataFrame.from_records(paper['Scores'])
|
222 |
+
|
223 |
+
scoreframe = scoreframe.groupby(scoreframe.columns.tolist(),as_index=False).size()
|
224 |
+
|
225 |
+
scoreframe = scoreframe.truncate(after = wordcount)
|
226 |
+
|
227 |
+
with st.expander("Sentence and Results"):
|
228 |
+
finalframe = pd.DataFrame()
|
229 |
+
finalframe['Sentence'] = scoreframe['Phrase']
|
230 |
+
finalframe[['Negativity','Neutrality','Positivity','Compound']] = scoreframe[['Negativity','Neutrality','Positivity','Compound']]
|
231 |
+
finalframe[['Count']] = scoreframe[['size']]
|
232 |
+
|
233 |
+
st.dataframe(finalframe, use_container_width=True)
|
234 |
+
|
235 |
+
with tab2:
|
236 |
+
st.markdown('**Hutto, C. and Gilbert, E. (2014) ‘VADER: A Parsimonious Rule-Based Model for Sentiment Analysis of Social Media Text’, Proceedings of the International AAAI Conference on Web and Social Media, 8(1), pp. 216–225.** https://doi.org/10.1609/icwsm.v8i1.14550')
|
237 |
+
|
238 |
+
with tab3:
|
239 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Sentiment Analysis. Text Mining for Information Professionals, 191–211.** https://doi.org/10.1007/978-3-030-85085-2_7')
|
240 |
+
|
241 |
+
with tab4:
|
242 |
+
st.subheader(':blue[CSV Results]', anchor=False)
|
243 |
+
st.text("Click Download button")
|
244 |
+
st.markdown("")
|
245 |
+
|
246 |
+
elif(method == 'TextBlob'):
|
247 |
+
|
248 |
+
@st.cache_resource(ttl=3600)
|
249 |
+
def get_sentimentb(text):
|
250 |
+
line = TextBlob(text)
|
251 |
+
return line.sentiment
|
252 |
+
|
253 |
+
@st.cache_resource(ttl=3600)
|
254 |
+
def get_assessments(frame):
|
255 |
+
text = TextBlob(str(frame))
|
256 |
+
|
257 |
+
polar, subject, assessment = text.sentiment_assessments
|
258 |
+
|
259 |
+
try:
|
260 |
+
phrase, phrasepolar, phrasesubject, unknown = assessment[0]
|
261 |
+
except: #this only happens if assessment is empty
|
262 |
+
phrase, phrasepolar, phrasesubject = "empty", 0, 0
|
263 |
+
|
264 |
+
return phrase, phrasepolar, phrasesubject
|
265 |
+
|
266 |
+
@st.cache_resource(ttl=3600)
|
267 |
+
def mergelist(data):
|
268 |
+
return ' '.join(data)
|
269 |
+
|
270 |
+
@st.cache_resource(ttl=3600)
|
271 |
+
def assignscore(data):
|
272 |
+
if data>0:
|
273 |
+
return "Positive"
|
274 |
+
elif data<0:
|
275 |
+
return "Negative"
|
276 |
+
else:
|
277 |
+
return "Neutral"
|
278 |
+
|
279 |
+
phrases = paper['Sentences__'].apply(get_assessments)
|
280 |
+
|
281 |
+
phraselist = phrases.to_list()
|
282 |
+
|
283 |
+
phraseframe = pd.DataFrame(phraselist, columns =["Phrase","Polarity","Subjectivity"])
|
284 |
+
|
285 |
+
phraseframe["Phrase"] = phraseframe["Phrase"].apply(mergelist)
|
286 |
+
|
287 |
+
phraseframe = phraseframe.groupby(phraseframe.columns.tolist(),as_index=False).size()
|
288 |
+
|
289 |
+
phraseframe["Score"] = phraseframe["Polarity"].apply(assignscore)
|
290 |
+
|
291 |
+
neut = phraseframe.loc[phraseframe['Score']=="Neutral"]
|
292 |
+
neut.reset_index(inplace = True)
|
293 |
+
|
294 |
+
pos = phraseframe.loc[phraseframe['Score']=="Positive"]
|
295 |
+
pos.reset_index(inplace = True)
|
296 |
+
|
297 |
+
neg = phraseframe.loc[phraseframe['Score']=="Negative"]
|
298 |
+
neg.reset_index(inplace = True)
|
299 |
+
|
300 |
+
paper['Sentiment'] = paper['Sentences__'].apply(get_sentimentb)
|
301 |
+
|
302 |
+
pos.sort_values(by=["size"], inplace = True, ascending = False, ignore_index = True)
|
303 |
+
pos = pos.truncate(after = wordcount)
|
304 |
+
|
305 |
+
neg.sort_values(by=["size"], inplace = True, ascending = False, ignore_index = True)
|
306 |
+
neg = neg.truncate(after = wordcount)
|
307 |
+
|
308 |
+
neut.sort_values(by=["size"], inplace = True, ascending = False, ignore_index = True)
|
309 |
+
neut = neut.truncate(after = wordcount)
|
310 |
+
|
311 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
312 |
+
with tab1:
|
313 |
+
#display tables and graphs
|
314 |
+
|
315 |
+
with st.expander("Positive Sentiment"):
|
316 |
+
st.dataframe(pos, use_container_width=True)
|
317 |
+
figpos = px.bar(pos, x="Phrase", y="size", labels={"size": "Count", "Phrase": "Word"})
|
318 |
+
st.plotly_chart(figpos, use_container_width=True)
|
319 |
+
|
320 |
+
with st.expander("Negative Sentiment"):
|
321 |
+
st.dataframe(neg, use_container_width=True)
|
322 |
+
figneg = px.bar(neg, x="Phrase", y="size", labels={"size": "Count", "Phrase": "Word"}, color_discrete_sequence=["#e57d7d"])
|
323 |
+
st.plotly_chart(figneg, use_container_width=True)
|
324 |
+
|
325 |
+
with st.expander("Neutral Sentiment"):
|
326 |
+
st.dataframe(neut, use_container_width=True)
|
327 |
+
figneut = px.bar(neut, x="Phrase", y="size", labels={"size": "Count", "Phrase": "Word"}, color_discrete_sequence=["#737a72"])
|
328 |
+
st.plotly_chart(figneut, use_container_width=True)
|
329 |
+
|
330 |
+
|
331 |
+
with st.expander("Sentence and Results"):
|
332 |
+
finalframe = pd.DataFrame()
|
333 |
+
finalframe['Sentence'] = paper['Sentences__']
|
334 |
+
finalframe[['Polarity','Subjectivity']] = pd.DataFrame(paper['Sentiment'].tolist(), index = paper.index)
|
335 |
+
|
336 |
+
st.dataframe(finalframe, use_container_width=True)
|
337 |
+
|
338 |
+
with tab2:
|
339 |
+
st.markdown('**Steven, L. et al. (2018) TextBlob: Simplified Text Processing — TextBlob 0.15.2 documentation, Readthedocs.io.** https://textblob.readthedocs.io/en/dev/')
|
340 |
+
|
341 |
+
with tab3:
|
342 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Sentiment Analysis. Text Mining for Information Professionals, 191–211.** https://doi.org/10.1007/978-3-030-85085-2_7')
|
343 |
+
|
344 |
+
with tab4:
|
345 |
+
st.subheader(':blue[Sentiment Analysis]', anchor=False)
|
346 |
+
st.write("Click the three dots at the top right then select the desired format")
|
347 |
+
st.markdown("")
|
348 |
+
st.divider()
|
349 |
+
st.subheader(':blue[CSV Results]', anchor=False)
|
350 |
+
st.text("Click Download button")
|
351 |
+
st.markdown("")
|
352 |
+
|
353 |
+
|
354 |
+
except Exception as e:
|
355 |
+
st.write(e)
|
356 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
357 |
+
st.stop()
|
pages/8 Shifterator.py
ADDED
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import streamlit.components.v1 as components
|
3 |
+
import shifterator as sh
|
4 |
+
from shifterator import ProportionShift
|
5 |
+
import pandas as pd
|
6 |
+
import re
|
7 |
+
import nltk
|
8 |
+
nltk.download('wordnet')
|
9 |
+
from nltk.stem import WordNetLemmatizer
|
10 |
+
nltk.download('stopwords')
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
import time
|
13 |
+
import sys
|
14 |
+
import json
|
15 |
+
from tools import sourceformat as sf
|
16 |
+
from collections import Counter
|
17 |
+
import io
|
18 |
+
|
19 |
+
#===config===
|
20 |
+
st.set_page_config(
|
21 |
+
page_title="Coconut",
|
22 |
+
page_icon="🥥",
|
23 |
+
layout="wide",
|
24 |
+
initial_sidebar_state="collapsed"
|
25 |
+
)
|
26 |
+
|
27 |
+
hide_streamlit_style = """
|
28 |
+
<style>
|
29 |
+
#MainMenu
|
30 |
+
{visibility: hidden;}
|
31 |
+
footer {visibility: hidden;}
|
32 |
+
[data-testid="collapsedControl"] {display: none}
|
33 |
+
</style>
|
34 |
+
"""
|
35 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
36 |
+
|
37 |
+
with st.popover("🔗 Menu"):
|
38 |
+
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
39 |
+
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
40 |
+
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
41 |
+
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
42 |
+
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
43 |
+
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
44 |
+
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
45 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
46 |
+
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
47 |
+
|
48 |
+
st.header("Shifterator", anchor=False)
|
49 |
+
st.subheader('Put your file here...', anchor=False)
|
50 |
+
|
51 |
+
def reset_all():
|
52 |
+
st.cache_data.clear()
|
53 |
+
|
54 |
+
@st.cache_data(ttl=3600)
|
55 |
+
def get_ext(extype):
|
56 |
+
extype = uploaded_file.name
|
57 |
+
return extype
|
58 |
+
|
59 |
+
#===upload file===
|
60 |
+
@st.cache_data(ttl=3600)
|
61 |
+
def upload(extype):
|
62 |
+
papers = pd.read_csv(uploaded_file)
|
63 |
+
#lens.org
|
64 |
+
if 'Publication Year' in papers.columns:
|
65 |
+
papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
66 |
+
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
67 |
+
|
68 |
+
if "dimensions" in uploaded_file.name.lower():
|
69 |
+
papers = sf.dim(papers)
|
70 |
+
col_dict = {'MeSH terms': 'Keywords',
|
71 |
+
'PubYear': 'Year',
|
72 |
+
'Times cited': 'Cited by',
|
73 |
+
'Publication Type': 'Document Type'
|
74 |
+
}
|
75 |
+
papers.rename(columns=col_dict, inplace=True)
|
76 |
+
|
77 |
+
return papers
|
78 |
+
|
79 |
+
@st.cache_data(ttl=3600)
|
80 |
+
def conv_txt(extype):
|
81 |
+
if("pmc" in uploaded_file.name.lower() or "pubmed" in uploaded_file.name.lower()):
|
82 |
+
file = uploaded_file
|
83 |
+
papers = sf.medline(file)
|
84 |
+
|
85 |
+
elif("hathi" in uploaded_file.name.lower()):
|
86 |
+
papers = pd.read_csv(uploaded_file,sep = '\t')
|
87 |
+
papers = sf.htrc(papers)
|
88 |
+
col_dict={'title': 'title',
|
89 |
+
'rights_date_used': 'Year',
|
90 |
+
}
|
91 |
+
papers.rename(columns=col_dict, inplace=True)
|
92 |
+
|
93 |
+
else:
|
94 |
+
col_dict = {'TI': 'Title',
|
95 |
+
'SO': 'Source title',
|
96 |
+
'DE': 'Author Keywords',
|
97 |
+
'DT': 'Document Type',
|
98 |
+
'AB': 'Abstract',
|
99 |
+
'TC': 'Cited by',
|
100 |
+
'PY': 'Year',
|
101 |
+
'ID': 'Keywords Plus'}
|
102 |
+
papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
|
103 |
+
papers.rename(columns=col_dict, inplace=True)
|
104 |
+
print(papers)
|
105 |
+
return papers
|
106 |
+
|
107 |
+
@st.cache_data(ttl=3600)
|
108 |
+
def conv_json(extype):
|
109 |
+
col_dict={'title': 'title',
|
110 |
+
'rights_date_used': 'Year',
|
111 |
+
}
|
112 |
+
|
113 |
+
data = json.load(uploaded_file)
|
114 |
+
hathifile = data['gathers']
|
115 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
116 |
+
|
117 |
+
keywords = sf.htrc(keywords)
|
118 |
+
keywords.rename(columns=col_dict,inplace=True)
|
119 |
+
return keywords
|
120 |
+
|
121 |
+
@st.cache_data(ttl=3600)
|
122 |
+
def conv_pub(extype):
|
123 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
124 |
+
bytedata = extype.read()
|
125 |
+
keywords = sf.readPub(bytedata)
|
126 |
+
elif (get_ext(extype)).endswith('.xml'):
|
127 |
+
bytedata = extype.read()
|
128 |
+
keywords = sf.readxml(bytedata)
|
129 |
+
return keywords
|
130 |
+
|
131 |
+
@st.cache_data(ttl=3600)
|
132 |
+
def get_data(extype):
|
133 |
+
df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
134 |
+
list_title = [col for col in df_col if col.lower() == "title"]
|
135 |
+
abstract_pattern = re.compile(r'abstract', re.IGNORECASE)
|
136 |
+
list_abstract = [col for col in df_col if abstract_pattern.search(col)]
|
137 |
+
|
138 |
+
if all(col in df_col for col in list_title) and all(col in df_col for col in list_abstract):
|
139 |
+
selected_cols = list_abstract + list_title
|
140 |
+
elif all(col in df_col for col in list_title):
|
141 |
+
selected_cols = list_title
|
142 |
+
elif all(col in df_col for col in list_abstract):
|
143 |
+
selected_cols = list_abstract
|
144 |
+
else:
|
145 |
+
selected_cols = df_col
|
146 |
+
|
147 |
+
if not selected_cols:
|
148 |
+
selected_cols = df_col
|
149 |
+
|
150 |
+
return df_col, selected_cols
|
151 |
+
|
152 |
+
@st.cache_data(ttl=3600)
|
153 |
+
def check_comparison(extype):
|
154 |
+
comparison = ['Word-to-word', 'Manual label']
|
155 |
+
|
156 |
+
if any('year' in col.lower() for col in papers.columns):
|
157 |
+
comparison.append('Years')
|
158 |
+
if any('source title' in col.lower() for col in papers.columns):
|
159 |
+
comparison.append('Sources')
|
160 |
+
|
161 |
+
comparison.sort(reverse=False)
|
162 |
+
return comparison
|
163 |
+
|
164 |
+
#===clean csv===
|
165 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
166 |
+
def clean_csv(extype):
|
167 |
+
paper = papers.dropna(subset=[ColCho])
|
168 |
+
|
169 |
+
#===mapping===
|
170 |
+
paper[ColCho] = paper[ColCho].map(lambda x: x.lower())
|
171 |
+
if rem_punc:
|
172 |
+
paper[ColCho] = paper[ColCho].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
|
173 |
+
paper[ColCho] = paper[ColCho].str.replace('\u201c|\u201d', '', regex=True)
|
174 |
+
if rem_copyright:
|
175 |
+
paper[ColCho] = paper[ColCho].map(lambda x: re.sub('©.*', '', x))
|
176 |
+
|
177 |
+
#===stopword removal===
|
178 |
+
stop = stopwords.words('english')
|
179 |
+
paper[ColCho] = paper[ColCho].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
|
180 |
+
|
181 |
+
#===lemmatize===
|
182 |
+
lemmatizer = WordNetLemmatizer()
|
183 |
+
|
184 |
+
@st.cache_data(ttl=3600)
|
185 |
+
def lemmatize_words(text):
|
186 |
+
words = text.split()
|
187 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
188 |
+
return ' '.join(words)
|
189 |
+
|
190 |
+
paper[ColCho] = paper[ColCho].apply(lemmatize_words)
|
191 |
+
|
192 |
+
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
193 |
+
remove_set = set(words_rmv)
|
194 |
+
|
195 |
+
@st.cache_data(ttl=3600)
|
196 |
+
def remove_words(text):
|
197 |
+
words = text.split()
|
198 |
+
cleaned_words = [word for word in words if word not in remove_set]
|
199 |
+
return ' '.join(cleaned_words)
|
200 |
+
|
201 |
+
paper[ColCho] = paper[ColCho].apply(remove_words)
|
202 |
+
|
203 |
+
return paper
|
204 |
+
|
205 |
+
@st.cache_data(ttl=3600)
|
206 |
+
def get_minmax(extype):
|
207 |
+
MIN = int(papers['Year'].min())
|
208 |
+
MAX = int(papers['Year'].max())
|
209 |
+
GAP = MAX - MIN
|
210 |
+
MID = round((MIN + MAX) / 2)
|
211 |
+
return MIN, MAX, GAP, MID
|
212 |
+
|
213 |
+
@st.cache_data(ttl=3600)
|
214 |
+
def running_shifterator(dict1, dict2):
|
215 |
+
try:
|
216 |
+
if method_shifts == 'Proportion Shifts':
|
217 |
+
proportion_shift = sh.ProportionShift(type2freq_1=dict1, type2freq_2=dict2)
|
218 |
+
ax = proportion_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Proportion Shifts')
|
219 |
+
|
220 |
+
elif method_shifts == 'Shannon Entropy Shifts':
|
221 |
+
entropy_shift = sh.EntropyShift(type2freq_1=dict1,
|
222 |
+
type2freq_2=dict2,
|
223 |
+
base=2)
|
224 |
+
ax = entropy_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Shannon Entropy Shifts')
|
225 |
+
|
226 |
+
elif method_shifts == 'Tsallis Entropy Shifts':
|
227 |
+
entropy_shift = sh.EntropyShift(type2freq_1=dict1,
|
228 |
+
type2freq_2=dict2,
|
229 |
+
base=2,
|
230 |
+
alpha=0.8)
|
231 |
+
ax = entropy_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Tsallis Entropy Shifts')
|
232 |
+
|
233 |
+
elif method_shifts == 'Kullback-Leibler Divergence Shifts':
|
234 |
+
kld_shift = sh.KLDivergenceShift(type2freq_1=dict1,
|
235 |
+
type2freq_2=dict2,
|
236 |
+
base=2)
|
237 |
+
ax = kld_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Kullback-Leibler Divergence Shifts')
|
238 |
+
|
239 |
+
elif method_shifts == 'Jensen-Shannon Divergence Shifts':
|
240 |
+
jsd_shift = sh.JSDivergenceShift(type2freq_1=dict1,
|
241 |
+
type2freq_2=dict2,
|
242 |
+
weight_1=0.5,
|
243 |
+
weight_2=0.5,
|
244 |
+
base=2,
|
245 |
+
alpha=1)
|
246 |
+
ax = jsd_shift.get_shift_graph(system_names = ['Topic 1', 'Topic 2'], title='Jensen-Shannon Divergence Shifts')
|
247 |
+
|
248 |
+
fig = ax.get_figure()
|
249 |
+
|
250 |
+
buf = io.BytesIO()
|
251 |
+
fig.savefig(buf, format="png", bbox_inches='tight')
|
252 |
+
buf.seek(0)
|
253 |
+
|
254 |
+
return fig, buf
|
255 |
+
|
256 |
+
except ValueError:
|
257 |
+
st.warning('Please check your data.', icon="⚠️")
|
258 |
+
sys.exit()
|
259 |
+
|
260 |
+
@st.cache_data(ttl=3600)
|
261 |
+
def df2dict(df_1, df_2):
|
262 |
+
text1 = ' '.join(df_1.dropna().astype(str))
|
263 |
+
text2 = ' '.join(df_2.dropna().astype(str))
|
264 |
+
|
265 |
+
text1_clean = re.sub(r'\d+', '', text1)
|
266 |
+
text2_clean = re.sub(r'\d+', '', text2)
|
267 |
+
|
268 |
+
tokens1 = re.findall(r'\b\w+\b', text1_clean.lower())
|
269 |
+
tokens2 = re.findall(r'\b\w+\b', text2_clean.lower())
|
270 |
+
|
271 |
+
type2freq_1 = {k: int(v) for k, v in Counter(tokens1).items()}
|
272 |
+
type2freq_2 = {k: int(v) for k, v in Counter(tokens2).items()}
|
273 |
+
|
274 |
+
return type2freq_1, type2freq_2
|
275 |
+
|
276 |
+
@st.cache_data(ttl=3600)
|
277 |
+
def dict_w2w(search_terms1, search_terms2):
|
278 |
+
selected_col = [ColCho]
|
279 |
+
dfs1 = pd.DataFrame()
|
280 |
+
for term in search_terms1:
|
281 |
+
dfs1 = pd.concat([dfs1, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
|
282 |
+
dfs1['Topic'] = 'First Term'
|
283 |
+
dfs1 = dfs1.drop_duplicates()
|
284 |
+
|
285 |
+
dfs2 = pd.DataFrame()
|
286 |
+
for term in search_terms2:
|
287 |
+
dfs2 = pd.concat([dfs2, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
|
288 |
+
dfs2['Topic'] = 'Second Term'
|
289 |
+
dfs2 = dfs2.drop_duplicates()
|
290 |
+
|
291 |
+
type2freq_1, type2freq_2 = df2dict(dfs1[selected_col[0]], dfs2[selected_col[0]])
|
292 |
+
|
293 |
+
return type2freq_1, type2freq_2
|
294 |
+
|
295 |
+
@st.cache_data(ttl=3600)
|
296 |
+
def dict_sources(stitle1, stitle2):
|
297 |
+
selected_col = [ColCho]
|
298 |
+
dfs1 = paper[paper['Source title'].str.contains(stitle1, case=False, na=False)]
|
299 |
+
dfs1['Topic'] = stitle1
|
300 |
+
dfs2 = paper[paper['Source title'].str.contains(stitle2, case=False, na=False)]
|
301 |
+
dfs2['Topic'] = stitle2
|
302 |
+
|
303 |
+
type2freq_1, type2freq_2 = df2dict(dfs1[selected_col[0]], dfs2[selected_col[0]])
|
304 |
+
|
305 |
+
return type2freq_1, type2freq_2
|
306 |
+
|
307 |
+
@st.cache_data(ttl=3600)
|
308 |
+
def dict_years(first_range, second_range):
|
309 |
+
selected_col = [ColCho]
|
310 |
+
first_filter_df = paper[(paper['Year'] >= first_range[0]) & (paper['Year'] <= first_range[1])].copy()
|
311 |
+
first_filter_df['Topic Range'] = 'First range'
|
312 |
+
|
313 |
+
second_filter_df = paper[(paper['Year'] >= second_range[0]) & (paper['Year'] <= second_range[1])].copy()
|
314 |
+
second_filter_df['Topic Range'] = 'Second range'
|
315 |
+
|
316 |
+
type2freq_1, type2freq_2 = df2dict(first_filter_df[selected_col[0]], second_filter_df[selected_col[0]])
|
317 |
+
|
318 |
+
return type2freq_1, type2freq_2
|
319 |
+
|
320 |
+
|
321 |
+
#===Read data===
|
322 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt', 'json', 'tar.gz','xml'], on_change=reset_all)
|
323 |
+
|
324 |
+
if uploaded_file is not None:
|
325 |
+
try:
|
326 |
+
extype = get_ext(uploaded_file)
|
327 |
+
|
328 |
+
if extype.endswith('.csv'):
|
329 |
+
papers = upload(extype)
|
330 |
+
elif extype.endswith('.txt'):
|
331 |
+
papers = conv_txt(extype)
|
332 |
+
elif extype.endswith('.json'):
|
333 |
+
papers = conv_json(extype)
|
334 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
335 |
+
papers = conv_pub(uploaded_file)
|
336 |
+
|
337 |
+
df_col, selected_cols = get_data(extype)
|
338 |
+
comparison = check_comparison(extype)
|
339 |
+
|
340 |
+
#Menu
|
341 |
+
c1, c2, c3 = st.columns([4,0.1,4])
|
342 |
+
ColCho = c1.selectbox(
|
343 |
+
'Choose column to analyze',
|
344 |
+
(selected_cols), on_change=reset_all)
|
345 |
+
|
346 |
+
c2.write('')
|
347 |
+
|
348 |
+
compare = c3.selectbox(
|
349 |
+
'Type of comparison',
|
350 |
+
(comparison), on_change=reset_all)
|
351 |
+
|
352 |
+
with st.expander("🧮 Show advance settings"):
|
353 |
+
y1, y2, y3 = st.columns([4,0.1,4])
|
354 |
+
t1, t2 = st.columns([3,3])
|
355 |
+
words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
|
356 |
+
method_shifts = y3.selectbox("Choose preferred method",('Proportion Shifts','Shannon Entropy Shifts', 'Tsallis Entropy Shifts','Kullback-Leibler Divergence Shifts',
|
357 |
+
'Jensen-Shannon Divergence Shifts'), on_change=reset_all)
|
358 |
+
rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
359 |
+
rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
|
360 |
+
|
361 |
+
if method_shifts == 'Kullback-Leibler Divergence Shifts':
|
362 |
+
st.info('The Kullback-Leibler Divergence is only well-defined if every single word in the comparison text is also in the reference text.', icon="ℹ️")
|
363 |
+
|
364 |
+
paper = clean_csv(extype)
|
365 |
+
|
366 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
367 |
+
|
368 |
+
with tab1:
|
369 |
+
#===visualization===
|
370 |
+
if compare == 'Word-to-word':
|
371 |
+
col1, col2, col3 = st.columns([4,0.1,4])
|
372 |
+
text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
|
373 |
+
search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
|
374 |
+
col2.write('')
|
375 |
+
text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
|
376 |
+
search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
|
377 |
+
|
378 |
+
type2freq_1, type2freq_2 = dict_w2w(search_terms1, search_terms2)
|
379 |
+
|
380 |
+
if not type2freq_1 and not type2freq_2:
|
381 |
+
st.warning('We cannot find anything in your document.', icon="⚠️")
|
382 |
+
elif not type2freq_1:
|
383 |
+
st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
|
384 |
+
elif not type2freq_2:
|
385 |
+
st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
|
386 |
+
else:
|
387 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
388 |
+
fig, buf = running_shifterator(type2freq_1, type2freq_2)
|
389 |
+
st.pyplot(fig)
|
390 |
+
|
391 |
+
elif compare == 'Manual label':
|
392 |
+
col1, col2, col3 = st.columns(3)
|
393 |
+
|
394 |
+
df_col_sel = sorted([col for col in paper.columns.tolist()])
|
395 |
+
|
396 |
+
column_selected = col1.selectbox(
|
397 |
+
'Choose column',
|
398 |
+
(df_col_sel), on_change=reset_all)
|
399 |
+
|
400 |
+
list_words = paper[column_selected].values.tolist()
|
401 |
+
list_unique = sorted(list(set(list_words)))
|
402 |
+
|
403 |
+
if column_selected is not None:
|
404 |
+
label1 = col2.selectbox(
|
405 |
+
'Choose first label',
|
406 |
+
(list_unique), on_change=reset_all)
|
407 |
+
|
408 |
+
default_index = 0 if len(list_unique) == 1 else 1
|
409 |
+
label2 = col3.selectbox(
|
410 |
+
'Choose second label',
|
411 |
+
(list_unique), on_change=reset_all, index=default_index)
|
412 |
+
|
413 |
+
filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
|
414 |
+
|
415 |
+
dfs1 = filtered_df[filtered_df[column_selected] == label1].reset_index(drop=True)
|
416 |
+
dfs2 = filtered_df[filtered_df[column_selected] == label2].reset_index(drop=True)
|
417 |
+
|
418 |
+
type2freq_1, type2freq_2 = df2dict(dfs1[ColCho], dfs2[ColCho])
|
419 |
+
|
420 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
421 |
+
fig, buf = running_shifterator(type2freq_1, type2freq_2)
|
422 |
+
st.pyplot(fig)
|
423 |
+
|
424 |
+
elif compare == 'Sources':
|
425 |
+
col1, col2, col3 = st.columns([4,0.1,4])
|
426 |
+
|
427 |
+
unique_stitle = set()
|
428 |
+
unique_stitle.update(paper['Source title'].dropna())
|
429 |
+
list_stitle = sorted(list(unique_stitle))
|
430 |
+
|
431 |
+
stitle1 = col1.selectbox(
|
432 |
+
'Choose first label',
|
433 |
+
(list_stitle), on_change=reset_all)
|
434 |
+
col2.write('')
|
435 |
+
default_index = 0 if len(list_stitle) == 1 else 1
|
436 |
+
stitle2 = col3.selectbox(
|
437 |
+
'Choose second label',
|
438 |
+
(list_stitle), on_change=reset_all, index=default_index)
|
439 |
+
|
440 |
+
type2freq_1, type2freq_2 = dict_sources(stitle1, stitle2)
|
441 |
+
|
442 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
443 |
+
fig, buf = running_shifterator(type2freq_1, type2freq_2)
|
444 |
+
st.pyplot(fig)
|
445 |
+
|
446 |
+
elif compare == 'Years':
|
447 |
+
col1, col2, col3 = st.columns([4,0.1,4])
|
448 |
+
|
449 |
+
MIN, MAX, GAP, MID = get_minmax(extype)
|
450 |
+
if (GAP != 0):
|
451 |
+
first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
|
452 |
+
col2.write('')
|
453 |
+
second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
|
454 |
+
|
455 |
+
type2freq_1, type2freq_2 = dict_years(first_range, second_range)
|
456 |
+
|
457 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
458 |
+
fig, buf = running_shifterator(type2freq_1, type2freq_2)
|
459 |
+
st.pyplot(fig)
|
460 |
+
|
461 |
+
else:
|
462 |
+
st.write('You only have data in ', (MAX))
|
463 |
+
|
464 |
+
d1, d2 = st.columns(2)
|
465 |
+
|
466 |
+
d1.download_button(
|
467 |
+
label="📥 Download Graph",
|
468 |
+
data=buf,
|
469 |
+
file_name="shifterator.png",
|
470 |
+
mime="image/png"
|
471 |
+
)
|
472 |
+
|
473 |
+
@st.cache_data(ttl=3600)
|
474 |
+
def shifts_dfs(type2freq_1, type2freq_2):
|
475 |
+
proportion_shift = ProportionShift(type2freq_1=type2freq_1, type2freq_2=type2freq_2)
|
476 |
+
|
477 |
+
words = list(proportion_shift.types)
|
478 |
+
shift_scores = proportion_shift.get_shift_scores()
|
479 |
+
freq1 = proportion_shift.type2freq_1
|
480 |
+
freq2 = proportion_shift.type2freq_2
|
481 |
+
|
482 |
+
data = []
|
483 |
+
for word, score in shift_scores.items():
|
484 |
+
data.append({
|
485 |
+
'word': word,
|
486 |
+
'freq_text1': proportion_shift.type2freq_1.get(word, 0),
|
487 |
+
'freq_text2': proportion_shift.type2freq_2.get(word, 0),
|
488 |
+
'shift_score': score
|
489 |
+
})
|
490 |
+
|
491 |
+
df_shift = pd.DataFrame(data)
|
492 |
+
df_shift = df_shift.sort_values('shift_score')
|
493 |
+
|
494 |
+
return df_shift.to_csv(index=False).encode('utf-8')
|
495 |
+
|
496 |
+
csv = shifts_dfs(type2freq_1, type2freq_2)
|
497 |
+
|
498 |
+
d2.download_button(
|
499 |
+
"📥 Click to download result",
|
500 |
+
csv,
|
501 |
+
"shiftertor_dataframe.csv",
|
502 |
+
"text/csv")
|
503 |
+
|
504 |
+
with tab2:
|
505 |
+
st.markdown('**Gallagher, R.J., Frank, M.R., Mitchell, L. et al. (2021). Generalized Word Shift Graphs: A Method for Visualizing and Explaining Pairwise Comparisons Between Texts. EPJ Data Science, 10(4).** https://doi.org/10.1140/epjds/s13688-021-00260-3')
|
506 |
+
|
507 |
+
with tab3:
|
508 |
+
st.markdown('**Sánchez-Franco, M. J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision, 62(7).** https://doi.org/10.1108/md-06-2023-0966')
|
509 |
+
st.markdown('**Ipek Baris Schlicht, Fernandez, E., Chulvi, B., & Rosso, P. (2023). Automatic detection of health misinformation: a systematic review. Journal of Ambient Intelligence and Humanized Computing, 15.** https://doi.org/10.1007/s12652-023-04619-4')
|
510 |
+
st.markdown('**Torricelli, M., Falkenberg, M., Galeazzi, A., Zollo, F., Quattrociocchi, W., & Baronchelli, A. (2023). Hurricanes Increase Climate Change Conversations on Twitter. PLOS Climate, 2(11)** https://doi.org/10.1371/journal.pclm.0000277')
|
511 |
+
|
512 |
+
with tab4:
|
513 |
+
st.subheader(':blue[Result]', anchor=False)
|
514 |
+
st.button('📥 Download Graph')
|
515 |
+
st.text("Click Download Graph button.")
|
516 |
+
|
517 |
+
st.divider()
|
518 |
+
st.subheader(':blue[Shifterator Dataframe]', anchor=False)
|
519 |
+
st.button('📥 Click to download result')
|
520 |
+
st.text("Click the Download button to get the CSV result.")
|
521 |
+
|
522 |
+
except Exception as e:
|
523 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
524 |
+
st.stop()
|
pages/9 Summarization.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import streamlit.components.v1 as components
|
3 |
+
import nltk
|
4 |
+
import spacy
|
5 |
+
import pytextrank
|
6 |
+
import pandas as pd
|
7 |
+
from rouge_score import rouge_scorer
|
8 |
+
from nltk.translate.bleu_score import sentence_bleu
|
9 |
+
from transformers import pipeline, PegasusForConditionalGeneration, PegasusTokenizer, T5ForConditionalGeneration, T5Tokenizer
|
10 |
+
nltk.download('punkt')
|
11 |
+
|
12 |
+
#===config===
|
13 |
+
st.set_page_config(
|
14 |
+
page_title="Coconut",
|
15 |
+
page_icon="🥥",
|
16 |
+
layout="wide",
|
17 |
+
initial_sidebar_state="collapsed"
|
18 |
+
)
|
19 |
+
|
20 |
+
hide_streamlit_style = """
|
21 |
+
<style>
|
22 |
+
#MainMenu
|
23 |
+
{visibility: hidden;}
|
24 |
+
footer {visibility: hidden;}
|
25 |
+
[data-testid="collapsedControl"] {display: none}
|
26 |
+
</style>
|
27 |
+
"""
|
28 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
29 |
+
|
30 |
+
with st.popover("🔗 Menu"):
|
31 |
+
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
32 |
+
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
33 |
+
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
34 |
+
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
35 |
+
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
36 |
+
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
37 |
+
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
38 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
39 |
+
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
40 |
+
|
41 |
+
st.header("Summarization test", anchor=False)
|
42 |
+
st.subheader('Put your file here...', anchor=False)
|
43 |
+
|
44 |
+
#========unique id========
|
45 |
+
@st.cache_resource(ttl=3600)
|
46 |
+
def create_list():
|
47 |
+
l = [1, 2, 3]
|
48 |
+
return l
|
49 |
+
|
50 |
+
l = create_list()
|
51 |
+
first_list_value = l[0]
|
52 |
+
l[0] = first_list_value + 1
|
53 |
+
uID = str(l[0])
|
54 |
+
|
55 |
+
@st.cache_data(ttl=3600)
|
56 |
+
def get_ext(uploaded_file):
|
57 |
+
extype = uID+uploaded_file.name
|
58 |
+
return extype
|
59 |
+
|
60 |
+
#===clear cache===
|
61 |
+
def reset_all():
|
62 |
+
st.cache_data.clear()
|
63 |
+
|
64 |
+
#===text reading===
|
65 |
+
def read_txt(intext):
|
66 |
+
return (intext.read()).decode()
|
67 |
+
|
68 |
+
#===csv reading===
|
69 |
+
def read_csv(uploaded_file):
|
70 |
+
fulltexts = pd.read_csv(uploaded_file)
|
71 |
+
fulltexts.rename(columns={fulltexts.columns[0]: "texts"}, inplace = True)
|
72 |
+
return fulltexts
|
73 |
+
|
74 |
+
|
75 |
+
#===Read data===
|
76 |
+
uploaded_file = st.file_uploader('', type=['txt','csv'], on_change=reset_all)
|
77 |
+
|
78 |
+
|
79 |
+
if uploaded_file is not None:
|
80 |
+
try:
|
81 |
+
extype = get_ext(uploaded_file)
|
82 |
+
|
83 |
+
if extype.endswith(".txt"):
|
84 |
+
fulltext = read_txt(uploaded_file)
|
85 |
+
elif extype.endswith(".csv"):
|
86 |
+
texts = read_csv(uploaded_file)
|
87 |
+
|
88 |
+
#Menu
|
89 |
+
|
90 |
+
method = st.selectbox("Method",("Extractive","Abstractive"))
|
91 |
+
if method == "Abstractive":
|
92 |
+
ab_method = st.selectbox("Abstractive method", ("Pegasus x-sum","FalconsAI t5"))
|
93 |
+
min_length = st.number_input("Minimum length", min_value = 0)
|
94 |
+
max_length = st.number_input("Maximum length", min_value = 1)
|
95 |
+
|
96 |
+
if method == "Extractive":
|
97 |
+
ex_method = st.selectbox("Extractive method", ("t5","Spacy PyTextRank"))
|
98 |
+
if ex_method == "Spacy PyTextRank":
|
99 |
+
phrase_limit = st.number_input("Phrase length limit", min_value = 0)
|
100 |
+
sentence_limit = st.number_input("Sentence limit", min_value = 0)
|
101 |
+
elif ex_method == "t5" or ex_method == "FalconsAI t5":
|
102 |
+
min_length = st.number_input("Minimum length", min_value = 0)
|
103 |
+
max_length = st.number_input("Maximum length", min_value = 1)
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
if st.button("Submit", on_click=reset_all):
|
108 |
+
|
109 |
+
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "⬇️ Download Help"])
|
110 |
+
|
111 |
+
with tab1:
|
112 |
+
|
113 |
+
def SpacyRank(text):
|
114 |
+
nlp = spacy.load("en_core_web_lg")
|
115 |
+
nlp.add_pipe("textrank")
|
116 |
+
doc = nlp(text)
|
117 |
+
summary = ""
|
118 |
+
for sent in doc._.textrank.summary(limit_phrases = phrase_limit, limit_sentences = sentence_limit):
|
119 |
+
summary+=str(sent) + '\n'
|
120 |
+
return summary
|
121 |
+
|
122 |
+
def t5summ(text):
|
123 |
+
model = T5ForConditionalGeneration.from_pretrained('t5-small')
|
124 |
+
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
125 |
+
|
126 |
+
input_text = "summarize: " + text
|
127 |
+
input_ids = tokenizer.encode(input_text,return_tensors='pt')
|
128 |
+
|
129 |
+
summed = model.generate(input_ids, max_length = max_length, min_length = min_length)
|
130 |
+
|
131 |
+
summary = tokenizer.decode(summed[0],skip_special_tokens=True)
|
132 |
+
return summary
|
133 |
+
|
134 |
+
def xsum(text):
|
135 |
+
model_name = "google/pegasus-xsum"
|
136 |
+
|
137 |
+
pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)
|
138 |
+
|
139 |
+
summarizer = pipeline("summarization",
|
140 |
+
model=model_name,
|
141 |
+
tokenizer=pegasus_tokenizer,
|
142 |
+
framework="pt")
|
143 |
+
|
144 |
+
summed = summarizer(text, min_length = min_length, max_length = max_length)
|
145 |
+
summary = summed[0]["summary_text"]
|
146 |
+
|
147 |
+
return summary
|
148 |
+
|
149 |
+
def falcsum(text):
|
150 |
+
summarizer = pipeline("summarization",model = "Falconsai/text_summarization")
|
151 |
+
summed = summarizer(text, max_length = max_length, min_length = min_length, do_sample = False)
|
152 |
+
summary = summed[0]["summary_text"]
|
153 |
+
return summary
|
154 |
+
|
155 |
+
def bulkScore(combined):
|
156 |
+
|
157 |
+
scorelist = []
|
158 |
+
|
159 |
+
for column in range(len(combined)):
|
160 |
+
ref = combined[column][0]
|
161 |
+
cand = combined[column][1]
|
162 |
+
|
163 |
+
BLEuscore = nltk.translate.bleu_score.sentence_bleu([ref], cand)
|
164 |
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
|
165 |
+
rougescores = scorer.score(ref, cand)
|
166 |
+
|
167 |
+
Bscore = f"{BLEuscore:.2f}"
|
168 |
+
Rscore = f"{rougescores['rouge1'].fmeasure:.2f}"
|
169 |
+
|
170 |
+
scoreTuplet = Bscore, Rscore
|
171 |
+
|
172 |
+
scorelist.append(scoreTuplet)
|
173 |
+
|
174 |
+
return scorelist
|
175 |
+
|
176 |
+
|
177 |
+
with st.spinner('Performing computations. Please wait ...'):
|
178 |
+
|
179 |
+
c1, c2 = st.columns([0.5,0.5], border=True)
|
180 |
+
|
181 |
+
if(extype.endswith(".txt")):
|
182 |
+
|
183 |
+
with c1:
|
184 |
+
if(extype.endswith(".txt")):
|
185 |
+
st.header("Original text")
|
186 |
+
with st.container(border=True):
|
187 |
+
st.write(fulltext)
|
188 |
+
|
189 |
+
if method == "Extractive":
|
190 |
+
if(ex_method == "Spacy PyTextRank"):
|
191 |
+
summary = SpacyRank(fulltext)
|
192 |
+
elif(ex_method == "t5"):
|
193 |
+
summary = t5summ(fulltext)
|
194 |
+
|
195 |
+
elif method == "Abstractive":
|
196 |
+
if ab_method == "Pegasus x-sum":
|
197 |
+
summary = xsum(fulltext)
|
198 |
+
|
199 |
+
elif ab_method == "FalconsAI t5":
|
200 |
+
summary = t5summ(fulltext)
|
201 |
+
with c2:
|
202 |
+
|
203 |
+
st.header("Summarized")
|
204 |
+
with st.container(border = True):
|
205 |
+
st.write(summary)
|
206 |
+
st.header("Performance scores")
|
207 |
+
with st.container(border = True):
|
208 |
+
|
209 |
+
#performance metrics
|
210 |
+
reference = fulltext
|
211 |
+
candidate = summary
|
212 |
+
|
213 |
+
BLEuscore = nltk.translate.bleu_score.sentence_bleu([reference], candidate)
|
214 |
+
|
215 |
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
|
216 |
+
rougescores = scorer.score(reference, candidate)
|
217 |
+
|
218 |
+
st.write(f"BLEU Score (NLTK): {BLEuscore:.2f}")
|
219 |
+
st.write(f"ROUGE-1 F1 Score: {rougescores['rouge1'].fmeasure:.2f}")
|
220 |
+
|
221 |
+
text_file = summary
|
222 |
+
st.download_button(
|
223 |
+
label = "Download Results",
|
224 |
+
data=text_file,
|
225 |
+
file_name="Summary.txt",
|
226 |
+
mime="text\csv",
|
227 |
+
on_click="ignore",)
|
228 |
+
|
229 |
+
elif(extype.endswith(".csv")):
|
230 |
+
if method == "Extractive":
|
231 |
+
if(ex_method == "Spacy PyTextRank"):
|
232 |
+
summaries = texts['texts'].apply(SpacyRank)
|
233 |
+
fullnsums = summaries.to_frame()
|
234 |
+
fullnsums['full'] = texts['texts']
|
235 |
+
fullnsums['combined'] = fullnsums.values.tolist()
|
236 |
+
|
237 |
+
|
238 |
+
elif(ex_method == "t5"):
|
239 |
+
summaries = texts['texts'].apply(t5summ)
|
240 |
+
fullnsums = summaries.to_frame()
|
241 |
+
fullnsums['full'] = texts['texts']
|
242 |
+
fullnsums['combined'] = fullnsums.values.tolist()
|
243 |
+
|
244 |
+
|
245 |
+
elif method == "Abstractive":
|
246 |
+
if ab_method == "Pegasus x-sum":
|
247 |
+
summaries = texts['texts'].apply(xsum)
|
248 |
+
fullnsums = summaries.to_frame()
|
249 |
+
fullnsums['full'] = texts['texts']
|
250 |
+
fullnsums['combined'] = fullnsums.values.tolist()
|
251 |
+
|
252 |
+
elif ab_method == "FalconsAI t5":
|
253 |
+
summaries = texts['texts'].apply(falcsum)
|
254 |
+
fullnsums = summaries.to_frame()
|
255 |
+
fullnsums['full'] = texts['texts']
|
256 |
+
fullnsums['combined'] = fullnsums.values.tolist()
|
257 |
+
|
258 |
+
with c1:
|
259 |
+
st.header("Download bulk summarization results")
|
260 |
+
|
261 |
+
result = summaries.to_csv()
|
262 |
+
st.download_button(
|
263 |
+
label = "Download Results",
|
264 |
+
data = result,
|
265 |
+
file_name = "Summaries.csv",
|
266 |
+
mime="text\csv",
|
267 |
+
on_click = "ignore"
|
268 |
+
)
|
269 |
+
|
270 |
+
with c2:
|
271 |
+
st.header("Scores and summaries results")
|
272 |
+
scores = pd.DataFrame.from_records(bulkScore(fullnsums.combined.to_list()),columns = ["BLEU","Rouge"])
|
273 |
+
|
274 |
+
summariesscores = fullnsums.join(scores)
|
275 |
+
|
276 |
+
summariesscores.drop("combined", axis = 1, inplace = True)
|
277 |
+
summariesscores.rename(columns = {"texts":"summarized"}, inplace = True)
|
278 |
+
|
279 |
+
result2 = summariesscores.to_csv()
|
280 |
+
|
281 |
+
st.download_button(
|
282 |
+
label = "Download scores and results",
|
283 |
+
data = result2,
|
284 |
+
file_name = "ScoredSummaries.csv",
|
285 |
+
mime = "test\csv",
|
286 |
+
on_click = "ignore"
|
287 |
+
)
|
288 |
+
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
|
293 |
+
#do this
|
294 |
+
with tab2:
|
295 |
+
st.write("")
|
296 |
+
|
297 |
+
with tab3:
|
298 |
+
st.header("Summarization result (.txt)")
|
299 |
+
st.write("Click the download button (example) to get the text file result")
|
300 |
+
st.button(label = "Download Results")
|
301 |
+
|
302 |
+
|
303 |
+
except Exception as e:
|
304 |
+
st.write(e)
|
tools/__pycache__/sourceformat.cpython-310.pyc
ADDED
Binary file (5.74 kB). View file
|
|
tools/sourceformat.py
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import StringIO, BytesIO
|
2 |
+
import pymarc
|
3 |
+
import requests
|
4 |
+
import string
|
5 |
+
import pandas as pd
|
6 |
+
import tarfile
|
7 |
+
try:
|
8 |
+
from lxml import etree as ET
|
9 |
+
except ImportError:
|
10 |
+
import xml.etree.ElementTree as ET
|
11 |
+
|
12 |
+
#metadata for htrc worksets
|
13 |
+
def htrc(self):
|
14 |
+
|
15 |
+
#variables/arrays and stuff
|
16 |
+
|
17 |
+
#string of keywords per volume/htid
|
18 |
+
keywords = ""
|
19 |
+
|
20 |
+
#array of all the keywords per each volume/htid, to add to the file
|
21 |
+
keylist = []
|
22 |
+
|
23 |
+
#get htids of the volumes
|
24 |
+
htids = self['htid'].values.tolist()
|
25 |
+
#iterate through list of htids
|
26 |
+
for id in range(len(htids)):
|
27 |
+
htid = htids[id]
|
28 |
+
|
29 |
+
#api call for the extra metadata using htid
|
30 |
+
extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
|
31 |
+
|
32 |
+
#turn the request into a json file
|
33 |
+
extradata = extradata.json()
|
34 |
+
|
35 |
+
#get record id and use it to get the xml/marc file with the actual metadata
|
36 |
+
recid = extradata['items'][0]['fromRecord']
|
37 |
+
xmlmarc = extradata['records'][recid]['marc-xml']
|
38 |
+
|
39 |
+
#turn the formatted xml into an actual pymarc
|
40 |
+
xml = StringIO(xmlmarc)
|
41 |
+
marc = pymarc.parse_xml_to_array(xml)[0]
|
42 |
+
xml.close()
|
43 |
+
|
44 |
+
for term in marc.get_fields('650'):
|
45 |
+
if "http" in (term.value()).lower():
|
46 |
+
keywords+= ""
|
47 |
+
elif "ocolc" in (term.value()).lower():
|
48 |
+
keywords+=""
|
49 |
+
else:
|
50 |
+
keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; "
|
51 |
+
keylist.append(keywords)
|
52 |
+
self['Keywords'] = keylist
|
53 |
+
return self
|
54 |
+
|
55 |
+
def htrcxtra(self):
|
56 |
+
|
57 |
+
#variables/arrays and stuff
|
58 |
+
|
59 |
+
#string of keywords per volume/htid
|
60 |
+
pages = ""
|
61 |
+
|
62 |
+
#array of all the keywords per each volume/htid, to add to the file
|
63 |
+
pagecount = []
|
64 |
+
|
65 |
+
#get htids of the volumes
|
66 |
+
htids = self['htid'].values.tolist()
|
67 |
+
#iterate through list of htids
|
68 |
+
for id in range(len(htids)):
|
69 |
+
htid = htids[id]
|
70 |
+
|
71 |
+
#api call for the extra metadata using htid
|
72 |
+
extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
|
73 |
+
|
74 |
+
#turn the request into a json file
|
75 |
+
extradata = extradata.json()
|
76 |
+
|
77 |
+
#get record id and use it to get the xml/marc file with the actual metadata
|
78 |
+
recid = extradata['items'][0]['fromRecord']
|
79 |
+
xmlmarc = extradata['records'][recid]['marc-xml']
|
80 |
+
|
81 |
+
#turn the formatted xml into an actual pymarc
|
82 |
+
xml = StringIO(xmlmarc)
|
83 |
+
marc = pymarc.parse_xml_to_array(xml)[0]
|
84 |
+
xml.close()
|
85 |
+
|
86 |
+
for term in marc.get_fields('350'):
|
87 |
+
pages+=term.value()
|
88 |
+
pagecount.append(pages)
|
89 |
+
self['pages'] = pagecount
|
90 |
+
return self
|
91 |
+
|
92 |
+
|
93 |
+
#format files from dimensions
|
94 |
+
def dim(file):
|
95 |
+
formatted = file.drop(file.columns[[0]],axis=1)
|
96 |
+
|
97 |
+
done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False))))
|
98 |
+
|
99 |
+
return done
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
def readPub(tar):
|
104 |
+
|
105 |
+
#list to put xmls from tarfile in
|
106 |
+
xmllist = []
|
107 |
+
|
108 |
+
readfile = BytesIO(tar)
|
109 |
+
|
110 |
+
#get the files from the tarfile into the list
|
111 |
+
files = tarfile.open(fileobj=readfile, mode = 'r:gz', )
|
112 |
+
for member in files.getmembers():
|
113 |
+
singlefile = files.extractfile(member)
|
114 |
+
if singlefile is not None:
|
115 |
+
article = singlefile.read()
|
116 |
+
article = article.decode("utf-8")
|
117 |
+
article = StringIO(article)
|
118 |
+
xmllist.append(article)
|
119 |
+
|
120 |
+
#lists for each data point
|
121 |
+
titles = []
|
122 |
+
years = []
|
123 |
+
keys = []
|
124 |
+
authors = []
|
125 |
+
publishers = []
|
126 |
+
journaltitles = []
|
127 |
+
|
128 |
+
#go through each xml file in the list
|
129 |
+
for art in range(len(xmllist)):
|
130 |
+
|
131 |
+
#make a parseable element tree out of the xml file
|
132 |
+
tree = ET.parse(xmllist[art])
|
133 |
+
root = tree.getroot()
|
134 |
+
|
135 |
+
#remove parts of the main branch that do not have metadata that we care about
|
136 |
+
for child in list(root):
|
137 |
+
if(child.tag!="front"):
|
138 |
+
root.remove(child)
|
139 |
+
|
140 |
+
#names to concatnate for each article
|
141 |
+
firstname = []
|
142 |
+
lastname = []
|
143 |
+
|
144 |
+
#individual strings for multiple keywords/titles
|
145 |
+
key = ""
|
146 |
+
title = ""
|
147 |
+
|
148 |
+
|
149 |
+
for target in root.iter('article-title'):
|
150 |
+
if target.text is not None:
|
151 |
+
title += target.text + ", "
|
152 |
+
else:
|
153 |
+
title += " "
|
154 |
+
for target in root.iter('kwd'):
|
155 |
+
if target.text is not None:
|
156 |
+
key+=target.text+ "; "
|
157 |
+
else:
|
158 |
+
key += " "
|
159 |
+
for target in root.iter('year'):
|
160 |
+
year=int(target.text)
|
161 |
+
years.append(year)
|
162 |
+
for names in root.iter('given-names'):
|
163 |
+
firstname.append(names.text)
|
164 |
+
for names in root.iter('surname'):
|
165 |
+
lastname.append(names.text)
|
166 |
+
for target in root.iter('journal-title'):
|
167 |
+
jtitle = target.text
|
168 |
+
journaltitles.append(jtitle)
|
169 |
+
for target in root.iter('publisher-name'):
|
170 |
+
publisher = target.text
|
171 |
+
publishers.append(publisher)
|
172 |
+
|
173 |
+
titles.append(title)
|
174 |
+
keys.append(key)
|
175 |
+
|
176 |
+
fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
|
177 |
+
|
178 |
+
#join the names into a single string with authors
|
179 |
+
author = str.join(', ', fullnames)
|
180 |
+
|
181 |
+
authors.append(author)
|
182 |
+
|
183 |
+
data = pd.DataFrame()
|
184 |
+
|
185 |
+
data["Title"] = pd.Series(titles)
|
186 |
+
data["Keywords"] = pd.Series(keys)
|
187 |
+
data["Authors"] = pd.Series(authors)
|
188 |
+
data["Year"] = pd.Series(years)
|
189 |
+
data["Document Type"] = pd.Series(publisher)
|
190 |
+
data["Source title"] = pd.Series(journaltitles)
|
191 |
+
|
192 |
+
data.fillna(value = "empty", inplace = True)
|
193 |
+
|
194 |
+
return data
|
195 |
+
|
196 |
+
|
197 |
+
def readxml(file):
|
198 |
+
root = ET.fromstring(file)
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
#remove stuff from the xml that we do not need
|
203 |
+
for child in list(root):
|
204 |
+
for lchild in list(child):
|
205 |
+
if(lchild.tag!="front"):
|
206 |
+
child.remove(lchild)
|
207 |
+
|
208 |
+
#get stuff
|
209 |
+
|
210 |
+
keys = []
|
211 |
+
titles = []
|
212 |
+
authors = []
|
213 |
+
jtitle = []
|
214 |
+
publishers = []
|
215 |
+
years = []
|
216 |
+
|
217 |
+
for child in list(root):
|
218 |
+
for article in list(child):
|
219 |
+
key = ""
|
220 |
+
firstname = []
|
221 |
+
lastname = []
|
222 |
+
for target in article.iter('article-title'):
|
223 |
+
|
224 |
+
if target.text is not None:
|
225 |
+
titles.append(target.text)
|
226 |
+
else:
|
227 |
+
titles.append("empty")
|
228 |
+
for target in article.iter('kwd'):
|
229 |
+
if target.text is not None:
|
230 |
+
key+= target.text + "; "
|
231 |
+
else:
|
232 |
+
key += ""
|
233 |
+
keys.append(key)
|
234 |
+
for target in article.iter('given-names'):
|
235 |
+
firstname.append(target.text)
|
236 |
+
for target in article.iter('surname'):
|
237 |
+
lastname.append(target.text)
|
238 |
+
|
239 |
+
fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
|
240 |
+
author = str.join(', ', fullnames)
|
241 |
+
authors.append(author)
|
242 |
+
|
243 |
+
for target in article.iter('journal-title'):
|
244 |
+
jtitle.append(target.text)
|
245 |
+
for target in article.iter('publisher-name'):
|
246 |
+
publishers.append(target.text)
|
247 |
+
|
248 |
+
for target in article.iter('year'):
|
249 |
+
years.append(int(target.text))
|
250 |
+
|
251 |
+
frame = pd.DataFrame()
|
252 |
+
|
253 |
+
frame["Title"] = pd.Series(titles)
|
254 |
+
frame["Keywords"] = pd.Series(keys)
|
255 |
+
frame["Authors"] = pd.Series(authors)
|
256 |
+
frame["Year"] = pd.Series(years)
|
257 |
+
frame["Document Type"] = pd.Series(jtitle)
|
258 |
+
frame["Source title"] = pd.Series(publishers)
|
259 |
+
|
260 |
+
frame.fillna(value = "empty", inplace = True)
|
261 |
+
|
262 |
+
return frame
|
263 |
+
|
264 |
+
def medline(file):
|
265 |
+
|
266 |
+
textfile = file.read()
|
267 |
+
|
268 |
+
|
269 |
+
text = textfile.decode()
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
|
274 |
+
|
275 |
+
authors = []
|
276 |
+
titles = []
|
277 |
+
year = []
|
278 |
+
meshkeys = []
|
279 |
+
otherkeys = []
|
280 |
+
|
281 |
+
#articles are separated by newlines so seperate them
|
282 |
+
articles = text.split('\n\n')
|
283 |
+
|
284 |
+
for paper in articles:
|
285 |
+
names = ""
|
286 |
+
meshk = ""
|
287 |
+
otherk = ""
|
288 |
+
largetext = paper.splitlines()
|
289 |
+
for line in largetext:
|
290 |
+
#title
|
291 |
+
if "TI - " in line:
|
292 |
+
#checking if the title goes over another line, and to add it if it does
|
293 |
+
startpos = line.index("-") + 2
|
294 |
+
if "- " not in(largetext[largetext.index(line)+1]):
|
295 |
+
titles.append(line[startpos:] + " " + largetext[largetext.index(line)+1].strip())
|
296 |
+
else:
|
297 |
+
titles.append(line[startpos:])
|
298 |
+
#author
|
299 |
+
if "FAU - " in line:
|
300 |
+
startpos = line.index("-") + 2
|
301 |
+
names+= line[startpos:] + "; "
|
302 |
+
#year
|
303 |
+
if "DP - " in line:
|
304 |
+
startpos = line.index("-") + 2
|
305 |
+
year.append(int(line[startpos:startpos+4]))
|
306 |
+
#key terms
|
307 |
+
if "MH - " in line:
|
308 |
+
startpos = line.index("-") + 2
|
309 |
+
meshk += line[startpos:] + "; "
|
310 |
+
if"OT - " in line:
|
311 |
+
startpos = line.index("-") + 2
|
312 |
+
otherk += line[startpos:] + "; "
|
313 |
+
|
314 |
+
authors.append(names)
|
315 |
+
meshkeys.append(meshk)
|
316 |
+
otherkeys.append(otherk)
|
317 |
+
|
318 |
+
frame = pd.DataFrame()
|
319 |
+
|
320 |
+
frame['Title'] = pd.Series(titles)
|
321 |
+
frame['Authors'] = pd.Series(authors)
|
322 |
+
frame['Year'] = pd.Series(year)
|
323 |
+
frame['MeSH Keywords'] = pd.Series(meshkeys)
|
324 |
+
frame['Other Keywords'] = pd.Series(otherkeys)
|
325 |
+
|
326 |
+
frame.fillna(value = "empty", inplace = True)
|
327 |
+
|
328 |
+
return frame
|