#===import module=== import streamlit as st import pandas as pd import plotly.express as px import numpy as np import sys import json from tools import sourceformat as sf #===config=== st.set_page_config( page_title="Coconut", page_icon="🥥", layout="wide", initial_sidebar_state="collapsed" ) hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) with st.popover("🔗 Menu"): st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠") st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣") st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣") st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣") st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣") st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣") st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣") st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣") st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣") st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣") st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "🔟") st.header("Sunburst Visualization", anchor=False) st.subheader('Put your file here...', anchor=False) #===clear cache=== def reset_all(): st.cache_data.clear() #===check type=== @st.cache_data(ttl=3600) def get_ext(extype): extype = uploaded_file.name return extype @st.cache_data(ttl=3600) def upload(extype): papers = pd.read_csv(uploaded_file) #lens.org if 'Publication Year' in papers.columns: papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by', 'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True) if "About the data" in papers.columns[0]: papers = sf.dim(papers) col_dict = {'MeSH terms': 'Keywords', 'PubYear': 'Year', 'Times cited': 'Cited by', 'Publication Type': 'Document Type' } papers.rename(columns=col_dict, inplace=True) return papers @st.cache_data(ttl=3600) def conv_txt(extype): if("PMID" in (uploaded_file.read()).decode()): uploaded_file.seek(0) papers = sf.medline(uploaded_file) print(papers) return papers col_dict = {'TI': 'Title', 'SO': 'Source title', 'DE': 'Author Keywords', 'DT': 'Document Type', 'AB': 'Abstract', 'TC': 'Cited by', 'PY': 'Year', 'ID': 'Keywords Plus', 'rights_date_used': 'Year'} uploaded_file.seek(0) papers = pd.read_csv(uploaded_file, sep='\t') if("htid" in papers.columns): papers = sf.htrc(papers) papers.rename(columns=col_dict, inplace=True) print(papers) return papers @st.cache_data(ttl=3600) def conv_json(extype): col_dict={'title': 'title', 'rights_date_used': 'Year', 'content_provider_code': 'Document Type', 'Keywords':'Source title' } data = json.load(uploaded_file) hathifile = data['gathers'] keywords = pd.DataFrame.from_records(hathifile) keywords = sf.htrc(keywords) keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size') keywords.rename(columns=col_dict,inplace=True) return keywords def conv_pub(extype): if (get_ext(extype)).endswith('.tar.gz'): bytedata = extype.read() keywords = sf.readPub(bytedata) elif (get_ext(extype)).endswith('.xml'): bytedata = extype.read() keywords = sf.readxml(bytedata) keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size') st.write(keywords) return keywords #===Read data=== uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all) if uploaded_file is not None: try: extype = get_ext(uploaded_file) if extype.endswith('.csv'): papers = upload(extype) elif extype.endswith('.txt'): papers = conv_txt(extype) elif extype.endswith('.json'): papers = conv_json(extype) elif extype.endswith('.tar.gz') or extype.endswith('.xml'): papers = conv_pub(uploaded_file) @st.cache_data(ttl=3600) def get_minmax(extype): extype = extype MIN = int(papers['Year'].min()) MAX = int(papers['Year'].max()) MIN1 = int(papers['Cited by'].min()) MAX1 = int(papers['Cited by'].max()) GAP = MAX - MIN return papers, MIN, MAX, GAP, MIN1, MAX1 tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading", "⬇️ Download Help"]) with tab1: #===sunburst=== try: papers, MIN, MAX, GAP, MIN1, MAX1 = get_minmax(extype) except KeyError: st.error('Error: Please check again your columns.') sys.exit(1) if (GAP != 0): YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all) KEYLIM = st.slider('Cited By Count',min_value = MIN1, max_value = MAX1, value = (MIN1,MAX1), on_change=reset_all) with st.expander("Filtering setings"): invert_keys = st.toggle("Invert keys", on_change=reset_all) filtered_keys = st.text_input("Filter words in source, seperate with semicolon (;)", value = "", on_change = reset_all) keylist = filtered_keys.split(";") select_col = st.selectbox("Column to filter from", (list(papers))) else: st.write('You only have data in ', (MAX)) YEAR = (MIN, MAX) KEYLIM = (MIN1,MAX1) @st.cache_data(ttl=3600) def listyear(extype): global papers years = list(range(YEAR[0],YEAR[1]+1)) cited = list(range(KEYLIM[0],KEYLIM[1]+1)) papers = papers.loc[papers['Year'].isin(years)] papers = papers.loc[papers['Cited by'].isin(cited)] return years, papers @st.cache_data(ttl=3600) def vis_sunbrust(extype): data = papers.copy() data['Cited by'] = data['Cited by'].fillna(0) #filtering if(invert_keys): data = data[data[select_col].isin(keylist)] else: data = data[~data[select_col].isin(keylist)] vis = pd.DataFrame() vis[['doctype','source','citby','year']] = data[['Document Type','Source title','Cited by','Year']] viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index() viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True) fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs', color='cited by', color_continuous_scale='RdBu', color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs'])) fig.update_layout(height=800, width=1200) return fig, viz years, papers = listyear(extype) if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns): if st.button("Submit"): fig, viz = vis_sunbrust(extype) st.plotly_chart(fig, height=800, width=1200) #use_container_width=True) st.dataframe(viz) else: st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨") with tab2: st.markdown('**numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual.** https://numpy.org/doc/stable/reference/generated/numpy.average.html') st.markdown('**Sunburst. (n.d.). Sunburst Charts in Python.** https://plotly.com/python/sunburst-charts/') with tab3: st.text("Click the camera icon on the top right menu (you may need to hover your cursor within the visualization)") st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)") except: st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨") st.stop()