Spaces:

haotle
/

LibTesting

Running

LibTesting / pages /4 Sunburst.py

T Le

Update to latest version

13d6d96 8 days ago

9.22 kB

	#===import module===
	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import numpy as np
	import sys
	import json
	from tools import sourceformat as sf


	#===config===
	st.set_page_config(
	page_title="Coconut",
	page_icon="🥥",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	hide_streamlit_style = """
	<style>
	#MainMenu
	{visibility: hidden;}
	footer {visibility: hidden;}
	[data-testid="collapsedControl"] {display: none}
	</style>
	"""
	st.markdown(hide_streamlit_style, unsafe_allow_html=True)

	with st.popover("🔗 Menu"):
	st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
	st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
	st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
	st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
	st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
	st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
	st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
	st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
	st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
	st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣")
	st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "🔟")

	st.header("Sunburst Visualization", anchor=False)
	st.subheader('Put your file here...', anchor=False)

	#===clear cache===
	def reset_all():
	st.cache_data.clear()

	#===check type===
	@st.cache_data(ttl=3600)
	def get_ext(extype):
	extype = uploaded_file.name
	return extype

	@st.cache_data(ttl=3600)
	def upload(extype):
	papers = pd.read_csv(uploaded_file)
	#lens.org
	if 'Publication Year' in papers.columns:
	papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
	'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
	if "About the data" in papers.columns[0]:
	papers = sf.dim(papers)
	col_dict = {'MeSH terms': 'Keywords',
	'PubYear': 'Year',
	'Times cited': 'Cited by',
	'Publication Type': 'Document Type'
	}
	papers.rename(columns=col_dict, inplace=True)

	return papers

	@st.cache_data(ttl=3600)
	def conv_txt(extype):
	if("PMID" in (uploaded_file.read()).decode()):
	uploaded_file.seek(0)
	papers = sf.medline(uploaded_file)
	print(papers)
	return papers
	col_dict = {'TI': 'Title',
	'SO': 'Source title',
	'DE': 'Author Keywords',
	'DT': 'Document Type',
	'AB': 'Abstract',
	'TC': 'Cited by',
	'PY': 'Year',
	'ID': 'Keywords Plus',
	'rights_date_used': 'Year'}
	uploaded_file.seek(0)
	papers = pd.read_csv(uploaded_file, sep='\t')
	if("htid" in papers.columns):
	papers = sf.htrc(papers)
	papers.rename(columns=col_dict, inplace=True)
	print(papers)
	return papers

	@st.cache_data(ttl=3600)
	def conv_json(extype):
	col_dict={'title': 'title',
	'rights_date_used': 'Year',
	'content_provider_code': 'Document Type',
	'Keywords':'Source title'
	}

	data = json.load(uploaded_file)
	hathifile = data['gathers']
	keywords = pd.DataFrame.from_records(hathifile)

	keywords = sf.htrc(keywords)
	keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
	keywords.rename(columns=col_dict,inplace=True)
	return keywords

	def conv_pub(extype):
	if (get_ext(extype)).endswith('.tar.gz'):
	bytedata = extype.read()
	keywords = sf.readPub(bytedata)
	elif (get_ext(extype)).endswith('.xml'):
	bytedata = extype.read()
	keywords = sf.readxml(bytedata)
	keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
	st.write(keywords)
	return keywords

	#===Read data===
	uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz', 'xml'], on_change=reset_all)

	if uploaded_file is not None:
	try:
	extype = get_ext(uploaded_file)
	if extype.endswith('.csv'):
	papers = upload(extype)

	elif extype.endswith('.txt'):
	papers = conv_txt(extype)
	elif extype.endswith('.json'):
	papers = conv_json(extype)
	elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
	papers = conv_pub(uploaded_file)

	@st.cache_data(ttl=3600)
	def get_minmax(extype):
	extype = extype
	MIN = int(papers['Year'].min())
	MAX = int(papers['Year'].max())
	MIN1 = int(papers['Cited by'].min())
	MAX1 = int(papers['Cited by'].max())
	GAP = MAX - MIN
	return papers, MIN, MAX, GAP, MIN1, MAX1

	tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading", "⬇️ Download Help"])

	with tab1:
	#===sunburst===
	try:
	papers, MIN, MAX, GAP, MIN1, MAX1 = get_minmax(extype)
	except KeyError:
	st.error('Error: Please check again your columns.')
	sys.exit(1)

	if (GAP != 0):
	YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
	KEYLIM = st.slider('Cited By Count',min_value = MIN1, max_value = MAX1, value = (MIN1,MAX1), on_change=reset_all)
	with st.expander("Filtering setings"):
	invert_keys = st.toggle("Invert keys", on_change=reset_all)
	filtered_keys = st.text_input("Filter words in source, seperate with semicolon (;)", value = "", on_change = reset_all)
	keylist = filtered_keys.split(";")
	select_col = st.selectbox("Column to filter from", (list(papers)))
	else:
	st.write('You only have data in ', (MAX))
	YEAR = (MIN, MAX)
	KEYLIM = (MIN1,MAX1)
	@st.cache_data(ttl=3600)
	def listyear(extype):
	global papers
	years = list(range(YEAR[0],YEAR[1]+1))
	cited = list(range(KEYLIM[0],KEYLIM[1]+1))
	papers = papers.loc[papers['Year'].isin(years)]
	papers = papers.loc[papers['Cited by'].isin(cited)]
	return years, papers

	@st.cache_data(ttl=3600)
	def vis_sunbrust(extype):
	data = papers.copy()
	data['Cited by'] = data['Cited by'].fillna(0)

	#filtering
	if(invert_keys):
	data = data[data[select_col].isin(keylist)]
	else:
	data = data[~data[select_col].isin(keylist)]

	vis = pd.DataFrame()
	vis[['doctype','source','citby','year']] = data[['Document Type','Source title','Cited by','Year']]
	viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index()
	viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True)

	fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs',
	color='cited by',
	color_continuous_scale='RdBu',
	color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
	fig.update_layout(height=800, width=1200)
	return fig, viz

	years, papers = listyear(extype)


	if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):

	if st.button("Submit"):
	fig, viz = vis_sunbrust(extype)
	st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
	st.dataframe(viz)

	else:
	st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨")

	with tab2:
	st.markdown('numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual. https://numpy.org/doc/stable/reference/generated/numpy.average.html')
	st.markdown('Sunburst. (n.d.). Sunburst Charts in Python. https://plotly.com/python/sunburst-charts/')

	with tab3:
	st.text("Click the camera icon on the top right menu (you may need to hover your cursor within the visualization)")
	st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)")
	except:
	st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
	st.stop()