T Le commited on
Commit
f742eb8
·
2 Parent(s): 13d6d96 e52d4a3

Fixed merge conflicts

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. pages/2 Topic Modeling.py +677 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Coconut
3
  emoji: 🥥
4
  colorFrom: red
5
  colorTo: blue
@@ -8,5 +8,5 @@ sdk_version: 1.35.0
8
  app_file: Home.py
9
  pinned: false
10
  license: mit
11
- #python_version: 3.9.13
12
  ---
 
1
  ---
2
+ title: Coconut Libtool Test
3
  emoji: 🥥
4
  colorFrom: red
5
  colorTo: blue
 
8
  app_file: Home.py
9
  pinned: false
10
  license: mit
11
+ short_description: t
12
  ---
pages/2 Topic Modeling.py CHANGED
@@ -1,3 +1,4 @@
 
1
  #import module
2
  import streamlit as st
3
  import streamlit.components.v1 as components
@@ -671,3 +672,679 @@ if uploaded_file is not None:
671
  except:
672
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
673
  st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  #import module
3
  import streamlit as st
4
  import streamlit.components.v1 as components
 
672
  except:
673
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
674
  st.stop()
675
+ =======
676
+ #import module
677
+ import streamlit as st
678
+ import streamlit.components.v1 as components
679
+ import pandas as pd
680
+ import numpy as np
681
+ import re
682
+ import string
683
+ import nltk
684
+ nltk.download('wordnet')
685
+ from nltk.stem import WordNetLemmatizer
686
+ nltk.download('stopwords')
687
+ from nltk.corpus import stopwords
688
+ import gensim
689
+ import gensim.corpora as corpora
690
+ from gensim.corpora import Dictionary
691
+ from gensim.models.coherencemodel import CoherenceModel
692
+ from gensim.models.ldamodel import LdaModel
693
+ from gensim.models import Phrases
694
+ from gensim.models.phrases import Phraser
695
+ from pprint import pprint
696
+ import pickle
697
+ import pyLDAvis
698
+ import pyLDAvis.gensim_models as gensimvis
699
+ from io import StringIO
700
+ from ipywidgets.embed import embed_minimal_html
701
+ from nltk.stem.snowball import SnowballStemmer
702
+ from bertopic import BERTopic
703
+ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
704
+ import plotly.express as px
705
+ from sklearn.cluster import KMeans
706
+ from sklearn.feature_extraction.text import CountVectorizer
707
+ import bitermplus as btm
708
+ import tmplot as tmp
709
+ import tomotopy
710
+ import sys
711
+ import spacy
712
+ import en_core_web_sm
713
+ import pipeline
714
+ from html2image import Html2Image
715
+ from umap import UMAP
716
+ import os
717
+ import time
718
+ import json
719
+ from tools import sourceformat as sf
720
+ import datamapplot
721
+ from sentence_transformers import SentenceTransformer
722
+ import openai
723
+ from transformers import pipeline
724
+
725
+ #===config===
726
+ st.set_page_config(
727
+ page_title="Coconut",
728
+ page_icon="🥥",
729
+ layout="wide",
730
+ initial_sidebar_state="collapsed"
731
+ )
732
+
733
+ hide_streamlit_style = """
734
+ <style>
735
+ #MainMenu
736
+ {visibility: hidden;}
737
+ footer {visibility: hidden;}
738
+ [data-testid="collapsedControl"] {display: none}
739
+ </style>
740
+ """
741
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
742
+
743
+ with st.popover("🔗 Menu"):
744
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
745
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
746
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
747
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
748
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
749
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
750
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
751
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
752
+
753
+ st.header("Topic Modeling", anchor=False)
754
+ st.subheader('Put your file here...', anchor=False)
755
+
756
+ #========unique id========
757
+ @st.cache_resource(ttl=3600)
758
+ def create_list():
759
+ l = [1, 2, 3]
760
+ return l
761
+
762
+ l = create_list()
763
+ first_list_value = l[0]
764
+ l[0] = first_list_value + 1
765
+ uID = str(l[0])
766
+
767
+ @st.cache_data(ttl=3600)
768
+ def get_ext(uploaded_file):
769
+ extype = uID+uploaded_file.name
770
+ return extype
771
+
772
+ #===clear cache===
773
+
774
+ def reset_biterm():
775
+ try:
776
+ biterm_map.clear()
777
+ biterm_bar.clear()
778
+ except NameError:
779
+ biterm_topic.clear()
780
+
781
+ def reset_all():
782
+ st.cache_data.clear()
783
+
784
+ #===avoiding deadlock===
785
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
786
+
787
+ #===upload file===
788
+ @st.cache_data(ttl=3600)
789
+ def upload(file):
790
+ papers = pd.read_csv(uploaded_file)
791
+ if "About the data" in papers.columns[0]:
792
+ papers = sf.dim(papers)
793
+ col_dict = {'MeSH terms': 'Keywords',
794
+ 'PubYear': 'Year',
795
+ 'Times cited': 'Cited by',
796
+ 'Publication Type': 'Document Type'
797
+ }
798
+ papers.rename(columns=col_dict, inplace=True)
799
+
800
+ return papers
801
+
802
+ @st.cache_data(ttl=3600)
803
+ def conv_txt(extype):
804
+ if("PMID" in (uploaded_file.read()).decode()):
805
+ uploaded_file.seek(0)
806
+ papers = sf.medline(uploaded_file)
807
+ print(papers)
808
+ return papers
809
+ col_dict = {'TI': 'Title',
810
+ 'SO': 'Source title',
811
+ 'DE': 'Author Keywords',
812
+ 'DT': 'Document Type',
813
+ 'AB': 'Abstract',
814
+ 'TC': 'Cited by',
815
+ 'PY': 'Year',
816
+ 'ID': 'Keywords Plus',
817
+ 'rights_date_used': 'Year'}
818
+ uploaded_file.seek(0)
819
+ papers = pd.read_csv(uploaded_file, sep='\t')
820
+ if("htid" in papers.columns):
821
+ papers = sf.htrc(papers)
822
+ papers.rename(columns=col_dict, inplace=True)
823
+ print(papers)
824
+ return papers
825
+
826
+
827
+ @st.cache_data(ttl=3600)
828
+ def conv_json(extype):
829
+ col_dict={'title': 'title',
830
+ 'rights_date_used': 'Year',
831
+ }
832
+
833
+ data = json.load(uploaded_file)
834
+ hathifile = data['gathers']
835
+ keywords = pd.DataFrame.from_records(hathifile)
836
+
837
+ keywords = sf.htrc(keywords)
838
+ keywords.rename(columns=col_dict,inplace=True)
839
+ return keywords
840
+
841
+ @st.cache_resource(ttl=3600)
842
+ def conv_pub(extype):
843
+ if (get_ext(extype)).endswith('.tar.gz'):
844
+ bytedata = extype.read()
845
+ keywords = sf.readPub(bytedata)
846
+ elif (get_ext(extype)).endswith('.xml'):
847
+ bytedata = extype.read()
848
+ keywords = sf.readxml(bytedata)
849
+ return keywords
850
+
851
+ #===Read data===
852
+ uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
853
+
854
+ if uploaded_file is not None:
855
+ try:
856
+ extype = get_ext(uploaded_file)
857
+
858
+ if extype.endswith('.csv'):
859
+ papers = upload(extype)
860
+ elif extype.endswith('.txt'):
861
+ papers = conv_txt(extype)
862
+
863
+ elif extype.endswith('.json'):
864
+ papers = conv_json(extype)
865
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
866
+ papers = conv_pub(uploaded_file)
867
+
868
+ coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
869
+
870
+ c1, c2, c3 = st.columns([3,3,4])
871
+ method = c1.selectbox(
872
+ 'Choose method',
873
+ ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
874
+ ColCho = c2.selectbox('Choose column', (["Title","Abstract"]))
875
+ num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
876
+
877
+ d1, d2 = st.columns([3,7])
878
+ xgram = d1.selectbox("N-grams", ("1", "2", "3"))
879
+ xgram = int(xgram)
880
+ words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)")
881
+
882
+ rem_copyright = d1.toggle('Remove copyright statement', value=True)
883
+ rem_punc = d2.toggle('Remove punctuation', value=True)
884
+
885
+ #===advance settings===
886
+ with st.expander("🧮 Show advance settings"):
887
+ t1, t2, t3 = st.columns([3,3,4])
888
+ if method == 'pyLDA':
889
+ py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
890
+ py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
891
+ opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
892
+
893
+ elif method == 'Biterm':
894
+ btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
895
+ btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
896
+ opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
897
+
898
+ elif method == 'BERTopic':
899
+ u1, u2 = st.columns([5,5])
900
+
901
+ bert_top_n_words = u1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
902
+ bert_random_state = u2.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
903
+ bert_n_components = u1.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
904
+ bert_n_neighbors = u2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
905
+ bert_embedding_model = st.radio(
906
+ "embedding_model",
907
+ ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_sm"], index=0, horizontal=True)
908
+
909
+ fine_tuning = st.toggle("Use Fine-tuning")
910
+ if fine_tuning:
911
+ topic_labelling = st.toggle("Automatic topic labelling")
912
+ if topic_labelling:
913
+ llm_model = st.selectbox("Model",["OpenAI/gpt-4o","Google/Flan-t5","OpenAI/gpt-oss"])
914
+ if llm_model == "OpenAI/gpt-4o":
915
+ api_key = st.text_input("API Key")
916
+
917
+ else:
918
+ st.write('Please choose your preferred method')
919
+
920
+ #===clean csv===
921
+ @st.cache_data(ttl=3600, show_spinner=False)
922
+ def clean_csv(extype):
923
+ paper = papers.dropna(subset=[ColCho])
924
+
925
+ #===mapping===
926
+ paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
927
+ if rem_punc:
928
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(
929
+ lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x)
930
+ ).map(lambda x: re.sub(r"\s+", " ", x).strip())
931
+ paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('[\u2018\u2019\u201c\u201d]', '', regex=True)
932
+ if rem_copyright:
933
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
934
+
935
+ #===stopword removal===
936
+ stop = stopwords.words('english')
937
+ paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
938
+
939
+ #===lemmatize===
940
+ lemmatizer = WordNetLemmatizer()
941
+
942
+ @st.cache_resource(ttl=3600)
943
+ def lemmatize_words(text):
944
+ words = text.split()
945
+ words = [lemmatizer.lemmatize(word) for word in words]
946
+ return ' '.join(words)
947
+ paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
948
+
949
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
950
+ remove_dict = {word: None for word in words_rmv}
951
+
952
+ @st.cache_resource(ttl=3600)
953
+ def remove_words(text):
954
+ words = text.split()
955
+ cleaned_words = [word for word in words if word not in remove_dict]
956
+ return ' '.join(cleaned_words)
957
+ paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
958
+
959
+ topic_abs = paper.Abstract_lem.values.tolist()
960
+ return topic_abs, paper
961
+
962
+ topic_abs, paper=clean_csv(extype)
963
+
964
+ if st.button("Submit", on_click=reset_all):
965
+ num_topic = num_cho
966
+
967
+ if method == 'BERTopic':
968
+ st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
969
+
970
+ #===topic===
971
+ if method == 'Choose...':
972
+ st.write('')
973
+
974
+ elif method == 'pyLDA':
975
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
976
+
977
+ with tab1:
978
+ #===visualization===
979
+ @st.cache_data(ttl=3600, show_spinner=False)
980
+ def pylda(extype):
981
+ topic_abs_LDA = [t.split(' ') for t in topic_abs]
982
+
983
+ bigram = Phrases(topic_abs_LDA, min_count=xgram, threshold=opt_threshold)
984
+ trigram = Phrases(bigram[topic_abs_LDA], threshold=opt_threshold)
985
+ bigram_mod = Phraser(bigram)
986
+ trigram_mod = Phraser(trigram)
987
+
988
+ topic_abs_LDA = [trigram_mod[bigram_mod[doc]] for doc in topic_abs_LDA]
989
+
990
+ id2word = Dictionary(topic_abs_LDA)
991
+ corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
992
+ #===LDA===
993
+ lda_model = LdaModel(corpus=corpus,
994
+ id2word=id2word,
995
+ num_topics=num_topic,
996
+ random_state=py_random_state,
997
+ chunksize=py_chunksize,
998
+ alpha='auto',
999
+ per_word_topics=False)
1000
+ pprint(lda_model.print_topics())
1001
+ doc_lda = lda_model[corpus]
1002
+ topics = lda_model.show_topics(num_words = 30,formatted=False)
1003
+
1004
+ #===visualization===
1005
+ coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
1006
+ coherence_lda = coherence_model_lda.get_coherence()
1007
+ vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
1008
+ py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
1009
+ return py_lda_vis_html, coherence_lda, vis, topics
1010
+
1011
+ with st.spinner('Performing computations. Please wait ...'):
1012
+ try:
1013
+ py_lda_vis_html, coherence_lda, vis, topics = pylda(extype)
1014
+ st.write('Coherence score: ', coherence_lda)
1015
+ components.html(py_lda_vis_html, width=1500, height=800)
1016
+ st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
1017
+
1018
+ @st.cache_data(ttl=3600, show_spinner=False)
1019
+ def img_lda(vis):
1020
+ pyLDAvis.save_html(vis, 'output.html')
1021
+ hti = Html2Image()
1022
+ hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
1023
+ hti.browser.use_new_headless = None
1024
+ css = "body {background: white;}"
1025
+ hti.screenshot(
1026
+ other_file='output.html', css_str=css, size=(1500, 800),
1027
+ save_as='ldavis_img.png'
1028
+ )
1029
+
1030
+ img_lda(vis)
1031
+
1032
+ d1, d2 = st.columns(2)
1033
+ with open("ldavis_img.png", "rb") as file:
1034
+ btn = d1.download_button(
1035
+ label="Download image",
1036
+ data=file,
1037
+ file_name="ldavis_img.png",
1038
+ mime="image/png"
1039
+ )
1040
+
1041
+ #===download results===#
1042
+ resultf = pd.DataFrame(topics)
1043
+ #formatting
1044
+ resultf = resultf.transpose()
1045
+ resultf = resultf.drop([0])
1046
+ resultf = resultf.explode(list(range(len(resultf.columns))), ignore_index=False)
1047
+
1048
+ resultcsv = resultf.to_csv().encode("utf-8")
1049
+ d2.download_button(
1050
+ label = "Download Results",
1051
+ data=resultcsv,
1052
+ file_name="results.csv",
1053
+ mime="text\csv",
1054
+ on_click="ignore")
1055
+
1056
+ except NameError as f:
1057
+ st.warning('🖱️ Please click Submit')
1058
+
1059
+ with tab2:
1060
+ st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
1061
+
1062
+ with tab3:
1063
+ st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
1064
+ st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
1065
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
1066
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
1067
+
1068
+ with tab4:
1069
+ st.subheader(':blue[pyLDA]', anchor=False)
1070
+ st.button('Download image')
1071
+ st.text("Click Download Image button.")
1072
+ st.divider()
1073
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
1074
+ st.button("Download Results")
1075
+ st.text("Click Download results button at bottom of page")
1076
+
1077
+ #===Biterm===
1078
+ elif method == 'Biterm':
1079
+
1080
+ #===optimize Biterm===
1081
+ @st.cache_data(ttl=3600, show_spinner=False)
1082
+ def biterm_topic(extype):
1083
+ tokenized_abs = [t.split(' ') for t in topic_abs]
1084
+
1085
+ bigram = Phrases(tokenized_abs, min_count=xgram, threshold=opt_threshold)
1086
+ trigram = Phrases(bigram[tokenized_abs], threshold=opt_threshold)
1087
+ bigram_mod = Phraser(bigram)
1088
+ trigram_mod = Phraser(trigram)
1089
+
1090
+ topic_abs_ngram = [trigram_mod[bigram_mod[doc]] for doc in tokenized_abs]
1091
+
1092
+ topic_abs_str = [' '.join(doc) for doc in topic_abs_ngram]
1093
+
1094
+
1095
+ X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs_str)
1096
+ tf = np.array(X.sum(axis=0)).ravel()
1097
+ docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
1098
+ docs_lens = list(map(len, docs_vec))
1099
+ biterms = btm.get_biterms(docs_vec)
1100
+
1101
+ model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
1102
+ model.fit(biterms, iterations=btm_iterations)
1103
+
1104
+ p_zd = model.transform(docs_vec)
1105
+ coherence = model.coherence_
1106
+ phi = tmp.get_phi(model)
1107
+ topics_coords = tmp.prepare_coords(model)
1108
+ totaltop = topics_coords.label.values.tolist()
1109
+ perplexity = model.perplexity_
1110
+ top_topics = model.df_words_topics_
1111
+
1112
+ return topics_coords, phi, totaltop, perplexity, top_topics
1113
+
1114
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
1115
+ with tab1:
1116
+ try:
1117
+ with st.spinner('Performing computations. Please wait ...'):
1118
+ topics_coords, phi, totaltop, perplexity, top_topics = biterm_topic(extype)
1119
+ col1, col2 = st.columns([4,6])
1120
+
1121
+ @st.cache_data(ttl=3600)
1122
+ def biterm_map(extype):
1123
+ btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
1124
+ return btmvis_coords
1125
+
1126
+ @st.cache_data(ttl=3600)
1127
+ def biterm_bar(extype):
1128
+ terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
1129
+ btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
1130
+ return btmvis_probs
1131
+
1132
+ with col1:
1133
+ st.write('Perplexity score: ', perplexity)
1134
+ st.write('')
1135
+ numvis = st.selectbox(
1136
+ 'Choose topic',
1137
+ (totaltop), on_change=reset_biterm)
1138
+ btmvis_coords = biterm_map(extype)
1139
+ st.altair_chart(btmvis_coords)
1140
+ with col2:
1141
+ btmvis_probs = biterm_bar(extype)
1142
+ st.altair_chart(btmvis_probs, use_container_width=True)
1143
+
1144
+ #===download results===#
1145
+ resultcsv = top_topics.to_csv().encode("utf-8")
1146
+ st.download_button(label = "Download Results", data=resultcsv, file_name="results.csv", mime="text\csv", on_click="ignore")
1147
+
1148
+ except ValueError as g:
1149
+ st.error('🙇‍♂️ Please raise the number of topics and click submit')
1150
+
1151
+ except NameError as f:
1152
+ st.warning('🖱️ Please click Submit')
1153
+
1154
+ with tab2:
1155
+ st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
1156
+ with tab3:
1157
+ st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
1158
+ st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
1159
+ st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
1160
+ st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
1161
+ with tab4:
1162
+ st.subheader(':blue[Biterm]', anchor=False)
1163
+ st.text("Click the three dots at the top right then select the desired format.")
1164
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_biterm.jpg)")
1165
+ st.divider()
1166
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
1167
+ st.button("Download Results")
1168
+ st.text("Click Download results button at bottom of page")
1169
+
1170
+
1171
+ #===BERTopic===
1172
+ elif method == 'BERTopic':
1173
+ @st.cache_resource(ttl = 3600, show_spinner=False)
1174
+ #@st.cache_data(ttl=3600, show_spinner=False)
1175
+ def bertopic_vis(extype):
1176
+ umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
1177
+ min_dist=0.0, metric='cosine', random_state=bert_random_state)
1178
+ cluster_model = KMeans(n_clusters=num_topic)
1179
+ if bert_embedding_model == 'all-MiniLM-L6-v2':
1180
+ model = SentenceTransformer('all-MiniLM-L6-v2')
1181
+ lang = 'en'
1182
+ embeddings = model.encode(topic_abs, show_progress_bar=True)
1183
+
1184
+ elif bert_embedding_model == 'en_core_web_sm':
1185
+ nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
1186
+ model = nlp
1187
+ lang = 'en'
1188
+ embeddings = np.array([nlp(text).vector for text in topic_abs])
1189
+
1190
+ elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
1191
+ model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
1192
+ lang = 'multilingual'
1193
+ embeddings = model.encode(topic_abs, show_progress_bar=True)
1194
+
1195
+ representation_model = ""
1196
+
1197
+ if fine_tuning:
1198
+ keybert = KeyBERTInspired()
1199
+ mmr = MaximalMarginalRelevance(diversity=0.3)
1200
+ representation_model = {
1201
+ "KeyBERT": keybert,
1202
+ "MMR": mmr,
1203
+ }
1204
+ if topic_labelling:
1205
+ if llm_model == "OpenAI/gpt-4o":
1206
+ client = openai.OpenAI(api_key=api_key)
1207
+ representation_model = {
1208
+ "KeyBERT": keybert,
1209
+ "MMR": mmr,
1210
+ "test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
1211
+ }
1212
+ elif llm_model == "Google/Flan-t5":
1213
+ gen = pipeline("text2text-generation", model = "google/flan-t5-base")
1214
+ clientmod = TextGeneration(gen)
1215
+ representation_model = {
1216
+ "KeyBERT": keybert,
1217
+ "MMR": mmr,
1218
+ "test": clientmod
1219
+ }
1220
+ elif llm_model == "OpenAI/gpt-oss":
1221
+ gen = pipeline("text-generation",
1222
+ model = "unsloth/gpt-oss-20b-BF16",
1223
+ torch_dtype = "auto",
1224
+ device_map = "auto",
1225
+ )
1226
+ clientmod = TextGeneration(gen)
1227
+
1228
+ representation_model = {
1229
+ "KeyBERT": keybert,
1230
+ "MMR": mmr,
1231
+ "test": gen
1232
+ }
1233
+
1234
+
1235
+
1236
+ vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
1237
+ topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
1238
+ topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
1239
+
1240
+ if(fine_tuning and topic_labelling):
1241
+ generated_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["test"].values()]
1242
+ topic_model.set_topic_labels(generated_labels)
1243
+
1244
+ return topic_model, topics, probs, embeddings
1245
+
1246
+ @st.cache_resource(ttl = 3600, show_spinner=False)
1247
+ def Vis_Topics(extype):
1248
+ fig1 = topic_model.visualize_topics()
1249
+ return fig1
1250
+ @st.cache_resource(ttl = 3600, show_spinner=False)
1251
+ def Vis_Documents(extype):
1252
+ fig2 = topic_model.visualize_document_datamap(topic_abs, embeddings=embeddings, custom_labels = True)
1253
+ return fig2
1254
+ @st.cache_resource(ttl = 3600, show_spinner=False)
1255
+ def Vis_Hierarchy(extype):
1256
+ fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic, custom_labels = True)
1257
+ return fig3
1258
+ @st.cache_resource(ttl = 3600, show_spinner=False)
1259
+ def Vis_Heatmap(extype):
1260
+ global topic_model
1261
+ fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000, custom_labels = True)
1262
+ return fig4
1263
+ @st.cache_resource(ttl = 3600, show_spinner=False)
1264
+ def Vis_Barchart(extype):
1265
+ fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, custom_labels = True)
1266
+ return fig5
1267
+
1268
+ tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
1269
+ with tab1:
1270
+ try:
1271
+ with st.spinner('Performing computations. Please wait ...'):
1272
+
1273
+ topic_model, topics, probs, embeddings = bertopic_vis(extype)
1274
+ time.sleep(.5)
1275
+ st.toast('Visualize Topics', icon='🏃')
1276
+ fig1 = Vis_Topics(extype)
1277
+
1278
+ time.sleep(.5)
1279
+ st.toast('Visualize Document', icon='🏃')
1280
+ fig2 = Vis_Documents(extype)
1281
+
1282
+ time.sleep(.5)
1283
+ st.toast('Visualize Document Hierarchy', icon='🏃')
1284
+ fig3 = Vis_Hierarchy(extype)
1285
+
1286
+ time.sleep(.5)
1287
+ st.toast('Visualize Topic Similarity', icon='🏃')
1288
+ fig4 = Vis_Heatmap(extype)
1289
+
1290
+ time.sleep(.5)
1291
+ st.toast('Visualize Terms', icon='🏃')
1292
+ fig5 = Vis_Barchart(extype)
1293
+
1294
+ bertab1, bertab2, bertab3, bertab4, bertab5 = st.tabs(["Visualize Topics", "Visualize Terms", "Visualize Documents",
1295
+ "Visualize Document Hierarchy", "Visualize Topic Similarity"])
1296
+
1297
+ with bertab1:
1298
+ st.plotly_chart(fig1, use_container_width=True)
1299
+ with bertab2:
1300
+ st.plotly_chart(fig5, use_container_width=True)
1301
+ with bertab3:
1302
+ st.plotly_chart(fig2, use_container_width=True)
1303
+ with bertab4:
1304
+ st.plotly_chart(fig3, use_container_width=True)
1305
+ with bertab5:
1306
+ st.plotly_chart(fig4, use_container_width=True)
1307
+
1308
+ #===download results===#
1309
+ results = topic_model.get_topic_info()
1310
+ resultf = pd.DataFrame(results)
1311
+ resultcsv = resultf.to_csv().encode("utf-8")
1312
+ st.download_button(
1313
+ label = "Download Results",
1314
+ data=resultcsv,
1315
+ file_name="results.csv",
1316
+ mime="text\csv",
1317
+ on_click="ignore",
1318
+ )
1319
+
1320
+ except ValueError as e:
1321
+ st.write(e)
1322
+ st.error('🙇‍♂️ Please raise the number of topics and click submit')
1323
+
1324
+
1325
+ except NameError as e:
1326
+ st.warning('🖱️ Please click Submit')
1327
+ st.write(e)
1328
+
1329
+ with tab2:
1330
+ st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
1331
+
1332
+ with tab3:
1333
+ st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
1334
+ st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
1335
+
1336
+ with tab4:
1337
+ st.divider()
1338
+ st.subheader(':blue[BERTopic]', anchor=False)
1339
+ st.text("Click the camera icon on the top right menu")
1340
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)")
1341
+ st.divider()
1342
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
1343
+ st.button("Download Results")
1344
+ st.text("Click Download results button at bottom of page")
1345
+
1346
+ except Exception as e:
1347
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
1348
+ st.write(e)
1349
+ st.stop()
1350
+ >>>>>>> e52d4a30c18f770eb968980667fa8e5a7b287580
requirements.txt CHANGED
@@ -38,3 +38,7 @@ git+https://github.com/faizhalas/shifterator
38
  datamapplot==0.4.2
39
  altair-nx
40
  rouge_score
 
 
 
 
 
38
  datamapplot==0.4.2
39
  altair-nx
40
  rouge_score
41
+ pytextrank
42
+ openai
43
+ transformers
44
+ accelerate