awacke1 commited on
Commit
33a7c66
·
1 Parent(s): 06e9888

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -41
app.py CHANGED
@@ -9,29 +9,14 @@ from hdbscan import HDBSCAN
9
  from sklearn.feature_extraction.text import CountVectorizer
10
 
11
  st.set_page_config(page_title="HF-BERTopic")
12
- #st.title("HF-BERTopic A front end for BERTopic")
13
- #st.caption("By Allen Roush")
14
- #st.caption("github: https://github.com/Hellisotherpeople")
15
- #st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/")
16
- #st.image("https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png", width = 380)
17
- #st.caption("By Maarten Grootendorst")
18
- #st.caption("github: https://github.com/MaartenGr/BERTopic")
19
- #st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/")
20
- #st.image("https://maartengr.github.io/BERTopic/img/algorithm.png")
21
-
22
-
23
  form = st.sidebar.form("Main Settings")
24
-
25
  form.header("Main Settings")
26
- #form.image("https://maartengr.github.io/BERTopic/img/algorithm.png", width = 270)
27
-
28
-
29
  dataset_name = form.text_area("Enter the name of the huggingface dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
30
  dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
31
 
32
  split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
33
 
34
- number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
35
 
36
  column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
37
 
@@ -50,7 +35,6 @@ form.caption("Leave this if you want HDBScan to choose the number of topics (clu
50
  number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 4)
51
  use_random_seed = form.checkbox("Do you want to make the results reproducible? This significantly slows down BERTopic", value = False)
52
 
53
-
54
  form.header("CounterVectorizer Settings")
55
  cv_lowercase = form.checkbox("Shall we automatically lowercase the text?", value = True)
56
  cv_ngram_min = form.number_input("What's the lower boundary of the range of n-values for different word n-grams or char n-grams to be extracted", value = 1)
@@ -64,8 +48,6 @@ cv_min_df = form.number_input("Ignore terms that have a document frequency stric
64
  form.caption("This parameter represents a proportion of documents if a float is given")
65
  cv_max_features = form.number_input("Enter the maximum number of n-grams to be featurized", value = 100000)
66
 
67
-
68
-
69
  form.header("HDBScan Settings")
70
  hdbscan_min_cluster_size = form.number_input("Enter the number of points necessary to form a new cluster", value = 3)
71
  form.caption("Set it to the smallest size grouping that you wish to consider a cluster. This is the most impactful setting for HDBscan")
@@ -73,8 +55,6 @@ hdbscan_min_samples = form.number_input("Enter the minimum number of points to b
73
  form.caption("The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise, and clusters will be restricted to progressively more dense areas.")
74
  hdbscan_metric = form.text_area("Enter the name of the metric used for computing distances for HDBscan. Common metrics for NLP are euclidean and cosine. Cosine is not supported by HDBscan", value = "euclidean")
75
 
76
-
77
-
78
  form.header("UMAP Settings")
79
  umap_n_neighbors = form.number_input("Enter the number of neighbors used by UMAP for generating its manifold", value = 15)
80
  form.caption("This parameter controls how UMAP balances local versus global structure in the data. It does this by constraining the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. This means that low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture), while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data, losing fine detail structure for the sake of getting the broader of the data.")
@@ -86,18 +66,8 @@ form.caption("UMAP is used in BERTopic primarily to allow the highly effective c
86
  umap_metric = form.text_area("Enter the name of the metric used for computing distances. Common metrics for NLP are euclidean and cosine", value = "cosine")
87
  form.caption("A complete list of all available metrics supported by UMAP can be found here: https://umap-learn.readthedocs.io/en/latest/parameters.html#metric")
88
 
89
-
90
-
91
-
92
  form.form_submit_button("Submit")
93
 
94
-
95
-
96
-
97
-
98
-
99
-
100
-
101
  @st.cache
102
  def load_and_process_data(path, name, streaming, split_name, number_of_records):
103
  dataset = load_dataset(path = path, name = name, streaming=streaming)
@@ -106,8 +76,6 @@ def load_and_process_data(path, name, streaming, split_name, number_of_records):
106
  df = pd.DataFrame.from_dict(dataset_head)
107
  return df
108
 
109
-
110
-
111
  hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
112
  if use_random_seed:
113
  umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric, random_state = 42)
@@ -115,8 +83,6 @@ else:
115
  umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
116
  vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
117
 
118
-
119
-
120
  @st.cache(allow_output_mutation=True)
121
  def load_model(model_name, hdbscan_model=hdbscan_model, umap_model=umap_model, vectorizer_model=vectorizer_model, use_topic_reduction = use_topic_reduction, number_of_topics = number_of_topics):
122
  sentence_model = SentenceTransformer(model_name)
@@ -133,17 +99,12 @@ def fit_transform(model, docs):
133
  topics, probs = model.fit_transform(docs)
134
  return topics, probs
135
 
136
-
137
  model = load_model(model_name=model_name)
138
-
139
  df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
140
-
141
  X = df[column_name]
142
-
143
  st.header("Original Dataset")
144
  st.write(df)
145
 
146
-
147
  topics, probs = fit_transform(model, X)
148
 
149
  st.header("Topic assignment for each example")
@@ -170,7 +131,6 @@ if labels:
170
  st.header("Topics per class")
171
  topics_per_class = model.topics_per_class(X, classes=y)
172
  st.plotly_chart(model.visualize_topics_per_class(topics_per_class))
173
- #TODO:Each of these need there own options!
174
 
175
  st.header("Visualizations")
176
 
 
9
  from sklearn.feature_extraction.text import CountVectorizer
10
 
11
  st.set_page_config(page_title="HF-BERTopic")
 
 
 
 
 
 
 
 
 
 
 
12
  form = st.sidebar.form("Main Settings")
 
13
  form.header("Main Settings")
 
 
 
14
  dataset_name = form.text_area("Enter the name of the huggingface dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
15
  dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
16
 
17
  split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
18
 
19
+ number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 10)
20
 
21
  column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
22
 
 
35
  number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 4)
36
  use_random_seed = form.checkbox("Do you want to make the results reproducible? This significantly slows down BERTopic", value = False)
37
 
 
38
  form.header("CounterVectorizer Settings")
39
  cv_lowercase = form.checkbox("Shall we automatically lowercase the text?", value = True)
40
  cv_ngram_min = form.number_input("What's the lower boundary of the range of n-values for different word n-grams or char n-grams to be extracted", value = 1)
 
48
  form.caption("This parameter represents a proportion of documents if a float is given")
49
  cv_max_features = form.number_input("Enter the maximum number of n-grams to be featurized", value = 100000)
50
 
 
 
51
  form.header("HDBScan Settings")
52
  hdbscan_min_cluster_size = form.number_input("Enter the number of points necessary to form a new cluster", value = 3)
53
  form.caption("Set it to the smallest size grouping that you wish to consider a cluster. This is the most impactful setting for HDBscan")
 
55
  form.caption("The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise, and clusters will be restricted to progressively more dense areas.")
56
  hdbscan_metric = form.text_area("Enter the name of the metric used for computing distances for HDBscan. Common metrics for NLP are euclidean and cosine. Cosine is not supported by HDBscan", value = "euclidean")
57
 
 
 
58
  form.header("UMAP Settings")
59
  umap_n_neighbors = form.number_input("Enter the number of neighbors used by UMAP for generating its manifold", value = 15)
60
  form.caption("This parameter controls how UMAP balances local versus global structure in the data. It does this by constraining the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. This means that low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture), while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data, losing fine detail structure for the sake of getting the broader of the data.")
 
66
  umap_metric = form.text_area("Enter the name of the metric used for computing distances. Common metrics for NLP are euclidean and cosine", value = "cosine")
67
  form.caption("A complete list of all available metrics supported by UMAP can be found here: https://umap-learn.readthedocs.io/en/latest/parameters.html#metric")
68
 
 
 
 
69
  form.form_submit_button("Submit")
70
 
 
 
 
 
 
 
 
71
  @st.cache
72
  def load_and_process_data(path, name, streaming, split_name, number_of_records):
73
  dataset = load_dataset(path = path, name = name, streaming=streaming)
 
76
  df = pd.DataFrame.from_dict(dataset_head)
77
  return df
78
 
 
 
79
  hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
80
  if use_random_seed:
81
  umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric, random_state = 42)
 
83
  umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
84
  vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
85
 
 
 
86
  @st.cache(allow_output_mutation=True)
87
  def load_model(model_name, hdbscan_model=hdbscan_model, umap_model=umap_model, vectorizer_model=vectorizer_model, use_topic_reduction = use_topic_reduction, number_of_topics = number_of_topics):
88
  sentence_model = SentenceTransformer(model_name)
 
99
  topics, probs = model.fit_transform(docs)
100
  return topics, probs
101
 
 
102
  model = load_model(model_name=model_name)
 
103
  df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
 
104
  X = df[column_name]
 
105
  st.header("Original Dataset")
106
  st.write(df)
107
 
 
108
  topics, probs = fit_transform(model, X)
109
 
110
  st.header("Topic assignment for each example")
 
131
  st.header("Topics per class")
132
  topics_per_class = model.topics_per_class(X, classes=y)
133
  st.plotly_chart(model.visualize_topics_per_class(topics_per_class))
 
134
 
135
  st.header("Visualizations")
136