Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -9,29 +9,14 @@ from hdbscan import HDBSCAN
|
|
9 |
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
|
11 |
st.set_page_config(page_title="HF-BERTopic")
|
12 |
-
#st.title("HF-BERTopic A front end for BERTopic")
|
13 |
-
#st.caption("By Allen Roush")
|
14 |
-
#st.caption("github: https://github.com/Hellisotherpeople")
|
15 |
-
#st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/")
|
16 |
-
#st.image("https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png", width = 380)
|
17 |
-
#st.caption("By Maarten Grootendorst")
|
18 |
-
#st.caption("github: https://github.com/MaartenGr/BERTopic")
|
19 |
-
#st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/")
|
20 |
-
#st.image("https://maartengr.github.io/BERTopic/img/algorithm.png")
|
21 |
-
|
22 |
-
|
23 |
form = st.sidebar.form("Main Settings")
|
24 |
-
|
25 |
form.header("Main Settings")
|
26 |
-
#form.image("https://maartengr.github.io/BERTopic/img/algorithm.png", width = 270)
|
27 |
-
|
28 |
-
|
29 |
dataset_name = form.text_area("Enter the name of the huggingface dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
|
30 |
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
|
31 |
|
32 |
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
|
33 |
|
34 |
-
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value =
|
35 |
|
36 |
column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
|
37 |
|
@@ -50,7 +35,6 @@ form.caption("Leave this if you want HDBScan to choose the number of topics (clu
|
|
50 |
number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 4)
|
51 |
use_random_seed = form.checkbox("Do you want to make the results reproducible? This significantly slows down BERTopic", value = False)
|
52 |
|
53 |
-
|
54 |
form.header("CounterVectorizer Settings")
|
55 |
cv_lowercase = form.checkbox("Shall we automatically lowercase the text?", value = True)
|
56 |
cv_ngram_min = form.number_input("What's the lower boundary of the range of n-values for different word n-grams or char n-grams to be extracted", value = 1)
|
@@ -64,8 +48,6 @@ cv_min_df = form.number_input("Ignore terms that have a document frequency stric
|
|
64 |
form.caption("This parameter represents a proportion of documents if a float is given")
|
65 |
cv_max_features = form.number_input("Enter the maximum number of n-grams to be featurized", value = 100000)
|
66 |
|
67 |
-
|
68 |
-
|
69 |
form.header("HDBScan Settings")
|
70 |
hdbscan_min_cluster_size = form.number_input("Enter the number of points necessary to form a new cluster", value = 3)
|
71 |
form.caption("Set it to the smallest size grouping that you wish to consider a cluster. This is the most impactful setting for HDBscan")
|
@@ -73,8 +55,6 @@ hdbscan_min_samples = form.number_input("Enter the minimum number of points to b
|
|
73 |
form.caption("The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise, and clusters will be restricted to progressively more dense areas.")
|
74 |
hdbscan_metric = form.text_area("Enter the name of the metric used for computing distances for HDBscan. Common metrics for NLP are euclidean and cosine. Cosine is not supported by HDBscan", value = "euclidean")
|
75 |
|
76 |
-
|
77 |
-
|
78 |
form.header("UMAP Settings")
|
79 |
umap_n_neighbors = form.number_input("Enter the number of neighbors used by UMAP for generating its manifold", value = 15)
|
80 |
form.caption("This parameter controls how UMAP balances local versus global structure in the data. It does this by constraining the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. This means that low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture), while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data, losing fine detail structure for the sake of getting the broader of the data.")
|
@@ -86,18 +66,8 @@ form.caption("UMAP is used in BERTopic primarily to allow the highly effective c
|
|
86 |
umap_metric = form.text_area("Enter the name of the metric used for computing distances. Common metrics for NLP are euclidean and cosine", value = "cosine")
|
87 |
form.caption("A complete list of all available metrics supported by UMAP can be found here: https://umap-learn.readthedocs.io/en/latest/parameters.html#metric")
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
form.form_submit_button("Submit")
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
@st.cache
|
102 |
def load_and_process_data(path, name, streaming, split_name, number_of_records):
|
103 |
dataset = load_dataset(path = path, name = name, streaming=streaming)
|
@@ -106,8 +76,6 @@ def load_and_process_data(path, name, streaming, split_name, number_of_records):
|
|
106 |
df = pd.DataFrame.from_dict(dataset_head)
|
107 |
return df
|
108 |
|
109 |
-
|
110 |
-
|
111 |
hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
|
112 |
if use_random_seed:
|
113 |
umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric, random_state = 42)
|
@@ -115,8 +83,6 @@ else:
|
|
115 |
umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
|
116 |
vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
|
117 |
|
118 |
-
|
119 |
-
|
120 |
@st.cache(allow_output_mutation=True)
|
121 |
def load_model(model_name, hdbscan_model=hdbscan_model, umap_model=umap_model, vectorizer_model=vectorizer_model, use_topic_reduction = use_topic_reduction, number_of_topics = number_of_topics):
|
122 |
sentence_model = SentenceTransformer(model_name)
|
@@ -133,17 +99,12 @@ def fit_transform(model, docs):
|
|
133 |
topics, probs = model.fit_transform(docs)
|
134 |
return topics, probs
|
135 |
|
136 |
-
|
137 |
model = load_model(model_name=model_name)
|
138 |
-
|
139 |
df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
|
140 |
-
|
141 |
X = df[column_name]
|
142 |
-
|
143 |
st.header("Original Dataset")
|
144 |
st.write(df)
|
145 |
|
146 |
-
|
147 |
topics, probs = fit_transform(model, X)
|
148 |
|
149 |
st.header("Topic assignment for each example")
|
@@ -170,7 +131,6 @@ if labels:
|
|
170 |
st.header("Topics per class")
|
171 |
topics_per_class = model.topics_per_class(X, classes=y)
|
172 |
st.plotly_chart(model.visualize_topics_per_class(topics_per_class))
|
173 |
-
#TODO:Each of these need there own options!
|
174 |
|
175 |
st.header("Visualizations")
|
176 |
|
|
|
9 |
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
|
11 |
st.set_page_config(page_title="HF-BERTopic")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
form = st.sidebar.form("Main Settings")
|
|
|
13 |
form.header("Main Settings")
|
|
|
|
|
|
|
14 |
dataset_name = form.text_area("Enter the name of the huggingface dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
|
15 |
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
|
16 |
|
17 |
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
|
18 |
|
19 |
+
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 10)
|
20 |
|
21 |
column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
|
22 |
|
|
|
35 |
number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 4)
|
36 |
use_random_seed = form.checkbox("Do you want to make the results reproducible? This significantly slows down BERTopic", value = False)
|
37 |
|
|
|
38 |
form.header("CounterVectorizer Settings")
|
39 |
cv_lowercase = form.checkbox("Shall we automatically lowercase the text?", value = True)
|
40 |
cv_ngram_min = form.number_input("What's the lower boundary of the range of n-values for different word n-grams or char n-grams to be extracted", value = 1)
|
|
|
48 |
form.caption("This parameter represents a proportion of documents if a float is given")
|
49 |
cv_max_features = form.number_input("Enter the maximum number of n-grams to be featurized", value = 100000)
|
50 |
|
|
|
|
|
51 |
form.header("HDBScan Settings")
|
52 |
hdbscan_min_cluster_size = form.number_input("Enter the number of points necessary to form a new cluster", value = 3)
|
53 |
form.caption("Set it to the smallest size grouping that you wish to consider a cluster. This is the most impactful setting for HDBscan")
|
|
|
55 |
form.caption("The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise, and clusters will be restricted to progressively more dense areas.")
|
56 |
hdbscan_metric = form.text_area("Enter the name of the metric used for computing distances for HDBscan. Common metrics for NLP are euclidean and cosine. Cosine is not supported by HDBscan", value = "euclidean")
|
57 |
|
|
|
|
|
58 |
form.header("UMAP Settings")
|
59 |
umap_n_neighbors = form.number_input("Enter the number of neighbors used by UMAP for generating its manifold", value = 15)
|
60 |
form.caption("This parameter controls how UMAP balances local versus global structure in the data. It does this by constraining the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. This means that low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture), while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data, losing fine detail structure for the sake of getting the broader of the data.")
|
|
|
66 |
umap_metric = form.text_area("Enter the name of the metric used for computing distances. Common metrics for NLP are euclidean and cosine", value = "cosine")
|
67 |
form.caption("A complete list of all available metrics supported by UMAP can be found here: https://umap-learn.readthedocs.io/en/latest/parameters.html#metric")
|
68 |
|
|
|
|
|
|
|
69 |
form.form_submit_button("Submit")
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
@st.cache
|
72 |
def load_and_process_data(path, name, streaming, split_name, number_of_records):
|
73 |
dataset = load_dataset(path = path, name = name, streaming=streaming)
|
|
|
76 |
df = pd.DataFrame.from_dict(dataset_head)
|
77 |
return df
|
78 |
|
|
|
|
|
79 |
hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
|
80 |
if use_random_seed:
|
81 |
umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric, random_state = 42)
|
|
|
83 |
umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
|
84 |
vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
|
85 |
|
|
|
|
|
86 |
@st.cache(allow_output_mutation=True)
|
87 |
def load_model(model_name, hdbscan_model=hdbscan_model, umap_model=umap_model, vectorizer_model=vectorizer_model, use_topic_reduction = use_topic_reduction, number_of_topics = number_of_topics):
|
88 |
sentence_model = SentenceTransformer(model_name)
|
|
|
99 |
topics, probs = model.fit_transform(docs)
|
100 |
return topics, probs
|
101 |
|
|
|
102 |
model = load_model(model_name=model_name)
|
|
|
103 |
df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
|
|
|
104 |
X = df[column_name]
|
|
|
105 |
st.header("Original Dataset")
|
106 |
st.write(df)
|
107 |
|
|
|
108 |
topics, probs = fit_transform(model, X)
|
109 |
|
110 |
st.header("Topic assignment for each example")
|
|
|
131 |
st.header("Topics per class")
|
132 |
topics_per_class = model.topics_per_class(X, classes=y)
|
133 |
st.plotly_chart(model.visualize_topics_per_class(topics_per_class))
|
|
|
134 |
|
135 |
st.header("Visualizations")
|
136 |
|