Spaces:

crossroderick
/

semanticdala

Sleeping

App Files Files Community

crossroderick commited on May 6

Commit

9d99321

1 Parent(s): 724dcb9

Major clustering update

Browse files

Files changed (7) hide show

app.py +2 -2
src/modelling/__pycache__/topic_model.cpython-312.pyc +0 -0
src/modelling/topic_model.py +31 -3
src/utils/__pycache__/plotting.cpython-312.pyc +0 -0
src/utils/plotting.py +32 -13
vector_store/faiss_index.index +2 -2
vector_store/faiss_index.json +0 -0

app.py CHANGED Viewed

@@ -125,13 +125,13 @@ def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
         vector_db.add(embeddings, metadata)
         # Topic modelling
-        topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
         # Get a list of rows for topic labels
         overview_table = [[k, v] for k, v in topic_labels.items()]
         # Zip back transliterated text with topic IDs
-        annotated = list(zip(translits, topics))
         # Log success
         log_submission(file.name, len(chunks), start, status = "success")

         vector_db.add(embeddings, metadata)
         # Topic modelling
+        topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
         # Get a list of rows for topic labels
         overview_table = [[k, v] for k, v in topic_labels.items()]
         # Zip back transliterated text with topic IDs
+        annotated = list(zip(dedup_translits, topics))
         # Log success
         log_submission(file.name, len(chunks), start, status = "success")

src/modelling/__pycache__/topic_model.cpython-312.pyc CHANGED Viewed

Binary files a/src/modelling/__pycache__/topic_model.cpython-312.pyc and b/src/modelling/__pycache__/topic_model.cpython-312.pyc differ

src/modelling/topic_model.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import re
 import plotly
 from bertopic import BERTopic
 from collections import Counter
 from src.utils.data_utils import tokeniser
@@ -27,7 +29,7 @@ class TopicModeller:
         token_counter = Counter()
         for text in texts:
-            token_ids = tokeniser.encode(text, add_special_tokens=False)
             token_counter.update(token_ids)
         most_common = token_counter.most_common(top_k)
@@ -57,6 +59,14 @@ class TopicModeller:
         """
         clean_texts = self._preprocess_texts(texts)
         # Leverage DalaT5's tokeniser for stopword acquisition
         stopwords = self._extract_dalat5_stopwords(clean_texts, top_k = 75)
@@ -68,7 +78,9 @@ class TopicModeller:
         self.model = BERTopic(
             language = "multilingual",
             vectorizer_model = self.vectoriser_model,
-            embedding_model = DalaEmbedder().get_model()
         )
         topics, _ = self.model.fit_transform(clean_texts, embeddings)
@@ -83,7 +95,23 @@ class TopicModeller:
                 continue
-            words = [word for word, _ in self.model.get_topic(topic_id)[:4]]
             label = "_".join(words)
             topic_labels[topic_id] = f"{topic_id}_{label}"

 import re
 import plotly
+from umap import UMAP
+from hdbscan import HDBSCAN
 from bertopic import BERTopic
 from collections import Counter
 from src.utils.data_utils import tokeniser
         token_counter = Counter()
         for text in texts:
+            token_ids = tokeniser.encode(text, add_special_tokens = False)
             token_counter.update(token_ids)
         most_common = token_counter.most_common(top_k)
         """
         clean_texts = self._preprocess_texts(texts)
+        # Compute a safe number of neighbours and clusters
+        n_samples = len(embeddings)
+        min_cluster_size = max(2, len(embeddings) // 2)
+        safe_n_neighbours = min(15, max(2, n_samples - 1))
+        # Create a UMAP model
+        umap_model = UMAP(n_neighbors = safe_n_neighbours, min_dist = 0.1, metric = "cosine", random_state = 42)
         # Leverage DalaT5's tokeniser for stopword acquisition
         stopwords = self._extract_dalat5_stopwords(clean_texts, top_k = 75)
         self.model = BERTopic(
             language = "multilingual",
             vectorizer_model = self.vectoriser_model,
+            embedding_model = DalaEmbedder().get_model(),
+            umap_model = umap_model,
+            hdbscan_model = HDBSCAN(min_cluster_size = min_cluster_size, min_samples = 1, cluster_selection_epsilon = 0.1)
         )
         topics, _ = self.model.fit_transform(clean_texts, embeddings)
                 continue
+            topic_words = self.model.get_topic(topic_id)
+            if not isinstance(topic_words, list) or len(topic_words) == 0:
+                print(f"[WARN] Skipping label generation for topic_id={topic_id} - invalid topic")
+                continue
+            words = []
+            for pair in topic_words[:4]:
+                if isinstance(pair, (list, tuple)) and len(pair) >= 1:
+                    words.append(pair[0])
+            if not words:
+                print(f"[WARN] No valid words found for topic_id = {topic_id}")
+                continue
             label = "_".join(words)
             topic_labels[topic_id] = f"{topic_id}_{label}"

src/utils/__pycache__/plotting.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/plotting.cpython-312.pyc and b/src/utils/__pycache__/plotting.cpython-312.pyc differ

src/utils/plotting.py CHANGED Viewed

@@ -17,37 +17,56 @@ def custom_topic_barchart(model: BERTopic, topic_labels: Dict[int, str], top_n_t
         if topic_id == -1:
             continue
-        for word, score in model.get_topic(topic_id)[:n_words]:
             data.append({"Topic": label, "Word": word, "Score": score})
     df = pd.DataFrame(data)
     fig = px.bar(
         df,
-        x = "Score",
-        y = "Word",
-        color = "Topic",
-        orientation = 'h',
-        barmode = "group",
-        #height = 500,
     )
     fig.update_layout(
-        margin = dict(l = 40, r = 20, t = 40, b = 20),
-        yaxis = dict(title = ""),
-        xaxis = dict(title = "Relevance"),
-        legend_title_text = "Topic",
     )
     return fig
 def custom_umap_plot(embeddings: List[List[float]], topics: List[int], topic_labels: Dict[int, str]) -> plotly.graph_objs.Figure:
     """
     Custom UMAP plotting to work better with the Gradio layout.
     """
-    reducer = UMAP(n_neighbors = 15, min_dist = 0.1, metric = "cosine", random_state = 42)
     umap_coords = reducer.fit_transform(embeddings)
     df = pd.DataFrame(umap_coords, columns=["x", "y"])

         if topic_id == -1:
             continue
+        topic = model.get_topic(topic_id)
+        if not isinstance(topic, list) or len(topic) == 0:
+            continue
+        for pair in topic[:n_words]:
+            if not isinstance(pair, (list, tuple)) or len(pair) != 2:
+                continue
+            word, score = pair
             data.append({"Topic": label, "Word": word, "Score": score})
+    # ✅ Construct only if data exists
+    if not data:
+        print("[WARN] No topic-word-score data to visualize.")
+        return plotly.graph_objs.Figure()
     df = pd.DataFrame(data)
+    required_cols = {"Topic", "Word", "Score"}
+    if not required_cols.issubset(df.columns):
+        print("[ERROR] Required columns missing in DataFrame.")
+        return plotly.graph_objs.Figure()
     fig = px.bar(
         df,
+        x="Score",
+        y="Word",
+        color="Topic",
+        orientation='h',
+        barmode="group",
     )
     fig.update_layout(
+        margin=dict(l=40, r=20, t=40, b=20),
+        yaxis=dict(title=""),
+        xaxis=dict(title="Relevance"),
+        legend_title_text="Topic",
     )
     return fig
 def custom_umap_plot(embeddings: List[List[float]], topics: List[int], topic_labels: Dict[int, str]) -> plotly.graph_objs.Figure:
     """
     Custom UMAP plotting to work better with the Gradio layout.
     """
+    # Compute a safe number of neighbours
+    n_samples = len(embeddings)
+    safe_n_neighbours = min(15, max(2, n_samples - 1))
+    reducer = UMAP(n_neighbors = safe_n_neighbours, min_dist = 0.1, metric = "cosine", random_state = 42)
     umap_coords = reducer.fit_transform(embeddings)
     df = pd.DataFrame(umap_coords, columns=["x", "y"])

vector_store/faiss_index.index CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:25004d0d5df0be08b29e41af806fefc2215d37f215c08fdd5b8ce16484ee83fc
-size 175149

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce19ebb4f9f8a57800a85bffbc97c637134adf2775420b4b09889dec95943cf6
+size 6189

vector_store/faiss_index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff