Spaces:

crossroderick
/

semanticdala

Sleeping

App Files Files Community

crossroderick commited on May 6

Commit

0eb636f

1 Parent(s): e6f8cc6

Added all files

Browse files

Files changed (24) hide show

.gitattributes +1 -0
.gitignore +2 -0
README.md +1 -1
app.py +143 -0
src/__init__.py +8 -0
src/db/__pycache__/vector_store.cpython-312.pyc +0 -0
src/db/search.py +23 -0
src/db/vector_store.py +72 -0
src/modelling/__pycache__/embed.cpython-312.pyc +0 -0
src/modelling/__pycache__/topic_model.cpython-312.pyc +0 -0
src/modelling/__pycache__/transliterate.cpython-312.pyc +0 -0
src/modelling/embed.py +31 -0
src/modelling/topic_model.py +94 -0
src/modelling/transliterate.py +29 -0
src/utils/__pycache__/config.cpython-312.pyc +0 -0
src/utils/__pycache__/data_utils.cpython-312.pyc +0 -0
src/utils/__pycache__/ingest.cpython-312.pyc +0 -0
src/utils/__pycache__/plotting.cpython-312.pyc +0 -0
src/utils/config.py +19 -0
src/utils/data_utils.py +164 -0
src/utils/ingest.py +77 -0
src/utils/plotting.py +71 -0
vector_store/faiss_index.index +3 -0
vector_store/faiss_index.json +0 -0

.gitattributes CHANGED Viewed

@@ -32,4 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+vector_store/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/**
2	+ src/__pycache__/**

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Semanticdala
 emoji: 💻
 colorFrom: pink
 colorTo: green

 ---
+title: SemanticDala
 emoji: 💻
 colorFrom: pink
 colorTo: green

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import gradio as gr
+from src.modelling.embed import DalaEmbedder
+from src.db.vector_store import VectorStore
+from src.modelling.topic_model import TopicModeller
+from src.modelling.transliterate import DalaTransliterator
+from src.utils.data_utils import (
+    extract_text_with_pdfplumber,
+    extract_text_with_ocr,
+    chunk_text,
+    deduplicate_chunks,
+    repair_extracted_text
+)
+from typing import Any, List, Tuple
+# Instantiate components
+translit = DalaTransliterator()
+embedder = DalaEmbedder()
+vector_db = VectorStore()
+topic_modeller = TopicModeller()
+def extract_text(file: Any) -> str:
+    """
+    Try multiple PDF extraction strategies, with fallback to OCR if necessary.
+    """
+    if file.name.endswith(".pdf"):
+        text = extract_text_with_pdfplumber(file)
+        if len(text.strip()) > 100:
+            return repair_extracted_text(text)
+        print("[INFO] Falling back to OCR...")
+        return extract_text_with_ocr(file)
+    elif file.name.endswith(".txt"):
+        return repair_extracted_text(file.read().decode("utf-8", errors = "ignore"))
+    return ""
+def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
+    """
+    Main file processing function, which will also chunk, transliterate and cluster
+    the file contents, as well as plot the clusters.
+    """
+    raw_text = extract_text(file)
+    chunks = chunk_text(raw_text)
+    # Deduplicate and embed embedding
+    translits = translit.batch_transliterate(chunks)
+    dedup_translits = deduplicate_chunks(translits, embedder)
+    embeddings = embedder.embed_batch(dedup_translits)
+    # Clear previous entries before adding
+    vector_db.index.reset()
+    vector_db.metadata = []
+    metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
+    vector_db.add(embeddings, metadata)
+    # Topic modelling
+    topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
+    # Get a list of rows for topic labels
+    overview_table = [[k, v] for k, v in topic_labels.items()]
+    # Zip back transliterated text with topic IDs
+    annotated = list(zip(translits, topics))
+    return annotated, fig, overview_table, umap_fig
+def search_text(query: str):
+    """
+    Search for a given query in the vector DB.
+    """
+    query_emb = embedder.embed_text(query)
+    results = vector_db.search(query_emb, top_k = 5)
+    return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
+# Custom CSS
+page_css = """
+p {
+    font-size: 18px;
+}
+.lang_btn {
+    width: 5%;
+}
+"""
+# Gradio UI
+with gr.Blocks(css = page_css) as demo:
+    title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
+    with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):
+        with gr.Row():
+            file_input = gr.File(label = "PDF немесе TXT жүктеңіз / Upload PDF or TXT", file_types = [".pdf", ".txt"])
+            process_btn = gr.Button("Процесс файлы / Process File", scale = 1)
+        translit_output = gr.Dataframe(
+            headers = ["Мәтін / Text", "Тақырып идентификаторы / Topic ID"],
+            label = "Транслитерацияланған үзінділер + Тақырыптар / Transliterated Chunks + Topics"
+        )
+        topic_label_table = gr.Dataframe(
+            headers = ["Тақырып идентификаторы / Topic ID", "Белгі / Label"],
+            label = "Тақырып белгілері / Topic Labels"
+        )
+        with gr.Row(equal_height = True):
+            with gr.Column(scale = 1):
+                plot_output = gr.Plot(label = "Негізгі тақырыптар / Top Topics")
+            with gr.Column(scale = 1):
+                umap_output = gr.Plot(label = "UMAP проекциясы / UMAP Topic Projection")
+    with gr.Tab("🔍 Семантикалық іздеу / Semantic Search"):
+        with gr.Row():
+            search_box = gr.Textbox(label = "Сұрау / Query", placeholder = "мысалы / e.g., Qazaqstan tarihy", lines = 1, scale = 5)
+            search_btn = gr.Button("Іздеу / Search", scale = 1)
+        search_results = gr.Textbox(label = "Нәтижелер / Top Results", lines = 6, interactive = False)
+    # Bind callbacks
+    process_btn.click(
+        fn = process_file,
+        inputs = file_input,
+        outputs = [translit_output, plot_output, topic_label_table, umap_output]
+    )
+    search_btn.click(fn = search_text, inputs = search_box, outputs = search_results)
+# Launch
+if __name__ == "__main__":
+    demo.launch()

src/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from src.utils.config import *
+from src.utils.ingest import *
+from src.utils.plotting import *
+from src.utils.data_utils import *
+from src.modelling.embed import *
+from src.modelling.topic_model import *
+from src.modelling.transliterate import *
+from src.db.vector_store import *

src/db/__pycache__/vector_store.cpython-312.pyc ADDED Viewed

Binary file (4.31 kB). View file

src/db/search.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from db.vector_store import VectorStore
+from src.modelling.embed import DalaEmbedder
+from typing import List
+class SemanticSearcher:
+    """
+    Perform semantic search over embedded Kazakh text.
+    """
+    def __init__(self):
+        self.embedder = DalaEmbedder()
+        self.vector_store = VectorStore()
+    def search(self, query: str, top_k: int = 5) -> List[dict]:
+        """
+        Embed the query and retrieve the most relevant chunks.
+        """
+        query_embedding = self.embedder.embed_text(query)
+        results = self.vector_store.search(query_embedding, top_k = top_k)
+        return results

src/db/vector_store.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import faiss
+import json
+import numpy as np
+from pathlib import Path
+from src.utils.config import VECTOR_DB_PATH, EMBEDDING_DIM
+from typing import List
+class VectorStore:
+    """
+    Wrapper for FAISS vector storage, with ID-to-text mapping.
+    """
+    def __init__(self, index_path: Path = VECTOR_DB_PATH):
+        self.index_path = index_path.with_suffix(".index")
+        self.meta_path = index_path.with_suffix(".json")
+        self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
+        self.metadata = []  # list of dicts: {"id": str, "text": str}
+        # Try loading if exists
+        if self.index_path.exists() and self.meta_path.exists():
+            try:
+                self.load()
+            except Exception as e:
+                print(f"[WARN] Failed to load vector store: {e}")
+                # Reinitialize clean if corrupted
+                self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
+                self.metadata = []
+    def add(self, embeddings: list[list[float]], metadata: List[dict]):
+        """
+        Add new embeddings and their metadata (e.g., {"id": "doc1_chunk0", "text": "..."})
+        """
+        self.index.add(np.array(embeddings).astype("float32"))
+        self.metadata.extend(metadata)
+        self.save()
+    def search(self, query_embedding: list[float], top_k: int = 5) -> List[dict]:
+        """
+        Perform vector search and return metadata of top_k results.
+        """
+        D, I = self.index.search(np.array([query_embedding]).astype("float32"), top_k)
+        return [self.metadata[i] for i in I[0]]
+    def save(self) -> None:
+        """
+        Save data to an external file.
+        """
+        self.index_path.parent.mkdir(parents = True, exist_ok = True)
+        faiss.write_index(self.index, str(self.index_path))
+        with open(self.meta_path, 'w', encoding = "utf-8") as f:
+            json.dump(self.metadata, f, ensure_ascii = False, indent = 2)
+    def load(self) -> None:
+        """
+        Load data from an external file.
+        """
+        self.index = faiss.read_index(str(self.index_path))
+        with open(self.meta_path, 'r', encoding = "utf-8") as f:
+            self.metadata = json.load(f)

src/modelling/__pycache__/embed.cpython-312.pyc ADDED Viewed

Binary file (1.82 kB). View file

src/modelling/__pycache__/topic_model.cpython-312.pyc ADDED Viewed

Binary file (4.55 kB). View file

src/modelling/__pycache__/transliterate.cpython-312.pyc ADDED Viewed

Binary file (1.83 kB). View file

src/modelling/embed.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from src.utils.config import MINIDALALM_MODEL
+from sentence_transformers import SentenceTransformer
+class DalaEmbedder:
+    """
+    Simple wrapper for the MiniDalaLM embedding model
+    """
+    def __init__(self, model_path: str = MINIDALALM_MODEL):
+        self.model = SentenceTransformer(model_path)
+    def embed_text(self, text: str) -> list[float]:
+        """
+        Embed a single string of text.
+        """
+        return self.model.encode(text, convert_to_numpy = True).tolist()
+    def embed_batch(self, texts: list[str]) -> list[list[float]]:
+        """
+        Embed a batch of text strings.
+        """
+        return self.model.encode(texts, convert_to_numpy = True)
+    def get_model(self) -> SentenceTransformer:
+        """
+        Get function to enable access to the MiniDalaLM model.
+        """
+        return self.model

src/modelling/topic_model.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import re
+import plotly
+from bertopic import BERTopic
+from collections import Counter
+from src.utils.data_utils import tokeniser
+from src.modelling.embed import DalaEmbedder
+from sklearn.feature_extraction.text import CountVectorizer
+from src.utils.plotting import custom_topic_barchart, custom_umap_plot
+from typing import Dict, List, Tuple
+class TopicModeller:
+    """
+    Wrapper for topic modelling with BERTopic.
+    """
+    def __init__(self):
+        # Custom vectoriser with stopword filtering
+        self.vectorizer_model = None
+        self.model = None
+    def _extract_dalat5_stopwords(self, texts: List[str], top_k: int = 75) -> List[str]:
+        """
+        Identify frequent tokens using DalaT5's tokeniser as proxy stopwords.
+        """
+        token_counter = Counter()
+        for text in texts:
+            token_ids = tokeniser.encode(text, add_special_tokens=False)
+            token_counter.update(token_ids)
+        most_common = token_counter.most_common(top_k)
+        stop_tokens = [tokeniser.decode([tok_id]).strip() for tok_id, _ in most_common]
+        return stop_tokens
+    def _preprocess_texts(self, texts: List[str]) -> List[str]:
+        """
+        Lowercase and remove digits/symbols from texts.
+        """
+        return [
+            re.sub(r"\d+|\s+", " ", t.lower()).strip()
+            for t in texts
+        ]
+    def fit(
+        self,
+        texts: List[str],
+        embeddings: List[List[float]]
+    ) -> Tuple[List[str], plotly.graph_objs.Figure, Dict[int, str], plotly.graph_objs.Figure]:
+        """
+        Fit BERTopic on preprocessed texts and given embeddings.
+        Returns topics and an interactive plot.
+        """
+        clean_texts = self._preprocess_texts(texts)
+        # Leverage DalaT5's tokeniser for stopword acquisition
+        stopwords = self._extract_dalat5_stopwords(clean_texts, top_k = 75)
+        # Define vectoriser and model
+        self.vectoriser_model = CountVectorizer(
+            stop_words = stopwords,
+            token_pattern = r"\b[a-zA-Z]+(?:-[a-zA-Z]+)?\b"
+        )
+        self.model = BERTopic(
+            language = "multilingual",
+            vectorizer_model = self.vectoriser_model,
+            embedding_model = DalaEmbedder().get_model()
+        )
+        topics, _ = self.model.fit_transform(clean_texts, embeddings)
+        # Generate labels
+        topic_info = self.model.get_topic_info()
+        topic_labels = {}
+        for topic_id in topic_info.Topic.values:
+            if topic_id == -1:
+                topic_labels[topic_id] = '-'
+                continue
+            words = [word for word, _ in self.model.get_topic(topic_id)[:4]]
+            label = "_".join(words)
+            topic_labels[topic_id] = f"{topic_id}_{label}"
+        fig = custom_topic_barchart(self.model, topic_labels)
+        umap_fig = custom_umap_plot(embeddings, topics, topic_labels)
+        labeled_topics = [topic_labels[t] for t in topics]
+        return labeled_topics, fig, topic_labels, umap_fig

src/modelling/transliterate.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from transformers import pipeline
+from src.utils.config import DALAT5_MODEL
+from typing import List
+class DalaTransliterator:
+    """
+    Simple wrapper for the DalaT5 transliterator model.
+    """
+    def __init__(self, model_name: str = DALAT5_MODEL):
+        self.pipe = pipeline("text2text-generation", model = model_name)
+    def transliterate(self, text: str, max_length: int = 128) -> str:
+        """
+        Transliterate a given text using DalaT5.
+        """
+        input_text = f"Cyrillic2Latin: {text.strip()}"
+        result = self.pipe(input_text, max_length = max_length)
+        return result[0]["generated_text"]
+    def batch_transliterate(self, texts: list[str], max_length: int = 128) -> List[str]:
+        """
+        Perform batch transliteration using DalaT5.
+        """
+        return [self.transliterate(t, max_length) for t in texts]

src/utils/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (698 Bytes). View file

src/utils/__pycache__/data_utils.cpython-312.pyc ADDED Viewed

Binary file (6.81 kB). View file

src/utils/__pycache__/ingest.cpython-312.pyc ADDED Viewed

Binary file (2.98 kB). View file

src/utils/__pycache__/plotting.cpython-312.pyc ADDED Viewed

Binary file (2.91 kB). View file

src/utils/config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from pathlib import Path
+# Model paths
+DALAT5_MODEL = "crossroderick/dalat5"
+MINIDALALM_MODEL = "crossroderick/minidalalm"
+# Vector DB config
+VECTOR_DB_PATH = Path("vector_store/faiss_index")
+EMBEDDING_DIM = 384  # for MiniLM-based models
+# Chunking
+CHUNK_SIZE = 256
+CHUNK_OVERLAP = 64
+# File input/output
+DOC_INPUT_DIR = Path("data/uploads")
+DOC_OUTPUT_DIR = Path("data/processed")
+TRANS_OUTPUT_DIR = Path("data/transliterated")

src/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import re
+import pdfplumber
+import numpy as np
+import pytesseract
+from transformers import AutoTokenizer
+from pdf2image import convert_from_path
+from src.utils.config import DALAT5_MODEL, CHUNK_SIZE, CHUNK_OVERLAP
+from typing import Any, List
+# Load DalaT5's tokeniser
+tokeniser = AutoTokenizer.from_pretrained(DALAT5_MODEL)
+def extract_text_with_pdfplumber(file: Any) -> str:
+    """
+    Extract text by leveraging PDFPlumber, which is particularly useful for PDF files
+    with tabular data.
+    """
+    if file.name.endswith(".pdf"):
+        try:
+            with pdfplumber.open(file.name) as pdf:
+                texts = [page.extract_text() or "" for page in pdf.pages]
+                return "\n".join(texts).strip()
+        except Exception as e:
+            print(f"[ERROR] PDFPlumber failed: {e}")
+            return ""
+    return ""
+def extract_text_with_ocr(file: Any) -> str:
+    """
+    Extract text data by leveraging Tesseract.
+    """
+    if file.name.endswith(".pdf"):
+        try:
+            images = convert_from_path(file.name, dpi = 300)
+            page_texts = []
+            for img in images:
+                raw = pytesseract.image_to_string(img, lang = "kaz+eng")
+                # Clean page-by-page
+                cleaned = repair_extracted_text(raw)
+                page_texts.append(cleaned)
+            return "\n".join(page_texts).strip()
+        except Exception as e:
+            print(f"[ERROR] OCR failed: {e}")
+            return ""
+def clean_text(text: str) -> str:
+    """
+    Pre-clean text before chunking.
+    """
+    # Collapse multiple newlines into a space
+    text = re.sub(r"\n+", " ", text)
+    # Normalize excessive punctuation
+    text = re.sub(r"[^\w\s]{2,}", "", text)
+    # Remove repeated punctuation or symbols
+    text = re.sub(r"[•●–—―]+", " ", text)
+    # Normalize extra spacing
+    text = re.sub(r"\s{2,}", " ", text)
+    return text.strip()
+def is_valid_chunk(chunk: str) -> bool:
+    """
+    Heuristic to filter out low-quality chunks.
+    """
+    if len(chunk) < 20:
+        return False
+    symbols = sum(1 for c in chunk if not c.isalnum() and c != ' ')
+    if symbols / len(chunk) > 0.4:
+        return False
+    return True
+def deduplicate_chunks(chunks: List[str], embedder: Any, threshold: float = 0.95) -> List[str]:
+    """
+    Deduplicate chunks based on cosine similarity.
+    Only retains semantically distinct segments.
+    """
+    unique_chunks = []
+    seen_embeddings = []
+    for chunk in chunks:
+        emb = embedder.embed_text(chunk)
+        if all(np.dot(emb, e) / (np.linalg.norm(emb) * np.linalg.norm(e)) < threshold for e in seen_embeddings):
+            unique_chunks.append(chunk)
+            seen_embeddings.append(emb)
+    return unique_chunks
+def chunk_text(text: str) -> List[str]:
+    """
+    Chunk text into overlapping token-based segments using DalaT5's tokeniser.
+    """
+    # Clean text before doing anything
+    cleaned_text = clean_text(text)
+    # Encode with the tokeniser
+    tokens = tokeniser.encode(cleaned_text, add_special_tokens = False)
+    total_tokens = len(tokens)
+    if total_tokens <= CHUNK_SIZE:
+        single_chunk = tokeniser.decode(tokens, skip_special_tokens=True).strip()
+        return [single_chunk] if is_valid_chunk(single_chunk) else []
+    chunks = []
+    start = 0
+    while start < total_tokens:
+        end = min(start + CHUNK_SIZE, total_tokens)
+        chunk_tokens = tokens[start:end]
+        chunk = tokeniser.decode(chunk_tokens, skip_special_tokens=True).strip()
+        if is_valid_chunk(chunk):
+            chunks.append(chunk)
+        start += CHUNK_SIZE - CHUNK_OVERLAP
+    return chunks
+def repair_extracted_text(text: str) -> str:
+    """
+    Additional logic to repair broken line splits, hyphenations, and common repetition artifacts.
+    """
+    # Remove repeated words
+    text = re.sub(r'\b(\w{4,})\s+\1\b', r'\1', text)
+    # Fix hyphenation
+    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
+    # Remove extremely repeated sentences
+    text = re.sub(r'(\b\w{1,2}\b\s+){5,}', '', text)
+    # Remove some previously observed junk
+    text = re.sub(r'\b(Googsoft|Hoogsoft|biometriialyq|avtorometriia)\b', '', text)
+    # Collapse multiple spaces
+    text = re.sub(r'\s{2,}', ' ', text)
+    return text.strip()

src/utils/ingest.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import uuid
+from pathlib import Path
+from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR
+from src.utils.data_utils import chunk_text
+from src.db.vector_store import VectorStore
+from src.modelling.embed import DalaEmbedder
+from src.modelling.transliterate import DalaTransliterator
+def load_documents(input_dir: Path) -> list[tuple[str, str]]:
+    """
+    Loads all .txt documents from input_dir. Returns a list of
+    tuples: (filename, content)
+    """
+    docs = []
+    for file in input_dir.glob("*.txt"):
+        with open(file, 'r', encoding = "utf-8") as f:
+            text = f.read()
+            docs.append((file.stem, text))
+    return docs
+def process_documents() -> None:
+    """
+    Main processing procedure.
+    """
+    # Components
+    transliterator = DalaTransliterator()
+    embedder = DalaEmbedder()
+    vector_store = VectorStore()
+    docs = load_documents(DOC_INPUT_DIR)
+    all_chunks = []
+    all_transliterated = []
+    all_metadata = []
+    for doc_id, text in docs:
+        # Chunk the data
+        chunks = chunk_text(text)
+        all_chunks.extend(chunks)
+        # Transliterate chunks
+        translit_chunks = transliterator.batch_transliterate(chunks)
+        all_transliterated.extend(translit_chunks)
+        # Save transliterated version
+        output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt"
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write("\n\n".join(translit_chunks))
+        # Create metadata entries
+        for i, chunk in enumerate(translit_chunks):
+            meta = {
+                "id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}",
+                "text": chunk
+            }
+            all_metadata.append(meta)
+    # Embed all chunks
+    embeddings = embedder.embed_batch(all_transliterated)
+    # Add to vector DB
+    vector_store.add(embeddings, all_metadata)
+    print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.")
+if __name__ == "__main__":
+    process_documents()

src/utils/plotting.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import plotly
+import pandas as pd
+from umap import UMAP
+import plotly.express as px
+from bertopic import BERTopic
+from typing import Dict, List
+def custom_topic_barchart(model: BERTopic, topic_labels: Dict[int, str], top_n_topics: int = 10, n_words: int = 10) -> plotly.graph_objs.Figure:
+    """
+    Create a custom horizontal bar chart of top topics using plotly.express.
+    """
+    data = []
+    for topic_id, label in topic_labels.items():
+        if topic_id == -1:
+            continue
+        for word, score in model.get_topic(topic_id)[:n_words]:
+            data.append({"Topic": label, "Word": word, "Score": score})
+    df = pd.DataFrame(data)
+    fig = px.bar(
+        df,
+        x = "Score",
+        y = "Word",
+        color = "Topic",
+        orientation = 'h',
+        barmode = "group",
+        #height = 500,
+    )
+    fig.update_layout(
+        margin = dict(l = 40, r = 20, t = 40, b = 20),
+        yaxis = dict(title = ""),
+        xaxis = dict(title = "Relevance"),
+        legend_title_text = "Topic",
+    )
+    return fig
+def custom_umap_plot(embeddings: List[List[float]], topics: List[int], topic_labels: Dict[int, str]) -> plotly.graph_objs.Figure:
+    """
+    Custom UMAP plotting to work better with the Gradio layout.
+    """
+    reducer = UMAP(n_neighbors = 15, min_dist = 0.1, metric = "cosine", random_state = 42)
+    umap_coords = reducer.fit_transform(embeddings)
+    df = pd.DataFrame(umap_coords, columns=["x", "y"])
+    df["topic"] = topics
+    df["label"] = [topic_labels[t] for t in topics]
+    # Filter out topic -1 (noise)
+    df = df[df["topic"] != -1]
+    fig = px.scatter(
+        df,
+        x = 'x',
+        y = 'y',
+        color = "label",
+        labels = {"label": "Topic"},
+        #height = 500
+    )
+    fig.update_layout(margin = dict(l = 20, r = 20, t = 40, b = 20))
+    return fig

vector_store/faiss_index.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25004d0d5df0be08b29e41af806fefc2215d37f215c08fdd5b8ce16484ee83fc
+size 175149

vector_store/faiss_index.json ADDED Viewed

The diff for this file is too large to render. See raw diff