Spaces:
Sleeping
Sleeping
Commit
·
0eb636f
1
Parent(s):
e6f8cc6
Added all files
Browse files- .gitattributes +1 -0
- .gitignore +2 -0
- README.md +1 -1
- app.py +143 -0
- src/__init__.py +8 -0
- src/db/__pycache__/vector_store.cpython-312.pyc +0 -0
- src/db/search.py +23 -0
- src/db/vector_store.py +72 -0
- src/modelling/__pycache__/embed.cpython-312.pyc +0 -0
- src/modelling/__pycache__/topic_model.cpython-312.pyc +0 -0
- src/modelling/__pycache__/transliterate.cpython-312.pyc +0 -0
- src/modelling/embed.py +31 -0
- src/modelling/topic_model.py +94 -0
- src/modelling/transliterate.py +29 -0
- src/utils/__pycache__/config.cpython-312.pyc +0 -0
- src/utils/__pycache__/data_utils.cpython-312.pyc +0 -0
- src/utils/__pycache__/ingest.cpython-312.pyc +0 -0
- src/utils/__pycache__/plotting.cpython-312.pyc +0 -0
- src/utils/config.py +19 -0
- src/utils/data_utils.py +164 -0
- src/utils/ingest.py +77 -0
- src/utils/plotting.py +71 -0
- vector_store/faiss_index.index +3 -0
- vector_store/faiss_index.json +0 -0
.gitattributes
CHANGED
@@ -32,4 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
vector_store/**/* filter=lfs diff=lfs merge=lfs -text
|
36 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/**
|
2 |
+
src/__pycache__/**
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 💻
|
4 |
colorFrom: pink
|
5 |
colorTo: green
|
|
|
1 |
---
|
2 |
+
title: SemanticDala
|
3 |
emoji: 💻
|
4 |
colorFrom: pink
|
5 |
colorTo: green
|
app.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from src.modelling.embed import DalaEmbedder
|
3 |
+
from src.db.vector_store import VectorStore
|
4 |
+
from src.modelling.topic_model import TopicModeller
|
5 |
+
from src.modelling.transliterate import DalaTransliterator
|
6 |
+
from src.utils.data_utils import (
|
7 |
+
extract_text_with_pdfplumber,
|
8 |
+
extract_text_with_ocr,
|
9 |
+
chunk_text,
|
10 |
+
deduplicate_chunks,
|
11 |
+
repair_extracted_text
|
12 |
+
)
|
13 |
+
|
14 |
+
from typing import Any, List, Tuple
|
15 |
+
|
16 |
+
|
17 |
+
# Instantiate components
|
18 |
+
translit = DalaTransliterator()
|
19 |
+
embedder = DalaEmbedder()
|
20 |
+
vector_db = VectorStore()
|
21 |
+
topic_modeller = TopicModeller()
|
22 |
+
|
23 |
+
|
24 |
+
def extract_text(file: Any) -> str:
|
25 |
+
"""
|
26 |
+
Try multiple PDF extraction strategies, with fallback to OCR if necessary.
|
27 |
+
"""
|
28 |
+
if file.name.endswith(".pdf"):
|
29 |
+
text = extract_text_with_pdfplumber(file)
|
30 |
+
|
31 |
+
if len(text.strip()) > 100:
|
32 |
+
return repair_extracted_text(text)
|
33 |
+
|
34 |
+
print("[INFO] Falling back to OCR...")
|
35 |
+
|
36 |
+
return extract_text_with_ocr(file)
|
37 |
+
|
38 |
+
elif file.name.endswith(".txt"):
|
39 |
+
return repair_extracted_text(file.read().decode("utf-8", errors = "ignore"))
|
40 |
+
|
41 |
+
return ""
|
42 |
+
|
43 |
+
|
44 |
+
def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
|
45 |
+
"""
|
46 |
+
Main file processing function, which will also chunk, transliterate and cluster
|
47 |
+
the file contents, as well as plot the clusters.
|
48 |
+
"""
|
49 |
+
raw_text = extract_text(file)
|
50 |
+
chunks = chunk_text(raw_text)
|
51 |
+
|
52 |
+
# Deduplicate and embed embedding
|
53 |
+
translits = translit.batch_transliterate(chunks)
|
54 |
+
dedup_translits = deduplicate_chunks(translits, embedder)
|
55 |
+
embeddings = embedder.embed_batch(dedup_translits)
|
56 |
+
|
57 |
+
# Clear previous entries before adding
|
58 |
+
vector_db.index.reset()
|
59 |
+
vector_db.metadata = []
|
60 |
+
|
61 |
+
metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
|
62 |
+
|
63 |
+
vector_db.add(embeddings, metadata)
|
64 |
+
|
65 |
+
# Topic modelling
|
66 |
+
topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
|
67 |
+
|
68 |
+
# Get a list of rows for topic labels
|
69 |
+
overview_table = [[k, v] for k, v in topic_labels.items()]
|
70 |
+
|
71 |
+
# Zip back transliterated text with topic IDs
|
72 |
+
annotated = list(zip(translits, topics))
|
73 |
+
|
74 |
+
return annotated, fig, overview_table, umap_fig
|
75 |
+
|
76 |
+
|
77 |
+
def search_text(query: str):
|
78 |
+
"""
|
79 |
+
Search for a given query in the vector DB.
|
80 |
+
"""
|
81 |
+
query_emb = embedder.embed_text(query)
|
82 |
+
results = vector_db.search(query_emb, top_k = 5)
|
83 |
+
|
84 |
+
return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
|
85 |
+
|
86 |
+
|
87 |
+
# Custom CSS
|
88 |
+
page_css = """
|
89 |
+
p {
|
90 |
+
font-size: 18px;
|
91 |
+
}
|
92 |
+
|
93 |
+
.lang_btn {
|
94 |
+
width: 5%;
|
95 |
+
}
|
96 |
+
"""
|
97 |
+
|
98 |
+
# Gradio UI
|
99 |
+
with gr.Blocks(css = page_css) as demo:
|
100 |
+
title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
|
101 |
+
|
102 |
+
with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):
|
103 |
+
with gr.Row():
|
104 |
+
file_input = gr.File(label = "PDF немесе TXT жүктеңіз / Upload PDF or TXT", file_types = [".pdf", ".txt"])
|
105 |
+
process_btn = gr.Button("Процесс файлы / Process File", scale = 1)
|
106 |
+
|
107 |
+
translit_output = gr.Dataframe(
|
108 |
+
headers = ["Мәтін / Text", "Тақырып идентификаторы / Topic ID"],
|
109 |
+
label = "Транслитерацияланған үзінділер + Тақырыптар / Transliterated Chunks + Topics"
|
110 |
+
)
|
111 |
+
|
112 |
+
topic_label_table = gr.Dataframe(
|
113 |
+
headers = ["Тақырып идентификаторы / Topic ID", "Белгі / Label"],
|
114 |
+
label = "Тақырып белгілері / Topic Labels"
|
115 |
+
)
|
116 |
+
|
117 |
+
with gr.Row(equal_height = True):
|
118 |
+
with gr.Column(scale = 1):
|
119 |
+
plot_output = gr.Plot(label = "Негізгі тақырыптар / Top Topics")
|
120 |
+
|
121 |
+
with gr.Column(scale = 1):
|
122 |
+
umap_output = gr.Plot(label = "UMAP проекциясы / UMAP Topic Projection")
|
123 |
+
|
124 |
+
with gr.Tab("🔍 Семантикалық іздеу / Semantic Search"):
|
125 |
+
with gr.Row():
|
126 |
+
search_box = gr.Textbox(label = "Сұрау / Query", placeholder = "мысалы / e.g., Qazaqstan tarihy", lines = 1, scale = 5)
|
127 |
+
search_btn = gr.Button("Іздеу / Search", scale = 1)
|
128 |
+
|
129 |
+
search_results = gr.Textbox(label = "Нәтижелер / Top Results", lines = 6, interactive = False)
|
130 |
+
|
131 |
+
# Bind callbacks
|
132 |
+
process_btn.click(
|
133 |
+
fn = process_file,
|
134 |
+
inputs = file_input,
|
135 |
+
outputs = [translit_output, plot_output, topic_label_table, umap_output]
|
136 |
+
)
|
137 |
+
|
138 |
+
search_btn.click(fn = search_text, inputs = search_box, outputs = search_results)
|
139 |
+
|
140 |
+
|
141 |
+
# Launch
|
142 |
+
if __name__ == "__main__":
|
143 |
+
demo.launch()
|
src/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.utils.config import *
|
2 |
+
from src.utils.ingest import *
|
3 |
+
from src.utils.plotting import *
|
4 |
+
from src.utils.data_utils import *
|
5 |
+
from src.modelling.embed import *
|
6 |
+
from src.modelling.topic_model import *
|
7 |
+
from src.modelling.transliterate import *
|
8 |
+
from src.db.vector_store import *
|
src/db/__pycache__/vector_store.cpython-312.pyc
ADDED
Binary file (4.31 kB). View file
|
|
src/db/search.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from db.vector_store import VectorStore
|
2 |
+
from src.modelling.embed import DalaEmbedder
|
3 |
+
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
|
7 |
+
class SemanticSearcher:
|
8 |
+
"""
|
9 |
+
Perform semantic search over embedded Kazakh text.
|
10 |
+
"""
|
11 |
+
def __init__(self):
|
12 |
+
self.embedder = DalaEmbedder()
|
13 |
+
self.vector_store = VectorStore()
|
14 |
+
|
15 |
+
|
16 |
+
def search(self, query: str, top_k: int = 5) -> List[dict]:
|
17 |
+
"""
|
18 |
+
Embed the query and retrieve the most relevant chunks.
|
19 |
+
"""
|
20 |
+
query_embedding = self.embedder.embed_text(query)
|
21 |
+
results = self.vector_store.search(query_embedding, top_k = top_k)
|
22 |
+
|
23 |
+
return results
|
src/db/vector_store.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faiss
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
from pathlib import Path
|
5 |
+
from src.utils.config import VECTOR_DB_PATH, EMBEDDING_DIM
|
6 |
+
|
7 |
+
from typing import List
|
8 |
+
|
9 |
+
|
10 |
+
class VectorStore:
|
11 |
+
"""
|
12 |
+
Wrapper for FAISS vector storage, with ID-to-text mapping.
|
13 |
+
"""
|
14 |
+
def __init__(self, index_path: Path = VECTOR_DB_PATH):
|
15 |
+
self.index_path = index_path.with_suffix(".index")
|
16 |
+
self.meta_path = index_path.with_suffix(".json")
|
17 |
+
|
18 |
+
self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
|
19 |
+
self.metadata = [] # list of dicts: {"id": str, "text": str}
|
20 |
+
|
21 |
+
# Try loading if exists
|
22 |
+
if self.index_path.exists() and self.meta_path.exists():
|
23 |
+
try:
|
24 |
+
self.load()
|
25 |
+
|
26 |
+
except Exception as e:
|
27 |
+
print(f"[WARN] Failed to load vector store: {e}")
|
28 |
+
|
29 |
+
# Reinitialize clean if corrupted
|
30 |
+
self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
|
31 |
+
self.metadata = []
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
def add(self, embeddings: list[list[float]], metadata: List[dict]):
|
36 |
+
"""
|
37 |
+
Add new embeddings and their metadata (e.g., {"id": "doc1_chunk0", "text": "..."})
|
38 |
+
"""
|
39 |
+
self.index.add(np.array(embeddings).astype("float32"))
|
40 |
+
self.metadata.extend(metadata)
|
41 |
+
self.save()
|
42 |
+
|
43 |
+
|
44 |
+
def search(self, query_embedding: list[float], top_k: int = 5) -> List[dict]:
|
45 |
+
"""
|
46 |
+
Perform vector search and return metadata of top_k results.
|
47 |
+
"""
|
48 |
+
D, I = self.index.search(np.array([query_embedding]).astype("float32"), top_k)
|
49 |
+
|
50 |
+
return [self.metadata[i] for i in I[0]]
|
51 |
+
|
52 |
+
|
53 |
+
def save(self) -> None:
|
54 |
+
"""
|
55 |
+
Save data to an external file.
|
56 |
+
"""
|
57 |
+
self.index_path.parent.mkdir(parents = True, exist_ok = True)
|
58 |
+
|
59 |
+
faiss.write_index(self.index, str(self.index_path))
|
60 |
+
|
61 |
+
with open(self.meta_path, 'w', encoding = "utf-8") as f:
|
62 |
+
json.dump(self.metadata, f, ensure_ascii = False, indent = 2)
|
63 |
+
|
64 |
+
|
65 |
+
def load(self) -> None:
|
66 |
+
"""
|
67 |
+
Load data from an external file.
|
68 |
+
"""
|
69 |
+
self.index = faiss.read_index(str(self.index_path))
|
70 |
+
|
71 |
+
with open(self.meta_path, 'r', encoding = "utf-8") as f:
|
72 |
+
self.metadata = json.load(f)
|
src/modelling/__pycache__/embed.cpython-312.pyc
ADDED
Binary file (1.82 kB). View file
|
|
src/modelling/__pycache__/topic_model.cpython-312.pyc
ADDED
Binary file (4.55 kB). View file
|
|
src/modelling/__pycache__/transliterate.cpython-312.pyc
ADDED
Binary file (1.83 kB). View file
|
|
src/modelling/embed.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.utils.config import MINIDALALM_MODEL
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
|
4 |
+
|
5 |
+
class DalaEmbedder:
|
6 |
+
"""
|
7 |
+
Simple wrapper for the MiniDalaLM embedding model
|
8 |
+
"""
|
9 |
+
def __init__(self, model_path: str = MINIDALALM_MODEL):
|
10 |
+
self.model = SentenceTransformer(model_path)
|
11 |
+
|
12 |
+
|
13 |
+
def embed_text(self, text: str) -> list[float]:
|
14 |
+
"""
|
15 |
+
Embed a single string of text.
|
16 |
+
"""
|
17 |
+
return self.model.encode(text, convert_to_numpy = True).tolist()
|
18 |
+
|
19 |
+
|
20 |
+
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
21 |
+
"""
|
22 |
+
Embed a batch of text strings.
|
23 |
+
"""
|
24 |
+
return self.model.encode(texts, convert_to_numpy = True)
|
25 |
+
|
26 |
+
|
27 |
+
def get_model(self) -> SentenceTransformer:
|
28 |
+
"""
|
29 |
+
Get function to enable access to the MiniDalaLM model.
|
30 |
+
"""
|
31 |
+
return self.model
|
src/modelling/topic_model.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import plotly
|
3 |
+
from bertopic import BERTopic
|
4 |
+
from collections import Counter
|
5 |
+
from src.utils.data_utils import tokeniser
|
6 |
+
from src.modelling.embed import DalaEmbedder
|
7 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
8 |
+
from src.utils.plotting import custom_topic_barchart, custom_umap_plot
|
9 |
+
|
10 |
+
from typing import Dict, List, Tuple
|
11 |
+
|
12 |
+
|
13 |
+
class TopicModeller:
|
14 |
+
"""
|
15 |
+
Wrapper for topic modelling with BERTopic.
|
16 |
+
"""
|
17 |
+
def __init__(self):
|
18 |
+
# Custom vectoriser with stopword filtering
|
19 |
+
self.vectorizer_model = None
|
20 |
+
self.model = None
|
21 |
+
|
22 |
+
|
23 |
+
def _extract_dalat5_stopwords(self, texts: List[str], top_k: int = 75) -> List[str]:
|
24 |
+
"""
|
25 |
+
Identify frequent tokens using DalaT5's tokeniser as proxy stopwords.
|
26 |
+
"""
|
27 |
+
token_counter = Counter()
|
28 |
+
|
29 |
+
for text in texts:
|
30 |
+
token_ids = tokeniser.encode(text, add_special_tokens=False)
|
31 |
+
token_counter.update(token_ids)
|
32 |
+
|
33 |
+
most_common = token_counter.most_common(top_k)
|
34 |
+
stop_tokens = [tokeniser.decode([tok_id]).strip() for tok_id, _ in most_common]
|
35 |
+
|
36 |
+
return stop_tokens
|
37 |
+
|
38 |
+
|
39 |
+
def _preprocess_texts(self, texts: List[str]) -> List[str]:
|
40 |
+
"""
|
41 |
+
Lowercase and remove digits/symbols from texts.
|
42 |
+
"""
|
43 |
+
return [
|
44 |
+
re.sub(r"\d+|\s+", " ", t.lower()).strip()
|
45 |
+
for t in texts
|
46 |
+
]
|
47 |
+
|
48 |
+
|
49 |
+
def fit(
|
50 |
+
self,
|
51 |
+
texts: List[str],
|
52 |
+
embeddings: List[List[float]]
|
53 |
+
) -> Tuple[List[str], plotly.graph_objs.Figure, Dict[int, str], plotly.graph_objs.Figure]:
|
54 |
+
"""
|
55 |
+
Fit BERTopic on preprocessed texts and given embeddings.
|
56 |
+
Returns topics and an interactive plot.
|
57 |
+
"""
|
58 |
+
clean_texts = self._preprocess_texts(texts)
|
59 |
+
|
60 |
+
# Leverage DalaT5's tokeniser for stopword acquisition
|
61 |
+
stopwords = self._extract_dalat5_stopwords(clean_texts, top_k = 75)
|
62 |
+
|
63 |
+
# Define vectoriser and model
|
64 |
+
self.vectoriser_model = CountVectorizer(
|
65 |
+
stop_words = stopwords,
|
66 |
+
token_pattern = r"\b[a-zA-Z]+(?:-[a-zA-Z]+)?\b"
|
67 |
+
)
|
68 |
+
self.model = BERTopic(
|
69 |
+
language = "multilingual",
|
70 |
+
vectorizer_model = self.vectoriser_model,
|
71 |
+
embedding_model = DalaEmbedder().get_model()
|
72 |
+
)
|
73 |
+
|
74 |
+
topics, _ = self.model.fit_transform(clean_texts, embeddings)
|
75 |
+
|
76 |
+
# Generate labels
|
77 |
+
topic_info = self.model.get_topic_info()
|
78 |
+
topic_labels = {}
|
79 |
+
|
80 |
+
for topic_id in topic_info.Topic.values:
|
81 |
+
if topic_id == -1:
|
82 |
+
topic_labels[topic_id] = '-'
|
83 |
+
|
84 |
+
continue
|
85 |
+
|
86 |
+
words = [word for word, _ in self.model.get_topic(topic_id)[:4]]
|
87 |
+
label = "_".join(words)
|
88 |
+
topic_labels[topic_id] = f"{topic_id}_{label}"
|
89 |
+
|
90 |
+
fig = custom_topic_barchart(self.model, topic_labels)
|
91 |
+
umap_fig = custom_umap_plot(embeddings, topics, topic_labels)
|
92 |
+
labeled_topics = [topic_labels[t] for t in topics]
|
93 |
+
|
94 |
+
return labeled_topics, fig, topic_labels, umap_fig
|
src/modelling/transliterate.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from src.utils.config import DALAT5_MODEL
|
3 |
+
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
|
7 |
+
class DalaTransliterator:
|
8 |
+
"""
|
9 |
+
Simple wrapper for the DalaT5 transliterator model.
|
10 |
+
"""
|
11 |
+
def __init__(self, model_name: str = DALAT5_MODEL):
|
12 |
+
self.pipe = pipeline("text2text-generation", model = model_name)
|
13 |
+
|
14 |
+
|
15 |
+
def transliterate(self, text: str, max_length: int = 128) -> str:
|
16 |
+
"""
|
17 |
+
Transliterate a given text using DalaT5.
|
18 |
+
"""
|
19 |
+
input_text = f"Cyrillic2Latin: {text.strip()}"
|
20 |
+
result = self.pipe(input_text, max_length = max_length)
|
21 |
+
|
22 |
+
return result[0]["generated_text"]
|
23 |
+
|
24 |
+
|
25 |
+
def batch_transliterate(self, texts: list[str], max_length: int = 128) -> List[str]:
|
26 |
+
"""
|
27 |
+
Perform batch transliteration using DalaT5.
|
28 |
+
"""
|
29 |
+
return [self.transliterate(t, max_length) for t in texts]
|
src/utils/__pycache__/config.cpython-312.pyc
ADDED
Binary file (698 Bytes). View file
|
|
src/utils/__pycache__/data_utils.cpython-312.pyc
ADDED
Binary file (6.81 kB). View file
|
|
src/utils/__pycache__/ingest.cpython-312.pyc
ADDED
Binary file (2.98 kB). View file
|
|
src/utils/__pycache__/plotting.cpython-312.pyc
ADDED
Binary file (2.91 kB). View file
|
|
src/utils/config.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
|
4 |
+
# Model paths
|
5 |
+
DALAT5_MODEL = "crossroderick/dalat5"
|
6 |
+
MINIDALALM_MODEL = "crossroderick/minidalalm"
|
7 |
+
|
8 |
+
# Vector DB config
|
9 |
+
VECTOR_DB_PATH = Path("vector_store/faiss_index")
|
10 |
+
EMBEDDING_DIM = 384 # for MiniLM-based models
|
11 |
+
|
12 |
+
# Chunking
|
13 |
+
CHUNK_SIZE = 256
|
14 |
+
CHUNK_OVERLAP = 64
|
15 |
+
|
16 |
+
# File input/output
|
17 |
+
DOC_INPUT_DIR = Path("data/uploads")
|
18 |
+
DOC_OUTPUT_DIR = Path("data/processed")
|
19 |
+
TRANS_OUTPUT_DIR = Path("data/transliterated")
|
src/utils/data_utils.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pdfplumber
|
3 |
+
import numpy as np
|
4 |
+
import pytesseract
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
from pdf2image import convert_from_path
|
7 |
+
from src.utils.config import DALAT5_MODEL, CHUNK_SIZE, CHUNK_OVERLAP
|
8 |
+
|
9 |
+
from typing import Any, List
|
10 |
+
|
11 |
+
|
12 |
+
# Load DalaT5's tokeniser
|
13 |
+
tokeniser = AutoTokenizer.from_pretrained(DALAT5_MODEL)
|
14 |
+
|
15 |
+
|
16 |
+
def extract_text_with_pdfplumber(file: Any) -> str:
|
17 |
+
"""
|
18 |
+
Extract text by leveraging PDFPlumber, which is particularly useful for PDF files
|
19 |
+
with tabular data.
|
20 |
+
"""
|
21 |
+
if file.name.endswith(".pdf"):
|
22 |
+
try:
|
23 |
+
with pdfplumber.open(file.name) as pdf:
|
24 |
+
texts = [page.extract_text() or "" for page in pdf.pages]
|
25 |
+
|
26 |
+
return "\n".join(texts).strip()
|
27 |
+
|
28 |
+
except Exception as e:
|
29 |
+
print(f"[ERROR] PDFPlumber failed: {e}")
|
30 |
+
|
31 |
+
return ""
|
32 |
+
|
33 |
+
return ""
|
34 |
+
|
35 |
+
|
36 |
+
def extract_text_with_ocr(file: Any) -> str:
|
37 |
+
"""
|
38 |
+
Extract text data by leveraging Tesseract.
|
39 |
+
"""
|
40 |
+
if file.name.endswith(".pdf"):
|
41 |
+
try:
|
42 |
+
images = convert_from_path(file.name, dpi = 300)
|
43 |
+
page_texts = []
|
44 |
+
|
45 |
+
for img in images:
|
46 |
+
raw = pytesseract.image_to_string(img, lang = "kaz+eng")
|
47 |
+
|
48 |
+
# Clean page-by-page
|
49 |
+
cleaned = repair_extracted_text(raw)
|
50 |
+
|
51 |
+
page_texts.append(cleaned)
|
52 |
+
|
53 |
+
return "\n".join(page_texts).strip()
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
print(f"[ERROR] OCR failed: {e}")
|
57 |
+
|
58 |
+
return ""
|
59 |
+
|
60 |
+
|
61 |
+
def clean_text(text: str) -> str:
|
62 |
+
"""
|
63 |
+
Pre-clean text before chunking.
|
64 |
+
"""
|
65 |
+
# Collapse multiple newlines into a space
|
66 |
+
text = re.sub(r"\n+", " ", text)
|
67 |
+
|
68 |
+
# Normalize excessive punctuation
|
69 |
+
text = re.sub(r"[^\w\s]{2,}", "", text)
|
70 |
+
|
71 |
+
# Remove repeated punctuation or symbols
|
72 |
+
text = re.sub(r"[•●–—―]+", " ", text)
|
73 |
+
|
74 |
+
# Normalize extra spacing
|
75 |
+
text = re.sub(r"\s{2,}", " ", text)
|
76 |
+
|
77 |
+
return text.strip()
|
78 |
+
|
79 |
+
|
80 |
+
def is_valid_chunk(chunk: str) -> bool:
|
81 |
+
"""
|
82 |
+
Heuristic to filter out low-quality chunks.
|
83 |
+
"""
|
84 |
+
if len(chunk) < 20:
|
85 |
+
return False
|
86 |
+
|
87 |
+
symbols = sum(1 for c in chunk if not c.isalnum() and c != ' ')
|
88 |
+
|
89 |
+
if symbols / len(chunk) > 0.4:
|
90 |
+
return False
|
91 |
+
|
92 |
+
return True
|
93 |
+
|
94 |
+
|
95 |
+
def deduplicate_chunks(chunks: List[str], embedder: Any, threshold: float = 0.95) -> List[str]:
|
96 |
+
"""
|
97 |
+
Deduplicate chunks based on cosine similarity.
|
98 |
+
Only retains semantically distinct segments.
|
99 |
+
"""
|
100 |
+
unique_chunks = []
|
101 |
+
seen_embeddings = []
|
102 |
+
|
103 |
+
for chunk in chunks:
|
104 |
+
emb = embedder.embed_text(chunk)
|
105 |
+
|
106 |
+
if all(np.dot(emb, e) / (np.linalg.norm(emb) * np.linalg.norm(e)) < threshold for e in seen_embeddings):
|
107 |
+
unique_chunks.append(chunk)
|
108 |
+
seen_embeddings.append(emb)
|
109 |
+
|
110 |
+
return unique_chunks
|
111 |
+
|
112 |
+
|
113 |
+
def chunk_text(text: str) -> List[str]:
|
114 |
+
"""
|
115 |
+
Chunk text into overlapping token-based segments using DalaT5's tokeniser.
|
116 |
+
"""
|
117 |
+
# Clean text before doing anything
|
118 |
+
cleaned_text = clean_text(text)
|
119 |
+
|
120 |
+
# Encode with the tokeniser
|
121 |
+
tokens = tokeniser.encode(cleaned_text, add_special_tokens = False)
|
122 |
+
total_tokens = len(tokens)
|
123 |
+
|
124 |
+
if total_tokens <= CHUNK_SIZE:
|
125 |
+
single_chunk = tokeniser.decode(tokens, skip_special_tokens=True).strip()
|
126 |
+
|
127 |
+
return [single_chunk] if is_valid_chunk(single_chunk) else []
|
128 |
+
|
129 |
+
chunks = []
|
130 |
+
start = 0
|
131 |
+
|
132 |
+
while start < total_tokens:
|
133 |
+
end = min(start + CHUNK_SIZE, total_tokens)
|
134 |
+
chunk_tokens = tokens[start:end]
|
135 |
+
chunk = tokeniser.decode(chunk_tokens, skip_special_tokens=True).strip()
|
136 |
+
|
137 |
+
if is_valid_chunk(chunk):
|
138 |
+
chunks.append(chunk)
|
139 |
+
|
140 |
+
start += CHUNK_SIZE - CHUNK_OVERLAP
|
141 |
+
|
142 |
+
return chunks
|
143 |
+
|
144 |
+
|
145 |
+
def repair_extracted_text(text: str) -> str:
|
146 |
+
"""
|
147 |
+
Additional logic to repair broken line splits, hyphenations, and common repetition artifacts.
|
148 |
+
"""
|
149 |
+
# Remove repeated words
|
150 |
+
text = re.sub(r'\b(\w{4,})\s+\1\b', r'\1', text)
|
151 |
+
|
152 |
+
# Fix hyphenation
|
153 |
+
text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
|
154 |
+
|
155 |
+
# Remove extremely repeated sentences
|
156 |
+
text = re.sub(r'(\b\w{1,2}\b\s+){5,}', '', text)
|
157 |
+
|
158 |
+
# Remove some previously observed junk
|
159 |
+
text = re.sub(r'\b(Googsoft|Hoogsoft|biometriialyq|avtorometriia)\b', '', text)
|
160 |
+
|
161 |
+
# Collapse multiple spaces
|
162 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
163 |
+
|
164 |
+
return text.strip()
|
src/utils/ingest.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
from pathlib import Path
|
3 |
+
from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR
|
4 |
+
|
5 |
+
from src.utils.data_utils import chunk_text
|
6 |
+
from src.db.vector_store import VectorStore
|
7 |
+
from src.modelling.embed import DalaEmbedder
|
8 |
+
from src.modelling.transliterate import DalaTransliterator
|
9 |
+
|
10 |
+
|
11 |
+
def load_documents(input_dir: Path) -> list[tuple[str, str]]:
|
12 |
+
"""
|
13 |
+
Loads all .txt documents from input_dir. Returns a list of
|
14 |
+
tuples: (filename, content)
|
15 |
+
"""
|
16 |
+
docs = []
|
17 |
+
|
18 |
+
for file in input_dir.glob("*.txt"):
|
19 |
+
with open(file, 'r', encoding = "utf-8") as f:
|
20 |
+
text = f.read()
|
21 |
+
|
22 |
+
docs.append((file.stem, text))
|
23 |
+
|
24 |
+
return docs
|
25 |
+
|
26 |
+
|
27 |
+
def process_documents() -> None:
|
28 |
+
"""
|
29 |
+
Main processing procedure.
|
30 |
+
"""
|
31 |
+
# Components
|
32 |
+
transliterator = DalaTransliterator()
|
33 |
+
embedder = DalaEmbedder()
|
34 |
+
vector_store = VectorStore()
|
35 |
+
|
36 |
+
docs = load_documents(DOC_INPUT_DIR)
|
37 |
+
all_chunks = []
|
38 |
+
all_transliterated = []
|
39 |
+
all_metadata = []
|
40 |
+
|
41 |
+
for doc_id, text in docs:
|
42 |
+
# Chunk the data
|
43 |
+
chunks = chunk_text(text)
|
44 |
+
|
45 |
+
all_chunks.extend(chunks)
|
46 |
+
|
47 |
+
# Transliterate chunks
|
48 |
+
translit_chunks = transliterator.batch_transliterate(chunks)
|
49 |
+
|
50 |
+
all_transliterated.extend(translit_chunks)
|
51 |
+
|
52 |
+
# Save transliterated version
|
53 |
+
output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt"
|
54 |
+
|
55 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
56 |
+
f.write("\n\n".join(translit_chunks))
|
57 |
+
|
58 |
+
# Create metadata entries
|
59 |
+
for i, chunk in enumerate(translit_chunks):
|
60 |
+
meta = {
|
61 |
+
"id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}",
|
62 |
+
"text": chunk
|
63 |
+
}
|
64 |
+
|
65 |
+
all_metadata.append(meta)
|
66 |
+
|
67 |
+
# Embed all chunks
|
68 |
+
embeddings = embedder.embed_batch(all_transliterated)
|
69 |
+
|
70 |
+
# Add to vector DB
|
71 |
+
vector_store.add(embeddings, all_metadata)
|
72 |
+
|
73 |
+
print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.")
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
process_documents()
|
src/utils/plotting.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import plotly
|
2 |
+
import pandas as pd
|
3 |
+
from umap import UMAP
|
4 |
+
import plotly.express as px
|
5 |
+
from bertopic import BERTopic
|
6 |
+
|
7 |
+
from typing import Dict, List
|
8 |
+
|
9 |
+
|
10 |
+
def custom_topic_barchart(model: BERTopic, topic_labels: Dict[int, str], top_n_topics: int = 10, n_words: int = 10) -> plotly.graph_objs.Figure:
|
11 |
+
"""
|
12 |
+
Create a custom horizontal bar chart of top topics using plotly.express.
|
13 |
+
"""
|
14 |
+
data = []
|
15 |
+
|
16 |
+
for topic_id, label in topic_labels.items():
|
17 |
+
if topic_id == -1:
|
18 |
+
continue
|
19 |
+
|
20 |
+
for word, score in model.get_topic(topic_id)[:n_words]:
|
21 |
+
data.append({"Topic": label, "Word": word, "Score": score})
|
22 |
+
|
23 |
+
df = pd.DataFrame(data)
|
24 |
+
|
25 |
+
fig = px.bar(
|
26 |
+
df,
|
27 |
+
x = "Score",
|
28 |
+
y = "Word",
|
29 |
+
color = "Topic",
|
30 |
+
orientation = 'h',
|
31 |
+
barmode = "group",
|
32 |
+
#height = 500,
|
33 |
+
)
|
34 |
+
|
35 |
+
fig.update_layout(
|
36 |
+
margin = dict(l = 40, r = 20, t = 40, b = 20),
|
37 |
+
yaxis = dict(title = ""),
|
38 |
+
xaxis = dict(title = "Relevance"),
|
39 |
+
legend_title_text = "Topic",
|
40 |
+
)
|
41 |
+
|
42 |
+
return fig
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
def custom_umap_plot(embeddings: List[List[float]], topics: List[int], topic_labels: Dict[int, str]) -> plotly.graph_objs.Figure:
|
47 |
+
"""
|
48 |
+
Custom UMAP plotting to work better with the Gradio layout.
|
49 |
+
"""
|
50 |
+
reducer = UMAP(n_neighbors = 15, min_dist = 0.1, metric = "cosine", random_state = 42)
|
51 |
+
umap_coords = reducer.fit_transform(embeddings)
|
52 |
+
|
53 |
+
df = pd.DataFrame(umap_coords, columns=["x", "y"])
|
54 |
+
df["topic"] = topics
|
55 |
+
df["label"] = [topic_labels[t] for t in topics]
|
56 |
+
|
57 |
+
# Filter out topic -1 (noise)
|
58 |
+
df = df[df["topic"] != -1]
|
59 |
+
|
60 |
+
fig = px.scatter(
|
61 |
+
df,
|
62 |
+
x = 'x',
|
63 |
+
y = 'y',
|
64 |
+
color = "label",
|
65 |
+
labels = {"label": "Topic"},
|
66 |
+
#height = 500
|
67 |
+
)
|
68 |
+
|
69 |
+
fig.update_layout(margin = dict(l = 20, r = 20, t = 40, b = 20))
|
70 |
+
|
71 |
+
return fig
|
vector_store/faiss_index.index
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25004d0d5df0be08b29e41af806fefc2215d37f215c08fdd5b8ce16484ee83fc
|
3 |
+
size 175149
|
vector_store/faiss_index.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|