crossroderick commited on
Commit
0eb636f
·
1 Parent(s): e6f8cc6

Added all files

Browse files
.gitattributes CHANGED
@@ -32,4 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
+ vector_store/**/* filter=lfs diff=lfs merge=lfs -text
36
  *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/**
2
+ src/__pycache__/**
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Semanticdala
3
  emoji: 💻
4
  colorFrom: pink
5
  colorTo: green
 
1
  ---
2
+ title: SemanticDala
3
  emoji: 💻
4
  colorFrom: pink
5
  colorTo: green
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.modelling.embed import DalaEmbedder
3
+ from src.db.vector_store import VectorStore
4
+ from src.modelling.topic_model import TopicModeller
5
+ from src.modelling.transliterate import DalaTransliterator
6
+ from src.utils.data_utils import (
7
+ extract_text_with_pdfplumber,
8
+ extract_text_with_ocr,
9
+ chunk_text,
10
+ deduplicate_chunks,
11
+ repair_extracted_text
12
+ )
13
+
14
+ from typing import Any, List, Tuple
15
+
16
+
17
+ # Instantiate components
18
+ translit = DalaTransliterator()
19
+ embedder = DalaEmbedder()
20
+ vector_db = VectorStore()
21
+ topic_modeller = TopicModeller()
22
+
23
+
24
+ def extract_text(file: Any) -> str:
25
+ """
26
+ Try multiple PDF extraction strategies, with fallback to OCR if necessary.
27
+ """
28
+ if file.name.endswith(".pdf"):
29
+ text = extract_text_with_pdfplumber(file)
30
+
31
+ if len(text.strip()) > 100:
32
+ return repair_extracted_text(text)
33
+
34
+ print("[INFO] Falling back to OCR...")
35
+
36
+ return extract_text_with_ocr(file)
37
+
38
+ elif file.name.endswith(".txt"):
39
+ return repair_extracted_text(file.read().decode("utf-8", errors = "ignore"))
40
+
41
+ return ""
42
+
43
+
44
+ def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
45
+ """
46
+ Main file processing function, which will also chunk, transliterate and cluster
47
+ the file contents, as well as plot the clusters.
48
+ """
49
+ raw_text = extract_text(file)
50
+ chunks = chunk_text(raw_text)
51
+
52
+ # Deduplicate and embed embedding
53
+ translits = translit.batch_transliterate(chunks)
54
+ dedup_translits = deduplicate_chunks(translits, embedder)
55
+ embeddings = embedder.embed_batch(dedup_translits)
56
+
57
+ # Clear previous entries before adding
58
+ vector_db.index.reset()
59
+ vector_db.metadata = []
60
+
61
+ metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
62
+
63
+ vector_db.add(embeddings, metadata)
64
+
65
+ # Topic modelling
66
+ topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
67
+
68
+ # Get a list of rows for topic labels
69
+ overview_table = [[k, v] for k, v in topic_labels.items()]
70
+
71
+ # Zip back transliterated text with topic IDs
72
+ annotated = list(zip(translits, topics))
73
+
74
+ return annotated, fig, overview_table, umap_fig
75
+
76
+
77
+ def search_text(query: str):
78
+ """
79
+ Search for a given query in the vector DB.
80
+ """
81
+ query_emb = embedder.embed_text(query)
82
+ results = vector_db.search(query_emb, top_k = 5)
83
+
84
+ return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
85
+
86
+
87
+ # Custom CSS
88
+ page_css = """
89
+ p {
90
+ font-size: 18px;
91
+ }
92
+
93
+ .lang_btn {
94
+ width: 5%;
95
+ }
96
+ """
97
+
98
+ # Gradio UI
99
+ with gr.Blocks(css = page_css) as demo:
100
+ title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
101
+
102
+ with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):
103
+ with gr.Row():
104
+ file_input = gr.File(label = "PDF немесе TXT жүктеңіз / Upload PDF or TXT", file_types = [".pdf", ".txt"])
105
+ process_btn = gr.Button("Процесс файлы / Process File", scale = 1)
106
+
107
+ translit_output = gr.Dataframe(
108
+ headers = ["Мәтін / Text", "Тақырып идентификаторы / Topic ID"],
109
+ label = "Транслитерацияланған үзінділер + Тақырыптар / Transliterated Chunks + Topics"
110
+ )
111
+
112
+ topic_label_table = gr.Dataframe(
113
+ headers = ["Тақырып идентификаторы / Topic ID", "Белгі / Label"],
114
+ label = "Тақырып белгілері / Topic Labels"
115
+ )
116
+
117
+ with gr.Row(equal_height = True):
118
+ with gr.Column(scale = 1):
119
+ plot_output = gr.Plot(label = "Негізгі тақырыптар / Top Topics")
120
+
121
+ with gr.Column(scale = 1):
122
+ umap_output = gr.Plot(label = "UMAP проекциясы / UMAP Topic Projection")
123
+
124
+ with gr.Tab("🔍 Семантикалық іздеу / Semantic Search"):
125
+ with gr.Row():
126
+ search_box = gr.Textbox(label = "Сұрау / Query", placeholder = "мысалы / e.g., Qazaqstan tarihy", lines = 1, scale = 5)
127
+ search_btn = gr.Button("Іздеу / Search", scale = 1)
128
+
129
+ search_results = gr.Textbox(label = "Нәтижелер / Top Results", lines = 6, interactive = False)
130
+
131
+ # Bind callbacks
132
+ process_btn.click(
133
+ fn = process_file,
134
+ inputs = file_input,
135
+ outputs = [translit_output, plot_output, topic_label_table, umap_output]
136
+ )
137
+
138
+ search_btn.click(fn = search_text, inputs = search_box, outputs = search_results)
139
+
140
+
141
+ # Launch
142
+ if __name__ == "__main__":
143
+ demo.launch()
src/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from src.utils.config import *
2
+ from src.utils.ingest import *
3
+ from src.utils.plotting import *
4
+ from src.utils.data_utils import *
5
+ from src.modelling.embed import *
6
+ from src.modelling.topic_model import *
7
+ from src.modelling.transliterate import *
8
+ from src.db.vector_store import *
src/db/__pycache__/vector_store.cpython-312.pyc ADDED
Binary file (4.31 kB). View file
 
src/db/search.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from db.vector_store import VectorStore
2
+ from src.modelling.embed import DalaEmbedder
3
+
4
+ from typing import List
5
+
6
+
7
+ class SemanticSearcher:
8
+ """
9
+ Perform semantic search over embedded Kazakh text.
10
+ """
11
+ def __init__(self):
12
+ self.embedder = DalaEmbedder()
13
+ self.vector_store = VectorStore()
14
+
15
+
16
+ def search(self, query: str, top_k: int = 5) -> List[dict]:
17
+ """
18
+ Embed the query and retrieve the most relevant chunks.
19
+ """
20
+ query_embedding = self.embedder.embed_text(query)
21
+ results = self.vector_store.search(query_embedding, top_k = top_k)
22
+
23
+ return results
src/db/vector_store.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import json
3
+ import numpy as np
4
+ from pathlib import Path
5
+ from src.utils.config import VECTOR_DB_PATH, EMBEDDING_DIM
6
+
7
+ from typing import List
8
+
9
+
10
+ class VectorStore:
11
+ """
12
+ Wrapper for FAISS vector storage, with ID-to-text mapping.
13
+ """
14
+ def __init__(self, index_path: Path = VECTOR_DB_PATH):
15
+ self.index_path = index_path.with_suffix(".index")
16
+ self.meta_path = index_path.with_suffix(".json")
17
+
18
+ self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
19
+ self.metadata = [] # list of dicts: {"id": str, "text": str}
20
+
21
+ # Try loading if exists
22
+ if self.index_path.exists() and self.meta_path.exists():
23
+ try:
24
+ self.load()
25
+
26
+ except Exception as e:
27
+ print(f"[WARN] Failed to load vector store: {e}")
28
+
29
+ # Reinitialize clean if corrupted
30
+ self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
31
+ self.metadata = []
32
+
33
+
34
+
35
+ def add(self, embeddings: list[list[float]], metadata: List[dict]):
36
+ """
37
+ Add new embeddings and their metadata (e.g., {"id": "doc1_chunk0", "text": "..."})
38
+ """
39
+ self.index.add(np.array(embeddings).astype("float32"))
40
+ self.metadata.extend(metadata)
41
+ self.save()
42
+
43
+
44
+ def search(self, query_embedding: list[float], top_k: int = 5) -> List[dict]:
45
+ """
46
+ Perform vector search and return metadata of top_k results.
47
+ """
48
+ D, I = self.index.search(np.array([query_embedding]).astype("float32"), top_k)
49
+
50
+ return [self.metadata[i] for i in I[0]]
51
+
52
+
53
+ def save(self) -> None:
54
+ """
55
+ Save data to an external file.
56
+ """
57
+ self.index_path.parent.mkdir(parents = True, exist_ok = True)
58
+
59
+ faiss.write_index(self.index, str(self.index_path))
60
+
61
+ with open(self.meta_path, 'w', encoding = "utf-8") as f:
62
+ json.dump(self.metadata, f, ensure_ascii = False, indent = 2)
63
+
64
+
65
+ def load(self) -> None:
66
+ """
67
+ Load data from an external file.
68
+ """
69
+ self.index = faiss.read_index(str(self.index_path))
70
+
71
+ with open(self.meta_path, 'r', encoding = "utf-8") as f:
72
+ self.metadata = json.load(f)
src/modelling/__pycache__/embed.cpython-312.pyc ADDED
Binary file (1.82 kB). View file
 
src/modelling/__pycache__/topic_model.cpython-312.pyc ADDED
Binary file (4.55 kB). View file
 
src/modelling/__pycache__/transliterate.cpython-312.pyc ADDED
Binary file (1.83 kB). View file
 
src/modelling/embed.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.utils.config import MINIDALALM_MODEL
2
+ from sentence_transformers import SentenceTransformer
3
+
4
+
5
+ class DalaEmbedder:
6
+ """
7
+ Simple wrapper for the MiniDalaLM embedding model
8
+ """
9
+ def __init__(self, model_path: str = MINIDALALM_MODEL):
10
+ self.model = SentenceTransformer(model_path)
11
+
12
+
13
+ def embed_text(self, text: str) -> list[float]:
14
+ """
15
+ Embed a single string of text.
16
+ """
17
+ return self.model.encode(text, convert_to_numpy = True).tolist()
18
+
19
+
20
+ def embed_batch(self, texts: list[str]) -> list[list[float]]:
21
+ """
22
+ Embed a batch of text strings.
23
+ """
24
+ return self.model.encode(texts, convert_to_numpy = True)
25
+
26
+
27
+ def get_model(self) -> SentenceTransformer:
28
+ """
29
+ Get function to enable access to the MiniDalaLM model.
30
+ """
31
+ return self.model
src/modelling/topic_model.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import plotly
3
+ from bertopic import BERTopic
4
+ from collections import Counter
5
+ from src.utils.data_utils import tokeniser
6
+ from src.modelling.embed import DalaEmbedder
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+ from src.utils.plotting import custom_topic_barchart, custom_umap_plot
9
+
10
+ from typing import Dict, List, Tuple
11
+
12
+
13
+ class TopicModeller:
14
+ """
15
+ Wrapper for topic modelling with BERTopic.
16
+ """
17
+ def __init__(self):
18
+ # Custom vectoriser with stopword filtering
19
+ self.vectorizer_model = None
20
+ self.model = None
21
+
22
+
23
+ def _extract_dalat5_stopwords(self, texts: List[str], top_k: int = 75) -> List[str]:
24
+ """
25
+ Identify frequent tokens using DalaT5's tokeniser as proxy stopwords.
26
+ """
27
+ token_counter = Counter()
28
+
29
+ for text in texts:
30
+ token_ids = tokeniser.encode(text, add_special_tokens=False)
31
+ token_counter.update(token_ids)
32
+
33
+ most_common = token_counter.most_common(top_k)
34
+ stop_tokens = [tokeniser.decode([tok_id]).strip() for tok_id, _ in most_common]
35
+
36
+ return stop_tokens
37
+
38
+
39
+ def _preprocess_texts(self, texts: List[str]) -> List[str]:
40
+ """
41
+ Lowercase and remove digits/symbols from texts.
42
+ """
43
+ return [
44
+ re.sub(r"\d+|\s+", " ", t.lower()).strip()
45
+ for t in texts
46
+ ]
47
+
48
+
49
+ def fit(
50
+ self,
51
+ texts: List[str],
52
+ embeddings: List[List[float]]
53
+ ) -> Tuple[List[str], plotly.graph_objs.Figure, Dict[int, str], plotly.graph_objs.Figure]:
54
+ """
55
+ Fit BERTopic on preprocessed texts and given embeddings.
56
+ Returns topics and an interactive plot.
57
+ """
58
+ clean_texts = self._preprocess_texts(texts)
59
+
60
+ # Leverage DalaT5's tokeniser for stopword acquisition
61
+ stopwords = self._extract_dalat5_stopwords(clean_texts, top_k = 75)
62
+
63
+ # Define vectoriser and model
64
+ self.vectoriser_model = CountVectorizer(
65
+ stop_words = stopwords,
66
+ token_pattern = r"\b[a-zA-Z]+(?:-[a-zA-Z]+)?\b"
67
+ )
68
+ self.model = BERTopic(
69
+ language = "multilingual",
70
+ vectorizer_model = self.vectoriser_model,
71
+ embedding_model = DalaEmbedder().get_model()
72
+ )
73
+
74
+ topics, _ = self.model.fit_transform(clean_texts, embeddings)
75
+
76
+ # Generate labels
77
+ topic_info = self.model.get_topic_info()
78
+ topic_labels = {}
79
+
80
+ for topic_id in topic_info.Topic.values:
81
+ if topic_id == -1:
82
+ topic_labels[topic_id] = '-'
83
+
84
+ continue
85
+
86
+ words = [word for word, _ in self.model.get_topic(topic_id)[:4]]
87
+ label = "_".join(words)
88
+ topic_labels[topic_id] = f"{topic_id}_{label}"
89
+
90
+ fig = custom_topic_barchart(self.model, topic_labels)
91
+ umap_fig = custom_umap_plot(embeddings, topics, topic_labels)
92
+ labeled_topics = [topic_labels[t] for t in topics]
93
+
94
+ return labeled_topics, fig, topic_labels, umap_fig
src/modelling/transliterate.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from src.utils.config import DALAT5_MODEL
3
+
4
+ from typing import List
5
+
6
+
7
+ class DalaTransliterator:
8
+ """
9
+ Simple wrapper for the DalaT5 transliterator model.
10
+ """
11
+ def __init__(self, model_name: str = DALAT5_MODEL):
12
+ self.pipe = pipeline("text2text-generation", model = model_name)
13
+
14
+
15
+ def transliterate(self, text: str, max_length: int = 128) -> str:
16
+ """
17
+ Transliterate a given text using DalaT5.
18
+ """
19
+ input_text = f"Cyrillic2Latin: {text.strip()}"
20
+ result = self.pipe(input_text, max_length = max_length)
21
+
22
+ return result[0]["generated_text"]
23
+
24
+
25
+ def batch_transliterate(self, texts: list[str], max_length: int = 128) -> List[str]:
26
+ """
27
+ Perform batch transliteration using DalaT5.
28
+ """
29
+ return [self.transliterate(t, max_length) for t in texts]
src/utils/__pycache__/config.cpython-312.pyc ADDED
Binary file (698 Bytes). View file
 
src/utils/__pycache__/data_utils.cpython-312.pyc ADDED
Binary file (6.81 kB). View file
 
src/utils/__pycache__/ingest.cpython-312.pyc ADDED
Binary file (2.98 kB). View file
 
src/utils/__pycache__/plotting.cpython-312.pyc ADDED
Binary file (2.91 kB). View file
 
src/utils/config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+
4
+ # Model paths
5
+ DALAT5_MODEL = "crossroderick/dalat5"
6
+ MINIDALALM_MODEL = "crossroderick/minidalalm"
7
+
8
+ # Vector DB config
9
+ VECTOR_DB_PATH = Path("vector_store/faiss_index")
10
+ EMBEDDING_DIM = 384 # for MiniLM-based models
11
+
12
+ # Chunking
13
+ CHUNK_SIZE = 256
14
+ CHUNK_OVERLAP = 64
15
+
16
+ # File input/output
17
+ DOC_INPUT_DIR = Path("data/uploads")
18
+ DOC_OUTPUT_DIR = Path("data/processed")
19
+ TRANS_OUTPUT_DIR = Path("data/transliterated")
src/utils/data_utils.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pdfplumber
3
+ import numpy as np
4
+ import pytesseract
5
+ from transformers import AutoTokenizer
6
+ from pdf2image import convert_from_path
7
+ from src.utils.config import DALAT5_MODEL, CHUNK_SIZE, CHUNK_OVERLAP
8
+
9
+ from typing import Any, List
10
+
11
+
12
+ # Load DalaT5's tokeniser
13
+ tokeniser = AutoTokenizer.from_pretrained(DALAT5_MODEL)
14
+
15
+
16
+ def extract_text_with_pdfplumber(file: Any) -> str:
17
+ """
18
+ Extract text by leveraging PDFPlumber, which is particularly useful for PDF files
19
+ with tabular data.
20
+ """
21
+ if file.name.endswith(".pdf"):
22
+ try:
23
+ with pdfplumber.open(file.name) as pdf:
24
+ texts = [page.extract_text() or "" for page in pdf.pages]
25
+
26
+ return "\n".join(texts).strip()
27
+
28
+ except Exception as e:
29
+ print(f"[ERROR] PDFPlumber failed: {e}")
30
+
31
+ return ""
32
+
33
+ return ""
34
+
35
+
36
+ def extract_text_with_ocr(file: Any) -> str:
37
+ """
38
+ Extract text data by leveraging Tesseract.
39
+ """
40
+ if file.name.endswith(".pdf"):
41
+ try:
42
+ images = convert_from_path(file.name, dpi = 300)
43
+ page_texts = []
44
+
45
+ for img in images:
46
+ raw = pytesseract.image_to_string(img, lang = "kaz+eng")
47
+
48
+ # Clean page-by-page
49
+ cleaned = repair_extracted_text(raw)
50
+
51
+ page_texts.append(cleaned)
52
+
53
+ return "\n".join(page_texts).strip()
54
+
55
+ except Exception as e:
56
+ print(f"[ERROR] OCR failed: {e}")
57
+
58
+ return ""
59
+
60
+
61
+ def clean_text(text: str) -> str:
62
+ """
63
+ Pre-clean text before chunking.
64
+ """
65
+ # Collapse multiple newlines into a space
66
+ text = re.sub(r"\n+", " ", text)
67
+
68
+ # Normalize excessive punctuation
69
+ text = re.sub(r"[^\w\s]{2,}", "", text)
70
+
71
+ # Remove repeated punctuation or symbols
72
+ text = re.sub(r"[•●–—―]+", " ", text)
73
+
74
+ # Normalize extra spacing
75
+ text = re.sub(r"\s{2,}", " ", text)
76
+
77
+ return text.strip()
78
+
79
+
80
+ def is_valid_chunk(chunk: str) -> bool:
81
+ """
82
+ Heuristic to filter out low-quality chunks.
83
+ """
84
+ if len(chunk) < 20:
85
+ return False
86
+
87
+ symbols = sum(1 for c in chunk if not c.isalnum() and c != ' ')
88
+
89
+ if symbols / len(chunk) > 0.4:
90
+ return False
91
+
92
+ return True
93
+
94
+
95
+ def deduplicate_chunks(chunks: List[str], embedder: Any, threshold: float = 0.95) -> List[str]:
96
+ """
97
+ Deduplicate chunks based on cosine similarity.
98
+ Only retains semantically distinct segments.
99
+ """
100
+ unique_chunks = []
101
+ seen_embeddings = []
102
+
103
+ for chunk in chunks:
104
+ emb = embedder.embed_text(chunk)
105
+
106
+ if all(np.dot(emb, e) / (np.linalg.norm(emb) * np.linalg.norm(e)) < threshold for e in seen_embeddings):
107
+ unique_chunks.append(chunk)
108
+ seen_embeddings.append(emb)
109
+
110
+ return unique_chunks
111
+
112
+
113
+ def chunk_text(text: str) -> List[str]:
114
+ """
115
+ Chunk text into overlapping token-based segments using DalaT5's tokeniser.
116
+ """
117
+ # Clean text before doing anything
118
+ cleaned_text = clean_text(text)
119
+
120
+ # Encode with the tokeniser
121
+ tokens = tokeniser.encode(cleaned_text, add_special_tokens = False)
122
+ total_tokens = len(tokens)
123
+
124
+ if total_tokens <= CHUNK_SIZE:
125
+ single_chunk = tokeniser.decode(tokens, skip_special_tokens=True).strip()
126
+
127
+ return [single_chunk] if is_valid_chunk(single_chunk) else []
128
+
129
+ chunks = []
130
+ start = 0
131
+
132
+ while start < total_tokens:
133
+ end = min(start + CHUNK_SIZE, total_tokens)
134
+ chunk_tokens = tokens[start:end]
135
+ chunk = tokeniser.decode(chunk_tokens, skip_special_tokens=True).strip()
136
+
137
+ if is_valid_chunk(chunk):
138
+ chunks.append(chunk)
139
+
140
+ start += CHUNK_SIZE - CHUNK_OVERLAP
141
+
142
+ return chunks
143
+
144
+
145
+ def repair_extracted_text(text: str) -> str:
146
+ """
147
+ Additional logic to repair broken line splits, hyphenations, and common repetition artifacts.
148
+ """
149
+ # Remove repeated words
150
+ text = re.sub(r'\b(\w{4,})\s+\1\b', r'\1', text)
151
+
152
+ # Fix hyphenation
153
+ text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
154
+
155
+ # Remove extremely repeated sentences
156
+ text = re.sub(r'(\b\w{1,2}\b\s+){5,}', '', text)
157
+
158
+ # Remove some previously observed junk
159
+ text = re.sub(r'\b(Googsoft|Hoogsoft|biometriialyq|avtorometriia)\b', '', text)
160
+
161
+ # Collapse multiple spaces
162
+ text = re.sub(r'\s{2,}', ' ', text)
163
+
164
+ return text.strip()
src/utils/ingest.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from pathlib import Path
3
+ from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR
4
+
5
+ from src.utils.data_utils import chunk_text
6
+ from src.db.vector_store import VectorStore
7
+ from src.modelling.embed import DalaEmbedder
8
+ from src.modelling.transliterate import DalaTransliterator
9
+
10
+
11
+ def load_documents(input_dir: Path) -> list[tuple[str, str]]:
12
+ """
13
+ Loads all .txt documents from input_dir. Returns a list of
14
+ tuples: (filename, content)
15
+ """
16
+ docs = []
17
+
18
+ for file in input_dir.glob("*.txt"):
19
+ with open(file, 'r', encoding = "utf-8") as f:
20
+ text = f.read()
21
+
22
+ docs.append((file.stem, text))
23
+
24
+ return docs
25
+
26
+
27
+ def process_documents() -> None:
28
+ """
29
+ Main processing procedure.
30
+ """
31
+ # Components
32
+ transliterator = DalaTransliterator()
33
+ embedder = DalaEmbedder()
34
+ vector_store = VectorStore()
35
+
36
+ docs = load_documents(DOC_INPUT_DIR)
37
+ all_chunks = []
38
+ all_transliterated = []
39
+ all_metadata = []
40
+
41
+ for doc_id, text in docs:
42
+ # Chunk the data
43
+ chunks = chunk_text(text)
44
+
45
+ all_chunks.extend(chunks)
46
+
47
+ # Transliterate chunks
48
+ translit_chunks = transliterator.batch_transliterate(chunks)
49
+
50
+ all_transliterated.extend(translit_chunks)
51
+
52
+ # Save transliterated version
53
+ output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt"
54
+
55
+ with open(output_path, 'w', encoding='utf-8') as f:
56
+ f.write("\n\n".join(translit_chunks))
57
+
58
+ # Create metadata entries
59
+ for i, chunk in enumerate(translit_chunks):
60
+ meta = {
61
+ "id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}",
62
+ "text": chunk
63
+ }
64
+
65
+ all_metadata.append(meta)
66
+
67
+ # Embed all chunks
68
+ embeddings = embedder.embed_batch(all_transliterated)
69
+
70
+ # Add to vector DB
71
+ vector_store.add(embeddings, all_metadata)
72
+
73
+ print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.")
74
+
75
+
76
+ if __name__ == "__main__":
77
+ process_documents()
src/utils/plotting.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly
2
+ import pandas as pd
3
+ from umap import UMAP
4
+ import plotly.express as px
5
+ from bertopic import BERTopic
6
+
7
+ from typing import Dict, List
8
+
9
+
10
+ def custom_topic_barchart(model: BERTopic, topic_labels: Dict[int, str], top_n_topics: int = 10, n_words: int = 10) -> plotly.graph_objs.Figure:
11
+ """
12
+ Create a custom horizontal bar chart of top topics using plotly.express.
13
+ """
14
+ data = []
15
+
16
+ for topic_id, label in topic_labels.items():
17
+ if topic_id == -1:
18
+ continue
19
+
20
+ for word, score in model.get_topic(topic_id)[:n_words]:
21
+ data.append({"Topic": label, "Word": word, "Score": score})
22
+
23
+ df = pd.DataFrame(data)
24
+
25
+ fig = px.bar(
26
+ df,
27
+ x = "Score",
28
+ y = "Word",
29
+ color = "Topic",
30
+ orientation = 'h',
31
+ barmode = "group",
32
+ #height = 500,
33
+ )
34
+
35
+ fig.update_layout(
36
+ margin = dict(l = 40, r = 20, t = 40, b = 20),
37
+ yaxis = dict(title = ""),
38
+ xaxis = dict(title = "Relevance"),
39
+ legend_title_text = "Topic",
40
+ )
41
+
42
+ return fig
43
+
44
+
45
+
46
+ def custom_umap_plot(embeddings: List[List[float]], topics: List[int], topic_labels: Dict[int, str]) -> plotly.graph_objs.Figure:
47
+ """
48
+ Custom UMAP plotting to work better with the Gradio layout.
49
+ """
50
+ reducer = UMAP(n_neighbors = 15, min_dist = 0.1, metric = "cosine", random_state = 42)
51
+ umap_coords = reducer.fit_transform(embeddings)
52
+
53
+ df = pd.DataFrame(umap_coords, columns=["x", "y"])
54
+ df["topic"] = topics
55
+ df["label"] = [topic_labels[t] for t in topics]
56
+
57
+ # Filter out topic -1 (noise)
58
+ df = df[df["topic"] != -1]
59
+
60
+ fig = px.scatter(
61
+ df,
62
+ x = 'x',
63
+ y = 'y',
64
+ color = "label",
65
+ labels = {"label": "Topic"},
66
+ #height = 500
67
+ )
68
+
69
+ fig.update_layout(margin = dict(l = 20, r = 20, t = 40, b = 20))
70
+
71
+ return fig
vector_store/faiss_index.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25004d0d5df0be08b29e41af806fefc2215d37f215c08fdd5b8ce16484ee83fc
3
+ size 175149
vector_store/faiss_index.json ADDED
The diff for this file is too large to render. See raw diff