semanticdala / app.py
crossroderick's picture
Removed the logging code
2341eb2
import os
import csv
import time
import hashlib
import datetime
import gradio as gr
from src.db.vector_store import VectorStore
from src.modelling.embed import DalaEmbedder
from src.modelling.topic_model import TopicModeller
from src.modelling.transliterate import DalaTransliterator
from src.utils.data_utils import (
extract_text_with_pdfplumber,
extract_text_with_ocr,
chunk_text,
deduplicate_chunks,
repair_extracted_text
)
from typing import Any, List, Tuple
# Instantiate components
translit = DalaTransliterator()
embedder = DalaEmbedder()
vector_db = VectorStore()
topic_modeller = TopicModeller()
def extract_text(file: Any) -> str:
"""
Try multiple PDF extraction strategies, with fallback to OCR if necessary.
"""
if file.name.endswith(".pdf"):
text = extract_text_with_pdfplumber(file)
if len(text.strip()) > 100:
return repair_extracted_text(text)
print("[INFO] Falling back to OCR...")
return extract_text_with_ocr(file)
elif file.name.endswith(".txt"):
return repair_extracted_text(file.read().decode("utf-8", errors = "ignore"))
return ""
def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
"""
Main file processing function, which will also chunk, transliterate and cluster
the file contents, as well as plot the clusters.
"""
raw_text = extract_text(file)
chunks = chunk_text(raw_text)
# Deduplicate and embed embedding
translits = translit.batch_transliterate(chunks)
dedup_translits = deduplicate_chunks(translits, embedder)
embeddings = embedder.embed_batch(dedup_translits)
# Clear previous entries before adding
vector_db.index.reset()
vector_db.metadata = []
metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
vector_db.add(embeddings, metadata)
# Topic modelling
topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
# Get a list of rows for topic labels
overview_table = [[k, v] for k, v in topic_labels.items()]
# Zip back transliterated text with topic IDs
annotated = list(zip(dedup_translits, topics))
return annotated, fig, overview_table, umap_fig
def search_text(query: str):
"""
Search for a given query in the vector DB.
"""
query_emb = embedder.embed_text(query)
results = vector_db.search(query_emb, top_k = 5)
return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
# Gradio UI
with gr.Blocks() as demo:
title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):
with gr.Row():
file_input = gr.File(label = "PDF немесе TXT жүктеңіз / Upload PDF or TXT", file_types = [".pdf", ".txt"])
process_btn = gr.Button("Процесс файлы / Process File", scale = 1)
translit_output = gr.Dataframe(
headers = ["Мәтін / Text", "Тақырып идентификаторы / Topic ID"],
label = "Транслитерацияланған үзінділер + Тақырыптар / Transliterated Chunks + Topics"
)
topic_label_table = gr.Dataframe(
headers = ["Тақырып идентификаторы / Topic ID", "Белгі / Label"],
label = "Тақырып белгілері / Topic Labels"
)
with gr.Row(equal_height = True):
with gr.Column(scale = 1):
plot_output = gr.Plot(label = "Негізгі тақырыптар / Top Topics")
with gr.Column(scale = 1):
umap_output = gr.Plot(label = "UMAP проекциясы / UMAP Topic Projection")
with gr.Tab("🔍 Семантикалық іздеу / Semantic Search"):
with gr.Row():
search_box = gr.Textbox(label = "Сұрау / Query", placeholder = "мысалы / e.g., Qazaqstan tarihy", lines = 1, scale = 5)
search_btn = gr.Button("Іздеу / Search", scale = 1)
search_results = gr.Textbox(label = "Нәтижелер / Top Results", lines = 6, interactive = False)
# Bind callbacks
process_btn.click(
fn = process_file,
inputs = file_input,
outputs = [translit_output, plot_output, topic_label_table, umap_output]
)
search_btn.click(fn = search_text, inputs = search_box, outputs = search_results)
# Launch
if __name__ == "__main__":
demo.launch()