import os import csv import time import hashlib import datetime import gradio as gr from src.db.vector_store import VectorStore from src.modelling.embed import DalaEmbedder from src.modelling.topic_model import TopicModeller from src.modelling.transliterate import DalaTransliterator from src.utils.data_utils import ( extract_text_with_pdfplumber, extract_text_with_ocr, chunk_text, deduplicate_chunks, repair_extracted_text ) from typing import Any, List, Tuple # Instantiate components translit = DalaTransliterator() embedder = DalaEmbedder() vector_db = VectorStore() topic_modeller = TopicModeller() def extract_text(file: Any) -> str: """ Try multiple PDF extraction strategies, with fallback to OCR if necessary. """ if file.name.endswith(".pdf"): text = extract_text_with_pdfplumber(file) if len(text.strip()) > 100: return repair_extracted_text(text) print("[INFO] Falling back to OCR...") return extract_text_with_ocr(file) elif file.name.endswith(".txt"): return repair_extracted_text(file.read().decode("utf-8", errors = "ignore")) return "" def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]: """ Main file processing function, which will also chunk, transliterate and cluster the file contents, as well as plot the clusters. """ raw_text = extract_text(file) chunks = chunk_text(raw_text) # Deduplicate and embed embedding translits = translit.batch_transliterate(chunks) dedup_translits = deduplicate_chunks(translits, embedder) embeddings = embedder.embed_batch(dedup_translits) # Clear previous entries before adding vector_db.index.reset() vector_db.metadata = [] metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)] vector_db.add(embeddings, metadata) # Topic modelling topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings) # Get a list of rows for topic labels overview_table = [[k, v] for k, v in topic_labels.items()] # Zip back transliterated text with topic IDs annotated = list(zip(dedup_translits, topics)) return annotated, fig, overview_table, umap_fig def search_text(query: str): """ Search for a given query in the vector DB. """ query_emb = embedder.embed_text(query) results = vector_db.search(query_emb, top_k = 5) return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results) # Gradio UI with gr.Blocks() as demo: title_html = gr.HTML("