Spaces:

crossroderick
/

semanticdala

Sleeping

App Files Files Community

semanticdala / app.py

crossroderick

Removed the logging code

2341eb2 about 1 month ago

raw

history blame contribute delete

4.71 kB

	import os
	import csv
	import time
	import hashlib
	import datetime
	import gradio as gr
	from src.db.vector_store import VectorStore
	from src.modelling.embed import DalaEmbedder
	from src.modelling.topic_model import TopicModeller
	from src.modelling.transliterate import DalaTransliterator
	from src.utils.data_utils import (
	extract_text_with_pdfplumber,
	extract_text_with_ocr,
	chunk_text,
	deduplicate_chunks,
	repair_extracted_text
	)

	from typing import Any, List, Tuple


	# Instantiate components
	translit = DalaTransliterator()
	embedder = DalaEmbedder()
	vector_db = VectorStore()
	topic_modeller = TopicModeller()



	def extract_text(file: Any) -> str:
	"""
	Try multiple PDF extraction strategies, with fallback to OCR if necessary.
	"""
	if file.name.endswith(".pdf"):
	text = extract_text_with_pdfplumber(file)

	if len(text.strip()) > 100:
	return repair_extracted_text(text)

	print("[INFO] Falling back to OCR...")

	return extract_text_with_ocr(file)

	elif file.name.endswith(".txt"):
	return repair_extracted_text(file.read().decode("utf-8", errors = "ignore"))

	return ""


	def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
	"""
	Main file processing function, which will also chunk, transliterate and cluster
	the file contents, as well as plot the clusters.
	"""
	raw_text = extract_text(file)
	chunks = chunk_text(raw_text)

	# Deduplicate and embed embedding
	translits = translit.batch_transliterate(chunks)
	dedup_translits = deduplicate_chunks(translits, embedder)
	embeddings = embedder.embed_batch(dedup_translits)

	# Clear previous entries before adding
	vector_db.index.reset()
	vector_db.metadata = []

	metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]

	vector_db.add(embeddings, metadata)

	# Topic modelling
	topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)

	# Get a list of rows for topic labels
	overview_table = [[k, v] for k, v in topic_labels.items()]

	# Zip back transliterated text with topic IDs
	annotated = list(zip(dedup_translits, topics))

	return annotated, fig, overview_table, umap_fig


	def search_text(query: str):
	"""
	Search for a given query in the vector DB.
	"""
	query_emb = embedder.embed_text(query)
	results = vector_db.search(query_emb, top_k = 5)

	return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)


	# Gradio UI
	with gr.Blocks() as demo:
	title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")

	with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):
	with gr.Row():
	file_input = gr.File(label = "PDF немесе TXT жүктеңіз / Upload PDF or TXT", file_types = [".pdf", ".txt"])
	process_btn = gr.Button("Процесс файлы / Process File", scale = 1)

	translit_output = gr.Dataframe(
	headers = ["Мәтін / Text", "Тақырып идентификаторы / Topic ID"],
	label = "Транслитерацияланған үзінділер + Тақырыптар / Transliterated Chunks + Topics"
	)

	topic_label_table = gr.Dataframe(
	headers = ["Тақырып идентификаторы / Topic ID", "Белгі / Label"],
	label = "Тақырып белгілері / Topic Labels"
	)

	with gr.Row(equal_height = True):
	with gr.Column(scale = 1):
	plot_output = gr.Plot(label = "Негізгі тақырыптар / Top Topics")

	with gr.Column(scale = 1):
	umap_output = gr.Plot(label = "UMAP проекциясы / UMAP Topic Projection")

	with gr.Tab("🔍 Семантикалық іздеу / Semantic Search"):
	with gr.Row():
	search_box = gr.Textbox(label = "Сұрау / Query", placeholder = "мысалы / e.g., Qazaqstan tarihy", lines = 1, scale = 5)
	search_btn = gr.Button("Іздеу / Search", scale = 1)

	search_results = gr.Textbox(label = "Нәтижелер / Top Results", lines = 6, interactive = False)

	# Bind callbacks
	process_btn.click(
	fn = process_file,
	inputs = file_input,
	outputs = [translit_output, plot_output, topic_label_table, umap_output]
	)

	search_btn.click(fn = search_text, inputs = search_box, outputs = search_results)


	# Launch
	if __name__ == "__main__":
	demo.launch()