Spaces:

crossroderick
/

semanticdala

Sleeping

App Files Files Community

crossroderick commited on May 6

Commit

cee942f

1 Parent(s): 9d320f1

Basic anonymised logging utility to keep track of app usage

Browse files

Files changed (1) hide show

app.py +72 -31

app.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import gradio as gr
-from src.modelling.embed import DalaEmbedder
 from src.db.vector_store import VectorStore
 from src.modelling.topic_model import TopicModeller
 from src.modelling.transliterate import DalaTransliterator
 from src.utils.data_utils import (
@@ -21,6 +26,42 @@ vector_db = VectorStore()
 topic_modeller = TopicModeller()
 def extract_text(file: Any) -> str:
     """
     Try multiple PDF extraction strategies, with fallback to OCR if necessary.
@@ -46,32 +87,43 @@ def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
     Main file processing function, which will also chunk, transliterate and cluster
     the file contents, as well as plot the clusters.
     """
-    raw_text = extract_text(file)
-    chunks = chunk_text(raw_text)
-    # Deduplicate and embed embedding
-    translits = translit.batch_transliterate(chunks)
-    dedup_translits = deduplicate_chunks(translits, embedder)
-    embeddings = embedder.embed_batch(dedup_translits)
-    # Clear previous entries before adding
-    vector_db.index.reset()
-    vector_db.metadata = []
-    metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
-    vector_db.add(embeddings, metadata)
-    # Topic modelling
-    topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
-    # Get a list of rows for topic labels
-    overview_table = [[k, v] for k, v in topic_labels.items()]
-    # Zip back transliterated text with topic IDs
-    annotated = list(zip(translits, topics))
-    return annotated, fig, overview_table, umap_fig
 def search_text(query: str):
@@ -84,19 +136,8 @@ def search_text(query: str):
     return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
-# Custom CSS
-page_css = """
-p {
-    font-size: 18px;
-}
-.lang_btn {
-    width: 5%;
-}
-"""
 # Gradio UI
-with gr.Blocks(css = page_css) as demo:
     title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
     with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):

+import os
+import csv
+import time
+import hashlib
 import gradio as gr
+from datetime import datetime
 from src.db.vector_store import VectorStore
+from src.modelling.embed import DalaEmbedder
 from src.modelling.topic_model import TopicModeller
 from src.modelling.transliterate import DalaTransliterator
 from src.utils.data_utils import (
 topic_modeller = TopicModeller()
+def log_submission(filename: str, num_chunks: int, start_time: float, status: str, session_id: str = "anonymous") -> None:
+    """
+    Basic logging utility to keep track of app usage.
+    """
+    log_file = "semanticdala_log.csv"
+    end_time = time.time()
+    duration = round(end_time - start_time, 2)
+    # Anonymise filename for privacy
+    anonymized_name = hashlib.sha256(filename.encode()).hexdigest()[:10]
+    # Get file size in bytes
+    file_size = os.path.getsize(filename) if os.path.exists(filename) else 0
+    file_size_mb = round(file_size / (1024 * 1024), 2)
+    log_entry = {
+        "timestamp": datetime.utcnow().isoformat(),
+        "filename_hash": anonymized_name,
+        "file_size_mb": file_size_mb,
+        "num_chunks": num_chunks,
+        "processing_time_sec": duration,
+        "status": status,
+        "session_id": session_id
+    }
+    file_exists = os.path.isfile(log_file)
+    with open(log_file, mode = 'a', newline = "") as f:
+        writer = csv.DictWriter(f, fieldnames = log_entry.keys())
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(log_entry)
 def extract_text(file: Any) -> str:
     """
     Try multiple PDF extraction strategies, with fallback to OCR if necessary.
     Main file processing function, which will also chunk, transliterate and cluster
     the file contents, as well as plot the clusters.
     """
+    start = time.time()
+    try:
+        raw_text = extract_text(file)
+        chunks = chunk_text(raw_text)
+        # Deduplicate and embed embedding
+        translits = translit.batch_transliterate(chunks)
+        dedup_translits = deduplicate_chunks(translits, embedder)
+        embeddings = embedder.embed_batch(dedup_translits)
+        # Clear previous entries before adding
+        vector_db.index.reset()
+        vector_db.metadata = []
+        metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
+        vector_db.add(embeddings, metadata)
+        # Topic modelling
+        topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
+        # Get a list of rows for topic labels
+        overview_table = [[k, v] for k, v in topic_labels.items()]
+        # Zip back transliterated text with topic IDs
+        annotated = list(zip(translits, topics))
+        # Log success
+        log_submission(file.name, len(chunks), start, status = "success")
+        return annotated, fig, overview_table, umap_fig
+    except Exception as e:
+        log_submission(file.name, 0, start, status = f"error: {str(e)}")
+        raise e
 def search_text(query: str):
     return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
 # Gradio UI
+with gr.Blocks() as demo:
     title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
     with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):