crossroderick commited on
Commit
cee942f
·
1 Parent(s): 9d320f1

Basic anonymised logging utility to keep track of app usage

Browse files
Files changed (1) hide show
  1. app.py +72 -31
app.py CHANGED
@@ -1,6 +1,11 @@
 
 
 
 
1
  import gradio as gr
2
- from src.modelling.embed import DalaEmbedder
3
  from src.db.vector_store import VectorStore
 
4
  from src.modelling.topic_model import TopicModeller
5
  from src.modelling.transliterate import DalaTransliterator
6
  from src.utils.data_utils import (
@@ -21,6 +26,42 @@ vector_db = VectorStore()
21
  topic_modeller = TopicModeller()
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def extract_text(file: Any) -> str:
25
  """
26
  Try multiple PDF extraction strategies, with fallback to OCR if necessary.
@@ -46,32 +87,43 @@ def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
46
  Main file processing function, which will also chunk, transliterate and cluster
47
  the file contents, as well as plot the clusters.
48
  """
49
- raw_text = extract_text(file)
50
- chunks = chunk_text(raw_text)
 
 
 
51
 
52
- # Deduplicate and embed embedding
53
- translits = translit.batch_transliterate(chunks)
54
- dedup_translits = deduplicate_chunks(translits, embedder)
55
- embeddings = embedder.embed_batch(dedup_translits)
56
 
57
- # Clear previous entries before adding
58
- vector_db.index.reset()
59
- vector_db.metadata = []
60
 
61
- metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
62
 
63
- vector_db.add(embeddings, metadata)
64
 
65
- # Topic modelling
66
- topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
 
 
 
 
 
 
67
 
68
- # Get a list of rows for topic labels
69
- overview_table = [[k, v] for k, v in topic_labels.items()]
 
 
70
 
71
- # Zip back transliterated text with topic IDs
72
- annotated = list(zip(translits, topics))
73
 
74
- return annotated, fig, overview_table, umap_fig
75
 
76
 
77
  def search_text(query: str):
@@ -84,19 +136,8 @@ def search_text(query: str):
84
  return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
85
 
86
 
87
- # Custom CSS
88
- page_css = """
89
- p {
90
- font-size: 18px;
91
- }
92
-
93
- .lang_btn {
94
- width: 5%;
95
- }
96
- """
97
-
98
  # Gradio UI
99
- with gr.Blocks(css = page_css) as demo:
100
  title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
101
 
102
  with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):
 
1
+ import os
2
+ import csv
3
+ import time
4
+ import hashlib
5
  import gradio as gr
6
+ from datetime import datetime
7
  from src.db.vector_store import VectorStore
8
+ from src.modelling.embed import DalaEmbedder
9
  from src.modelling.topic_model import TopicModeller
10
  from src.modelling.transliterate import DalaTransliterator
11
  from src.utils.data_utils import (
 
26
  topic_modeller = TopicModeller()
27
 
28
 
29
+ def log_submission(filename: str, num_chunks: int, start_time: float, status: str, session_id: str = "anonymous") -> None:
30
+ """
31
+ Basic logging utility to keep track of app usage.
32
+ """
33
+ log_file = "semanticdala_log.csv"
34
+ end_time = time.time()
35
+ duration = round(end_time - start_time, 2)
36
+
37
+ # Anonymise filename for privacy
38
+ anonymized_name = hashlib.sha256(filename.encode()).hexdigest()[:10]
39
+
40
+ # Get file size in bytes
41
+ file_size = os.path.getsize(filename) if os.path.exists(filename) else 0
42
+ file_size_mb = round(file_size / (1024 * 1024), 2)
43
+
44
+ log_entry = {
45
+ "timestamp": datetime.utcnow().isoformat(),
46
+ "filename_hash": anonymized_name,
47
+ "file_size_mb": file_size_mb,
48
+ "num_chunks": num_chunks,
49
+ "processing_time_sec": duration,
50
+ "status": status,
51
+ "session_id": session_id
52
+ }
53
+
54
+ file_exists = os.path.isfile(log_file)
55
+
56
+ with open(log_file, mode = 'a', newline = "") as f:
57
+ writer = csv.DictWriter(f, fieldnames = log_entry.keys())
58
+
59
+ if not file_exists:
60
+ writer.writeheader()
61
+
62
+ writer.writerow(log_entry)
63
+
64
+
65
  def extract_text(file: Any) -> str:
66
  """
67
  Try multiple PDF extraction strategies, with fallback to OCR if necessary.
 
87
  Main file processing function, which will also chunk, transliterate and cluster
88
  the file contents, as well as plot the clusters.
89
  """
90
+ start = time.time()
91
+
92
+ try:
93
+ raw_text = extract_text(file)
94
+ chunks = chunk_text(raw_text)
95
 
96
+ # Deduplicate and embed embedding
97
+ translits = translit.batch_transliterate(chunks)
98
+ dedup_translits = deduplicate_chunks(translits, embedder)
99
+ embeddings = embedder.embed_batch(dedup_translits)
100
 
101
+ # Clear previous entries before adding
102
+ vector_db.index.reset()
103
+ vector_db.metadata = []
104
 
105
+ metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
106
 
107
+ vector_db.add(embeddings, metadata)
108
 
109
+ # Topic modelling
110
+ topics, fig, topic_labels, umap_fig = topic_modeller.fit(translits, embeddings)
111
+
112
+ # Get a list of rows for topic labels
113
+ overview_table = [[k, v] for k, v in topic_labels.items()]
114
+
115
+ # Zip back transliterated text with topic IDs
116
+ annotated = list(zip(translits, topics))
117
 
118
+ # Log success
119
+ log_submission(file.name, len(chunks), start, status = "success")
120
+
121
+ return annotated, fig, overview_table, umap_fig
122
 
123
+ except Exception as e:
124
+ log_submission(file.name, 0, start, status = f"error: {str(e)}")
125
 
126
+ raise e
127
 
128
 
129
  def search_text(query: str):
 
136
  return "\n\n".join(f"[{r['id']}]: {r['text']}" for r in results)
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Gradio UI
140
+ with gr.Blocks() as demo:
141
  title_html = gr.HTML("<center><h1>🇰🇿 SemanticDala</h1><h2>Қазақтың семантикалық платформасы</h2><h3>Kazakh Semantic Platform</h3></center>")
142
 
143
  with gr.Tab("📁 Жүктеп салу және өңдеу / Upload and Process"):