crossroderick commited on
Commit
2341eb2
·
1 Parent(s): 9d99321

Removed the logging code

Browse files
Files changed (1) hide show
  1. app.py +18 -84
app.py CHANGED
@@ -26,59 +26,6 @@ vector_db = VectorStore()
26
  topic_modeller = TopicModeller()
27
 
28
 
29
- def print_recent_logs(n: int = 5):
30
- """
31
- Print the last N log lines to the container logs for developer monitoring.
32
- """
33
- log_file = "semanticdala_log.csv"
34
-
35
- if os.path.exists(log_file):
36
- print(f"\n[SEMANTICDALA USAGE LOG - Last {n} Entries]")
37
-
38
- with open(log_file, "r") as f:
39
- lines = f.readlines()
40
-
41
- for line in lines[-n:]:
42
- print(line.strip())
43
-
44
- print("[END LOG SNAPSHOT]\n")
45
-
46
-
47
- def log_submission(filename: str, num_chunks: int, start_time: float, status: str, session_id: str = "anonymous") -> None:
48
- """
49
- Basic logging utility to keep track of app usage.
50
- """
51
- log_file = "semanticdala_log.csv"
52
- end_time = time.time()
53
- duration = round(end_time - start_time, 2)
54
-
55
- # Anonymise filename for privacy
56
- anonymized_name = hashlib.sha256(filename.encode()).hexdigest()[:10]
57
-
58
- # Get file size in bytes
59
- file_size = os.path.getsize(filename) if os.path.exists(filename) else 0
60
- file_size_mb = round(file_size / (1024 * 1024), 2)
61
-
62
- log_entry = {
63
- "timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
64
- "filename_hash": anonymized_name,
65
- "file_size_mb": file_size_mb,
66
- "num_chunks": num_chunks,
67
- "processing_time_sec": duration,
68
- "status": status,
69
- "session_id": session_id
70
- }
71
-
72
- file_exists = os.path.isfile(log_file)
73
-
74
- with open(log_file, mode = 'a', newline = "") as f:
75
- writer = csv.DictWriter(f, fieldnames = log_entry.keys())
76
-
77
- if not file_exists:
78
- writer.writeheader()
79
-
80
- writer.writerow(log_entry)
81
-
82
 
83
  def extract_text(file: Any) -> str:
84
  """
@@ -105,45 +52,32 @@ def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
105
  Main file processing function, which will also chunk, transliterate and cluster
106
  the file contents, as well as plot the clusters.
107
  """
108
- start = time.time()
 
109
 
110
- try:
111
- raw_text = extract_text(file)
112
- chunks = chunk_text(raw_text)
 
113
 
114
- # Deduplicate and embed embedding
115
- translits = translit.batch_transliterate(chunks)
116
- dedup_translits = deduplicate_chunks(translits, embedder)
117
- embeddings = embedder.embed_batch(dedup_translits)
118
 
119
- # Clear previous entries before adding
120
- vector_db.index.reset()
121
- vector_db.metadata = []
122
 
123
- metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
124
-
125
- vector_db.add(embeddings, metadata)
126
-
127
- # Topic modelling
128
- topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
129
-
130
- # Get a list of rows for topic labels
131
- overview_table = [[k, v] for k, v in topic_labels.items()]
132
-
133
- # Zip back transliterated text with topic IDs
134
- annotated = list(zip(dedup_translits, topics))
135
 
136
- # Log success
137
- log_submission(file.name, len(chunks), start, status = "success")
138
- print_recent_logs()
139
 
140
- return annotated, fig, overview_table, umap_fig
 
141
 
142
- except Exception as e:
143
- log_submission(file.name, 0, start, status = f"error: {str(e)}")
144
- print_recent_logs()
145
 
146
- raise e
147
 
148
 
149
  def search_text(query: str):
 
26
  topic_modeller = TopicModeller()
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def extract_text(file: Any) -> str:
31
  """
 
52
  Main file processing function, which will also chunk, transliterate and cluster
53
  the file contents, as well as plot the clusters.
54
  """
55
+ raw_text = extract_text(file)
56
+ chunks = chunk_text(raw_text)
57
 
58
+ # Deduplicate and embed embedding
59
+ translits = translit.batch_transliterate(chunks)
60
+ dedup_translits = deduplicate_chunks(translits, embedder)
61
+ embeddings = embedder.embed_batch(dedup_translits)
62
 
63
+ # Clear previous entries before adding
64
+ vector_db.index.reset()
65
+ vector_db.metadata = []
 
66
 
67
+ metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
 
 
68
 
69
+ vector_db.add(embeddings, metadata)
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Topic modelling
72
+ topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
 
73
 
74
+ # Get a list of rows for topic labels
75
+ overview_table = [[k, v] for k, v in topic_labels.items()]
76
 
77
+ # Zip back transliterated text with topic IDs
78
+ annotated = list(zip(dedup_translits, topics))
 
79
 
80
+ return annotated, fig, overview_table, umap_fig
81
 
82
 
83
  def search_text(query: str):