Spaces:
Sleeping
Sleeping
Commit
·
2341eb2
1
Parent(s):
9d99321
Removed the logging code
Browse files
app.py
CHANGED
@@ -26,59 +26,6 @@ vector_db = VectorStore()
|
|
26 |
topic_modeller = TopicModeller()
|
27 |
|
28 |
|
29 |
-
def print_recent_logs(n: int = 5):
|
30 |
-
"""
|
31 |
-
Print the last N log lines to the container logs for developer monitoring.
|
32 |
-
"""
|
33 |
-
log_file = "semanticdala_log.csv"
|
34 |
-
|
35 |
-
if os.path.exists(log_file):
|
36 |
-
print(f"\n[SEMANTICDALA USAGE LOG - Last {n} Entries]")
|
37 |
-
|
38 |
-
with open(log_file, "r") as f:
|
39 |
-
lines = f.readlines()
|
40 |
-
|
41 |
-
for line in lines[-n:]:
|
42 |
-
print(line.strip())
|
43 |
-
|
44 |
-
print("[END LOG SNAPSHOT]\n")
|
45 |
-
|
46 |
-
|
47 |
-
def log_submission(filename: str, num_chunks: int, start_time: float, status: str, session_id: str = "anonymous") -> None:
|
48 |
-
"""
|
49 |
-
Basic logging utility to keep track of app usage.
|
50 |
-
"""
|
51 |
-
log_file = "semanticdala_log.csv"
|
52 |
-
end_time = time.time()
|
53 |
-
duration = round(end_time - start_time, 2)
|
54 |
-
|
55 |
-
# Anonymise filename for privacy
|
56 |
-
anonymized_name = hashlib.sha256(filename.encode()).hexdigest()[:10]
|
57 |
-
|
58 |
-
# Get file size in bytes
|
59 |
-
file_size = os.path.getsize(filename) if os.path.exists(filename) else 0
|
60 |
-
file_size_mb = round(file_size / (1024 * 1024), 2)
|
61 |
-
|
62 |
-
log_entry = {
|
63 |
-
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
64 |
-
"filename_hash": anonymized_name,
|
65 |
-
"file_size_mb": file_size_mb,
|
66 |
-
"num_chunks": num_chunks,
|
67 |
-
"processing_time_sec": duration,
|
68 |
-
"status": status,
|
69 |
-
"session_id": session_id
|
70 |
-
}
|
71 |
-
|
72 |
-
file_exists = os.path.isfile(log_file)
|
73 |
-
|
74 |
-
with open(log_file, mode = 'a', newline = "") as f:
|
75 |
-
writer = csv.DictWriter(f, fieldnames = log_entry.keys())
|
76 |
-
|
77 |
-
if not file_exists:
|
78 |
-
writer.writeheader()
|
79 |
-
|
80 |
-
writer.writerow(log_entry)
|
81 |
-
|
82 |
|
83 |
def extract_text(file: Any) -> str:
|
84 |
"""
|
@@ -105,45 +52,32 @@ def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
|
|
105 |
Main file processing function, which will also chunk, transliterate and cluster
|
106 |
the file contents, as well as plot the clusters.
|
107 |
"""
|
108 |
-
|
|
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
embeddings = embedder.embed_batch(dedup_translits)
|
118 |
|
119 |
-
|
120 |
-
vector_db.index.reset()
|
121 |
-
vector_db.metadata = []
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
vector_db.add(embeddings, metadata)
|
126 |
-
|
127 |
-
# Topic modelling
|
128 |
-
topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
|
129 |
-
|
130 |
-
# Get a list of rows for topic labels
|
131 |
-
overview_table = [[k, v] for k, v in topic_labels.items()]
|
132 |
-
|
133 |
-
# Zip back transliterated text with topic IDs
|
134 |
-
annotated = list(zip(dedup_translits, topics))
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
print_recent_logs()
|
139 |
|
140 |
-
|
|
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
print_recent_logs()
|
145 |
|
146 |
-
|
147 |
|
148 |
|
149 |
def search_text(query: str):
|
|
|
26 |
topic_modeller = TopicModeller()
|
27 |
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def extract_text(file: Any) -> str:
|
31 |
"""
|
|
|
52 |
Main file processing function, which will also chunk, transliterate and cluster
|
53 |
the file contents, as well as plot the clusters.
|
54 |
"""
|
55 |
+
raw_text = extract_text(file)
|
56 |
+
chunks = chunk_text(raw_text)
|
57 |
|
58 |
+
# Deduplicate and embed embedding
|
59 |
+
translits = translit.batch_transliterate(chunks)
|
60 |
+
dedup_translits = deduplicate_chunks(translits, embedder)
|
61 |
+
embeddings = embedder.embed_batch(dedup_translits)
|
62 |
|
63 |
+
# Clear previous entries before adding
|
64 |
+
vector_db.index.reset()
|
65 |
+
vector_db.metadata = []
|
|
|
66 |
|
67 |
+
metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
|
|
|
|
|
68 |
|
69 |
+
vector_db.add(embeddings, metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
# Topic modelling
|
72 |
+
topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
|
|
|
73 |
|
74 |
+
# Get a list of rows for topic labels
|
75 |
+
overview_table = [[k, v] for k, v in topic_labels.items()]
|
76 |
|
77 |
+
# Zip back transliterated text with topic IDs
|
78 |
+
annotated = list(zip(dedup_translits, topics))
|
|
|
79 |
|
80 |
+
return annotated, fig, overview_table, umap_fig
|
81 |
|
82 |
|
83 |
def search_text(query: str):
|