Molbap HF Staff commited on
Commit
061a198
Β·
1 Parent(s): 4fa1ace

make things persist

Browse files
Files changed (2) hide show
  1. app.py +10 -1
  2. modular_graph_and_candidates.py +55 -19
app.py CHANGED
@@ -52,7 +52,16 @@ def _escape_srcdoc(text: str) -> str:
52
 
53
 
54
  def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
55
- repo_path = clone_or_cache(repo_url)
 
 
 
 
 
 
 
 
 
56
 
57
  graph = build_graph_json(
58
  transformers_dir=repo_path,
 
52
 
53
 
54
  def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
55
+ # Check if we can use cached embeddings for embedding similarity
56
+ embeddings_cache = Path("embeddings_cache.npz")
57
+
58
+ if sim_method == "embedding" and embeddings_cache.exists():
59
+ print("πŸš€ Using cached embeddings - skipping repo download")
60
+ # Use a dummy path since we won't need the actual repo
61
+ repo_path = Path("/tmp/dummy")
62
+ else:
63
+ print("πŸ“₯ Downloading/updating repository")
64
+ repo_path = clone_or_cache(repo_url)
65
 
66
  graph = build_graph_json(
67
  transformers_dir=repo_path,
modular_graph_and_candidates.py CHANGED
@@ -130,22 +130,22 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
130
  print(f"Encoding embeddings for {len(names)} models...")
131
  batch_size = 4 # keep your default
132
 
133
- # ── checkpoint / resume ────────────────────────────────────────────────────
134
- ckpt_path = models_root / "__emb_ckpt.npz"
135
  start_idx = 0
136
  emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
137
 
138
- if ckpt_path.exists():
139
  try:
140
- ckpt = np.load(ckpt_path, allow_pickle=True)
141
- ckpt_names = list(ckpt["names"])
142
- if names[:len(ckpt_names)] == ckpt_names:
143
- loaded = ckpt["embeddings"].astype(np.float32)
144
  all_embeddings.append(loaded)
145
- start_idx = len(ckpt_names)
146
- print(f"Resuming from checkpoint at {start_idx}/{len(names)}")
147
  except Exception as e:
148
- print(f"⚠️ Failed to load checkpoint: {type(e).__name__}: {e}")
149
  # ───────────────────────────────────────────────────────────────────────────
150
 
151
  for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
@@ -161,16 +161,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
161
 
162
  all_embeddings.append(emb)
163
 
164
- # save checkpoint after each batch
165
  try:
166
  cur = np.vstack(all_embeddings).astype(np.float32)
167
  np.savez(
168
- ckpt_path,
169
  embeddings=cur,
170
  names=np.array(names[:i+len(batch_names)], dtype=object),
171
  )
172
  except Exception as e:
173
- print(f"⚠️ Failed to write checkpoint: {type(e).__name__}: {e}")
174
 
175
  if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
176
  torch.cuda.empty_cache()
@@ -193,14 +193,42 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
193
  if s >= thr:
194
  out[(processed_names[i], processed_names[j])] = s
195
 
196
- # best-effort cleanup
197
- try:
198
- ckpt_path.unlink()
199
- except Exception:
200
- pass
201
-
202
  return out
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
 
206
 
@@ -269,6 +297,14 @@ def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str,
269
  if sim_method == "jaccard":
270
  return similarity_clusters({m: bags[m] for m in missing}, threshold)
271
  else:
 
 
 
 
 
 
 
 
272
  return embedding_similarity_clusters(models_root, missing, threshold)
273
 
274
  def build_graph_json(
 
130
  print(f"Encoding embeddings for {len(names)} models...")
131
  batch_size = 4 # keep your default
132
 
133
+ # ── persistent embeddings storage ────────────────────────────────────────────
134
+ embeddings_path = Path("embeddings_cache.npz")
135
  start_idx = 0
136
  emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
137
 
138
+ if embeddings_path.exists():
139
  try:
140
+ cached = np.load(embeddings_path, allow_pickle=True)
141
+ cached_names = list(cached["names"])
142
+ if names[:len(cached_names)] == cached_names:
143
+ loaded = cached["embeddings"].astype(np.float32)
144
  all_embeddings.append(loaded)
145
+ start_idx = len(cached_names)
146
+ print(f"πŸ“¦ Using cached embeddings for {start_idx}/{len(names)} models")
147
  except Exception as e:
148
+ print(f"⚠️ Failed to load cached embeddings: {type(e).__name__}: {e}")
149
  # ───────────────────────────────────────────────────────────────────────────
150
 
151
  for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
 
161
 
162
  all_embeddings.append(emb)
163
 
164
+ # save to persistent cache after each batch
165
  try:
166
  cur = np.vstack(all_embeddings).astype(np.float32)
167
  np.savez(
168
+ embeddings_path,
169
  embeddings=cur,
170
  names=np.array(names[:i+len(batch_names)], dtype=object),
171
  )
172
  except Exception as e:
173
+ print(f"⚠️ Failed to write embeddings cache: {type(e).__name__}: {e}")
174
 
175
  if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
176
  torch.cuda.empty_cache()
 
193
  if s >= thr:
194
  out[(processed_names[i], processed_names[j])] = s
195
 
196
+ print(f"πŸ’Ύ Embeddings saved to {embeddings_path}")
 
 
 
 
 
197
  return out
198
 
199
+ def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
200
+ """Compute similarities from cached embeddings without reprocessing."""
201
+ embeddings_path = Path("embeddings_cache.npz")
202
+
203
+ if not embeddings_path.exists():
204
+ return {}
205
+
206
+ try:
207
+ cached = np.load(embeddings_path, allow_pickle=True)
208
+ embeddings = cached["embeddings"].astype(np.float32)
209
+ names = list(cached["names"])
210
+
211
+ # Normalize embeddings
212
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
213
+ embeddings = embeddings / norms
214
+
215
+ # Compute similarities
216
+ sims_mat = embeddings @ embeddings.T
217
+
218
+ out = {}
219
+ for i in range(len(names)):
220
+ for j in range(i + 1, len(names)):
221
+ s = float(sims_mat[i, j])
222
+ if s >= threshold:
223
+ out[(names[i], names[j])] = s
224
+
225
+ print(f"⚑ Computed {len(out)} similarities from cache (threshold: {threshold})")
226
+ return out
227
+
228
+ except Exception as e:
229
+ print(f"⚠️ Failed to compute from cache: {e}")
230
+ return {}
231
+
232
 
233
 
234
 
 
297
  if sim_method == "jaccard":
298
  return similarity_clusters({m: bags[m] for m in missing}, threshold)
299
  else:
300
+ # Try to use cached embeddings first
301
+ embeddings_path = Path("embeddings_cache.npz")
302
+ if embeddings_path.exists():
303
+ cached_sims = compute_similarities_from_cache(threshold)
304
+ if cached_sims: # Cache exists and worked
305
+ return cached_sims
306
+
307
+ # Fallback to full computation
308
  return embedding_similarity_clusters(models_root, missing, threshold)
309
 
310
  def build_graph_json(