Molbap HF Staff commited on
Commit
0b63d8e
Β·
1 Parent(s): 214d223
Files changed (1) hide show
  1. modular_graph_and_candidates.py +69 -16
modular_graph_and_candidates.py CHANGED
@@ -130,22 +130,35 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
130
  print(f"Encoding embeddings for {len(names)} models...")
131
  batch_size = 4 # keep your default
132
 
133
- # ── persistent embeddings storage ────────────────────────────────────────────
134
- embeddings_path = Path("embeddings_cache.npz")
 
135
  start_idx = 0
136
  emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
137
 
138
- if embeddings_path.exists():
 
 
 
 
 
 
 
 
 
 
 
 
139
  try:
140
- cached = np.load(embeddings_path, allow_pickle=True)
141
  cached_names = list(cached["names"])
142
  if names[:len(cached_names)] == cached_names:
143
  loaded = cached["embeddings"].astype(np.float32)
144
  all_embeddings.append(loaded)
145
  start_idx = len(cached_names)
146
- print(f"πŸ“¦ Using cached embeddings for {start_idx}/{len(names)} models")
147
  except Exception as e:
148
- print(f"⚠️ Failed to load cached embeddings: {type(e).__name__}: {e}")
149
  # ───────────────────────────────────────────────────────────────────────────
150
 
151
  for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
@@ -161,16 +174,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
161
 
162
  all_embeddings.append(emb)
163
 
164
- # save to persistent cache after each batch
165
  try:
166
  cur = np.vstack(all_embeddings).astype(np.float32)
167
  np.savez(
168
- embeddings_path,
169
  embeddings=cur,
170
  names=np.array(names[:i+len(batch_names)], dtype=object),
171
  )
172
  except Exception as e:
173
- print(f"⚠️ Failed to write embeddings cache: {type(e).__name__}: {e}")
174
 
175
  if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
176
  torch.cuda.empty_cache()
@@ -193,7 +206,17 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
193
  if s >= thr:
194
  out[(processed_names[i], processed_names[j])] = s
195
 
196
- print(f"πŸ’Ύ Embeddings saved to {embeddings_path}")
 
 
 
 
 
 
 
 
 
 
197
  return out
198
 
199
  def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
@@ -326,20 +349,50 @@ def build_graph_json(
326
  print(f"πŸ” Got {len(cached_sims)} cached similarities")
327
 
328
  if cached_sims:
329
- # Create minimal graph with cached data
330
  cached_data = np.load(embeddings_cache, allow_pickle=True)
331
  missing = list(cached_data["names"])
332
 
333
- nodes = []
334
- for name in missing:
335
- nodes.append({"id": name, "cls": "cand", "sz": 1})
 
336
 
 
 
337
  links = []
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  for (a, b), s in cached_sims.items():
339
  links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
340
 
341
- print(f"⚑ Built graph from cache: {len(nodes)} nodes, {len(links)} links")
342
- return {"nodes": nodes, "links": links}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  except Exception as e:
344
  print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
345
 
 
130
  print(f"Encoding embeddings for {len(names)} models...")
131
  batch_size = 4 # keep your default
132
 
133
+ # ── two-stage caching: temp (for resume) + permanent (for reuse) ─────────────
134
+ temp_cache_path = Path("temp_embeddings.npz") # For resuming computation
135
+ final_cache_path = Path("embeddings_cache.npz") # For permanent storage
136
  start_idx = 0
137
  emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
138
 
139
+ # Try to load from permanent cache first
140
+ if final_cache_path.exists():
141
+ try:
142
+ cached = np.load(final_cache_path, allow_pickle=True)
143
+ cached_names = list(cached["names"])
144
+ if names == cached_names: # Exact match - use final cache
145
+ print(f"βœ… Using final embeddings cache ({len(cached_names)} models)")
146
+ return compute_similarities_from_cache(thr)
147
+ except Exception as e:
148
+ print(f"⚠️ Failed to load final cache: {e}")
149
+
150
+ # Try to resume from temp cache
151
+ if temp_cache_path.exists():
152
  try:
153
+ cached = np.load(temp_cache_path, allow_pickle=True)
154
  cached_names = list(cached["names"])
155
  if names[:len(cached_names)] == cached_names:
156
  loaded = cached["embeddings"].astype(np.float32)
157
  all_embeddings.append(loaded)
158
  start_idx = len(cached_names)
159
+ print(f"πŸ”„ Resuming from temp cache: {start_idx}/{len(names)} models")
160
  except Exception as e:
161
+ print(f"⚠️ Failed to load temp cache: {e}")
162
  # ───────────────────────────────────────────────────────────────────────────
163
 
164
  for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
 
174
 
175
  all_embeddings.append(emb)
176
 
177
+ # save to temp cache after each batch (for resume)
178
  try:
179
  cur = np.vstack(all_embeddings).astype(np.float32)
180
  np.savez(
181
+ temp_cache_path,
182
  embeddings=cur,
183
  names=np.array(names[:i+len(batch_names)], dtype=object),
184
  )
185
  except Exception as e:
186
+ print(f"⚠️ Failed to write temp cache: {e}")
187
 
188
  if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
189
  torch.cuda.empty_cache()
 
206
  if s >= thr:
207
  out[(processed_names[i], processed_names[j])] = s
208
 
209
+ # Save to final cache when complete
210
+ try:
211
+ np.savez(final_cache_path, embeddings=embeddings, names=np.array(names, dtype=object))
212
+ print(f"πŸ’Ύ Final embeddings saved to {final_cache_path}")
213
+ # Clean up temp cache
214
+ if temp_cache_path.exists():
215
+ temp_cache_path.unlink()
216
+ print(f"🧹 Cleaned up temp cache")
217
+ except Exception as e:
218
+ print(f"⚠️ Failed to save final cache: {e}")
219
+
220
  return out
221
 
222
  def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
 
349
  print(f"πŸ” Got {len(cached_sims)} cached similarities")
350
 
351
  if cached_sims:
352
+ # Create graph with cached similarities + modular dependencies
353
  cached_data = np.load(embeddings_cache, allow_pickle=True)
354
  missing = list(cached_data["names"])
355
 
356
+ # Still need to get modular dependencies from repo
357
+ models_root = transformers_dir / "src/transformers/models"
358
+ mod_files = modular_files(models_root)
359
+ deps = dependency_graph(mod_files, models_root)
360
 
361
+ # Build full graph structure
362
+ nodes = set(missing) # Start with cached models
363
  links = []
364
+
365
+ # Add dependency links
366
+ for drv, lst in deps.items():
367
+ for d in lst:
368
+ links.append({
369
+ "source": d["source"],
370
+ "target": drv,
371
+ "label": f"{sum(1 for x in lst if x['source'] == d['source'])} imports",
372
+ "cand": False
373
+ })
374
+ nodes.update({d["source"], drv})
375
+
376
+ # Add similarity links
377
  for (a, b), s in cached_sims.items():
378
  links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
379
 
380
+ # Create node list with proper classification
381
+ targets = {lk["target"] for lk in links if not lk["cand"]}
382
+ sources = {lk["source"] for lk in links if not lk["cand"]}
383
+
384
+ nodelist = []
385
+ for n in sorted(nodes):
386
+ if n in missing and n not in sources and n not in targets:
387
+ cls = "cand"
388
+ elif n in sources and n not in targets:
389
+ cls = "base"
390
+ else:
391
+ cls = "derived"
392
+ nodelist.append({"id": n, "cls": cls, "sz": 1})
393
+
394
+ print(f"⚑ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
395
+ return {"nodes": nodelist, "links": links}
396
  except Exception as e:
397
  print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
398