Spaces:
Running
on
Zero
Running
on
Zero
update
Browse files- modular_graph_and_candidates.py +69 -16
modular_graph_and_candidates.py
CHANGED
@@ -130,22 +130,35 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
130 |
print(f"Encoding embeddings for {len(names)} models...")
|
131 |
batch_size = 4 # keep your default
|
132 |
|
133 |
-
# ββ
|
134 |
-
|
|
|
135 |
start_idx = 0
|
136 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
137 |
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
try:
|
140 |
-
cached = np.load(
|
141 |
cached_names = list(cached["names"])
|
142 |
if names[:len(cached_names)] == cached_names:
|
143 |
loaded = cached["embeddings"].astype(np.float32)
|
144 |
all_embeddings.append(loaded)
|
145 |
start_idx = len(cached_names)
|
146 |
-
print(f"
|
147 |
except Exception as e:
|
148 |
-
print(f"β οΈ Failed to load
|
149 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
150 |
|
151 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
@@ -161,16 +174,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
161 |
|
162 |
all_embeddings.append(emb)
|
163 |
|
164 |
-
# save to
|
165 |
try:
|
166 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
167 |
np.savez(
|
168 |
-
|
169 |
embeddings=cur,
|
170 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
171 |
)
|
172 |
except Exception as e:
|
173 |
-
print(f"β οΈ Failed to write
|
174 |
|
175 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
176 |
torch.cuda.empty_cache()
|
@@ -193,7 +206,17 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
193 |
if s >= thr:
|
194 |
out[(processed_names[i], processed_names[j])] = s
|
195 |
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
return out
|
198 |
|
199 |
def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
|
@@ -326,20 +349,50 @@ def build_graph_json(
|
|
326 |
print(f"π Got {len(cached_sims)} cached similarities")
|
327 |
|
328 |
if cached_sims:
|
329 |
-
# Create
|
330 |
cached_data = np.load(embeddings_cache, allow_pickle=True)
|
331 |
missing = list(cached_data["names"])
|
332 |
|
333 |
-
|
334 |
-
|
335 |
-
|
|
|
336 |
|
|
|
|
|
337 |
links = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
for (a, b), s in cached_sims.items():
|
339 |
links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
|
340 |
|
341 |
-
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
except Exception as e:
|
344 |
print(f"β οΈ Cache-only build failed: {e}, falling back to full build")
|
345 |
|
|
|
130 |
print(f"Encoding embeddings for {len(names)} models...")
|
131 |
batch_size = 4 # keep your default
|
132 |
|
133 |
+
# ββ two-stage caching: temp (for resume) + permanent (for reuse) βββββββββββββ
|
134 |
+
temp_cache_path = Path("temp_embeddings.npz") # For resuming computation
|
135 |
+
final_cache_path = Path("embeddings_cache.npz") # For permanent storage
|
136 |
start_idx = 0
|
137 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
138 |
|
139 |
+
# Try to load from permanent cache first
|
140 |
+
if final_cache_path.exists():
|
141 |
+
try:
|
142 |
+
cached = np.load(final_cache_path, allow_pickle=True)
|
143 |
+
cached_names = list(cached["names"])
|
144 |
+
if names == cached_names: # Exact match - use final cache
|
145 |
+
print(f"β
Using final embeddings cache ({len(cached_names)} models)")
|
146 |
+
return compute_similarities_from_cache(thr)
|
147 |
+
except Exception as e:
|
148 |
+
print(f"β οΈ Failed to load final cache: {e}")
|
149 |
+
|
150 |
+
# Try to resume from temp cache
|
151 |
+
if temp_cache_path.exists():
|
152 |
try:
|
153 |
+
cached = np.load(temp_cache_path, allow_pickle=True)
|
154 |
cached_names = list(cached["names"])
|
155 |
if names[:len(cached_names)] == cached_names:
|
156 |
loaded = cached["embeddings"].astype(np.float32)
|
157 |
all_embeddings.append(loaded)
|
158 |
start_idx = len(cached_names)
|
159 |
+
print(f"π Resuming from temp cache: {start_idx}/{len(names)} models")
|
160 |
except Exception as e:
|
161 |
+
print(f"β οΈ Failed to load temp cache: {e}")
|
162 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
163 |
|
164 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
|
|
174 |
|
175 |
all_embeddings.append(emb)
|
176 |
|
177 |
+
# save to temp cache after each batch (for resume)
|
178 |
try:
|
179 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
180 |
np.savez(
|
181 |
+
temp_cache_path,
|
182 |
embeddings=cur,
|
183 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
184 |
)
|
185 |
except Exception as e:
|
186 |
+
print(f"β οΈ Failed to write temp cache: {e}")
|
187 |
|
188 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
189 |
torch.cuda.empty_cache()
|
|
|
206 |
if s >= thr:
|
207 |
out[(processed_names[i], processed_names[j])] = s
|
208 |
|
209 |
+
# Save to final cache when complete
|
210 |
+
try:
|
211 |
+
np.savez(final_cache_path, embeddings=embeddings, names=np.array(names, dtype=object))
|
212 |
+
print(f"πΎ Final embeddings saved to {final_cache_path}")
|
213 |
+
# Clean up temp cache
|
214 |
+
if temp_cache_path.exists():
|
215 |
+
temp_cache_path.unlink()
|
216 |
+
print(f"π§Ή Cleaned up temp cache")
|
217 |
+
except Exception as e:
|
218 |
+
print(f"β οΈ Failed to save final cache: {e}")
|
219 |
+
|
220 |
return out
|
221 |
|
222 |
def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
|
|
|
349 |
print(f"π Got {len(cached_sims)} cached similarities")
|
350 |
|
351 |
if cached_sims:
|
352 |
+
# Create graph with cached similarities + modular dependencies
|
353 |
cached_data = np.load(embeddings_cache, allow_pickle=True)
|
354 |
missing = list(cached_data["names"])
|
355 |
|
356 |
+
# Still need to get modular dependencies from repo
|
357 |
+
models_root = transformers_dir / "src/transformers/models"
|
358 |
+
mod_files = modular_files(models_root)
|
359 |
+
deps = dependency_graph(mod_files, models_root)
|
360 |
|
361 |
+
# Build full graph structure
|
362 |
+
nodes = set(missing) # Start with cached models
|
363 |
links = []
|
364 |
+
|
365 |
+
# Add dependency links
|
366 |
+
for drv, lst in deps.items():
|
367 |
+
for d in lst:
|
368 |
+
links.append({
|
369 |
+
"source": d["source"],
|
370 |
+
"target": drv,
|
371 |
+
"label": f"{sum(1 for x in lst if x['source'] == d['source'])} imports",
|
372 |
+
"cand": False
|
373 |
+
})
|
374 |
+
nodes.update({d["source"], drv})
|
375 |
+
|
376 |
+
# Add similarity links
|
377 |
for (a, b), s in cached_sims.items():
|
378 |
links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
|
379 |
|
380 |
+
# Create node list with proper classification
|
381 |
+
targets = {lk["target"] for lk in links if not lk["cand"]}
|
382 |
+
sources = {lk["source"] for lk in links if not lk["cand"]}
|
383 |
+
|
384 |
+
nodelist = []
|
385 |
+
for n in sorted(nodes):
|
386 |
+
if n in missing and n not in sources and n not in targets:
|
387 |
+
cls = "cand"
|
388 |
+
elif n in sources and n not in targets:
|
389 |
+
cls = "base"
|
390 |
+
else:
|
391 |
+
cls = "derived"
|
392 |
+
nodelist.append({"id": n, "cls": cls, "sz": 1})
|
393 |
+
|
394 |
+
print(f"β‘ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
|
395 |
+
return {"nodes": nodelist, "links": links}
|
396 |
except Exception as e:
|
397 |
print(f"β οΈ Cache-only build failed: {e}, falling back to full build")
|
398 |
|