Spaces:

Molbap
/

transformers-modular-refactor

Running on Zero

App Files Files Community

Molbap HF Staff commited on 4 days ago

Commit

0b63d8e

1 Parent(s): 214d223

update

Browse files

Files changed (1) hide show

modular_graph_and_candidates.py +69 -16

modular_graph_and_candidates.py CHANGED Viewed

@@ -130,22 +130,35 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
     print(f"Encoding embeddings for {len(names)} models...")
     batch_size = 4  # keep your default
-    # ── persistent embeddings storage ────────────────────────────────────────────
-    embeddings_path = Path("embeddings_cache.npz")
     start_idx = 0
     emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
-    if embeddings_path.exists():
         try:
-            cached = np.load(embeddings_path, allow_pickle=True)
             cached_names = list(cached["names"])
             if names[:len(cached_names)] == cached_names:
                 loaded = cached["embeddings"].astype(np.float32)
                 all_embeddings.append(loaded)
                 start_idx = len(cached_names)
-                print(f"📦 Using cached embeddings for {start_idx}/{len(names)} models")
         except Exception as e:
-            print(f"⚠️  Failed to load cached embeddings: {type(e).__name__}: {e}")
     # ───────────────────────────────────────────────────────────────────────────
     for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
@@ -161,16 +174,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
         all_embeddings.append(emb)
-        # save to persistent cache after each batch
         try:
             cur = np.vstack(all_embeddings).astype(np.float32)
             np.savez(
-                embeddings_path,
                 embeddings=cur,
                 names=np.array(names[:i+len(batch_names)], dtype=object),
             )
         except Exception as e:
-            print(f"⚠️  Failed to write embeddings cache: {type(e).__name__}: {e}")
         if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -193,7 +206,17 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
             if s >= thr:
                 out[(processed_names[i], processed_names[j])] = s
-    print(f"💾 Embeddings saved to {embeddings_path}")
     return out
 def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
@@ -326,20 +349,50 @@ def build_graph_json(
             print(f"🔍 Got {len(cached_sims)} cached similarities")
             if cached_sims:
-                # Create minimal graph with cached data
                 cached_data = np.load(embeddings_cache, allow_pickle=True)
                 missing = list(cached_data["names"])
-                nodes = []
-                for name in missing:
-                    nodes.append({"id": name, "cls": "cand", "sz": 1})
                 links = []
                 for (a, b), s in cached_sims.items():
                     links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
-                print(f"⚡ Built graph from cache: {len(nodes)} nodes, {len(links)} links")
-                return {"nodes": nodes, "links": links}
         except Exception as e:
             print(f"⚠️ Cache-only build failed: {e}, falling back to full build")

     print(f"Encoding embeddings for {len(names)} models...")
     batch_size = 4  # keep your default
+    # ── two-stage caching: temp (for resume) + permanent (for reuse) ─────────────
+    temp_cache_path = Path("temp_embeddings.npz")  # For resuming computation
+    final_cache_path = Path("embeddings_cache.npz")  # For permanent storage
     start_idx = 0
     emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
+    # Try to load from permanent cache first
+    if final_cache_path.exists():
+        try:
+            cached = np.load(final_cache_path, allow_pickle=True)
+            cached_names = list(cached["names"])
+            if names == cached_names:  # Exact match - use final cache
+                print(f"✅ Using final embeddings cache ({len(cached_names)} models)")
+                return compute_similarities_from_cache(thr)
+        except Exception as e:
+            print(f"⚠️  Failed to load final cache: {e}")
+    # Try to resume from temp cache
+    if temp_cache_path.exists():
         try:
+            cached = np.load(temp_cache_path, allow_pickle=True)
             cached_names = list(cached["names"])
             if names[:len(cached_names)] == cached_names:
                 loaded = cached["embeddings"].astype(np.float32)
                 all_embeddings.append(loaded)
                 start_idx = len(cached_names)
+                print(f"🔄 Resuming from temp cache: {start_idx}/{len(names)} models")
         except Exception as e:
+            print(f"⚠️  Failed to load temp cache: {e}")
     # ───────────────────────────────────────────────────────────────────────────
     for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
         all_embeddings.append(emb)
+        # save to temp cache after each batch (for resume)
         try:
             cur = np.vstack(all_embeddings).astype(np.float32)
             np.savez(
+                temp_cache_path,
                 embeddings=cur,
                 names=np.array(names[:i+len(batch_names)], dtype=object),
             )
         except Exception as e:
+            print(f"⚠️  Failed to write temp cache: {e}")
         if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
             torch.cuda.empty_cache()
             if s >= thr:
                 out[(processed_names[i], processed_names[j])] = s
+    # Save to final cache when complete
+    try:
+        np.savez(final_cache_path, embeddings=embeddings, names=np.array(names, dtype=object))
+        print(f"💾 Final embeddings saved to {final_cache_path}")
+        # Clean up temp cache
+        if temp_cache_path.exists():
+            temp_cache_path.unlink()
+            print(f"🧹 Cleaned up temp cache")
+    except Exception as e:
+        print(f"⚠️ Failed to save final cache: {e}")
     return out
 def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
             print(f"🔍 Got {len(cached_sims)} cached similarities")
             if cached_sims:
+                # Create graph with cached similarities + modular dependencies
                 cached_data = np.load(embeddings_cache, allow_pickle=True)
                 missing = list(cached_data["names"])
+                # Still need to get modular dependencies from repo
+                models_root = transformers_dir / "src/transformers/models"
+                mod_files = modular_files(models_root)
+                deps = dependency_graph(mod_files, models_root)
+                # Build full graph structure
+                nodes = set(missing)  # Start with cached models
                 links = []
+                # Add dependency links
+                for drv, lst in deps.items():
+                    for d in lst:
+                        links.append({
+                            "source": d["source"],
+                            "target": drv,
+                            "label": f"{sum(1 for x in lst if x['source'] == d['source'])} imports",
+                            "cand": False
+                        })
+                        nodes.update({d["source"], drv})
+                # Add similarity links
                 for (a, b), s in cached_sims.items():
                     links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
+                # Create node list with proper classification
+                targets = {lk["target"] for lk in links if not lk["cand"]}
+                sources = {lk["source"] for lk in links if not lk["cand"]}
+                nodelist = []
+                for n in sorted(nodes):
+                    if n in missing and n not in sources and n not in targets:
+                        cls = "cand"
+                    elif n in sources and n not in targets:
+                        cls = "base"
+                    else:
+                        cls = "derived"
+                    nodelist.append({"id": n, "cls": cls, "sz": 1})
+                print(f"⚡ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
+                return {"nodes": nodelist, "links": links}
         except Exception as e:
             print(f"⚠️ Cache-only build failed: {e}, falling back to full build")