Spaces:

Molbap
/

transformers-modular-refactor

Running on Zero

App Files Files Community

Molbap HF Staff commited on 5 days ago

Commit

b90f4fc

2 Parent(s): c862054 6804082

Merge branch 'main' of https://huggingface.co/spaces/Molbap/transformers-modular-refactor

Browse files

Files changed (1) hide show

modular_graph_and_candidates.py +6 -46

modular_graph_and_candidates.py CHANGED Viewed

@@ -94,59 +94,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
             out[(m1, m2)] = s
     return out
-#@spaces.GPU
-def old_embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
-    model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
-    model.max_seq_length = 8192  # truncate overly long modeling files
-    texts = {}
-    for name in tqdm(missing, desc="Reading modeling files"):
-        code = ""
-        for py in (models_root / name).rglob("modeling_*.py"):
-            try:
-                code += _strip_source(py.read_text(encoding="utf-8")) + "\n"
-            except Exception:
-                continue
-        texts[name] = code.strip() or " "
-    names = list(texts)
-    all_embeddings = []
-    print("Encoding embeddings...")
-    batch_size = 2
-    for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
-        batch = [texts[n] for n in names[i:i+batch_size]]
-        emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
-        all_embeddings.append(emb)
-    embeddings = np.vstack(all_embeddings)  # [N, D]
-    print("Computing pairwise similarities...")
-    sims = embeddings @ embeddings.T
-    out = {}
-    for i in range(len(names)):
-        for j in range(i + 1, len(names)):
-            s = sims[i, j]
-            if s >= thr:
-                out[(names[i], names[j])] = float(s)
-    return out
-#@spaces.GPU
 def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
     model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
-    # Hard-cap by backend max positions (prevents IndexError in self.wpe)
     try:
         cfg = model[0].auto_model.config
         pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
     except Exception:
-        pos_limit = 1024  # conservative fallback if config is odd
-    seq_len = min(pos_limit, 2048)  # optional extra ceiling if pos_limit is huge
-    model.max_seq_length = seq_len               # SentenceTransformer wrapper
-    model[0].max_seq_length = seq_len            # its Transformer submodule actually used for tokenize()
-    model[0].tokenizer.model_max_length = seq_len  # ensure tokenizer truncates
     texts = {}
     for name in tqdm(missing, desc="Reading modeling files"):
@@ -168,8 +130,6 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
         emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
         all_embeddings.append(emb)
-    # Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
-    import numpy as np
     embeddings = np.vstack(all_embeddings).astype(np.float32)
     norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
     embeddings = embeddings / norms

             out[(m1, m2)] = s
     return out
+@spaces.GPU
 def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
     model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
     try:
         cfg = model[0].auto_model.config
         pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
     except Exception:
+        pos_limit = 1024
+    seq_len = min(pos_limit, 2048)
+    model.max_seq_length = seq_len
+    model[0].max_seq_length = seq_len
+    model[0].tokenizer.model_max_length = seq_len
     texts = {}
     for name in tqdm(missing, desc="Reading modeling files"):
         emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
         all_embeddings.append(emb)
     embeddings = np.vstack(all_embeddings).astype(np.float32)
     norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
     embeddings = embeddings / norms