Spaces:
Running
on
Zero
Running
on
Zero
Add longer seqlens
Browse files
modular_graph_and_candidates.py
CHANGED
@@ -96,10 +96,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
96 |
|
97 |
@spaces.GPU
|
98 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
99 |
-
model = SentenceTransformer("codesage/codesage-large-v2", trust_remote_code=True)
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
for name in tqdm(missing, desc="Reading modeling files"):
|
104 |
code = ""
|
105 |
for py in (models_root / name).rglob("modeling_*.py"):
|
@@ -113,23 +124,27 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
113 |
all_embeddings = []
|
114 |
|
115 |
print("Encoding embeddings...")
|
116 |
-
batch_size = 8
|
117 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
118 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
119 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
120 |
all_embeddings.append(emb)
|
121 |
|
122 |
-
|
|
|
|
|
|
|
|
|
123 |
|
124 |
print("Computing pairwise similarities...")
|
125 |
-
|
126 |
|
127 |
out = {}
|
128 |
for i in range(len(names)):
|
129 |
for j in range(i + 1, len(names)):
|
130 |
-
s =
|
131 |
if s >= thr:
|
132 |
-
out[(names[i], names[j])] =
|
133 |
return out
|
134 |
|
135 |
|
|
|
96 |
|
97 |
@spaces.GPU
|
98 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
99 |
+
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
100 |
+
|
101 |
+
# Hard-cap by backend max positions (prevents IndexError in self.wpe)
|
102 |
+
try:
|
103 |
+
cfg = model[0].auto_model.config
|
104 |
+
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
105 |
+
except Exception:
|
106 |
+
pos_limit = 1024 # conservative fallback if config is odd
|
107 |
|
108 |
+
seq_len = min(pos_limit, 2048) # optional extra ceiling if pos_limit is huge
|
109 |
+
model.max_seq_length = seq_len # SentenceTransformer wrapper
|
110 |
+
model[0].max_seq_length = seq_len # its Transformer submodule actually used for tokenize()
|
111 |
+
model[0].tokenizer.model_max_length = seq_len # ensure tokenizer truncates
|
112 |
+
|
113 |
+
texts = {}
|
114 |
for name in tqdm(missing, desc="Reading modeling files"):
|
115 |
code = ""
|
116 |
for py in (models_root / name).rglob("modeling_*.py"):
|
|
|
124 |
all_embeddings = []
|
125 |
|
126 |
print("Encoding embeddings...")
|
127 |
+
batch_size = 8
|
128 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
129 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
131 |
all_embeddings.append(emb)
|
132 |
|
133 |
+
# Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
|
134 |
+
import numpy as np
|
135 |
+
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
136 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
137 |
+
embeddings = embeddings / norms
|
138 |
|
139 |
print("Computing pairwise similarities...")
|
140 |
+
sims_mat = embeddings @ embeddings.T
|
141 |
|
142 |
out = {}
|
143 |
for i in range(len(names)):
|
144 |
for j in range(i + 1, len(names)):
|
145 |
+
s = float(sims_mat[i, j])
|
146 |
if s >= thr:
|
147 |
+
out[(names[i], names[j])] = s
|
148 |
return out
|
149 |
|
150 |
|