Spaces:
Running
on
Zero
Running
on
Zero
Merge branch 'main' of https://huggingface.co/spaces/Molbap/transformers-modular-refactor
Browse files
modular_graph_and_candidates.py
CHANGED
@@ -94,59 +94,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
94 |
out[(m1, m2)] = s
|
95 |
return out
|
96 |
|
97 |
-
#@spaces.GPU
|
98 |
-
def old_embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
99 |
-
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
100 |
-
model.max_seq_length = 8192 # truncate overly long modeling files
|
101 |
-
texts = {}
|
102 |
-
|
103 |
-
for name in tqdm(missing, desc="Reading modeling files"):
|
104 |
-
code = ""
|
105 |
-
for py in (models_root / name).rglob("modeling_*.py"):
|
106 |
-
try:
|
107 |
-
code += _strip_source(py.read_text(encoding="utf-8")) + "\n"
|
108 |
-
except Exception:
|
109 |
-
continue
|
110 |
-
texts[name] = code.strip() or " "
|
111 |
-
|
112 |
-
names = list(texts)
|
113 |
-
all_embeddings = []
|
114 |
-
|
115 |
-
print("Encoding embeddings...")
|
116 |
-
batch_size = 2
|
117 |
-
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
118 |
-
batch = [texts[n] for n in names[i:i+batch_size]]
|
119 |
-
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
120 |
-
all_embeddings.append(emb)
|
121 |
-
|
122 |
-
embeddings = np.vstack(all_embeddings) # [N, D]
|
123 |
-
|
124 |
-
print("Computing pairwise similarities...")
|
125 |
-
sims = embeddings @ embeddings.T
|
126 |
-
|
127 |
-
out = {}
|
128 |
-
for i in range(len(names)):
|
129 |
-
for j in range(i + 1, len(names)):
|
130 |
-
s = sims[i, j]
|
131 |
-
if s >= thr:
|
132 |
-
out[(names[i], names[j])] = float(s)
|
133 |
-
return out
|
134 |
|
135 |
-
|
136 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
137 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
138 |
|
139 |
-
# Hard-cap by backend max positions (prevents IndexError in self.wpe)
|
140 |
try:
|
141 |
cfg = model[0].auto_model.config
|
142 |
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
143 |
except Exception:
|
144 |
-
pos_limit = 1024
|
145 |
|
146 |
-
seq_len = min(pos_limit, 2048)
|
147 |
-
model.max_seq_length = seq_len
|
148 |
-
model[0].max_seq_length = seq_len
|
149 |
-
model[0].tokenizer.model_max_length = seq_len
|
150 |
|
151 |
texts = {}
|
152 |
for name in tqdm(missing, desc="Reading modeling files"):
|
@@ -168,8 +130,6 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
168 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
169 |
all_embeddings.append(emb)
|
170 |
|
171 |
-
# Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
|
172 |
-
import numpy as np
|
173 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
174 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
175 |
embeddings = embeddings / norms
|
|
|
94 |
out[(m1, m2)] = s
|
95 |
return out
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
@spaces.GPU
|
99 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
100 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
101 |
|
|
|
102 |
try:
|
103 |
cfg = model[0].auto_model.config
|
104 |
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
105 |
except Exception:
|
106 |
+
pos_limit = 1024
|
107 |
|
108 |
+
seq_len = min(pos_limit, 2048)
|
109 |
+
model.max_seq_length = seq_len
|
110 |
+
model[0].max_seq_length = seq_len
|
111 |
+
model[0].tokenizer.model_max_length = seq_len
|
112 |
|
113 |
texts = {}
|
114 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
|
130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
131 |
all_embeddings.append(emb)
|
132 |
|
|
|
|
|
133 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
134 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
135 |
embeddings = embeddings / norms
|