Spaces:
Running
on
Zero
Running
on
Zero
update
Browse files- modular_graph_and_candidates.py +34 -18
modular_graph_and_candidates.py
CHANGED
@@ -61,10 +61,10 @@ def _strip_source(code: str) -> str:
|
|
61 |
if not re.match(r"\s*(from|import)\s+", ln))
|
62 |
|
63 |
def _tokenise(code: str) -> Set[str]:
|
|
|
64 |
toks: Set[str] = set()
|
65 |
-
for
|
66 |
-
|
67 |
-
toks.add(tok.string)
|
68 |
return toks
|
69 |
|
70 |
def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
|
@@ -124,9 +124,9 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
124 |
all_embeddings = []
|
125 |
|
126 |
print("Encoding embeddings...")
|
127 |
-
batch_size =
|
128 |
-
for i in tqdm(range(0, len(names), batch_size), desc="
|
129 |
-
batch = [texts[
|
130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
131 |
all_embeddings.append(emb)
|
132 |
|
@@ -194,6 +194,26 @@ def dependency_graph(modular_files: List[Path], models_root: Path) -> Dict[str,
|
|
194 |
|
195 |
# modular_graph_and_candidates.py (top-level)
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
def build_graph_json(
|
198 |
transformers_dir: Path,
|
199 |
threshold: float = SIM_DEFAULT,
|
@@ -202,20 +222,16 @@ def build_graph_json(
|
|
202 |
) -> dict:
|
203 |
"""Return the {nodes, links} dict that D3 needs."""
|
204 |
models_root = transformers_dir / "src/transformers/models"
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
207 |
mod_files = modular_files(models_root)
|
208 |
deps = dependency_graph(mod_files, models_root)
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
if multimodal:
|
213 |
-
missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
|
214 |
-
|
215 |
-
if sim_method == "jaccard":
|
216 |
-
sims = similarity_clusters({m: bags[m] for m in missing}, threshold)
|
217 |
-
else:
|
218 |
-
sims = embedding_similarity_clusters(models_root, missing, threshold)
|
219 |
|
220 |
# ---- assemble nodes & links ----
|
221 |
nodes: Set[str] = set()
|
|
|
61 |
if not re.match(r"\s*(from|import)\s+", ln))
|
62 |
|
63 |
def _tokenise(code: str) -> Set[str]:
|
64 |
+
"""Extract identifiers using regex - more robust than tokenizer for malformed code."""
|
65 |
toks: Set[str] = set()
|
66 |
+
for match in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code):
|
67 |
+
toks.add(match.group())
|
|
|
68 |
return toks
|
69 |
|
70 |
def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
|
|
|
124 |
all_embeddings = []
|
125 |
|
126 |
print("Encoding embeddings...")
|
127 |
+
batch_size = 1
|
128 |
+
for i in tqdm(range(0, len(names), batch_size), desc="Models", leave=False):
|
129 |
+
batch = [texts[names[i]]]
|
130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
131 |
all_embeddings.append(emb)
|
132 |
|
|
|
194 |
|
195 |
# modular_graph_and_candidates.py (top-level)
|
196 |
|
197 |
+
def get_missing_models(models_root: Path, multimodal: bool = False) -> Tuple[List[str], Dict[str, List[Set[str]]], Dict[str, int]]:
|
198 |
+
"""Get list of models missing modular implementations."""
|
199 |
+
bags, pix_hits = build_token_bags(models_root)
|
200 |
+
mod_files = modular_files(models_root)
|
201 |
+
models_with_modular = {p.parent.name for p in mod_files}
|
202 |
+
missing = [m for m in bags if m not in models_with_modular]
|
203 |
+
|
204 |
+
if multimodal:
|
205 |
+
missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
|
206 |
+
|
207 |
+
return missing, bags, pix_hits
|
208 |
+
|
209 |
+
def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
|
210 |
+
threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
|
211 |
+
"""Compute similarities between missing models using specified method."""
|
212 |
+
if sim_method == "jaccard":
|
213 |
+
return similarity_clusters({m: bags[m] for m in missing}, threshold)
|
214 |
+
else:
|
215 |
+
return embedding_similarity_clusters(models_root, missing, threshold)
|
216 |
+
|
217 |
def build_graph_json(
|
218 |
transformers_dir: Path,
|
219 |
threshold: float = SIM_DEFAULT,
|
|
|
222 |
) -> dict:
|
223 |
"""Return the {nodes, links} dict that D3 needs."""
|
224 |
models_root = transformers_dir / "src/transformers/models"
|
225 |
+
|
226 |
+
# Get missing models and their data
|
227 |
+
missing, bags, pix_hits = get_missing_models(models_root, multimodal)
|
228 |
+
|
229 |
+
# Build dependency graph
|
230 |
mod_files = modular_files(models_root)
|
231 |
deps = dependency_graph(mod_files, models_root)
|
232 |
+
|
233 |
+
# Compute similarities
|
234 |
+
sims = compute_similarities(models_root, missing, bags, threshold, sim_method)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
# ---- assemble nodes & links ----
|
237 |
nodes: Set[str] = set()
|