Molbap HF Staff commited on
Commit
09c1921
·
1 Parent(s): 06274f5
Files changed (1) hide show
  1. modular_graph_and_candidates.py +34 -18
modular_graph_and_candidates.py CHANGED
@@ -61,10 +61,10 @@ def _strip_source(code: str) -> str:
61
  if not re.match(r"\s*(from|import)\s+", ln))
62
 
63
  def _tokenise(code: str) -> Set[str]:
 
64
  toks: Set[str] = set()
65
- for tok in tokenize.generate_tokens(iter(code.splitlines(keepends=True)).__next__):
66
- if tok.type == tokenize.NAME:
67
- toks.add(tok.string)
68
  return toks
69
 
70
  def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
@@ -124,9 +124,9 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
124
  all_embeddings = []
125
 
126
  print("Encoding embeddings...")
127
- batch_size = 2
128
- for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
129
- batch = [texts[n] for n in names[i:i+batch_size]]
130
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
131
  all_embeddings.append(emb)
132
 
@@ -194,6 +194,26 @@ def dependency_graph(modular_files: List[Path], models_root: Path) -> Dict[str,
194
 
195
  # modular_graph_and_candidates.py (top-level)
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  def build_graph_json(
198
  transformers_dir: Path,
199
  threshold: float = SIM_DEFAULT,
@@ -202,20 +222,16 @@ def build_graph_json(
202
  ) -> dict:
203
  """Return the {nodes, links} dict that D3 needs."""
204
  models_root = transformers_dir / "src/transformers/models"
205
- bags, pix_hits = build_token_bags(models_root)
206
-
 
 
 
207
  mod_files = modular_files(models_root)
208
  deps = dependency_graph(mod_files, models_root)
209
-
210
- models_with_modular = {p.parent.name for p in mod_files}
211
- missing = [m for m in bags if m not in models_with_modular]
212
- if multimodal:
213
- missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
214
-
215
- if sim_method == "jaccard":
216
- sims = similarity_clusters({m: bags[m] for m in missing}, threshold)
217
- else:
218
- sims = embedding_similarity_clusters(models_root, missing, threshold)
219
 
220
  # ---- assemble nodes & links ----
221
  nodes: Set[str] = set()
 
61
  if not re.match(r"\s*(from|import)\s+", ln))
62
 
63
  def _tokenise(code: str) -> Set[str]:
64
+ """Extract identifiers using regex - more robust than tokenizer for malformed code."""
65
  toks: Set[str] = set()
66
+ for match in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code):
67
+ toks.add(match.group())
 
68
  return toks
69
 
70
  def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
 
124
  all_embeddings = []
125
 
126
  print("Encoding embeddings...")
127
+ batch_size = 1
128
+ for i in tqdm(range(0, len(names), batch_size), desc="Models", leave=False):
129
+ batch = [texts[names[i]]]
130
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
131
  all_embeddings.append(emb)
132
 
 
194
 
195
  # modular_graph_and_candidates.py (top-level)
196
 
197
+ def get_missing_models(models_root: Path, multimodal: bool = False) -> Tuple[List[str], Dict[str, List[Set[str]]], Dict[str, int]]:
198
+ """Get list of models missing modular implementations."""
199
+ bags, pix_hits = build_token_bags(models_root)
200
+ mod_files = modular_files(models_root)
201
+ models_with_modular = {p.parent.name for p in mod_files}
202
+ missing = [m for m in bags if m not in models_with_modular]
203
+
204
+ if multimodal:
205
+ missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
206
+
207
+ return missing, bags, pix_hits
208
+
209
+ def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
210
+ threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
211
+ """Compute similarities between missing models using specified method."""
212
+ if sim_method == "jaccard":
213
+ return similarity_clusters({m: bags[m] for m in missing}, threshold)
214
+ else:
215
+ return embedding_similarity_clusters(models_root, missing, threshold)
216
+
217
  def build_graph_json(
218
  transformers_dir: Path,
219
  threshold: float = SIM_DEFAULT,
 
222
  ) -> dict:
223
  """Return the {nodes, links} dict that D3 needs."""
224
  models_root = transformers_dir / "src/transformers/models"
225
+
226
+ # Get missing models and their data
227
+ missing, bags, pix_hits = get_missing_models(models_root, multimodal)
228
+
229
+ # Build dependency graph
230
  mod_files = modular_files(models_root)
231
  deps = dependency_graph(mod_files, models_root)
232
+
233
+ # Compute similarities
234
+ sims = compute_similarities(models_root, missing, bags, threshold, sim_method)
 
 
 
 
 
 
 
235
 
236
  # ---- assemble nodes & links ----
237
  nodes: Set[str] = set()