Molbap HF Staff commited on
Commit
c862054
Β·
1 Parent(s): dc04102
Files changed (2) hide show
  1. hf-logo.svg +8 -0
  2. modular_graph_and_candidates.py +126 -31
hf-logo.svg ADDED
modular_graph_and_candidates.py CHANGED
@@ -94,10 +94,10 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
94
  out[(m1, m2)] = s
95
  return out
96
 
97
- @spaces.GPU
98
- def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
99
- model = SentenceTransformer("codesage/codesage-large-v2", trust_remote_code=True)
100
- model.max_seq_length = 4096 # truncate overly long modeling files
101
  texts = {}
102
 
103
  for name in tqdm(missing, desc="Reading modeling files"):
@@ -113,7 +113,7 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
113
  all_embeddings = []
114
 
115
  print("Encoding embeddings...")
116
- batch_size = 8 # or 2 if memory is tight
117
  for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
118
  batch = [texts[n] for n in names[i:i+batch_size]]
119
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
@@ -122,7 +122,7 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
122
  embeddings = np.vstack(all_embeddings) # [N, D]
123
 
124
  print("Computing pairwise similarities...")
125
- sims = embeddings @ embeddings.T # cosine since already normalized
126
 
127
  out = {}
128
  for i in range(len(names)):
@@ -132,6 +132,59 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
132
  out[(names[i], names[j])] = float(s)
133
  return out
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
 
137
 
@@ -261,34 +314,72 @@ def generate_html(graph: dict) -> str:
261
  # ────────────────────────────────────────────────────────────────────────────────
262
  CSS = """
263
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
264
- :root { --base: 60px; }
265
- body { margin:0; font-family:'Inter',Arial,sans-serif; background:transparent; overflow:hidden; }
266
- svg { width:100vw; height:100vh; }
267
- .link { stroke:#999; stroke-opacity:.6; }
268
- .link.cand { stroke:#e63946; stroke-width:2.5; }
269
- .node-label { fill:#333; pointer-events:none; text-anchor:middle; font-weight:600; }
270
- .link-label { fill:#555; font-size:10px; pointer-events:none; text-anchor:middle; }
271
- .node.base path { fill:#ffbe0b; }
272
- .node.derived circle { fill:#1f77b4; }
273
- .node.cand circle, .node.cand path { fill:#e63946; }
274
- #legend { position:fixed; top:18px; left:18px; background:rgba(255,255,255,.92); padding:18px 28px;
275
- border-radius:10px; border:1.5px solid #bbb; font-size:18px; box-shadow:0 2px 8px rgba(0,0,0,.08); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  """
277
 
278
  JS = """
279
-
280
  function updateVisibility() {
281
  const show = document.getElementById('toggleRed').checked;
282
  svg.selectAll('.link.cand').style('display', show ? null : 'none');
283
  svg.selectAll('.node.cand').style('display', show ? null : 'none');
284
- svg.selectAll('.link-label')
285
- .filter(d => d.cand)
286
- .style('display', show ? null : 'none');
287
  }
288
-
289
  document.getElementById('toggleRed').addEventListener('change', updateVisibility);
290
 
291
-
292
  const graph = __GRAPH_DATA__;
293
  const W = innerWidth, H = innerHeight;
294
  const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
@@ -311,17 +402,21 @@ const node = g.selectAll('g.node')
311
  .attr('class', d => `node ${d.cls}`)
312
  .call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
313
 
314
- node.filter(d => d.cls==='base').append('image')
315
- .attr('xlink:href', 'hf-logo.svg').attr('x', -30).attr('y', -30).attr('width', 60).attr('height', 60);
316
- node.filter(d => d.cls!=='base').append('circle').attr('r', d => 20*d.sz);
 
 
 
 
 
317
  node.append('text').attr('class','node-label').attr('dy','-2.4em').text(d => d.id);
318
 
319
  const sim = d3.forceSimulation(graph.nodes)
320
- .force('link', d3.forceLink(graph.links).id(d => d.id).distance(520)) // tighter links
321
- .force('charge', d3.forceManyBody().strength(-600)) // weaker repulsion
322
  .force('center', d3.forceCenter(W / 2, H / 2))
323
- .force('collide', d3.forceCollide(d => d.cls === 'base' ? 50 : 50)); // smaller bubble spacing
324
-
325
 
326
  sim.on('tick', () => {
327
  link.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y)
 
94
  out[(m1, m2)] = s
95
  return out
96
 
97
+ #@spaces.GPU
98
+ def old_embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
99
+ model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
100
+ model.max_seq_length = 8192 # truncate overly long modeling files
101
  texts = {}
102
 
103
  for name in tqdm(missing, desc="Reading modeling files"):
 
113
  all_embeddings = []
114
 
115
  print("Encoding embeddings...")
116
+ batch_size = 2
117
  for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
118
  batch = [texts[n] for n in names[i:i+batch_size]]
119
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
 
122
  embeddings = np.vstack(all_embeddings) # [N, D]
123
 
124
  print("Computing pairwise similarities...")
125
+ sims = embeddings @ embeddings.T
126
 
127
  out = {}
128
  for i in range(len(names)):
 
132
  out[(names[i], names[j])] = float(s)
133
  return out
134
 
135
+ #@spaces.GPU
136
+ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
137
+ model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
138
+
139
+ # Hard-cap by backend max positions (prevents IndexError in self.wpe)
140
+ try:
141
+ cfg = model[0].auto_model.config
142
+ pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
143
+ except Exception:
144
+ pos_limit = 1024 # conservative fallback if config is odd
145
+
146
+ seq_len = min(pos_limit, 2048) # optional extra ceiling if pos_limit is huge
147
+ model.max_seq_length = seq_len # SentenceTransformer wrapper
148
+ model[0].max_seq_length = seq_len # its Transformer submodule actually used for tokenize()
149
+ model[0].tokenizer.model_max_length = seq_len # ensure tokenizer truncates
150
+
151
+ texts = {}
152
+ for name in tqdm(missing, desc="Reading modeling files"):
153
+ code = ""
154
+ for py in (models_root / name).rglob("modeling_*.py"):
155
+ try:
156
+ code += _strip_source(py.read_text(encoding="utf-8")) + "\n"
157
+ except Exception:
158
+ continue
159
+ texts[name] = code.strip() or " "
160
+
161
+ names = list(texts)
162
+ all_embeddings = []
163
+
164
+ print("Encoding embeddings...")
165
+ batch_size = 2
166
+ for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
167
+ batch = [texts[n] for n in names[i:i+batch_size]]
168
+ emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
169
+ all_embeddings.append(emb)
170
+
171
+ # Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
172
+ import numpy as np
173
+ embeddings = np.vstack(all_embeddings).astype(np.float32)
174
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
175
+ embeddings = embeddings / norms
176
+
177
+ print("Computing pairwise similarities...")
178
+ sims_mat = embeddings @ embeddings.T
179
+
180
+ out = {}
181
+ for i in range(len(names)):
182
+ for j in range(i + 1, len(names)):
183
+ s = float(sims_mat[i, j])
184
+ if s >= thr:
185
+ out[(names[i], names[j])] = s
186
+ return out
187
+
188
 
189
 
190
 
 
314
  # ────────────────────────────────────────────────────────────────────────────────
315
  CSS = """
316
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
317
+
318
+ :root{
319
+ --bg:#ffffff;
320
+ --text:#222222;
321
+ --muted:#555555;
322
+ --outline:#ffffff;
323
+ }
324
+ @media (prefers-color-scheme: dark){
325
+ :root{
326
+ --bg:#0b0d10;
327
+ --text:#e8e8e8;
328
+ --muted:#c8c8c8;
329
+ --outline:#000000;
330
+ }
331
+ }
332
+
333
+ body{ margin:0; font-family:'Inter',Arial,sans-serif; background:var(--bg); overflow:hidden; }
334
+ svg{ width:100vw; height:100vh; }
335
+
336
+ .link{ stroke:#999; stroke-opacity:.6; }
337
+ .link.cand{ stroke:#e63946; stroke-width:2.5; }
338
+
339
+ .node-label{
340
+ fill:var(--text);
341
+ pointer-events:none;
342
+ text-anchor:middle;
343
+ font-weight:600;
344
+ paint-order:stroke fill;
345
+ stroke:var(--outline);
346
+ stroke-width:3px;
347
+ }
348
+ .link-label{
349
+ fill:var(--muted);
350
+ pointer-events:none;
351
+ text-anchor:middle;
352
+ font-size:10px;
353
+ paint-order:stroke fill;
354
+ stroke:var(--bg);
355
+ stroke-width:2px;
356
+ }
357
+
358
+ .node.base image{ width:60px; height:60px; transform:translate(-30px,-30px); }
359
+ .node.derived circle{ fill:#1f77b4; }
360
+ .node.cand circle, .node.cand path{ fill:#e63946; }
361
+
362
+ #legend{
363
+ position:fixed; top:18px; left:18px;
364
+ background:rgba(255,255,255,.92);
365
+ padding:18px 28px; border-radius:10px; border:1.5px solid #bbb;
366
+ font-size:18px; box-shadow:0 2px 8px rgba(0,0,0,.08);
367
+ }
368
+ @media (prefers-color-scheme: dark){
369
+ #legend{ background:rgba(20,22,25,.92); color:#e8e8e8; border-color:#444; }
370
+ }
371
  """
372
 
373
  JS = """
 
374
  function updateVisibility() {
375
  const show = document.getElementById('toggleRed').checked;
376
  svg.selectAll('.link.cand').style('display', show ? null : 'none');
377
  svg.selectAll('.node.cand').style('display', show ? null : 'none');
378
+ svg.selectAll('.link-label').filter(d => d.cand).style('display', show ? null : 'none');
 
 
379
  }
 
380
  document.getElementById('toggleRed').addEventListener('change', updateVisibility);
381
 
382
+ const HF_LOGO_URI = "__HF_LOGO_DATA_URI__";
383
  const graph = __GRAPH_DATA__;
384
  const W = innerWidth, H = innerHeight;
385
  const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
 
402
  .attr('class', d => `node ${d.cls}`)
403
  .call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
404
 
405
+ const baseSel = node.filter(d => d.cls === 'base');
406
+ if (HF_LOGO_URI){
407
+ baseSel.append('image').attr('href', HF_LOGO_URI);
408
+ }else{
409
+ baseSel.append('circle').attr('r', d => 22*d.sz).attr('fill', '#ffbe0b');
410
+ }
411
+ node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
412
+
413
  node.append('text').attr('class','node-label').attr('dy','-2.4em').text(d => d.id);
414
 
415
  const sim = d3.forceSimulation(graph.nodes)
416
+ .force('link', d3.forceLink(graph.links).id(d => d.id).distance(520))
417
+ .force('charge', d3.forceManyBody().strength(-600))
418
  .force('center', d3.forceCenter(W / 2, H / 2))
419
+ .force('collide', d3.forceCollide(d => 50));
 
420
 
421
  sim.on('tick', () => {
422
  link.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y)