Spaces:
Running
on
Zero
Running
on
Zero
update
Browse files- hf-logo.svg +8 -0
- modular_graph_and_candidates.py +126 -31
hf-logo.svg
ADDED
|
modular_graph_and_candidates.py
CHANGED
@@ -94,10 +94,10 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
94 |
out[(m1, m2)] = s
|
95 |
return out
|
96 |
|
97 |
-
|
98 |
-
def
|
99 |
-
model = SentenceTransformer("codesage/codesage-large-v2", trust_remote_code=True)
|
100 |
-
model.max_seq_length =
|
101 |
texts = {}
|
102 |
|
103 |
for name in tqdm(missing, desc="Reading modeling files"):
|
@@ -113,7 +113,7 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
113 |
all_embeddings = []
|
114 |
|
115 |
print("Encoding embeddings...")
|
116 |
-
batch_size =
|
117 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
118 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
119 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
@@ -122,7 +122,7 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
122 |
embeddings = np.vstack(all_embeddings) # [N, D]
|
123 |
|
124 |
print("Computing pairwise similarities...")
|
125 |
-
sims = embeddings @ embeddings.T
|
126 |
|
127 |
out = {}
|
128 |
for i in range(len(names)):
|
@@ -132,6 +132,59 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
132 |
out[(names[i], names[j])] = float(s)
|
133 |
return out
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
|
137 |
|
@@ -261,34 +314,72 @@ def generate_html(graph: dict) -> str:
|
|
261 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
262 |
CSS = """
|
263 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
"""
|
277 |
|
278 |
JS = """
|
279 |
-
|
280 |
function updateVisibility() {
|
281 |
const show = document.getElementById('toggleRed').checked;
|
282 |
svg.selectAll('.link.cand').style('display', show ? null : 'none');
|
283 |
svg.selectAll('.node.cand').style('display', show ? null : 'none');
|
284 |
-
svg.selectAll('.link-label')
|
285 |
-
.filter(d => d.cand)
|
286 |
-
.style('display', show ? null : 'none');
|
287 |
}
|
288 |
-
|
289 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
290 |
|
291 |
-
|
292 |
const graph = __GRAPH_DATA__;
|
293 |
const W = innerWidth, H = innerHeight;
|
294 |
const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
|
@@ -311,17 +402,21 @@ const node = g.selectAll('g.node')
|
|
311 |
.attr('class', d => `node ${d.cls}`)
|
312 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
313 |
|
314 |
-
node.filter(d => d.cls==='base')
|
315 |
-
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
317 |
node.append('text').attr('class','node-label').attr('dy','-2.4em').text(d => d.id);
|
318 |
|
319 |
const sim = d3.forceSimulation(graph.nodes)
|
320 |
-
.force('link', d3.forceLink(graph.links).id(d => d.id).distance(520))
|
321 |
-
.force('charge', d3.forceManyBody().strength(-600))
|
322 |
.force('center', d3.forceCenter(W / 2, H / 2))
|
323 |
-
.force('collide', d3.forceCollide(d =>
|
324 |
-
|
325 |
|
326 |
sim.on('tick', () => {
|
327 |
link.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y)
|
|
|
94 |
out[(m1, m2)] = s
|
95 |
return out
|
96 |
|
97 |
+
#@spaces.GPU
|
98 |
+
def old_embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
99 |
+
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
100 |
+
model.max_seq_length = 8192 # truncate overly long modeling files
|
101 |
texts = {}
|
102 |
|
103 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
|
113 |
all_embeddings = []
|
114 |
|
115 |
print("Encoding embeddings...")
|
116 |
+
batch_size = 2
|
117 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
118 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
119 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
|
|
122 |
embeddings = np.vstack(all_embeddings) # [N, D]
|
123 |
|
124 |
print("Computing pairwise similarities...")
|
125 |
+
sims = embeddings @ embeddings.T
|
126 |
|
127 |
out = {}
|
128 |
for i in range(len(names)):
|
|
|
132 |
out[(names[i], names[j])] = float(s)
|
133 |
return out
|
134 |
|
135 |
+
#@spaces.GPU
|
136 |
+
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
137 |
+
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
138 |
+
|
139 |
+
# Hard-cap by backend max positions (prevents IndexError in self.wpe)
|
140 |
+
try:
|
141 |
+
cfg = model[0].auto_model.config
|
142 |
+
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
143 |
+
except Exception:
|
144 |
+
pos_limit = 1024 # conservative fallback if config is odd
|
145 |
+
|
146 |
+
seq_len = min(pos_limit, 2048) # optional extra ceiling if pos_limit is huge
|
147 |
+
model.max_seq_length = seq_len # SentenceTransformer wrapper
|
148 |
+
model[0].max_seq_length = seq_len # its Transformer submodule actually used for tokenize()
|
149 |
+
model[0].tokenizer.model_max_length = seq_len # ensure tokenizer truncates
|
150 |
+
|
151 |
+
texts = {}
|
152 |
+
for name in tqdm(missing, desc="Reading modeling files"):
|
153 |
+
code = ""
|
154 |
+
for py in (models_root / name).rglob("modeling_*.py"):
|
155 |
+
try:
|
156 |
+
code += _strip_source(py.read_text(encoding="utf-8")) + "\n"
|
157 |
+
except Exception:
|
158 |
+
continue
|
159 |
+
texts[name] = code.strip() or " "
|
160 |
+
|
161 |
+
names = list(texts)
|
162 |
+
all_embeddings = []
|
163 |
+
|
164 |
+
print("Encoding embeddings...")
|
165 |
+
batch_size = 2
|
166 |
+
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
167 |
+
batch = [texts[n] for n in names[i:i+batch_size]]
|
168 |
+
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
169 |
+
all_embeddings.append(emb)
|
170 |
+
|
171 |
+
# Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
|
172 |
+
import numpy as np
|
173 |
+
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
174 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
175 |
+
embeddings = embeddings / norms
|
176 |
+
|
177 |
+
print("Computing pairwise similarities...")
|
178 |
+
sims_mat = embeddings @ embeddings.T
|
179 |
+
|
180 |
+
out = {}
|
181 |
+
for i in range(len(names)):
|
182 |
+
for j in range(i + 1, len(names)):
|
183 |
+
s = float(sims_mat[i, j])
|
184 |
+
if s >= thr:
|
185 |
+
out[(names[i], names[j])] = s
|
186 |
+
return out
|
187 |
+
|
188 |
|
189 |
|
190 |
|
|
|
314 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
315 |
CSS = """
|
316 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
|
317 |
+
|
318 |
+
:root{
|
319 |
+
--bg:#ffffff;
|
320 |
+
--text:#222222;
|
321 |
+
--muted:#555555;
|
322 |
+
--outline:#ffffff;
|
323 |
+
}
|
324 |
+
@media (prefers-color-scheme: dark){
|
325 |
+
:root{
|
326 |
+
--bg:#0b0d10;
|
327 |
+
--text:#e8e8e8;
|
328 |
+
--muted:#c8c8c8;
|
329 |
+
--outline:#000000;
|
330 |
+
}
|
331 |
+
}
|
332 |
+
|
333 |
+
body{ margin:0; font-family:'Inter',Arial,sans-serif; background:var(--bg); overflow:hidden; }
|
334 |
+
svg{ width:100vw; height:100vh; }
|
335 |
+
|
336 |
+
.link{ stroke:#999; stroke-opacity:.6; }
|
337 |
+
.link.cand{ stroke:#e63946; stroke-width:2.5; }
|
338 |
+
|
339 |
+
.node-label{
|
340 |
+
fill:var(--text);
|
341 |
+
pointer-events:none;
|
342 |
+
text-anchor:middle;
|
343 |
+
font-weight:600;
|
344 |
+
paint-order:stroke fill;
|
345 |
+
stroke:var(--outline);
|
346 |
+
stroke-width:3px;
|
347 |
+
}
|
348 |
+
.link-label{
|
349 |
+
fill:var(--muted);
|
350 |
+
pointer-events:none;
|
351 |
+
text-anchor:middle;
|
352 |
+
font-size:10px;
|
353 |
+
paint-order:stroke fill;
|
354 |
+
stroke:var(--bg);
|
355 |
+
stroke-width:2px;
|
356 |
+
}
|
357 |
+
|
358 |
+
.node.base image{ width:60px; height:60px; transform:translate(-30px,-30px); }
|
359 |
+
.node.derived circle{ fill:#1f77b4; }
|
360 |
+
.node.cand circle, .node.cand path{ fill:#e63946; }
|
361 |
+
|
362 |
+
#legend{
|
363 |
+
position:fixed; top:18px; left:18px;
|
364 |
+
background:rgba(255,255,255,.92);
|
365 |
+
padding:18px 28px; border-radius:10px; border:1.5px solid #bbb;
|
366 |
+
font-size:18px; box-shadow:0 2px 8px rgba(0,0,0,.08);
|
367 |
+
}
|
368 |
+
@media (prefers-color-scheme: dark){
|
369 |
+
#legend{ background:rgba(20,22,25,.92); color:#e8e8e8; border-color:#444; }
|
370 |
+
}
|
371 |
"""
|
372 |
|
373 |
JS = """
|
|
|
374 |
function updateVisibility() {
|
375 |
const show = document.getElementById('toggleRed').checked;
|
376 |
svg.selectAll('.link.cand').style('display', show ? null : 'none');
|
377 |
svg.selectAll('.node.cand').style('display', show ? null : 'none');
|
378 |
+
svg.selectAll('.link-label').filter(d => d.cand).style('display', show ? null : 'none');
|
|
|
|
|
379 |
}
|
|
|
380 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
381 |
|
382 |
+
const HF_LOGO_URI = "__HF_LOGO_DATA_URI__";
|
383 |
const graph = __GRAPH_DATA__;
|
384 |
const W = innerWidth, H = innerHeight;
|
385 |
const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
|
|
|
402 |
.attr('class', d => `node ${d.cls}`)
|
403 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
404 |
|
405 |
+
const baseSel = node.filter(d => d.cls === 'base');
|
406 |
+
if (HF_LOGO_URI){
|
407 |
+
baseSel.append('image').attr('href', HF_LOGO_URI);
|
408 |
+
}else{
|
409 |
+
baseSel.append('circle').attr('r', d => 22*d.sz).attr('fill', '#ffbe0b');
|
410 |
+
}
|
411 |
+
node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
|
412 |
+
|
413 |
node.append('text').attr('class','node-label').attr('dy','-2.4em').text(d => d.id);
|
414 |
|
415 |
const sim = d3.forceSimulation(graph.nodes)
|
416 |
+
.force('link', d3.forceLink(graph.links).id(d => d.id).distance(520))
|
417 |
+
.force('charge', d3.forceManyBody().strength(-600))
|
418 |
.force('center', d3.forceCenter(W / 2, H / 2))
|
419 |
+
.force('collide', d3.forceCollide(d => 50));
|
|
|
420 |
|
421 |
sim.on('tick', () => {
|
422 |
link.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y)
|