Spaces:
Running
on
Zero
Running
on
Zero
make things persist
Browse files- app.py +10 -1
- modular_graph_and_candidates.py +55 -19
app.py
CHANGED
@@ -52,7 +52,16 @@ def _escape_srcdoc(text: str) -> str:
|
|
52 |
|
53 |
|
54 |
def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
graph = build_graph_json(
|
58 |
transformers_dir=repo_path,
|
|
|
52 |
|
53 |
|
54 |
def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
55 |
+
# Check if we can use cached embeddings for embedding similarity
|
56 |
+
embeddings_cache = Path("embeddings_cache.npz")
|
57 |
+
|
58 |
+
if sim_method == "embedding" and embeddings_cache.exists():
|
59 |
+
print("π Using cached embeddings - skipping repo download")
|
60 |
+
# Use a dummy path since we won't need the actual repo
|
61 |
+
repo_path = Path("/tmp/dummy")
|
62 |
+
else:
|
63 |
+
print("π₯ Downloading/updating repository")
|
64 |
+
repo_path = clone_or_cache(repo_url)
|
65 |
|
66 |
graph = build_graph_json(
|
67 |
transformers_dir=repo_path,
|
modular_graph_and_candidates.py
CHANGED
@@ -130,22 +130,22 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
130 |
print(f"Encoding embeddings for {len(names)} models...")
|
131 |
batch_size = 4 # keep your default
|
132 |
|
133 |
-
# ββ
|
134 |
-
|
135 |
start_idx = 0
|
136 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
137 |
|
138 |
-
if
|
139 |
try:
|
140 |
-
|
141 |
-
|
142 |
-
if names[:len(
|
143 |
-
loaded =
|
144 |
all_embeddings.append(loaded)
|
145 |
-
start_idx = len(
|
146 |
-
print(f"
|
147 |
except Exception as e:
|
148 |
-
print(f"β οΈ Failed to load
|
149 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
150 |
|
151 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
@@ -161,16 +161,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
161 |
|
162 |
all_embeddings.append(emb)
|
163 |
|
164 |
-
# save
|
165 |
try:
|
166 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
167 |
np.savez(
|
168 |
-
|
169 |
embeddings=cur,
|
170 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
171 |
)
|
172 |
except Exception as e:
|
173 |
-
print(f"β οΈ Failed to write
|
174 |
|
175 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
176 |
torch.cuda.empty_cache()
|
@@ -193,14 +193,42 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
193 |
if s >= thr:
|
194 |
out[(processed_names[i], processed_names[j])] = s
|
195 |
|
196 |
-
|
197 |
-
try:
|
198 |
-
ckpt_path.unlink()
|
199 |
-
except Exception:
|
200 |
-
pass
|
201 |
-
|
202 |
return out
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
|
206 |
|
@@ -269,6 +297,14 @@ def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str,
|
|
269 |
if sim_method == "jaccard":
|
270 |
return similarity_clusters({m: bags[m] for m in missing}, threshold)
|
271 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
return embedding_similarity_clusters(models_root, missing, threshold)
|
273 |
|
274 |
def build_graph_json(
|
|
|
130 |
print(f"Encoding embeddings for {len(names)} models...")
|
131 |
batch_size = 4 # keep your default
|
132 |
|
133 |
+
# ββ persistent embeddings storage ββββββββββββββββββββββββββββββββββββββββββββ
|
134 |
+
embeddings_path = Path("embeddings_cache.npz")
|
135 |
start_idx = 0
|
136 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
137 |
|
138 |
+
if embeddings_path.exists():
|
139 |
try:
|
140 |
+
cached = np.load(embeddings_path, allow_pickle=True)
|
141 |
+
cached_names = list(cached["names"])
|
142 |
+
if names[:len(cached_names)] == cached_names:
|
143 |
+
loaded = cached["embeddings"].astype(np.float32)
|
144 |
all_embeddings.append(loaded)
|
145 |
+
start_idx = len(cached_names)
|
146 |
+
print(f"π¦ Using cached embeddings for {start_idx}/{len(names)} models")
|
147 |
except Exception as e:
|
148 |
+
print(f"β οΈ Failed to load cached embeddings: {type(e).__name__}: {e}")
|
149 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
150 |
|
151 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
|
|
161 |
|
162 |
all_embeddings.append(emb)
|
163 |
|
164 |
+
# save to persistent cache after each batch
|
165 |
try:
|
166 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
167 |
np.savez(
|
168 |
+
embeddings_path,
|
169 |
embeddings=cur,
|
170 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
171 |
)
|
172 |
except Exception as e:
|
173 |
+
print(f"β οΈ Failed to write embeddings cache: {type(e).__name__}: {e}")
|
174 |
|
175 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
176 |
torch.cuda.empty_cache()
|
|
|
193 |
if s >= thr:
|
194 |
out[(processed_names[i], processed_names[j])] = s
|
195 |
|
196 |
+
print(f"πΎ Embeddings saved to {embeddings_path}")
|
|
|
|
|
|
|
|
|
|
|
197 |
return out
|
198 |
|
199 |
+
def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
|
200 |
+
"""Compute similarities from cached embeddings without reprocessing."""
|
201 |
+
embeddings_path = Path("embeddings_cache.npz")
|
202 |
+
|
203 |
+
if not embeddings_path.exists():
|
204 |
+
return {}
|
205 |
+
|
206 |
+
try:
|
207 |
+
cached = np.load(embeddings_path, allow_pickle=True)
|
208 |
+
embeddings = cached["embeddings"].astype(np.float32)
|
209 |
+
names = list(cached["names"])
|
210 |
+
|
211 |
+
# Normalize embeddings
|
212 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
213 |
+
embeddings = embeddings / norms
|
214 |
+
|
215 |
+
# Compute similarities
|
216 |
+
sims_mat = embeddings @ embeddings.T
|
217 |
+
|
218 |
+
out = {}
|
219 |
+
for i in range(len(names)):
|
220 |
+
for j in range(i + 1, len(names)):
|
221 |
+
s = float(sims_mat[i, j])
|
222 |
+
if s >= threshold:
|
223 |
+
out[(names[i], names[j])] = s
|
224 |
+
|
225 |
+
print(f"β‘ Computed {len(out)} similarities from cache (threshold: {threshold})")
|
226 |
+
return out
|
227 |
+
|
228 |
+
except Exception as e:
|
229 |
+
print(f"β οΈ Failed to compute from cache: {e}")
|
230 |
+
return {}
|
231 |
+
|
232 |
|
233 |
|
234 |
|
|
|
297 |
if sim_method == "jaccard":
|
298 |
return similarity_clusters({m: bags[m] for m in missing}, threshold)
|
299 |
else:
|
300 |
+
# Try to use cached embeddings first
|
301 |
+
embeddings_path = Path("embeddings_cache.npz")
|
302 |
+
if embeddings_path.exists():
|
303 |
+
cached_sims = compute_similarities_from_cache(threshold)
|
304 |
+
if cached_sims: # Cache exists and worked
|
305 |
+
return cached_sims
|
306 |
+
|
307 |
+
# Fallback to full computation
|
308 |
return embedding_similarity_clusters(models_root, missing, threshold)
|
309 |
|
310 |
def build_graph_json(
|