Molbap HF Staff commited on
Commit
b90f4fc
·
2 Parent(s): c862054 6804082

Merge branch 'main' of https://huggingface.co/spaces/Molbap/transformers-modular-refactor

Browse files
Files changed (1) hide show
  1. modular_graph_and_candidates.py +6 -46
modular_graph_and_candidates.py CHANGED
@@ -94,59 +94,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
94
  out[(m1, m2)] = s
95
  return out
96
 
97
- #@spaces.GPU
98
- def old_embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
99
- model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
100
- model.max_seq_length = 8192 # truncate overly long modeling files
101
- texts = {}
102
-
103
- for name in tqdm(missing, desc="Reading modeling files"):
104
- code = ""
105
- for py in (models_root / name).rglob("modeling_*.py"):
106
- try:
107
- code += _strip_source(py.read_text(encoding="utf-8")) + "\n"
108
- except Exception:
109
- continue
110
- texts[name] = code.strip() or " "
111
-
112
- names = list(texts)
113
- all_embeddings = []
114
-
115
- print("Encoding embeddings...")
116
- batch_size = 2
117
- for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
118
- batch = [texts[n] for n in names[i:i+batch_size]]
119
- emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
120
- all_embeddings.append(emb)
121
-
122
- embeddings = np.vstack(all_embeddings) # [N, D]
123
-
124
- print("Computing pairwise similarities...")
125
- sims = embeddings @ embeddings.T
126
-
127
- out = {}
128
- for i in range(len(names)):
129
- for j in range(i + 1, len(names)):
130
- s = sims[i, j]
131
- if s >= thr:
132
- out[(names[i], names[j])] = float(s)
133
- return out
134
 
135
- #@spaces.GPU
136
  def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
137
  model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
138
 
139
- # Hard-cap by backend max positions (prevents IndexError in self.wpe)
140
  try:
141
  cfg = model[0].auto_model.config
142
  pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
143
  except Exception:
144
- pos_limit = 1024 # conservative fallback if config is odd
145
 
146
- seq_len = min(pos_limit, 2048) # optional extra ceiling if pos_limit is huge
147
- model.max_seq_length = seq_len # SentenceTransformer wrapper
148
- model[0].max_seq_length = seq_len # its Transformer submodule actually used for tokenize()
149
- model[0].tokenizer.model_max_length = seq_len # ensure tokenizer truncates
150
 
151
  texts = {}
152
  for name in tqdm(missing, desc="Reading modeling files"):
@@ -168,8 +130,6 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
168
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
169
  all_embeddings.append(emb)
170
 
171
- # Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
172
- import numpy as np
173
  embeddings = np.vstack(all_embeddings).astype(np.float32)
174
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
175
  embeddings = embeddings / norms
 
94
  out[(m1, m2)] = s
95
  return out
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ @spaces.GPU
99
  def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
100
  model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
101
 
 
102
  try:
103
  cfg = model[0].auto_model.config
104
  pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
105
  except Exception:
106
+ pos_limit = 1024
107
 
108
+ seq_len = min(pos_limit, 2048)
109
+ model.max_seq_length = seq_len
110
+ model[0].max_seq_length = seq_len
111
+ model[0].tokenizer.model_max_length = seq_len
112
 
113
  texts = {}
114
  for name in tqdm(missing, desc="Reading modeling files"):
 
130
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
131
  all_embeddings.append(emb)
132
 
 
 
133
  embeddings = np.vstack(all_embeddings).astype(np.float32)
134
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
135
  embeddings = embeddings / norms