Molbap HF Staff commited on
Commit
7490843
·
verified ·
1 Parent(s): dc04102

Add longer seqlens

Browse files
Files changed (1) hide show
  1. modular_graph_and_candidates.py +23 -8
modular_graph_and_candidates.py CHANGED
@@ -96,10 +96,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
96
 
97
  @spaces.GPU
98
  def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
99
- model = SentenceTransformer("codesage/codesage-large-v2", trust_remote_code=True)
100
- model.max_seq_length = 4096 # truncate overly long modeling files
101
- texts = {}
 
 
 
 
 
102
 
 
 
 
 
 
 
103
  for name in tqdm(missing, desc="Reading modeling files"):
104
  code = ""
105
  for py in (models_root / name).rglob("modeling_*.py"):
@@ -113,23 +124,27 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
113
  all_embeddings = []
114
 
115
  print("Encoding embeddings...")
116
- batch_size = 8 # or 2 if memory is tight
117
  for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
118
  batch = [texts[n] for n in names[i:i+batch_size]]
119
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
120
  all_embeddings.append(emb)
121
 
122
- embeddings = np.vstack(all_embeddings) # [N, D]
 
 
 
 
123
 
124
  print("Computing pairwise similarities...")
125
- sims = embeddings @ embeddings.T # cosine since already normalized
126
 
127
  out = {}
128
  for i in range(len(names)):
129
  for j in range(i + 1, len(names)):
130
- s = sims[i, j]
131
  if s >= thr:
132
- out[(names[i], names[j])] = float(s)
133
  return out
134
 
135
 
 
96
 
97
  @spaces.GPU
98
  def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
99
+ model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
100
+
101
+ # Hard-cap by backend max positions (prevents IndexError in self.wpe)
102
+ try:
103
+ cfg = model[0].auto_model.config
104
+ pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
105
+ except Exception:
106
+ pos_limit = 1024 # conservative fallback if config is odd
107
 
108
+ seq_len = min(pos_limit, 2048) # optional extra ceiling if pos_limit is huge
109
+ model.max_seq_length = seq_len # SentenceTransformer wrapper
110
+ model[0].max_seq_length = seq_len # its Transformer submodule actually used for tokenize()
111
+ model[0].tokenizer.model_max_length = seq_len # ensure tokenizer truncates
112
+
113
+ texts = {}
114
  for name in tqdm(missing, desc="Reading modeling files"):
115
  code = ""
116
  for py in (models_root / name).rglob("modeling_*.py"):
 
124
  all_embeddings = []
125
 
126
  print("Encoding embeddings...")
127
+ batch_size = 8
128
  for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
129
  batch = [texts[n] for n in names[i:i+batch_size]]
130
  emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
131
  all_embeddings.append(emb)
132
 
133
+ # Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
134
+ import numpy as np
135
+ embeddings = np.vstack(all_embeddings).astype(np.float32)
136
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
137
+ embeddings = embeddings / norms
138
 
139
  print("Computing pairwise similarities...")
140
+ sims_mat = embeddings @ embeddings.T
141
 
142
  out = {}
143
  for i in range(len(names)):
144
  for j in range(i + 1, len(names)):
145
+ s = float(sims_mat[i, j])
146
  if s >= thr:
147
+ out[(names[i], names[j])] = s
148
  return out
149
 
150