Spaces:
Running on Zero

Ruurd commited on
Commit
6c7f510
·
verified ·
1 Parent(s): d86917b

Fix confidence guided noising

Browse files
Files changed (1) hide show
  1. app.py +26 -30
app.py CHANGED
@@ -115,46 +115,42 @@ def confidence_guided_noising(input_ids, answer_start, confidences, noise_clippi
115
  answer_len = len(input_ids) - answer_start
116
  num_to_noise = int(threshold * answer_len * noise_start)
117
  if num_to_noise == 0:
118
- return noised
119
 
120
  all_indices = np.arange(answer_start, len(input_ids))
121
  eos_indices = [i for i in all_indices if input_ids[i] == eos_token_id]
122
  non_eos_indices = [i for i in all_indices if input_ids[i] != eos_token_id]
123
 
124
- num_non_eos_to_noise = int(num_to_noise * (len(non_eos_indices) / (len(non_eos_indices) + len(eos_indices) + 1e-5)))
 
125
  num_eos_to_noise = num_to_noise - num_non_eos_to_noise
126
 
127
- # === Non-EOS sampling ===
128
- raw_weights_non_eos = 1.0 - np.array([confidences[i - answer_start] for i in non_eos_indices])
129
- raw_weights_non_eos = np.clip(raw_weights_non_eos, a_min=noise_clipping, a_max=None)
130
- weights_non_eos = raw_weights_non_eos / raw_weights_non_eos.sum() if raw_weights_non_eos.sum() > 0 else None
131
-
132
- chosen_non_eos = rng.choice(
133
- non_eos_indices,
134
- size=min(num_non_eos_to_noise, len(non_eos_indices)),
135
- replace=False,
136
- p=weights_non_eos
137
- ) if weights_non_eos is not None else []
138
-
139
- # === EOS sampling ===
140
- if eos_indices:
141
- raw_weights_eos = 1.0 - np.array([confidences[i - answer_start] for i in eos_indices])
142
- raw_weights_eos = np.clip(raw_weights_eos, a_min=noise_clipping, a_max=None)
143
- weights_eos = raw_weights_eos / raw_weights_eos.sum() if raw_weights_eos.sum() > 0 else None
144
-
145
- chosen_eos = rng.choice(
146
- eos_indices,
147
- size=min(num_eos_to_noise, len(eos_indices)),
148
- replace=False,
149
- p=weights_eos
150
- ) if weights_eos is not None else []
151
- else:
152
- chosen_eos = []
153
 
154
- for idx in list(chosen_non_eos) + list(chosen_eos):
 
 
 
155
  noised[idx] = mask_token_id
156
 
157
- return noised
 
 
158
 
159
  @spaces.GPU
160
  def generate_diffusion_text(input_ids):
 
115
  answer_len = len(input_ids) - answer_start
116
  num_to_noise = int(threshold * answer_len * noise_start)
117
  if num_to_noise == 0:
118
+ return noised, []
119
 
120
  all_indices = np.arange(answer_start, len(input_ids))
121
  eos_indices = [i for i in all_indices if input_ids[i] == eos_token_id]
122
  non_eos_indices = [i for i in all_indices if input_ids[i] != eos_token_id]
123
 
124
+ # Proportionally split how many to noise
125
+ num_non_eos_to_noise = int(num_to_noise * len(non_eos_indices) / (len(non_eos_indices) + len(eos_indices) + 1e-5))
126
  num_eos_to_noise = num_to_noise - num_non_eos_to_noise
127
 
128
+ noised_indices = []
129
+
130
+ # --- Non-EOS ---
131
+ if non_eos_indices:
132
+ raw_weights = 1.0 - np.array([confidences[i - answer_start] for i in non_eos_indices])
133
+ raw_weights = np.clip(raw_weights, a_min=noise_clipping, a_max=None)
134
+ weights = raw_weights / raw_weights.sum()
135
+
136
+ chosen = rng.choice(non_eos_indices, size=min(num_non_eos_to_noise, len(non_eos_indices)), replace=False, p=weights)
137
+ noised_indices.extend(chosen.tolist())
138
+
139
+ # --- EOS ---
140
+ if eos_indices and num_eos_to_noise > 0:
141
+ raw_weights = 1.0 - np.array([confidences[i - answer_start] for i in eos_indices])
142
+ raw_weights = np.clip(raw_weights, a_min=noise_clipping, a_max=None)
143
+ weights = raw_weights / raw_weights.sum()
 
 
 
 
 
 
 
 
 
 
144
 
145
+ chosen = rng.choice(eos_indices, size=min(num_eos_to_noise, len(eos_indices)), replace=False, p=weights)
146
+ noised_indices.extend(chosen.tolist())
147
+
148
+ for idx in noised_indices:
149
  noised[idx] = mask_token_id
150
 
151
+ noised_indices = sorted(noised_indices)
152
+ return noised, noised_indices
153
+
154
 
155
  @spaces.GPU
156
  def generate_diffusion_text(input_ids):