Spaces:

Ruurd
/

lad

Running on Zero

App Files Files

Ruurd commited on May 15

Commit

6c7f510

verified ·

1 Parent(s): d86917b

Fix confidence guided noising

Browse files

Files changed (1) hide show

app.py +26 -30

app.py CHANGED Viewed

@@ -115,46 +115,42 @@ def confidence_guided_noising(input_ids, answer_start, confidences, noise_clippi
     answer_len = len(input_ids) - answer_start
     num_to_noise = int(threshold * answer_len * noise_start)
     if num_to_noise == 0:
-        return noised
     all_indices = np.arange(answer_start, len(input_ids))
     eos_indices = [i for i in all_indices if input_ids[i] == eos_token_id]
     non_eos_indices = [i for i in all_indices if input_ids[i] != eos_token_id]
-    num_non_eos_to_noise = int(num_to_noise * (len(non_eos_indices) / (len(non_eos_indices) + len(eos_indices) + 1e-5)))
     num_eos_to_noise = num_to_noise - num_non_eos_to_noise
-    # === Non-EOS sampling ===
-    raw_weights_non_eos = 1.0 - np.array([confidences[i - answer_start] for i in non_eos_indices])
-    raw_weights_non_eos = np.clip(raw_weights_non_eos, a_min=noise_clipping, a_max=None)
-    weights_non_eos = raw_weights_non_eos / raw_weights_non_eos.sum() if raw_weights_non_eos.sum() > 0 else None
-    chosen_non_eos = rng.choice(
-        non_eos_indices,
-        size=min(num_non_eos_to_noise, len(non_eos_indices)),
-        replace=False,
-        p=weights_non_eos
-    ) if weights_non_eos is not None else []
-    # === EOS sampling ===
-    if eos_indices:
-        raw_weights_eos = 1.0 - np.array([confidences[i - answer_start] for i in eos_indices])
-        raw_weights_eos = np.clip(raw_weights_eos, a_min=noise_clipping, a_max=None)
-        weights_eos = raw_weights_eos / raw_weights_eos.sum() if raw_weights_eos.sum() > 0 else None
-        chosen_eos = rng.choice(
-            eos_indices,
-            size=min(num_eos_to_noise, len(eos_indices)),
-            replace=False,
-            p=weights_eos
-        ) if weights_eos is not None else []
-    else:
-        chosen_eos = []
-    for idx in list(chosen_non_eos) + list(chosen_eos):
         noised[idx] = mask_token_id
-    return noised
 @spaces.GPU
 def generate_diffusion_text(input_ids):

     answer_len = len(input_ids) - answer_start
     num_to_noise = int(threshold * answer_len * noise_start)
     if num_to_noise == 0:
+        return noised, []
     all_indices = np.arange(answer_start, len(input_ids))
     eos_indices = [i for i in all_indices if input_ids[i] == eos_token_id]
     non_eos_indices = [i for i in all_indices if input_ids[i] != eos_token_id]
+    # Proportionally split how many to noise
+    num_non_eos_to_noise = int(num_to_noise * len(non_eos_indices) / (len(non_eos_indices) + len(eos_indices) + 1e-5))
     num_eos_to_noise = num_to_noise - num_non_eos_to_noise
+    noised_indices = []
+    # --- Non-EOS ---
+    if non_eos_indices:
+        raw_weights = 1.0 - np.array([confidences[i - answer_start] for i in non_eos_indices])
+        raw_weights = np.clip(raw_weights, a_min=noise_clipping, a_max=None)
+        weights = raw_weights / raw_weights.sum()
+        chosen = rng.choice(non_eos_indices, size=min(num_non_eos_to_noise, len(non_eos_indices)), replace=False, p=weights)
+        noised_indices.extend(chosen.tolist())
+    # --- EOS ---
+    if eos_indices and num_eos_to_noise > 0:
+        raw_weights = 1.0 - np.array([confidences[i - answer_start] for i in eos_indices])
+        raw_weights = np.clip(raw_weights, a_min=noise_clipping, a_max=None)
+        weights = raw_weights / raw_weights.sum()
+        chosen = rng.choice(eos_indices, size=min(num_eos_to_noise, len(eos_indices)), replace=False, p=weights)
+        noised_indices.extend(chosen.tolist())
+    for idx in noised_indices:
         noised[idx] = mask_token_id
+    noised_indices = sorted(noised_indices)
+    return noised, noised_indices
 @spaces.GPU
 def generate_diffusion_text(input_ids):