Lukas Helff commited on
Commit
f567a45
·
1 Parent(s): 62ba87c

fix validation program

Browse files
VerifiableRewardsForScalableLogicalReasoning.py CHANGED
@@ -99,19 +99,21 @@ Returns:
99
  detailed_results (`list` of `dict`): Per-example results including correctness, partial score, execution time, and any errors encountered.
100
  """
101
 
102
-
103
- def validate_rule_no_hardcoded_cars(prediction):
104
- """Reject rules that hardcode specific car identifiers"""
105
- import re
106
-
107
- # Look for has_car with a constant (lowercase) in second position
108
- hardcoded_pattern = r'has_car\([^,]+,\s*([a-z][a-z0-9_]*)\)'
109
- matches = re.findall(hardcoded_pattern, prediction)
110
-
111
- if matches:
112
- return False, f"Cars must be variables: {matches[0]}"
113
-
114
- return True, "Rule is valid"
 
 
115
 
116
 
117
  def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
@@ -128,15 +130,6 @@ def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5
128
  # extract predicate from rule_to_evaluate
129
  rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
130
 
131
- is_valid, validation_msg = validate_rule_no_hardcoded_cars(rule_to_evaluate)
132
- if not is_valid:
133
- return {
134
- "is_correct": False,
135
- "partial_score": 0.0,
136
- "syntax_valid": False,
137
- "error": f"Rule validation failed: {validation_msg}"
138
- }
139
-
140
  if positive_pred not in rule_to_evaluate:
141
  p = prediction.replace('\n', ' ')
142
  return {
@@ -175,8 +168,7 @@ check_count(Count) :-
175
  check_all :- forall((pos({vars});neg({vars})), check({vars})).
176
  """
177
  # Add the rule to evaluate
178
- validation_program = re.sub(rf'\b{positive_pred}\b', 'pos', validation_program)
179
- validation_program = re.sub(rf'\b{negative_pred}\b', 'neg', validation_program)
180
 
181
  pos_negs = validation_program.count("pos(") + validation_program.count("neg(")
182
  validation_program = '\n'.join(sorted(validation_program.splitlines()))
 
99
  detailed_results (`list` of `dict`): Per-example results including correctness, partial score, execution time, and any errors encountered.
100
  """
101
 
102
+ def fix_validation_program(validation_program, positive_pred="eastbound", negative_pred="westbound"):
103
+ """
104
+ Fixes the validation program by ensuring it has a consistent format.
105
+ - Removes comments
106
+ - Ensures all rules end with a period
107
+ - Removes empty lines
108
+ """
109
+ # anonymize train and car instances, and head predicates
110
+ validation_program = re.sub(rf'\b{positive_pred}\b', 'pos', validation_program)
111
+ validation_program = re.sub(rf'\b{negative_pred}\b', 'neg', validation_program)
112
+ # replace train with mytrain and car with mycar
113
+ #trains must follow a digit pattern train\d+ and cars must follow a pattern car\d+_\d+
114
+ validation_program = validation_program.replace('(train', '(mytrain')
115
+ validation_program = validation_program.replace('(car', '(mycar').replace(', car', ', mycar')
116
+ return validation_program
117
 
118
 
119
  def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
 
130
  # extract predicate from rule_to_evaluate
131
  rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
132
 
 
 
 
 
 
 
 
 
 
133
  if positive_pred not in rule_to_evaluate:
134
  p = prediction.replace('\n', ' ')
135
  return {
 
168
  check_all :- forall((pos({vars});neg({vars})), check({vars})).
169
  """
170
  # Add the rule to evaluate
171
+ validation_program = fix_validation_program(validation_program, positive_pred, negative_pred)
 
172
 
173
  pos_negs = validation_program.count("pos(") + validation_program.count("neg(")
174
  validation_program = '\n'.join(sorted(validation_program.splitlines()))