Lukas Helff
commited on
Commit
·
f567a45
1
Parent(s):
62ba87c
fix validation program
Browse files
VerifiableRewardsForScalableLogicalReasoning.py
CHANGED
@@ -99,19 +99,21 @@ Returns:
|
|
99 |
detailed_results (`list` of `dict`): Per-example results including correctness, partial score, execution time, and any errors encountered.
|
100 |
"""
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
115 |
|
116 |
|
117 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
@@ -128,15 +130,6 @@ def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5
|
|
128 |
# extract predicate from rule_to_evaluate
|
129 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
|
130 |
|
131 |
-
is_valid, validation_msg = validate_rule_no_hardcoded_cars(rule_to_evaluate)
|
132 |
-
if not is_valid:
|
133 |
-
return {
|
134 |
-
"is_correct": False,
|
135 |
-
"partial_score": 0.0,
|
136 |
-
"syntax_valid": False,
|
137 |
-
"error": f"Rule validation failed: {validation_msg}"
|
138 |
-
}
|
139 |
-
|
140 |
if positive_pred not in rule_to_evaluate:
|
141 |
p = prediction.replace('\n', ' ')
|
142 |
return {
|
@@ -175,8 +168,7 @@ check_count(Count) :-
|
|
175 |
check_all :- forall((pos({vars});neg({vars})), check({vars})).
|
176 |
"""
|
177 |
# Add the rule to evaluate
|
178 |
-
validation_program =
|
179 |
-
validation_program = re.sub(rf'\b{negative_pred}\b', 'neg', validation_program)
|
180 |
|
181 |
pos_negs = validation_program.count("pos(") + validation_program.count("neg(")
|
182 |
validation_program = '\n'.join(sorted(validation_program.splitlines()))
|
|
|
99 |
detailed_results (`list` of `dict`): Per-example results including correctness, partial score, execution time, and any errors encountered.
|
100 |
"""
|
101 |
|
102 |
+
def fix_validation_program(validation_program, positive_pred="eastbound", negative_pred="westbound"):
|
103 |
+
"""
|
104 |
+
Fixes the validation program by ensuring it has a consistent format.
|
105 |
+
- Removes comments
|
106 |
+
- Ensures all rules end with a period
|
107 |
+
- Removes empty lines
|
108 |
+
"""
|
109 |
+
# anonymize train and car instances, and head predicates
|
110 |
+
validation_program = re.sub(rf'\b{positive_pred}\b', 'pos', validation_program)
|
111 |
+
validation_program = re.sub(rf'\b{negative_pred}\b', 'neg', validation_program)
|
112 |
+
# replace train with mytrain and car with mycar
|
113 |
+
#trains must follow a digit pattern train\d+ and cars must follow a pattern car\d+_\d+
|
114 |
+
validation_program = validation_program.replace('(train', '(mytrain')
|
115 |
+
validation_program = validation_program.replace('(car', '(mycar').replace(', car', ', mycar')
|
116 |
+
return validation_program
|
117 |
|
118 |
|
119 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
|
|
130 |
# extract predicate from rule_to_evaluate
|
131 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
if positive_pred not in rule_to_evaluate:
|
134 |
p = prediction.replace('\n', ' ')
|
135 |
return {
|
|
|
168 |
check_all :- forall((pos({vars});neg({vars})), check({vars})).
|
169 |
"""
|
170 |
# Add the rule to evaluate
|
171 |
+
validation_program = fix_validation_program(validation_program, positive_pred, negative_pred)
|
|
|
172 |
|
173 |
pos_negs = validation_program.count("pos(") + validation_program.count("neg(")
|
174 |
validation_program = '\n'.join(sorted(validation_program.splitlines()))
|