import numpy as np import re def extract_judge_hash(response): """ checklist 별로 yes, in, no를 판단한 정보를 hash 형태로 변환하여 반환 """ content = response['response'] try: judge_content = content.lower().replace(' ', '').split('')[1].split('')[0] except: import traceback traceback.print_exc() return None pattern = r":yes|:inprogress|:no" matches = re.findall(pattern, judge_content) matches = [{':yes': 'y', ':inprogress': 'i', ':no': 'n'}[match] for match in matches] return ''.join(matches) def average_logits(response): """ yes, in, no를 logits 레벨에서 계산. """ judge_probs = response['judge_probs'] yes_ = np.mean([r['yes'] for r in judge_probs]) in_ = np.mean([r['in'] for r in judge_probs]) reward = yes_ + 0.5 * in_ return reward REWARD_PROCESSORS = { 'avg_logits': average_logits } REWARD_PROCESSOR_N_SAMPLES = { 'avg_logits': 5 }