from human_eval.data import stream_jsonl import glob from tqdm import tqdm import argparse import jsonlines import json def read_mbpp(path): mbpp_problems = {} with jsonlines.open(path, "r") as fin: for obj in fin: mbpp_problems[obj["task_id"]] = obj return mbpp_problems parser = argparse.ArgumentParser() # Inputs parser.add_argument( '--path', type=str, help="") parser.add_argument( '--out_path', type=str, help="") parser.add_argument( '--add_prompt', action='store_true', help='') parser.add_argument('--mbpp_path', type=str, help="") args = parser.parse_args() files = sorted(glob.glob(args.path + '/*.jsonl')) print("{} files in {}".format(len(files), args.path)) problems = read_mbpp(args.mbpp_path) output = [[] for _ in range(len(problems))] a = 0 for code_file in tqdm(files, total=len(files)): codes = [c for c in stream_jsonl(code_file)] if args.add_prompt: for code in codes: task_id = code['task_id'] completion = code['completion'] if '```python' in completion: def_line = completion.index('```python') completion = completion[def_line:].strip() completion = completion.replace('```python', '') try: next_line = completion.index('\n```') completion = completion[:next_line].strip() except: a += 1 if "__name__ == \"__main__\"" in completion: next_line = completion.index('if __name__ == "__main__":') completion = completion[:next_line].strip() if "# Example usage" in completion: next_line = completion.index('# Example usage') completion = completion[:next_line].strip() if "# Test examples" in completion: next_line = completion.index('# Test examples') completion = completion[:next_line].strip() output[task_id-11].append(completion) print("save to {}".format(args.out_path)) print(a) with open(args.out_path, "w", encoding="utf-8") as fout: json.dump(output, fout)