#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import json import os from pathlib import Path import sys import time pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../../")) from project_settings import environment, project_path def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--raw_dataset", default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(), type=str ) parser.add_argument( "--dataset", default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(), type=str ) args = parser.parse_args() return args def main(): args = get_args() raw_dataset = Path(args.raw_dataset) dataset = Path(args.dataset) dataset.parent.mkdir(parents=True, exist_ok=True) with open(dataset.as_posix(), "w", encoding="utf-8") as fout: for sample_dir in raw_dataset.glob("*"): idx = sample_dir.parts[-1] system_prompt_file = sample_dir / "system_prompt.txt" user_prompt_file = sample_dir / "user_prompt.txt" response_file = sample_dir / "response.txt" with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f: system_prompt = f.read() with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f: user_prompt = f.read() with open(response_file.as_posix(), "r", encoding="utf-8") as f: response = f.read() prompt = f"""{system_prompt}\n\n{user_prompt}""".strip() print(f"{prompt}\n\n{response}") print("-" * 150) row_ = { "idx": idx, "prompt": prompt, "response": response, } row_ = json.dumps(row_, ensure_ascii=False) fout.write(f"{row_}\n") return if __name__ == "__main__": main()