File size: 1,999 Bytes
4464055
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
from pathlib import Path
import sys
import time

pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../../"))

from project_settings import environment, project_path


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--raw_dataset",
        default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(),
        type=str
    )
    parser.add_argument(
        "--dataset",
        default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(),
        type=str
    )
    args = parser.parse_args()
    return args


def main():
    args = get_args()

    raw_dataset = Path(args.raw_dataset)
    dataset = Path(args.dataset)
    dataset.parent.mkdir(parents=True, exist_ok=True)

    with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
        for sample_dir in raw_dataset.glob("*"):
            idx = sample_dir.parts[-1]
            system_prompt_file = sample_dir / "system_prompt.txt"
            user_prompt_file = sample_dir / "user_prompt.txt"
            response_file = sample_dir / "response.txt"

            with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
                system_prompt = f.read()
            with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
                user_prompt = f.read()
            with open(response_file.as_posix(), "r", encoding="utf-8") as f:
                response = f.read()

            prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()

            print(f"{prompt}\n\n{response}")
            print("-" * 150)

            row_ = {
                "idx": idx,
                "prompt": prompt,
                "response": response,
            }
            row_ = json.dumps(row_, ensure_ascii=False)
            fout.write(f"{row_}\n")

    return


if __name__ == "__main__":
    main()