Spaces:
Sleeping
Sleeping
File size: 1,999 Bytes
4464055 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
from pathlib import Path
import sys
import time
pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../../"))
from project_settings import environment, project_path
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--raw_dataset",
default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(),
type=str
)
parser.add_argument(
"--dataset",
default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(),
type=str
)
args = parser.parse_args()
return args
def main():
args = get_args()
raw_dataset = Path(args.raw_dataset)
dataset = Path(args.dataset)
dataset.parent.mkdir(parents=True, exist_ok=True)
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
for sample_dir in raw_dataset.glob("*"):
idx = sample_dir.parts[-1]
system_prompt_file = sample_dir / "system_prompt.txt"
user_prompt_file = sample_dir / "user_prompt.txt"
response_file = sample_dir / "response.txt"
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
system_prompt = f.read()
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
user_prompt = f.read()
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
response = f.read()
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
print(f"{prompt}\n\n{response}")
print("-" * 150)
row_ = {
"idx": idx,
"prompt": prompt,
"response": response,
}
row_ = json.dumps(row_, ensure_ascii=False)
fout.write(f"{row_}\n")
return
if __name__ == "__main__":
main()
|