Spaces:
Sleeping
Sleeping
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import argparse | |
import json | |
import os | |
from pathlib import Path | |
import sys | |
import time | |
pwd = os.path.abspath(os.path.dirname(__file__)) | |
sys.path.append(os.path.join(pwd, "../../")) | |
from project_settings import environment, project_path | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--raw_dataset", | |
default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(), | |
type=str | |
) | |
parser.add_argument( | |
"--dataset", | |
default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(), | |
type=str | |
) | |
args = parser.parse_args() | |
return args | |
def main(): | |
args = get_args() | |
raw_dataset = Path(args.raw_dataset) | |
dataset = Path(args.dataset) | |
dataset.parent.mkdir(parents=True, exist_ok=True) | |
with open(dataset.as_posix(), "w", encoding="utf-8") as fout: | |
for sample_dir in raw_dataset.glob("*"): | |
idx = sample_dir.parts[-1] | |
system_prompt_file = sample_dir / "system_prompt.txt" | |
user_prompt_file = sample_dir / "user_prompt.txt" | |
response_file = sample_dir / "response.txt" | |
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f: | |
system_prompt = f.read() | |
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f: | |
user_prompt = f.read() | |
with open(response_file.as_posix(), "r", encoding="utf-8") as f: | |
response = f.read() | |
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip() | |
print(f"{prompt}\n\n{response}") | |
print("-" * 150) | |
row_ = { | |
"idx": idx, | |
"prompt": prompt, | |
"response": response, | |
} | |
row_ = json.dumps(row_, ensure_ascii=False) | |
fout.write(f"{row_}\n") | |
return | |
if __name__ == "__main__": | |
main() | |