llm_eval_system / examples /make_dataset /make_lingoace_400_choice.py
HoneyTian's picture
first commit
4464055
raw
history blame
2 kB
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
from pathlib import Path
import sys
import time
pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../../"))
from project_settings import environment, project_path
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--raw_dataset",
default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(),
type=str
)
parser.add_argument(
"--dataset",
default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(),
type=str
)
args = parser.parse_args()
return args
def main():
args = get_args()
raw_dataset = Path(args.raw_dataset)
dataset = Path(args.dataset)
dataset.parent.mkdir(parents=True, exist_ok=True)
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
for sample_dir in raw_dataset.glob("*"):
idx = sample_dir.parts[-1]
system_prompt_file = sample_dir / "system_prompt.txt"
user_prompt_file = sample_dir / "user_prompt.txt"
response_file = sample_dir / "response.txt"
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
system_prompt = f.read()
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
user_prompt = f.read()
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
response = f.read()
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
print(f"{prompt}\n\n{response}")
print("-" * 150)
row_ = {
"idx": idx,
"prompt": prompt,
"response": response,
}
row_ = json.dumps(row_, ensure_ascii=False)
fout.write(f"{row_}\n")
return
if __name__ == "__main__":
main()