llm_eval_system / llm_eval_script /azure_openai_summary.py
HoneyTian's picture
update
dbd1ddd
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import re
from datetime import datetime
import json
import os
from pathlib import Path
import sys
import time
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../"))
import openai
from openai import AzureOpenAI
from project_settings import environment, project_path
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
# default="gpt-4o",
default="gpt-4o-mini",
type=str
)
parser.add_argument(
"--eval_dataset_name",
default="agent-bingoplus-ph-25-summary.jsonl",
type=str
)
parser.add_argument(
"--eval_dataset_dir",
default=(project_path / "data/dataset").as_posix(),
type=str
)
parser.add_argument(
"--eval_data_dir",
default=(project_path / "data/eval_data").as_posix(),
type=str
)
parser.add_argument(
"--client",
default="shenzhen_sase",
type=str
)
parser.add_argument(
"--service",
default="west_us_chatgpt_openai_azure_com",
type=str
)
parser.add_argument(
"--create_time_str",
default="null",
# default="20250806_114802",
type=str
)
parser.add_argument(
"--interval",
default=5,
type=int
)
args = parser.parse_args()
return args
def main():
args = get_args()
eval_dataset_dir = Path(args.eval_dataset_dir)
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
eval_data_dir = Path(args.eval_data_dir)
eval_data_dir.mkdir(parents=True, exist_ok=True)
if args.create_time_str == "null":
tz = ZoneInfo("Asia/Shanghai")
now = datetime.now(tz)
create_time_str = now.strftime("%Y%m%d_%H%M%S")
# create_time_str = "20250729-interval-5"
else:
create_time_str = args.create_time_str
eval_dataset = eval_dataset_dir / args.eval_dataset_name
output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
output_file.parent.mkdir(parents=True, exist_ok=True)
service_params = environment.get(args.service, dtype=json.loads)
client = AzureOpenAI(
**service_params,
# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
# api_version="2025-01-01-preview",
# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
)
total = 0
total_score = 0
# finished
finished_idx_set = set()
if os.path.exists(output_file.as_posix()):
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
for row in f:
row = json.loads(row)
idx = row["idx"]
total = row["total"]
total_score = row["total_score"]
finished_idx_set.add(idx)
print(f"finished count: {len(finished_idx_set)}")
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
for row in fin:
row = json.loads(row)
idx = row["idx"]
system_prompt: str = row["system_prompt"]
user_prompt: str = row["user_prompt"]
response = row["response"]
if idx in finished_idx_set:
continue
finished_idx_set.add(idx)
messages = [
{
"role": "system",
"content": system_prompt
},
{
"role": "user",
"content": user_prompt
},
]
try:
time.sleep(args.interval)
print(f"sleep: {args.interval}")
time_begin = time.time()
llm_response = client.chat.completions.create(
model=args.model_name,
messages=messages,
stream=False,
# max_tokens=1,
top_p=0.95,
temperature=0.6,
# logit_bias={
# 32: 100,
# 33: 100,
# 34: 100,
# 35: 100,
# 36: 100,
# 37: 100,
# }
)
time_cost = time.time() - time_begin
print(f"time_cost: {time_cost}")
except openai.BadRequestError as e:
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
continue
except openai.InternalServerError as e:
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
continue
prediction = llm_response.choices[0].message.content
response_ = json.loads(response)
response_tag_name_list = response_["tag_name_list"]
# print(response_tag_name_list)
if prediction.startswith("```json") and prediction.endswith("```"):
prediction_ = prediction[7:-3]
else:
prediction_ = prediction
prediction_tag_name_list = list()
try:
prediction_ = json.loads(prediction_)
prediction_tag_name_list = prediction_["tag_name_list"]
except json.JSONDecodeError:
pass
# print(prediction_tag_name_list)
# recall
recall_count = 0
for tag in response_tag_name_list:
if tag in prediction_tag_name_list:
recall_count += 1
recall = recall_count / (len(response_tag_name_list) + 1e-7)
# precision
precision_count = 0
for tag in prediction_tag_name_list:
if tag in response_tag_name_list:
precision_count += 1
precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
# f1
f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
total += 1
total_score += f1
score = total_score / total
row_ = {
"idx": idx,
"system_prompt": system_prompt,
"user_prompt": user_prompt,
"response": response,
"prediction": prediction,
"recall": recall,
"precision": precision,
"f1": f1,
"total": total,
"total_score": total_score,
"score": score,
"time_cost": time_cost,
}
row_ = json.dumps(row_, ensure_ascii=False)
fout.write(f"{row_}\n")
fout.flush()
return
if __name__ == "__main__":
main()