#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import re from datetime import datetime import json import os from pathlib import Path import sys import time from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装 pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../")) import openai from openai import AzureOpenAI from project_settings import environment, project_path def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--model_name", # default="gpt-4o", default="gpt-4o-mini", type=str ) parser.add_argument( "--eval_dataset_name", default="agent-bingoplus-ph-25-summary.jsonl", type=str ) parser.add_argument( "--eval_dataset_dir", default=(project_path / "data/dataset").as_posix(), type=str ) parser.add_argument( "--eval_data_dir", default=(project_path / "data/eval_data").as_posix(), type=str ) parser.add_argument( "--client", default="shenzhen_sase", type=str ) parser.add_argument( "--service", default="west_us_chatgpt_openai_azure_com", type=str ) parser.add_argument( "--create_time_str", default="null", # default="20250806_114802", type=str ) parser.add_argument( "--interval", default=5, type=int ) args = parser.parse_args() return args def main(): args = get_args() eval_dataset_dir = Path(args.eval_dataset_dir) eval_dataset_dir.mkdir(parents=True, exist_ok=True) eval_data_dir = Path(args.eval_data_dir) eval_data_dir.mkdir(parents=True, exist_ok=True) if args.create_time_str == "null": tz = ZoneInfo("Asia/Shanghai") now = datetime.now(tz) create_time_str = now.strftime("%Y%m%d_%H%M%S") # create_time_str = "20250729-interval-5" else: create_time_str = args.create_time_str eval_dataset = eval_dataset_dir / args.eval_dataset_name output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}" output_file.parent.mkdir(parents=True, exist_ok=True) service_params = environment.get(args.service, dtype=json.loads) client = AzureOpenAI( **service_params, # api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW", # api_version="2025-01-01-preview", # azure_endpoint="https://west-us-chatgpt.openai.azure.com" ) total = 0 total_score = 0 # finished finished_idx_set = set() if os.path.exists(output_file.as_posix()): with open(output_file.as_posix(), "r", encoding="utf-8") as f: for row in f: row = json.loads(row) idx = row["idx"] total = row["total"] total_score = row["total_score"] finished_idx_set.add(idx) print(f"finished count: {len(finished_idx_set)}") with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout: for row in fin: row = json.loads(row) idx = row["idx"] system_prompt: str = row["system_prompt"] user_prompt: str = row["user_prompt"] response = row["response"] if idx in finished_idx_set: continue finished_idx_set.add(idx) messages = [ { "role": "system", "content": system_prompt }, { "role": "user", "content": user_prompt }, ] try: time.sleep(args.interval) print(f"sleep: {args.interval}") time_begin = time.time() llm_response = client.chat.completions.create( model=args.model_name, messages=messages, stream=False, # max_tokens=1, top_p=0.95, temperature=0.6, # logit_bias={ # 32: 100, # 33: 100, # 34: 100, # 35: 100, # 36: 100, # 37: 100, # } ) time_cost = time.time() - time_begin print(f"time_cost: {time_cost}") except openai.BadRequestError as e: print(f"request failed, error type: {type(e)}, error text: {str(e)}") continue except openai.InternalServerError as e: print(f"request failed, error type: {type(e)}, error text: {str(e)}") continue prediction = llm_response.choices[0].message.content response_ = json.loads(response) response_tag_name_list = response_["tag_name_list"] # print(response_tag_name_list) if prediction.startswith("```json") and prediction.endswith("```"): prediction_ = prediction[7:-3] else: prediction_ = prediction prediction_tag_name_list = list() try: prediction_ = json.loads(prediction_) prediction_tag_name_list = prediction_["tag_name_list"] except json.JSONDecodeError: pass # print(prediction_tag_name_list) # recall recall_count = 0 for tag in response_tag_name_list: if tag in prediction_tag_name_list: recall_count += 1 recall = recall_count / (len(response_tag_name_list) + 1e-7) # precision precision_count = 0 for tag in prediction_tag_name_list: if tag in response_tag_name_list: precision_count += 1 precision = precision_count / (len(prediction_tag_name_list) + 1e-7) # f1 f1 = 2 * (recall * precision) / (recall + precision + 1e-7) total += 1 total_score += f1 score = total_score / total row_ = { "idx": idx, "system_prompt": system_prompt, "user_prompt": user_prompt, "response": response, "prediction": prediction, "recall": recall, "precision": precision, "f1": f1, "total": total, "total_score": total_score, "score": score, "time_cost": time_cost, } row_ = json.dumps(row_, ensure_ascii=False) fout.write(f"{row_}\n") fout.flush() return if __name__ == "__main__": main()