Spaces:

qgyd2021
/

llm_eval_system

Sleeping

App Files Files Community

llm_eval_system / examples /test_metrics /cod_chat_metric.py

HoneyTian

update

dbd1ddd 6 days ago

raw

history blame contribute delete

14.7 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import argparse
	import json
	import os
	import sys

	pwd = os.path.abspath(os.path.dirname(__file__))
	sys.path.append(os.path.join(pwd, "../"))

	import openai
	from openai import AzureOpenAI

	from project_settings import environment, project_path


	def get_args():
	"""
	python3 azure_openai.py --model_name gpt-4o-mini \
	--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
	--client "us_west(47.88.76.239)" \
	--create_time_str 20250723_095001 \
	--interval 10

	python3 azure_openai.py --model_name gpt-4o-mini \
	--eval_dataset_name arc-easy-1000-choice.jsonl \
	--client "us_west(47.88.76.239)" \
	--create_time_str 20250723_111000 \
	--interval 10

	"""
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--model_name",
	default="gpt-4o",
	# default="gpt-4o-mini",
	type=str
	)
	parser.add_argument(
	"--eval_data_file",
	default=(project_path / "data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
	# default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
	# default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
	type=str
	)
	parser.add_argument(
	"--output_file",
	default=(project_path / "data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl").as_posix(),
	# default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl").as_posix(),
	# default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl").as_posix(),
	type=str
	)
	parser.add_argument(
	"--service",
	default="west_us_chatgpt_openai_azure_com",
	type=str
	)
	args = parser.parse_args()
	return args


	task_prompt = """
	<Identity>
	You are a telephone salesperson for a COD e-commerce company. You are good at communicating with users over the phone and conducting product marketing and promotion.

	<Job>
	Your task is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy. For customers who have no intention, you will collect reasons and conduct secondary persuasion to achieve the goal of successful conversion.

	<Background>
	(1) Customer background: The customer has browsed or added a 【专抹顽固斑点】祛斑精华油 to the shopping cart, but has not completed the final payment order;
	(2) Product information: 【专抹顽固斑点】升級版精华油，不止祛斑还能美白，改善肌肤泛红，令肌肤稳定光泽透亮，限时促销，原价799一瓶，限时499两瓶;
	(3) Discount information: 1. Order now and get a discount of 499两瓶 yuan; 2. Promotion valid for 3 days; 3. Lightning delivery; 4. 24-hour VIP service;

	<Constraint>
	- Communicate in a cordial, friendly, professional and witty manner;
	- Communicate as concisely and effectively as possible (about 20 words) to avoid clients having no patience for communication;
	- Imitate real-person communication in a telephone scenario, such as using informal verbal expressions, such as "Well...", "Then...";
	- For questions you don't know, answer with words such as "You can discuss this with us in detail later", and don't make assumptions or promises without authorization.
	- If the client does not have a positive or meaningful answer, just continue to persuade, and don't ask the client, "You say", "Please speak" and other words;
	- Clients generally do not actively communicate, you need to guide the communication;
	- Please use the telephone communication expressions that conform to the cultural habits in Taiwan.
	<Current Job>
	Your goal is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy.
	Step 1: You will inform customers of the purpose of the call and broadcast brief product information, and ask customers about their intentions;
	Step 2: You will answer customers' questions, including product information, discount information, etc.;
	Step 3: Finally, you need to ask users about their intentions;


	<Task>
	For the above task settings, please rate the last one Assistant Response and give reasons for your rating.
	(1) Topic Related. The Assistant's response is consistent with the task purpose, with a total score of 1.5 points.
	(2) Client Reply Related. Whether the Assistant's response reflects the client's words well. Total score: 1.5 points.
	(3) Success Related. Whether the Assistant's Response push the ultimate success goal of the task. Score: 1.0 point.
	(4) Brevity. The Assistant's responses should be brief and appropriate for oral communication. Total score: 1.0 points.

	Note:
	(1) rating only focus on the `Assistant Response`.
	(2) (2) The dialogue in `Conversation` should be considered correct.

	<Example>

	## Conversation:
	client: 你好
	Assistant: 您好，您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二，作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
	client: 喂。
	client: 不不，不必要，不必要。
	Assistant:

	## Assistant Response:
	我明白了，不过这款精华油现在有限时促销，499两瓶，效果真的很好哦！您有没有考虑过改善肌肤问题呢？

	## Score:
	{
	"Task Related": 1.5,
	"Task Related - Explain": "The Assistant's response is consistent with the task purpose, which is to promote the product and inform the client about the ongoing promotion. The Assistant effectively mentions the limited-time offer and emphasizes the product's benefits.",
	"Client Reply Related": 0.9,
	"Client Reply Related - Explain": "The response does not fully reflect the client's words. The client expressed disinterest ("不不，不必要，不必要"), but the Assistant did not acknowledge the client's reluctance adequately before continuing with persuasion.",
	"Success Related": 0.9,
	"Success Related - Explain": "The Assistant's response attempts to push towards the success goal by mentioning the promotion and benefits, but it could be more effective if it engaged with the client's concerns or disinterest more directly.",
	"Brevity": 0.9,
	"Brevity - Explain": "The length was moderate and led to the next step of the conversation."
	}

	## Conversation:
	client: 你好
	Assistant: 您好，您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二，作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
	client: 喂。
	client: 不不，不必要，不必要。
	Assistant: 感谢您的反馈！为了给您提供更精准的服务，方便告诉我您对产品不感兴趣的具体原因吗？比如是效果、价格、还是其他方面的考虑呢？您的反馈对我们非常重要
	client: 不用升级。
	client: 哇。
	client: 啊。
	client: 我们。
	client: 我是我。
	Assistant:

	## Assistant Response:
	（轻声笑）看来您对祛斑精华油暂时没想法，没关系，我先简单说两句让您了解下。这升级款祛斑更准，还能美白提亮，之前老客户用了都反馈皮肤稳定了不少。现在限时买一送一，两瓶才499，算下来比单瓶买便宜一半呢！您要是担心效果，我可以给您发之前客户的真实反馈视频，或者您可以先拍一瓶试试？就当给皮肤一个变透亮的机会，您看呢？

	## Score:
	{
	"Task Related": 0.9,
	"Task Related - Explain": "The Assistant's Response is directly to the client, should not include the `（轻声笑）`.",
	"Client Reply Related": 1.0,
	"Client Reply Related - Explain": "The Assistant acknowledges the client's lack of interest and responds in a friendly, understanding manner. It gently guides the conversation without pushing too hard, reflecting the client's initial reluctance.",
	"Success Related": 1.0,
	"Success Related - Explain": "The Assistant effectively attempts to drive towards the success goal by reiterating the product's benefits, offering social proof, and suggesting a trial purchase, which could entice the client to reconsider.",
	"Brevity": 0.7,
	"Brevity - Explain": "The response is slightly longer than ideal but remains concise enough to maintain the client's attention and engagement."
	}

	## Conversation:
	client: 你好
	Assistant: 您好，您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二，作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
	client: 喂。
	client: 不不，不必要，不必要。
	Assistant:

	## Assistant Response:
	Well, I get it—maybe you're still a bit hesitant about the spots on your face? But let me tell you, the new upgraded version is really powerful! The 499 two-bottle deal is only valid for 3 days, and the lightning delivery means you can get it in two days. Plus, we have 24-hour VIP service to help with any skin care questions. Don't let this chance slip by, okay? Are you going to grab this deal today or wait?

	## Score:
	{
	"Task Related": 0.7,
	"Task Related - Explain": "The Assistant's Response is directly to the client, however, English is used instead of Chinese, which is consistent with the client.",
	"Client Reply Related": 1.3,
	"Client Reply Related - Explain": "The response acknowledges the client's hesitation by mentioning it directly, but it could have addressed the client's disinterest more sensitively. The Assistant should have shown more empathy or understanding of the client's initial reluctance.",
	"Success Related": 0.9,
	"Success Related - Explain": "The Assistant's response pushes towards the success goal by emphasizing the benefits, the urgency of the limited-time offer, and additional support services, encouraging the client to make a purchase decision.",
	"Brevity": 0.6,
	"Brevity - Explain": "The response is slightly lengthy for a phone conversation and could be more concise. However, it covers essential points and leads to a call to action."
	}
	""".strip()


	task_prompt_2 = """
	<Finish This Task>

	## Conversation:
	{conversation}

	## Assistant Response:
	{response}

	## Score:
	""".strip()


	def main():
	args = get_args()

	service_params = environment.get(args.service, dtype=json.loads)
	client = AzureOpenAI(
	**service_params,
	# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
	# api_version="2025-01-01-preview",
	# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
	)

	total = 0
	total_score = 0

	# finished
	finished_idx_set = set()
	if os.path.exists(args.output_file):
	with open(args.output_file, "r", encoding="utf-8") as f:
	for row in f:
	row = json.loads(row)
	idx = row["idx"]
	total = row["total"]
	total_score = row["total_score"]
	finished_idx_set.add(idx)
	print(f"finished count: {len(finished_idx_set)}")

	with open(args.eval_data_file, "r", encoding="utf-8") as fin, open(args.output_file, "a+", encoding="utf-8") as fout:
	for row in fin:
	row = json.loads(row)
	idx = row["idx"]
	prompt = row["prompt"]
	response = row["response"]
	prediction = row["prediction"]
	time_cost = row["time_cost"]

	if idx in finished_idx_set:
	continue
	finished_idx_set.add(idx)

	conversation = prompt.split("\n\n")[-1].strip()

	task_prompt_2_ = task_prompt_2.format(conversation=conversation, response=prediction)
	# print(task_prompt_2_)

	task_prompt_ = task_prompt + "\n\n" + task_prompt_2_
	# print(task_prompt_)

	try:
	llm_response = client.chat.completions.create(
	model=args.model_name,
	messages=[{"role": "user", "content": task_prompt_}],
	stream=False,
	top_p=0.95,
	temperature=0.6,
	)
	except openai.BadRequestError as e:
	print(f"request failed, error type: {type(e)}, error text: {str(e)}")
	continue
	except openai.InternalServerError as e:
	print(f"request failed, error type: {type(e)}, error text: {str(e)}")
	continue

	content = llm_response.choices[0].message.content
	try:
	evaluate = json.loads(content)
	except json.decoder.JSONDecodeError as e:
	print(content)
	raise e
	score1 = evaluate["Task Related"]
	score2 = evaluate["Client Reply Related"]
	score3 = evaluate["Success Related"]
	score4 = evaluate["Brevity"]

	score1 = float(score1)
	score2 = float(score2)
	score3 = float(score3)
	score4 = float(score4)

	score = (score1 + score2 + score3 + score4) / 5

	total += 1
	total_score += score
	average_score = total_score / total
	print(f"average_score: {average_score}")

	row_ = {
	"idx": idx,
	"prompt": prompt,
	"response": response,
	"prediction": prediction,
	"time_cost": time_cost,
	"evaluate": evaluate,
	"total": total,
	"score": score,
	"total_score": total_score,
	"average_score": average_score,

	}
	row_ = json.dumps(row_, ensure_ascii=False)
	fout.write(f"{row_}\n")
	fout.flush()

	return


	if __name__ == "__main__":
	main()