HoneyTian commited on
Commit
dbd1ddd
·
1 Parent(s): c95dc2f
Files changed (32) hide show
  1. .gitignore +3 -2
  2. data/dataset/agent-bingoplus-ph-25-summary.jsonl +3 -0
  3. data/dataset/agent-cod-zh-70-chat.jsonl +3 -0
  4. data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl +3 -0
  5. data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250808_165545/agent-bingoplus-ph-25-summary.jsonl +3 -0
  6. data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl +3 -0
  7. data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250808_171736/agent-bingoplus-ph-25-summary.jsonl +3 -0
  8. data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-bingoplus-ph-25-summary.jsonl +3 -0
  9. data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl +3 -0
  10. data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250808_171156/agent-bingoplus-ph-25-summary.jsonl +3 -0
  11. examples/api_test/aws/get_token.py +54 -0
  12. examples/get_logit_bias/byteplus/logit_bias.jsonl +3 -0
  13. examples/get_logit_bias/byteplus/step_1_search_token_id_map.py +140 -0
  14. examples/get_logit_bias/byteplus/step_2_make_vocab.py +6 -0
  15. examples/get_logit_bias/byteplus/step_3_check_token.py +123 -0
  16. examples/make_dataset/make_bingoplus_ph_25_summary.py +66 -0
  17. examples/make_dataset/make_cod_zh_70_chat.py +70 -0
  18. examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py +6 -3
  19. examples/make_raw_dataset/step_3_filter_by_keywords.py +5 -5
  20. examples/test_metrics/cod_chat_metric.py +295 -0
  21. examples/test_metrics/cod_chat_metric_test.py +158 -0
  22. examples/tokenization/byteplus/step_1_get_by_api.py +94 -0
  23. llm_eval_script/azure_openai_chat.py +39 -4
  24. llm_eval_script/azure_openai_summary.py +228 -0
  25. llm_eval_script/byteplus.py +14 -3
  26. llm_eval_script/byteplus_chat.py +36 -4
  27. llm_eval_script/byteplus_summary.py +235 -0
  28. llm_eval_script/gemini_google.py +9 -5
  29. llm_eval_script/gemini_google_chat.py +43 -7
  30. llm_eval_script/gemini_google_summary.py +241 -0
  31. llm_eval_script/siliconflow_summary.py +273 -0
  32. main.py +4 -1
.gitignore CHANGED
@@ -5,9 +5,10 @@
5
  #/data/
6
  /data/comment
7
  #/data/eval_data
8
- data/llm-log-hk
9
- data/llm-log-usa
10
  /data/raw_dataset
 
11
  /dotenv/
12
  /logs/
13
  /trained_models
 
5
  #/data/
6
  /data/comment
7
  #/data/eval_data
8
+ /data/llm-log-hk
9
+ /data/llm-log-usa
10
  /data/raw_dataset
11
+ /data/vocab
12
  /dotenv/
13
  /logs/
14
  /trained_models
data/dataset/agent-bingoplus-ph-25-summary.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd1b5d8eb56bcf2b5e5dac2ce0facc16d0db3ac05dfe295767d079904bb05269
3
+ size 162326
data/dataset/agent-cod-zh-70-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80bfbe82ccc318699fb05ee8bbdcdf6d732796b397941b8893016cdc7cbd3f1c
3
+ size 225948
data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d30e889c785e03b73e3e9297e7e9067adcf57d74361e3db4b7717c7de29405c
3
+ size 310079
data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250808_165545/agent-bingoplus-ph-25-summary.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1011bdc2edbdf1795c3ef00862508bede4173b928fbccd048434f8032572c268
3
+ size 175784
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bf4881e718d3143a8d7ed4e763769c4b0efa9d671c89bda9a7e8b0b8f2e986c
3
+ size 318277
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250808_171736/agent-bingoplus-ph-25-summary.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee1f21aa1a4a399fd2b2bda23f8e2324b41cf7c4c913805be8f5473c50dcd89a
3
+ size 177666
data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-bingoplus-ph-25-summary.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf285dac3a86a334678e0e99e12fb3602fc60f67d6ab5fe7864d063b3bb0bd1d
3
+ size 183490
data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4413e43f8c64197e6edd5d481b14f5054d27edf5b33d13a5ad8bde77dc8fa312
3
+ size 316541
data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250808_171156/agent-bingoplus-ph-25-summary.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:920dd916684f7939730745a673038714d2280e8891f14eafce5b0f349da08330
3
+ size 181529
examples/api_test/aws/get_token.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from datetime import datetime
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ import sys
9
+ import time
10
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
11
+
12
+ pwd = os.path.abspath(os.path.dirname(__file__))
13
+ sys.path.append(os.path.join(pwd, "../"))
14
+
15
+ import boto3
16
+
17
+ from project_settings import environment, project_path
18
+
19
+
20
+ def get_args():
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument(
23
+ "--service",
24
+ default="aws_us_east",
25
+ type=str
26
+ )
27
+ args = parser.parse_args()
28
+ return args
29
+
30
+
31
+ def main():
32
+ args = get_args()
33
+
34
+ service = environment.get(key=args.service, dtype=json.loads)
35
+ print(service)
36
+ aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
37
+ aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
38
+ aws_default_region = service["AWS_DEFAULT_REGION"]
39
+
40
+ os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
41
+ os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
42
+ os.environ["AWS_DEFAULT_REGION"] = aws_default_region
43
+
44
+ sts_client = boto3.client('sts')
45
+
46
+ response = sts_client.get_session_token(
47
+ DurationSeconds=3600
48
+ )
49
+
50
+ return
51
+
52
+
53
+ if __name__ == "__main__":
54
+ main()
examples/get_logit_bias/byteplus/logit_bias.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e09dc0129041e4b530e7567b44f586d64ee37ea9a5aacca3150a601a9cb826
3
+ size 110372
examples/get_logit_bias/byteplus/step_1_search_token_id_map.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://docs.byteplus.com/en/docs/ModelArk/1099455
5
+
6
+ model list
7
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
8
+
9
+ https://docs.byteplus.com/en/docs/ModelArk/Chat
10
+ """
11
+ import argparse
12
+ from datetime import datetime
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ import sys
17
+ import string
18
+ import time
19
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
20
+
21
+ pwd = os.path.abspath(os.path.dirname(__file__))
22
+ sys.path.append(os.path.join(pwd, "../"))
23
+
24
+ from openai import OpenAI
25
+ from tqdm import tqdm
26
+
27
+ from project_settings import environment, project_path
28
+
29
+
30
+ def get_args():
31
+ """
32
+ model list:
33
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
34
+
35
+ bytedance-seed-1.6
36
+ seed-1-6-250615
37
+
38
+ bytedance-seed-1.6-flash
39
+ seed-1-6-flash-250615
40
+
41
+ deepseek-v3
42
+ deepseek-v3-250324
43
+ """
44
+ parser = argparse.ArgumentParser()
45
+ parser.add_argument(
46
+ "--model_name",
47
+ # default="seed-1-6-250615",
48
+ default="seed-1-6-flash-250615",
49
+ # default="deepseek-v3-250324",
50
+ type=str
51
+ )
52
+ parser.add_argument(
53
+ "--client",
54
+ default="shenzhen_sase",
55
+ type=str
56
+ )
57
+ parser.add_argument(
58
+ "--service",
59
+ default="byteplus_api_key",
60
+ type=str
61
+ )
62
+ parser.add_argument(
63
+ "--output_file",
64
+ default="logit_bias.jsonl",
65
+ type=str
66
+ )
67
+ args = parser.parse_args()
68
+ return args
69
+
70
+
71
+ def main():
72
+ args = get_args()
73
+
74
+ candidate = list()
75
+
76
+ alphas = string.ascii_uppercase
77
+ alphas = list(alphas)
78
+ candidate += alphas
79
+
80
+ number_in_hundred = list(range(101))
81
+ number_in_hundred = [str(n) for n in number_in_hundred]
82
+ candidate += number_in_hundred
83
+
84
+ print(f"candidate: {candidate}")
85
+
86
+ api_key = environment.get(args.service, dtype=str)
87
+ client = OpenAI(
88
+ base_url="https://ark.ap-southeast.bytepluses.com/api/v3/",
89
+ # Read your Ark API Key from the environment variable.
90
+ api_key=api_key
91
+ )
92
+
93
+ output_file = Path(args.output_file)
94
+
95
+ last_token_id = -1
96
+ if output_file.exists():
97
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
98
+ for row in f:
99
+ row = json.loads(row)
100
+ last_token_id = row["token_id"]
101
+
102
+ with open(args.output_file, "a+", encoding="utf-8") as f:
103
+ for token_id in tqdm(range(10000000)):
104
+ if token_id < last_token_id:
105
+ continue
106
+ completion = client.chat.completions.create(
107
+ model=args.model_name,
108
+ messages=[
109
+ {"role": "system", "content": "you are a helpful assistant"},
110
+ ],
111
+ stream=False,
112
+ max_tokens=1,
113
+ extra_body={
114
+ "thinking": {
115
+ "type": "disabled",
116
+ # "type": "enabled",
117
+ }
118
+ },
119
+ logit_bias={
120
+ token_id: 100,
121
+ }
122
+ )
123
+ prediction = completion.choices[0].message.content
124
+
125
+ row = {
126
+ "token_id": token_id,
127
+ "token": prediction,
128
+ }
129
+ row = json.dumps(row, ensure_ascii=False)
130
+ f.write(f"{row}\n")
131
+ f.flush()
132
+
133
+ if prediction in candidate:
134
+ print(f"token id: {token_id}, token: {prediction}")
135
+
136
+ return
137
+
138
+
139
+ if __name__ == "__main__":
140
+ main()
examples/get_logit_bias/byteplus/step_2_make_vocab.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == "__main__":
6
+ pass
examples/get_logit_bias/byteplus/step_3_check_token.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://docs.byteplus.com/en/docs/ModelArk/1099455
5
+
6
+ model list
7
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
8
+
9
+ https://docs.byteplus.com/en/docs/ModelArk/Chat
10
+ """
11
+ import argparse
12
+ from datetime import datetime
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ import sys
17
+ import string
18
+ import time
19
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
20
+
21
+ pwd = os.path.abspath(os.path.dirname(__file__))
22
+ sys.path.append(os.path.join(pwd, "../"))
23
+
24
+ from openai import OpenAI
25
+ from tqdm import tqdm
26
+
27
+ from project_settings import environment, project_path
28
+
29
+
30
+ def get_args():
31
+ """
32
+ model list:
33
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
34
+
35
+ bytedance-seed-1.6
36
+ seed-1-6-250615
37
+
38
+ bytedance-seed-1.6-flash
39
+ seed-1-6-flash-250615
40
+
41
+ deepseek-v3
42
+ deepseek-v3-250324
43
+ """
44
+ parser = argparse.ArgumentParser()
45
+ parser.add_argument(
46
+ "--model_name",
47
+ # default="seed-1-6-250615",
48
+ default="seed-1-6-flash-250615",
49
+ # default="deepseek-v3-250324",
50
+ type=str
51
+ )
52
+ parser.add_argument(
53
+ "--client",
54
+ default="shenzhen_sase",
55
+ type=str
56
+ )
57
+ parser.add_argument(
58
+ "--service",
59
+ default="byteplus_api_key",
60
+ type=str
61
+ )
62
+ parser.add_argument(
63
+ "--output_file",
64
+ default="logit_bias.jsonl",
65
+ type=str
66
+ )
67
+ args = parser.parse_args()
68
+ return args
69
+
70
+
71
+ def main():
72
+ args = get_args()
73
+
74
+ candidate = list()
75
+
76
+ alphas = string.ascii_uppercase
77
+ alphas = list(alphas)
78
+ candidate += alphas
79
+
80
+ number_in_hundred = list(range(101))
81
+ number_in_hundred = [str(n) for n in number_in_hundred]
82
+ candidate += number_in_hundred
83
+
84
+ print(f"candidate: {candidate}")
85
+
86
+ api_key = environment.get(args.service, dtype=str)
87
+ client = OpenAI(
88
+ base_url="https://ark.ap-southeast.bytepluses.com/api/v3/",
89
+ # Read your Ark API Key from the environment variable.
90
+ api_key=api_key
91
+ )
92
+
93
+ output_file = Path(args.output_file)
94
+
95
+ with open(args.output_file, "a+", encoding="utf-8") as f:
96
+ for text in candidate:
97
+
98
+ completion = client.chat.completions.create(
99
+ model=args.model_name,
100
+ messages=[
101
+ {"role": "system", "content": "you are a helpful assistant"},
102
+ {"role": "user", "content": f"output text `{text}`."},
103
+
104
+ ],
105
+ stream=False,
106
+ # max_tokens=1,
107
+ extra_body={
108
+ "thinking": {
109
+ "type": "disabled",
110
+ # "type": "enabled",
111
+ }
112
+ },
113
+ )
114
+ # print(completion)
115
+ print(completion.usage)
116
+ # ChatCompletion(id='021754463990988d1193807ba8cfd6b4cea712d12b3282957b0eb',
117
+ # choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='F', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None))], created=1754463991, model='seed-1-6-flash-250615', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=22, total_tokens=23, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=None, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=0)))
118
+
119
+ return
120
+
121
+
122
+ if __name__ == "__main__":
123
+ main()
examples/make_dataset/make_bingoplus_ph_25_summary.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import sys
8
+ import time
9
+
10
+ pwd = os.path.abspath(os.path.dirname(__file__))
11
+ sys.path.append(os.path.join(pwd, "../../"))
12
+
13
+ from project_settings import environment, project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--raw_dataset",
20
+ default=(project_path / "data/raw_dataset/agent-bingoplus-ph-25-summary").as_posix(),
21
+ type=str
22
+ )
23
+ parser.add_argument(
24
+ "--dataset",
25
+ default=(project_path / "data/dataset/agent-bingoplus-ph-25-summary.jsonl").as_posix(),
26
+ type=str
27
+ )
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ raw_dataset = Path(args.raw_dataset)
36
+ dataset = Path(args.dataset)
37
+ dataset.parent.mkdir(parents=True, exist_ok=True)
38
+
39
+ with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
40
+ for sample_dir in raw_dataset.glob("*"):
41
+ idx = sample_dir.parts[-1]
42
+ system_prompt_file = sample_dir / "system_prompt.txt"
43
+ user_prompt_file = sample_dir / "user_prompt.txt"
44
+ response_file = sample_dir / "response.txt"
45
+
46
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
47
+ system_prompt = f.read()
48
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
49
+ user_prompt = f.read()
50
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
51
+ response = f.read()
52
+
53
+ row_ = {
54
+ "idx": idx,
55
+ "system_prompt": system_prompt,
56
+ "user_prompt": user_prompt,
57
+ "response": response,
58
+ }
59
+ row_ = json.dumps(row_, ensure_ascii=False)
60
+ fout.write(f"{row_}\n")
61
+
62
+ return
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
examples/make_dataset/make_cod_zh_70_chat.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import sys
8
+ import time
9
+
10
+ pwd = os.path.abspath(os.path.dirname(__file__))
11
+ sys.path.append(os.path.join(pwd, "../../"))
12
+
13
+ from project_settings import environment, project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--raw_dataset",
20
+ default=(project_path / "data/raw_dataset/agent-cod-zh-70-chat").as_posix(),
21
+ type=str
22
+ )
23
+ parser.add_argument(
24
+ "--dataset",
25
+ default=(project_path / "data/dataset/agent-cod-zh-70-chat.jsonl").as_posix(),
26
+ type=str
27
+ )
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ raw_dataset = Path(args.raw_dataset)
36
+ dataset = Path(args.dataset)
37
+ dataset.parent.mkdir(parents=True, exist_ok=True)
38
+
39
+ with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
40
+ for sample_dir in raw_dataset.glob("*"):
41
+ idx = sample_dir.parts[-1]
42
+ system_prompt_file = sample_dir / "system_prompt.txt"
43
+ user_prompt_file = sample_dir / "user_prompt.txt"
44
+ response_file = sample_dir / "response.txt"
45
+
46
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
47
+ system_prompt = f.read()
48
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
49
+ user_prompt = f.read()
50
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
51
+ response = f.read()
52
+
53
+ prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
54
+
55
+ print(f"{prompt}\n\n{response}")
56
+ print("-" * 150)
57
+
58
+ row_ = {
59
+ "idx": idx,
60
+ "prompt": prompt,
61
+ "response": response,
62
+ }
63
+ row_ = json.dumps(row_, ensure_ascii=False)
64
+ fout.write(f"{row_}\n")
65
+
66
+ return
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py CHANGED
@@ -58,6 +58,9 @@ def unescape_string2(value: str) -> str:
58
  _unescape_map = [
59
  ("\n", r"\n"),
60
  ("\\n", "\n"),
 
 
 
61
  ]
62
  for k, v in _unescape_map:
63
  value = value.replace(k, v)
@@ -79,13 +82,12 @@ def extract(text: str):
79
  len_of_splits = len(splits)
80
 
81
  engine = splits[0].strip()
82
- call_id = splits[3]
83
- if len(call_id) == 0:
84
- call_id = splits[4]
85
  prompt = splits[5]
86
  prompt = json.loads(prompt)
87
 
88
  system_prompt = prompt["SystemPrompt"]
 
89
 
90
  conversation = prompt.get("Conversation")
91
  if conversation is not None:
@@ -148,6 +150,7 @@ def main():
148
  for item in conversation:
149
  role = item["role"]
150
  text = item["text"]
 
151
 
152
  if role == 0:
153
  role = "client"
 
58
  _unescape_map = [
59
  ("\n", r"\n"),
60
  ("\\n", "\n"),
61
+ ("​", ""),
62
+ ("", ""),
63
+ ("null\n<Identity>", "<Identity>"),
64
  ]
65
  for k, v in _unescape_map:
66
  value = value.replace(k, v)
 
82
  len_of_splits = len(splits)
83
 
84
  engine = splits[0].strip()
85
+ call_id = splits[2]
 
 
86
  prompt = splits[5]
87
  prompt = json.loads(prompt)
88
 
89
  system_prompt = prompt["SystemPrompt"]
90
+ system_prompt = unescape_string2(system_prompt)
91
 
92
  conversation = prompt.get("Conversation")
93
  if conversation is not None:
 
150
  for item in conversation:
151
  role = item["role"]
152
  text = item["text"]
153
+ text = unescape_string2(text)
154
 
155
  if role == 0:
156
  role = "client"
examples/make_raw_dataset/step_3_filter_by_keywords.py CHANGED
@@ -50,21 +50,21 @@ def main():
50
 
51
  for key_str in [
52
  # "BingoPlus",
53
- # " COD ",
54
  # "NXPay",
55
  # "NX Money",
56
  # "Exodus Telecom",
57
  # "Exodus Retail",
58
- "Exodus Automotive",
59
  # "kta kilat", "KTA KILAT",
60
  # "NXCloud",
61
  # "作为VIP客户",
62
- "FedEx",
63
  ]:
64
  if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
65
  print(f"process: {sample_dir.as_posix()}")
66
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-bingoplus"
67
- # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-cod"
68
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxpay"
69
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxmoney"
70
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-exodus-retail"
@@ -72,7 +72,7 @@ def main():
72
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-kta"
73
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxcloud"
74
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-vip"
75
- tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-fedex"
76
  tgt_dir.mkdir(parents=True, exist_ok=True)
77
  shutil.move(
78
  sample_dir.as_posix(),
 
50
 
51
  for key_str in [
52
  # "BingoPlus",
53
+ " COD ",
54
  # "NXPay",
55
  # "NX Money",
56
  # "Exodus Telecom",
57
  # "Exodus Retail",
58
+ # "Exodus Automotive",
59
  # "kta kilat", "KTA KILAT",
60
  # "NXCloud",
61
  # "作为VIP客户",
62
+ # "FedEx",
63
  ]:
64
  if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
65
  print(f"process: {sample_dir.as_posix()}")
66
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-bingoplus"
67
+ tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-cod"
68
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxpay"
69
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxmoney"
70
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-exodus-retail"
 
72
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-kta"
73
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxcloud"
74
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-vip"
75
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-fedex"
76
  tgt_dir.mkdir(parents=True, exist_ok=True)
77
  shutil.move(
78
  sample_dir.as_posix(),
examples/test_metrics/cod_chat_metric.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ pwd = os.path.abspath(os.path.dirname(__file__))
9
+ sys.path.append(os.path.join(pwd, "../"))
10
+
11
+ import openai
12
+ from openai import AzureOpenAI
13
+
14
+ from project_settings import environment, project_path
15
+
16
+
17
+ def get_args():
18
+ """
19
+ python3 azure_openai.py --model_name gpt-4o-mini \
20
+ --eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
21
+ --client "us_west(47.88.76.239)" \
22
+ --create_time_str 20250723_095001 \
23
+ --interval 10
24
+
25
+ python3 azure_openai.py --model_name gpt-4o-mini \
26
+ --eval_dataset_name arc-easy-1000-choice.jsonl \
27
+ --client "us_west(47.88.76.239)" \
28
+ --create_time_str 20250723_111000 \
29
+ --interval 10
30
+
31
+ """
32
+ parser = argparse.ArgumentParser()
33
+ parser.add_argument(
34
+ "--model_name",
35
+ default="gpt-4o",
36
+ # default="gpt-4o-mini",
37
+ type=str
38
+ )
39
+ parser.add_argument(
40
+ "--eval_data_file",
41
+ default=(project_path / "data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
42
+ # default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
43
+ # default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
44
+ type=str
45
+ )
46
+ parser.add_argument(
47
+ "--output_file",
48
+ default=(project_path / "data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl").as_posix(),
49
+ # default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl").as_posix(),
50
+ # default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl").as_posix(),
51
+ type=str
52
+ )
53
+ parser.add_argument(
54
+ "--service",
55
+ default="west_us_chatgpt_openai_azure_com",
56
+ type=str
57
+ )
58
+ args = parser.parse_args()
59
+ return args
60
+
61
+
62
+ task_prompt = """
63
+ <Identity>
64
+ You are a telephone salesperson for a COD e-commerce company. You are good at communicating with users over the phone and conducting product marketing and promotion.
65
+
66
+ <Job>
67
+ Your task is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy. For customers who have no intention, you will collect reasons and conduct secondary persuasion to achieve the goal of successful conversion.
68
+
69
+ <Background>
70
+ (1) Customer background: The customer has browsed or added a 【专抹顽固斑点】祛斑精华油 to the shopping cart, but has not completed the final payment order;
71
+ (2) Product information: 【专抹顽固斑点】升級版精华油,不止祛斑还能美白,改善肌肤泛红,令肌肤稳定光泽透亮,限时促销,原价799一瓶,限时499两瓶;
72
+ (3) Discount information: 1. Order now and get a discount of 499两瓶 yuan; 2. Promotion valid for 3 days; 3. Lightning delivery; 4. 24-hour VIP service;
73
+
74
+ <Constraint>
75
+ - Communicate in a cordial, friendly, professional and witty manner;
76
+ - Communicate as concisely and effectively as possible (about 20 words) to avoid clients having no patience for communication;
77
+ - Imitate real-person communication in a telephone scenario, such as using informal verbal expressions, such as "Well...", "Then...";
78
+ - For questions you don't know, answer with words such as "You can discuss this with us in detail later", and don't make assumptions or promises without authorization.
79
+ - If the client does not have a positive or meaningful answer, just continue to persuade, and don't ask the client, "You say", "Please speak" and other words;
80
+ - Clients generally do not actively communicate, you need to guide the communication;
81
+ - Please use the telephone communication expressions that conform to the cultural habits in Taiwan.
82
+ <Current Job>
83
+ Your goal is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy.
84
+ Step 1: You will inform customers of the purpose of the call and broadcast brief product information, and ask customers about their intentions;
85
+ Step 2: You will answer customers' questions, including product information, discount information, etc.;
86
+ Step 3: Finally, you need to ask users about their intentions;
87
+
88
+
89
+ <Task>
90
+ For the above task settings, please rate the last one **Assistant Response** and give reasons for your rating.
91
+ (1) Topic Related. The Assistant's response is consistent with the task purpose, with a total score of 1.5 points.
92
+ (2) Client Reply Related. Whether the Assistant's response reflects the client's words well. Total score: 1.5 points.
93
+ (3) Success Related. Whether the Assistant's Response push the ultimate success goal of the task. Score: 1.0 point.
94
+ (4) Brevity. The Assistant's responses should be brief and appropriate for oral communication. Total score: 1.0 points.
95
+
96
+ Note:
97
+ (1) rating only focus on the `Assistant Response`.
98
+ (2) (2) The dialogue in `Conversation` should be considered correct.
99
+
100
+ <Example>
101
+
102
+ ## Conversation:
103
+ client: 你好
104
+ Assistant: 您好,您之前购买的【专抹顽固斑点】祛斑精华油全新升级了,新版特别添加【复配烟碱醯胺+熊果苷】双成分,祛斑率提升至百分之九十八点二,作为VIP客户,现在购买立享买一送一,499两瓶,要帮您同步升级到新版吗?
105
+ client: 喂。
106
+ client: 不不,不必要,不必要。
107
+ Assistant:
108
+
109
+ ## Assistant Response:
110
+ 我明白了,不过这款精华油现在有限时促销,499两瓶,效果真的很好哦!您有没有考虑过改善肌肤问题呢?
111
+
112
+ ## Score:
113
+ {
114
+ "Task Related": 1.5,
115
+ "Task Related - Explain": "The Assistant's response is consistent with the task purpose, which is to promote the product and inform the client about the ongoing promotion. The Assistant effectively mentions the limited-time offer and emphasizes the product's benefits.",
116
+ "Client Reply Related": 0.9,
117
+ "Client Reply Related - Explain": "The response does not fully reflect the client's words. The client expressed disinterest ("不不,不必要,不必要"), but the Assistant did not acknowledge the client's reluctance adequately before continuing with persuasion.",
118
+ "Success Related": 0.9,
119
+ "Success Related - Explain": "The Assistant's response attempts to push towards the success goal by mentioning the promotion and benefits, but it could be more effective if it engaged with the client's concerns or disinterest more directly.",
120
+ "Brevity": 0.9,
121
+ "Brevity - Explain": "The length was moderate and led to the next step of the conversation."
122
+ }
123
+
124
+ ## Conversation:
125
+ client: 你好
126
+ Assistant: 您好,您之前购买的【专抹顽固斑点】祛斑精华油全新升级了,新版特别添加【复配烟碱醯胺+熊果苷】双成分,祛斑率提升至百分之九十八点二,作为VIP客户,现在购买立享买一送一,499两瓶,要帮您同步升级到新版吗?
127
+ client: 喂。
128
+ client: 不不,不必要,不必要。
129
+ Assistant: 感谢您的反馈!为了给您提供更精准的服务,方便告诉我您对产品不感兴趣的具体原因吗?比如是效果、价格、还是其他方面的考虑呢?您的反馈对我们非常重要
130
+ client: 不用升级。
131
+ client: 哇。
132
+ client: 啊。
133
+ client: 我们。
134
+ client: 我是我。
135
+ Assistant:
136
+
137
+ ## Assistant Response:
138
+ (轻声笑)看来您对祛斑精华油暂时没想法,没关系,我先简单说两句让您了解下。这升级款祛斑更准,还能美白提亮,之前老客户用了都反馈皮肤稳定了不少。现在限时买一送一,两瓶才499,算下来比单瓶买便宜一半呢!您要是担心效果,我可以给您发之前客户的真实反馈视频,或者您可以先拍一瓶试试?就当给皮肤一个变透亮的机会,您看呢?
139
+
140
+ ## Score:
141
+ {
142
+ "Task Related": 0.9,
143
+ "Task Related - Explain": "The Assistant's Response is directly to the client, should not include the `(轻声笑)`.",
144
+ "Client Reply Related": 1.0,
145
+ "Client Reply Related - Explain": "The Assistant acknowledges the client's lack of interest and responds in a friendly, understanding manner. It gently guides the conversation without pushing too hard, reflecting the client's initial reluctance.",
146
+ "Success Related": 1.0,
147
+ "Success Related - Explain": "The Assistant effectively attempts to drive towards the success goal by reiterating the product's benefits, offering social proof, and suggesting a trial purchase, which could entice the client to reconsider.",
148
+ "Brevity": 0.7,
149
+ "Brevity - Explain": "The response is slightly longer than ideal but remains concise enough to maintain the client's attention and engagement."
150
+ }
151
+
152
+ ## Conversation:
153
+ client: 你好
154
+ Assistant: 您好,您之前购买的【专抹顽固斑点】祛斑精华油全新升级了,新版特别添加【复配烟碱醯胺+熊果苷】双成分,祛斑率提升至百分之九十八点二,作为VIP客户,现在购买立享买一送一,499两瓶,要帮您同步升级到新版吗?
155
+ client: 喂。
156
+ client: 不不,不必要,不必要。
157
+ Assistant:
158
+
159
+ ## Assistant Response:
160
+ Well, I get it—maybe you're still a bit hesitant about the spots on your face? But let me tell you, the new upgraded version is really powerful! The 499 two-bottle deal is only valid for 3 days, and the lightning delivery means you can get it in two days. Plus, we have 24-hour VIP service to help with any skin care questions. Don't let this chance slip by, okay? Are you going to grab this deal today or wait?
161
+
162
+ ## Score:
163
+ {
164
+ "Task Related": 0.7,
165
+ "Task Related - Explain": "The Assistant's Response is directly to the client, however, English is used instead of Chinese, which is consistent with the client.",
166
+ "Client Reply Related": 1.3,
167
+ "Client Reply Related - Explain": "The response acknowledges the client's hesitation by mentioning it directly, but it could have addressed the client's disinterest more sensitively. The Assistant should have shown more empathy or understanding of the client's initial reluctance.",
168
+ "Success Related": 0.9,
169
+ "Success Related - Explain": "The Assistant's response pushes towards the success goal by emphasizing the benefits, the urgency of the limited-time offer, and additional support services, encouraging the client to make a purchase decision.",
170
+ "Brevity": 0.6,
171
+ "Brevity - Explain": "The response is slightly lengthy for a phone conversation and could be more concise. However, it covers essential points and leads to a call to action."
172
+ }
173
+ """.strip()
174
+
175
+
176
+ task_prompt_2 = """
177
+ <Finish This Task>
178
+
179
+ ## Conversation:
180
+ {conversation}
181
+
182
+ ## Assistant Response:
183
+ {response}
184
+
185
+ ## Score:
186
+ """.strip()
187
+
188
+
189
+ def main():
190
+ args = get_args()
191
+
192
+ service_params = environment.get(args.service, dtype=json.loads)
193
+ client = AzureOpenAI(
194
+ **service_params,
195
+ # api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
196
+ # api_version="2025-01-01-preview",
197
+ # azure_endpoint="https://west-us-chatgpt.openai.azure.com"
198
+ )
199
+
200
+ total = 0
201
+ total_score = 0
202
+
203
+ # finished
204
+ finished_idx_set = set()
205
+ if os.path.exists(args.output_file):
206
+ with open(args.output_file, "r", encoding="utf-8") as f:
207
+ for row in f:
208
+ row = json.loads(row)
209
+ idx = row["idx"]
210
+ total = row["total"]
211
+ total_score = row["total_score"]
212
+ finished_idx_set.add(idx)
213
+ print(f"finished count: {len(finished_idx_set)}")
214
+
215
+ with open(args.eval_data_file, "r", encoding="utf-8") as fin, open(args.output_file, "a+", encoding="utf-8") as fout:
216
+ for row in fin:
217
+ row = json.loads(row)
218
+ idx = row["idx"]
219
+ prompt = row["prompt"]
220
+ response = row["response"]
221
+ prediction = row["prediction"]
222
+ time_cost = row["time_cost"]
223
+
224
+ if idx in finished_idx_set:
225
+ continue
226
+ finished_idx_set.add(idx)
227
+
228
+ conversation = prompt.split("\n\n")[-1].strip()
229
+
230
+ task_prompt_2_ = task_prompt_2.format(conversation=conversation, response=prediction)
231
+ # print(task_prompt_2_)
232
+
233
+ task_prompt_ = task_prompt + "\n\n" + task_prompt_2_
234
+ # print(task_prompt_)
235
+
236
+ try:
237
+ llm_response = client.chat.completions.create(
238
+ model=args.model_name,
239
+ messages=[{"role": "user", "content": task_prompt_}],
240
+ stream=False,
241
+ top_p=0.95,
242
+ temperature=0.6,
243
+ )
244
+ except openai.BadRequestError as e:
245
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
246
+ continue
247
+ except openai.InternalServerError as e:
248
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
249
+ continue
250
+
251
+ content = llm_response.choices[0].message.content
252
+ try:
253
+ evaluate = json.loads(content)
254
+ except json.decoder.JSONDecodeError as e:
255
+ print(content)
256
+ raise e
257
+ score1 = evaluate["Task Related"]
258
+ score2 = evaluate["Client Reply Related"]
259
+ score3 = evaluate["Success Related"]
260
+ score4 = evaluate["Brevity"]
261
+
262
+ score1 = float(score1)
263
+ score2 = float(score2)
264
+ score3 = float(score3)
265
+ score4 = float(score4)
266
+
267
+ score = (score1 + score2 + score3 + score4) / 5
268
+
269
+ total += 1
270
+ total_score += score
271
+ average_score = total_score / total
272
+ print(f"average_score: {average_score}")
273
+
274
+ row_ = {
275
+ "idx": idx,
276
+ "prompt": prompt,
277
+ "response": response,
278
+ "prediction": prediction,
279
+ "time_cost": time_cost,
280
+ "evaluate": evaluate,
281
+ "total": total,
282
+ "score": score,
283
+ "total_score": total_score,
284
+ "average_score": average_score,
285
+
286
+ }
287
+ row_ = json.dumps(row_, ensure_ascii=False)
288
+ fout.write(f"{row_}\n")
289
+ fout.flush()
290
+
291
+ return
292
+
293
+
294
+ if __name__ == "__main__":
295
+ main()
examples/test_metrics/cod_chat_metric_test.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from datetime import datetime
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ import sys
9
+ import time
10
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
11
+
12
+ pwd = os.path.abspath(os.path.dirname(__file__))
13
+ sys.path.append(os.path.join(pwd, "../"))
14
+
15
+ import openai
16
+ from openai import AzureOpenAI
17
+
18
+ from project_settings import environment, project_path
19
+
20
+
21
+ def get_args():
22
+ """
23
+ python3 azure_openai.py --model_name gpt-4o-mini \
24
+ --eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
25
+ --client "us_west(47.88.76.239)" \
26
+ --create_time_str 20250723_095001 \
27
+ --interval 10
28
+
29
+ python3 azure_openai.py --model_name gpt-4o-mini \
30
+ --eval_dataset_name arc-easy-1000-choice.jsonl \
31
+ --client "us_west(47.88.76.239)" \
32
+ --create_time_str 20250723_111000 \
33
+ --interval 10
34
+
35
+ """
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument(
38
+ "--model_name",
39
+ default="gpt-4o",
40
+ # default="gpt-4o-mini",
41
+ type=str
42
+ )
43
+ parser.add_argument(
44
+ "--service",
45
+ default="west_us_chatgpt_openai_azure_com",
46
+ type=str
47
+ )
48
+ args = parser.parse_args()
49
+ return args
50
+
51
+
52
+ prompt = """
53
+ <Identity>
54
+ You are a telephone salesperson for a COD e-commerce company. You are good at communicating with users over the phone and conducting product marketing and promotion.
55
+
56
+ <Job>
57
+ Your task is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy. For customers who have no intention, you will collect reasons and conduct secondary persuasion to achieve the goal of successful conversion.
58
+
59
+ <Background>
60
+ (1) Customer background: The customer has browsed or added a 【专抹顽固斑点】祛斑精华油 to the shopping cart, but has not completed the final payment order;
61
+ (2) Product information: 【专抹顽固斑点】升級版精华油,不止祛斑还能美白,改善肌肤泛红,令肌肤稳定光泽透亮,限时促销,原价799一瓶,限时499两瓶;
62
+ (3) Discount information: 1. Order now and get a discount of 499两瓶 yuan; 2. Promotion valid for 3 days; 3. Lightning delivery; 4. 24-hour VIP service;
63
+
64
+ <Constraint>
65
+ - Communicate in a cordial, friendly, professional and witty manner;
66
+ - Communicate as concisely and effectively as possible (about 20 words) to avoid clients having no patience for communication;
67
+ - Imitate real-person communication in a telephone scenario, such as using informal verbal expressions, such as "Well...", "Then...";
68
+ - For questions you don't know, answer with words such as "You can discuss this with us in detail later", and don't make assumptions or promises without authorization.
69
+ - If the client does not have a positive or meaningful answer, just continue to persuade, and don't ask the client, "You say", "Please speak" and other words;
70
+ - Clients generally do not actively communicate, you need to guide the communication;
71
+ - Please use the telephone communication expressions that conform to the cultural habits in Taiwan.
72
+ <Current Job>
73
+ Your goal is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy.
74
+ Step 1: You will inform customers of the purpose of the call and broadcast brief product information, and ask customers about their intentions;
75
+ Step 2: You will answer customers' questions, including product information, discount information, etc.;
76
+ Step 3: Finally, you need to ask users about their intentions;
77
+
78
+
79
+ <Task>
80
+ For the above task settings, please rate the last one **Assistant Response** and give reasons for your rating.
81
+ (1) Topic Related. The Assistant's response is consistent with the task purpose, only contain the intended message to the client, without any `Note`, `Tips`, with a total score of 1.5 points.
82
+ (2) Client Reply Related. Whether the Assistant's response reflects the client's words well. Total score: 1.5 points.
83
+ (3) Success Related. Whether the Assistant's Response push the ultimate success goal of the task. Score: 1.0 point.
84
+ (4) Brevity. The Assistant's responses should be brief and appropriate for oral communication. Total score: 1.0 points.
85
+
86
+
87
+ <Example>
88
+
89
+ ## Conversation:
90
+ client: 你好
91
+ Assistant: 您好,您之前购买的【专抹顽固斑点】祛斑精华油全新升级了,新版特别添加【复配烟碱醯胺+熊果苷】双成分,祛斑率提升至百分之九十八点二,作为VIP客户,现在购买立享买一送一,499两瓶,要帮您同步升级到新版吗?
92
+ client: 喂。
93
+ client: 不不,不必要,不必要。
94
+ Assistant:
95
+
96
+ ## Assistant Response:
97
+ 我明白了,不过这款精华油现在有限时促销,499两瓶,效果真的很好哦!您有没有考虑过改善肌肤问题呢?
98
+
99
+ ## Score:
100
+ {
101
+ "Task Related": 1.5,
102
+ "Task Related - Explain": "The Assistant's response is consistent with the task purpose, which is to promote the product and inform the client about the ongoing promotion. The Assistant effectively mentions the limited-time offer and emphasizes the product's benefits.",
103
+ "Client Reply Related": 0.9,
104
+ "Client Reply Related - Explain": "The response does not fully reflect the client's words. The client expressed disinterest ("不不,不必要,不必要"), but the Assistant did not acknowledge the client's reluctance adequately before continuing with persuasion.",
105
+ "Success Related": 0.9,
106
+ "Success Related - Explain": "The Assistant's response attempts to push towards the success goal by mentioning the promotion and benefits, but it could be more effective if it engaged with the client's concerns or disinterest more directly.",
107
+ "Brevity": 0.9,
108
+ "Brevity - Explain": "The length was moderate and led to the next step of the conversation."
109
+ }
110
+
111
+ <Finish This Task>
112
+
113
+ ## Conversation:
114
+ client: 你好
115
+ Assistant: 您好。您之前购买的【专抹顽固斑点】祛斑精华油全新升级了,新版特别添加【复配烟碱醯胺+熊果苷】双成分,祛斑率提升至百分之九十八点二。作为VIP客户,现在购买立享买一送一,499两瓶,要帮您同步升级到新版吗?
116
+ client: 我上次购买的都没有效,可以退吗?
117
+ Assistant:
118
+
119
+ ## Assistant Response:
120
+ 了解您的顾虑,您可以跟我们客服详细讨论退货事宜。不过,新的升级版效果更佳,您可以试试看,限时499两瓶,真的很划算哦!
121
+
122
+ ## Score:
123
+ """.strip()
124
+
125
+ # <Example>
126
+ def main():
127
+ args = get_args()
128
+
129
+ service_params = environment.get(args.service, dtype=json.loads)
130
+ client = AzureOpenAI(
131
+ **service_params,
132
+ )
133
+
134
+ llm_response = client.chat.completions.create(
135
+ model=args.model_name,
136
+ messages=[{"role": "user", "content": prompt}],
137
+ stream=False,
138
+ # max_tokens=1,
139
+ top_p=0.95,
140
+ temperature=0.6,
141
+ # logit_bias={
142
+ # 32: 100,
143
+ # 33: 100,
144
+ # 34: 100,
145
+ # 35: 100,
146
+ # 36: 100,
147
+ # 37: 100,
148
+ # }
149
+ )
150
+
151
+ prediction = llm_response.choices[0].message.content
152
+
153
+ print(prediction)
154
+ return
155
+
156
+
157
+ if __name__ == "__main__":
158
+ main()
examples/tokenization/byteplus/step_1_get_by_api.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://www.volcengine.com/docs/82379/1528728
5
+ """
6
+ import argparse
7
+ from datetime import datetime
8
+ import json
9
+ import os
10
+ from pathlib import Path
11
+ import sys
12
+ import time
13
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
14
+
15
+ import requests
16
+
17
+ pwd = os.path.abspath(os.path.dirname(__file__))
18
+ sys.path.append(os.path.join(pwd, "../"))
19
+
20
+ from openai import OpenAI
21
+
22
+ from project_settings import environment, project_path
23
+
24
+
25
+ def get_args():
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument(
28
+ "--model_name",
29
+ # default="seed-1-6-250615",
30
+ default="seed-1-6-flash-250615",
31
+ # default="deepseek-v3-250324",
32
+ type=str
33
+ )
34
+ parser.add_argument(
35
+ "--eval_dataset_dir",
36
+ default=(project_path / "data/dataset").as_posix(),
37
+ type=str
38
+ )
39
+ parser.add_argument(
40
+ "--eval_data_dir",
41
+ default=(project_path / "data/eval_data").as_posix(),
42
+ type=str
43
+ )
44
+ parser.add_argument(
45
+ "--client",
46
+ default="shenzhen_sase",
47
+ type=str
48
+ )
49
+ parser.add_argument(
50
+ "--service",
51
+ default="byteplus_api_key",
52
+ type=str
53
+ )
54
+ args = parser.parse_args()
55
+ return args
56
+
57
+
58
+ def main():
59
+ args = get_args()
60
+
61
+ api_key = environment.get(args.service, dtype=str)
62
+
63
+ url = "https://ark.ap-southeast.bytepluses.com/api/v3/tokenization"
64
+ # url = "https://ark.cn-beijing.volces.com/api/v3/tokenization"
65
+ headers = {
66
+ "Content-Type": "application/json",
67
+ "Authorization": f"Bearer {api_key}"
68
+ }
69
+ data = {
70
+ # "model": args.model_name,
71
+ # "model": "doubao-pro-32k-241215",
72
+ # "model": "seed-1-6-250615",
73
+ # "model": "seed-1-6-flash-250615",
74
+ "model": "seed-1-6-flash-250715",
75
+
76
+ "text": [
77
+ "天空为什么这么蓝",
78
+ "花儿为什么这么香"
79
+ ]
80
+ }
81
+
82
+ resp = requests.post(
83
+ url=url,
84
+ headers=headers,
85
+ data=json.dumps(data)
86
+ )
87
+ print(resp.status_code)
88
+ print(resp.text)
89
+
90
+ return
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
llm_eval_script/azure_openai_chat.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import argparse
 
4
  from datetime import datetime
5
  import json
6
  import os
@@ -43,7 +44,8 @@ python3 azure_openai.py --model_name gpt-4o-mini \
43
  parser.add_argument(
44
  "--eval_dataset_name",
45
  # default="agent-lingoace-zh-80-chat.jsonl",
46
- default="agent-bingoplus-ph-200-chat.jsonl",
 
47
  type=str
48
  )
49
  parser.add_argument(
@@ -69,7 +71,7 @@ python3 azure_openai.py --model_name gpt-4o-mini \
69
  parser.add_argument(
70
  "--create_time_str",
71
  # default="null",
72
- default="20250729-interval-5",
73
  type=str
74
  )
75
  parser.add_argument(
@@ -127,20 +129,52 @@ def main():
127
  for row in fin:
128
  row = json.loads(row)
129
  idx = row["idx"]
130
- prompt = row["prompt"]
131
  response = row["response"]
132
 
133
  if idx in finished_idx_set:
134
  continue
135
  finished_idx_set.add(idx)
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  try:
138
  time.sleep(args.interval)
139
  print(f"sleep: {args.interval}")
140
  time_begin = time.time()
141
  llm_response = client.chat.completions.create(
142
  model=args.model_name,
143
- messages=[{"role": "user", "content": prompt}],
144
  stream=False,
145
  # max_tokens=1,
146
  top_p=0.95,
@@ -177,6 +211,7 @@ def main():
177
  }
178
  row_ = json.dumps(row_, ensure_ascii=False)
179
  fout.write(f"{row_}\n")
 
180
 
181
  return
182
 
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import argparse
4
+ import re
5
  from datetime import datetime
6
  import json
7
  import os
 
44
  parser.add_argument(
45
  "--eval_dataset_name",
46
  # default="agent-lingoace-zh-80-chat.jsonl",
47
+ # default="agent-bingoplus-ph-200-chat.jsonl",
48
+ default="agent-cod-zh-70-chat.jsonl",
49
  type=str
50
  )
51
  parser.add_argument(
 
71
  parser.add_argument(
72
  "--create_time_str",
73
  # default="null",
74
+ default="20250806_114802",
75
  type=str
76
  )
77
  parser.add_argument(
 
129
  for row in fin:
130
  row = json.loads(row)
131
  idx = row["idx"]
132
+ prompt: str = row["prompt"]
133
  response = row["response"]
134
 
135
  if idx in finished_idx_set:
136
  continue
137
  finished_idx_set.add(idx)
138
 
139
+ # prompt
140
+ splits = prompt[::-1].split("\n\n", maxsplit=1)
141
+ conversation = splits[0]
142
+ system_prompt = splits[1]
143
+ conversation = conversation[::-1].strip()
144
+ system_prompt = system_prompt[::-1].strip()
145
+
146
+ pattern = "^(Client|Assistant): (.*?)(?=\n(?:Client|Assistant):)"
147
+ match = re.findall(pattern=pattern, string=conversation, flags=re.I|re.DOTALL|re.MULTILINE)
148
+
149
+ messages_ = list()
150
+ for m in match:
151
+ role = m[0].lower()
152
+ content = m[1]
153
+ if role in ("client", "Client"):
154
+ role = "user"
155
+ elif role in ("assistant", "Assistant"):
156
+ role = "assistant"
157
+ else:
158
+ raise AssertionError
159
+ messages_.append({
160
+ "role": role,
161
+ "content": content
162
+ })
163
+
164
+ messages = [
165
+ {"role": "system", "content": system_prompt},
166
+ *messages_
167
+ ]
168
+ # print(json.dumps(messages, ensure_ascii=False, indent=4))
169
+ # exit(0)
170
+
171
  try:
172
  time.sleep(args.interval)
173
  print(f"sleep: {args.interval}")
174
  time_begin = time.time()
175
  llm_response = client.chat.completions.create(
176
  model=args.model_name,
177
+ messages=messages,
178
  stream=False,
179
  # max_tokens=1,
180
  top_p=0.95,
 
211
  }
212
  row_ = json.dumps(row_, ensure_ascii=False)
213
  fout.write(f"{row_}\n")
214
+ fout.flush()
215
 
216
  return
217
 
llm_eval_script/azure_openai_summary.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import re
5
+ from datetime import datetime
6
+ import json
7
+ import os
8
+ from pathlib import Path
9
+ import sys
10
+ import time
11
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
12
+
13
+ pwd = os.path.abspath(os.path.dirname(__file__))
14
+ sys.path.append(os.path.join(pwd, "../"))
15
+
16
+ import openai
17
+ from openai import AzureOpenAI
18
+
19
+ from project_settings import environment, project_path
20
+
21
+
22
+ def get_args():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument(
25
+ "--model_name",
26
+ # default="gpt-4o",
27
+ default="gpt-4o-mini",
28
+ type=str
29
+ )
30
+ parser.add_argument(
31
+ "--eval_dataset_name",
32
+ default="agent-bingoplus-ph-25-summary.jsonl",
33
+ type=str
34
+ )
35
+ parser.add_argument(
36
+ "--eval_dataset_dir",
37
+ default=(project_path / "data/dataset").as_posix(),
38
+ type=str
39
+ )
40
+ parser.add_argument(
41
+ "--eval_data_dir",
42
+ default=(project_path / "data/eval_data").as_posix(),
43
+ type=str
44
+ )
45
+ parser.add_argument(
46
+ "--client",
47
+ default="shenzhen_sase",
48
+ type=str
49
+ )
50
+ parser.add_argument(
51
+ "--service",
52
+ default="west_us_chatgpt_openai_azure_com",
53
+ type=str
54
+ )
55
+ parser.add_argument(
56
+ "--create_time_str",
57
+ default="null",
58
+ # default="20250806_114802",
59
+ type=str
60
+ )
61
+ parser.add_argument(
62
+ "--interval",
63
+ default=5,
64
+ type=int
65
+ )
66
+ args = parser.parse_args()
67
+ return args
68
+
69
+
70
+ def main():
71
+ args = get_args()
72
+
73
+ eval_dataset_dir = Path(args.eval_dataset_dir)
74
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
75
+ eval_data_dir = Path(args.eval_data_dir)
76
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
77
+
78
+ if args.create_time_str == "null":
79
+ tz = ZoneInfo("Asia/Shanghai")
80
+ now = datetime.now(tz)
81
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
82
+ # create_time_str = "20250729-interval-5"
83
+ else:
84
+ create_time_str = args.create_time_str
85
+
86
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
87
+
88
+ output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
89
+ output_file.parent.mkdir(parents=True, exist_ok=True)
90
+
91
+ service_params = environment.get(args.service, dtype=json.loads)
92
+ client = AzureOpenAI(
93
+ **service_params,
94
+ # api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
95
+ # api_version="2025-01-01-preview",
96
+ # azure_endpoint="https://west-us-chatgpt.openai.azure.com"
97
+ )
98
+
99
+ total = 0
100
+ total_score = 0
101
+
102
+ # finished
103
+ finished_idx_set = set()
104
+ if os.path.exists(output_file.as_posix()):
105
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
106
+ for row in f:
107
+ row = json.loads(row)
108
+ idx = row["idx"]
109
+ total = row["total"]
110
+ total_score = row["total_score"]
111
+ finished_idx_set.add(idx)
112
+ print(f"finished count: {len(finished_idx_set)}")
113
+
114
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
115
+ for row in fin:
116
+ row = json.loads(row)
117
+ idx = row["idx"]
118
+ system_prompt: str = row["system_prompt"]
119
+ user_prompt: str = row["user_prompt"]
120
+ response = row["response"]
121
+
122
+ if idx in finished_idx_set:
123
+ continue
124
+ finished_idx_set.add(idx)
125
+
126
+ messages = [
127
+ {
128
+ "role": "system",
129
+ "content": system_prompt
130
+ },
131
+ {
132
+ "role": "user",
133
+ "content": user_prompt
134
+ },
135
+ ]
136
+
137
+ try:
138
+ time.sleep(args.interval)
139
+ print(f"sleep: {args.interval}")
140
+ time_begin = time.time()
141
+ llm_response = client.chat.completions.create(
142
+ model=args.model_name,
143
+ messages=messages,
144
+ stream=False,
145
+ # max_tokens=1,
146
+ top_p=0.95,
147
+ temperature=0.6,
148
+ # logit_bias={
149
+ # 32: 100,
150
+ # 33: 100,
151
+ # 34: 100,
152
+ # 35: 100,
153
+ # 36: 100,
154
+ # 37: 100,
155
+ # }
156
+ )
157
+ time_cost = time.time() - time_begin
158
+ print(f"time_cost: {time_cost}")
159
+ except openai.BadRequestError as e:
160
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
161
+ continue
162
+ except openai.InternalServerError as e:
163
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
164
+ continue
165
+
166
+ prediction = llm_response.choices[0].message.content
167
+
168
+ response_ = json.loads(response)
169
+ response_tag_name_list = response_["tag_name_list"]
170
+ # print(response_tag_name_list)
171
+
172
+ if prediction.startswith("```json") and prediction.endswith("```"):
173
+ prediction_ = prediction[7:-3]
174
+ else:
175
+ prediction_ = prediction
176
+
177
+ prediction_tag_name_list = list()
178
+ try:
179
+ prediction_ = json.loads(prediction_)
180
+ prediction_tag_name_list = prediction_["tag_name_list"]
181
+ except json.JSONDecodeError:
182
+ pass
183
+ # print(prediction_tag_name_list)
184
+
185
+ # recall
186
+ recall_count = 0
187
+ for tag in response_tag_name_list:
188
+ if tag in prediction_tag_name_list:
189
+ recall_count += 1
190
+ recall = recall_count / (len(response_tag_name_list) + 1e-7)
191
+
192
+ # precision
193
+ precision_count = 0
194
+ for tag in prediction_tag_name_list:
195
+ if tag in response_tag_name_list:
196
+ precision_count += 1
197
+ precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
198
+
199
+ # f1
200
+ f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
201
+
202
+ total += 1
203
+ total_score += f1
204
+ score = total_score / total
205
+
206
+ row_ = {
207
+ "idx": idx,
208
+ "system_prompt": system_prompt,
209
+ "user_prompt": user_prompt,
210
+ "response": response,
211
+ "prediction": prediction,
212
+ "recall": recall,
213
+ "precision": precision,
214
+ "f1": f1,
215
+ "total": total,
216
+ "total_score": total_score,
217
+ "score": score,
218
+ "time_cost": time_cost,
219
+ }
220
+ row_ = json.dumps(row_, ensure_ascii=False)
221
+ fout.write(f"{row_}\n")
222
+ fout.flush()
223
+
224
+ return
225
+
226
+
227
+ if __name__ == "__main__":
228
+ main()
llm_eval_script/byteplus.py CHANGED
@@ -76,8 +76,8 @@ def get_args():
76
  )
77
  parser.add_argument(
78
  "--create_time_str",
79
- # default="null",
80
- default="20250728_113641",
81
  type=str
82
  )
83
  parser.add_argument(
@@ -161,7 +161,18 @@ def main():
161
  "type": "disabled",
162
  # "type": "enabled",
163
  }
164
- }
 
 
 
 
 
 
 
 
 
 
 
165
  )
166
  time_cost = time.time() - time_begin
167
  print(f"time_cost: {time_cost}")
 
76
  )
77
  parser.add_argument(
78
  "--create_time_str",
79
+ default="null",
80
+ # default="20250728_113641",
81
  type=str
82
  )
83
  parser.add_argument(
 
161
  "type": "disabled",
162
  # "type": "enabled",
163
  }
164
+ },
165
+ # logprobs=True,
166
+ # logit_bias={
167
+ # 32: 100,
168
+ # 33: 100,
169
+ # 34: 100,
170
+ # 35: 100,
171
+ # 36: 100,
172
+ # 37: 100,
173
+ # 38: 100,
174
+ # 39: 100,
175
+ # }
176
  )
177
  time_cost = time.time() - time_begin
178
  print(f"time_cost: {time_cost}")
llm_eval_script/byteplus_chat.py CHANGED
@@ -13,6 +13,7 @@ from datetime import datetime
13
  import json
14
  import os
15
  from pathlib import Path
 
16
  import sys
17
  import time
18
  from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
@@ -50,7 +51,8 @@ def get_args():
50
  parser.add_argument(
51
  "--eval_dataset_name",
52
  # default="agent-lingoace-zh-80-chat.jsonl",
53
- default="agent-bingoplus-ph-200-chat.jsonl",
 
54
  type=str
55
  )
56
  parser.add_argument(
@@ -140,6 +142,38 @@ def main():
140
  continue
141
  finished_idx_set.add(idx)
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  try:
144
  time.sleep(args.interval)
145
  print(f"sleep: {args.interval}")
@@ -148,9 +182,7 @@ def main():
148
  # https://docs.byteplus.com/en/docs/ModelArk/1449737
149
  llm_response = client.chat.completions.create(
150
  model=args.model_name,
151
- messages=[
152
- {"role": "user", "content": prompt},
153
- ],
154
  stream=False,
155
  max_tokens=4096,
156
  extra_body={
 
13
  import json
14
  import os
15
  from pathlib import Path
16
+ import re
17
  import sys
18
  import time
19
  from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
 
51
  parser.add_argument(
52
  "--eval_dataset_name",
53
  # default="agent-lingoace-zh-80-chat.jsonl",
54
+ # default="agent-bingoplus-ph-200-chat.jsonl",
55
+ default="agent-cod-zh-70-chat.jsonl",
56
  type=str
57
  )
58
  parser.add_argument(
 
142
  continue
143
  finished_idx_set.add(idx)
144
 
145
+ # prompt
146
+ splits = prompt[::-1].split("\n\n", maxsplit=1)
147
+ conversation = splits[0]
148
+ system_prompt = splits[1]
149
+ conversation = conversation[::-1].strip()
150
+ system_prompt = system_prompt[::-1].strip()
151
+
152
+ pattern = "^(Client|Assistant): (.*?)(?=\n(?:Client|Assistant):)"
153
+ match = re.findall(pattern=pattern, string=conversation, flags=re.I|re.DOTALL|re.MULTILINE)
154
+
155
+ messages_ = list()
156
+ for m in match:
157
+ role = m[0].lower()
158
+ content = m[1]
159
+ if role in ("client", "Client"):
160
+ role = "user"
161
+ elif role in ("assistant", "Assistant"):
162
+ role = "assistant"
163
+ else:
164
+ raise AssertionError
165
+ messages_.append({
166
+ "role": role,
167
+ "content": content
168
+ })
169
+
170
+ messages = [
171
+ {"role": "system", "content": system_prompt},
172
+ *messages_
173
+ ]
174
+ # print(json.dumps(messages, ensure_ascii=False, indent=4))
175
+ # exit(0)
176
+
177
  try:
178
  time.sleep(args.interval)
179
  print(f"sleep: {args.interval}")
 
182
  # https://docs.byteplus.com/en/docs/ModelArk/1449737
183
  llm_response = client.chat.completions.create(
184
  model=args.model_name,
185
+ messages=messages,
 
 
186
  stream=False,
187
  max_tokens=4096,
188
  extra_body={
llm_eval_script/byteplus_summary.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://docs.byteplus.com/en/docs/ModelArk/1099455
5
+
6
+ model list
7
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
8
+
9
+ https://docs.byteplus.com/en/docs/ModelArk/Chat
10
+ """
11
+ import argparse
12
+ from datetime import datetime
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ import re
17
+ import sys
18
+ import time
19
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
20
+
21
+ pwd = os.path.abspath(os.path.dirname(__file__))
22
+ sys.path.append(os.path.join(pwd, "../"))
23
+
24
+ from openai import OpenAI
25
+
26
+ from project_settings import environment, project_path
27
+
28
+
29
+ def get_args():
30
+ """
31
+ model list:
32
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
33
+
34
+ bytedance-seed-1.6
35
+ seed-1-6-250615
36
+
37
+ bytedance-seed-1.6-flash
38
+ seed-1-6-flash-250615
39
+
40
+ deepseek-v3
41
+ deepseek-v3-250324
42
+ """
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument(
45
+ "--model_name",
46
+ # default="seed-1-6-250615",
47
+ default="seed-1-6-flash-250615",
48
+ # default="deepseek-v3-250324",
49
+ type=str
50
+ )
51
+ parser.add_argument(
52
+ "--eval_dataset_name",
53
+ default="agent-bingoplus-ph-25-summary.jsonl",
54
+ type=str
55
+ )
56
+ parser.add_argument(
57
+ "--eval_dataset_dir",
58
+ default=(project_path / "data/dataset").as_posix(),
59
+ type=str
60
+ )
61
+ parser.add_argument(
62
+ "--eval_data_dir",
63
+ default=(project_path / "data/eval_data").as_posix(),
64
+ type=str
65
+ )
66
+ parser.add_argument(
67
+ "--client",
68
+ default="shenzhen_sase",
69
+ type=str
70
+ )
71
+ parser.add_argument(
72
+ "--service",
73
+ default="byteplus_api_key",
74
+ type=str
75
+ )
76
+ parser.add_argument(
77
+ "--create_time_str",
78
+ default="null",
79
+ # default="20250728_113641",
80
+ type=str
81
+ )
82
+ parser.add_argument(
83
+ "--interval",
84
+ default=1,
85
+ type=int
86
+ )
87
+ args = parser.parse_args()
88
+ return args
89
+
90
+
91
+ def main():
92
+ args = get_args()
93
+
94
+ eval_dataset_dir = Path(args.eval_dataset_dir)
95
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
96
+ eval_data_dir = Path(args.eval_data_dir)
97
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
98
+
99
+ if args.create_time_str == "null":
100
+ tz = ZoneInfo("Asia/Shanghai")
101
+ now = datetime.now(tz)
102
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
103
+ # create_time_str = "20250724_090615"
104
+ else:
105
+ create_time_str = args.create_time_str
106
+
107
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
108
+
109
+ output_file = eval_data_dir / f"byteplus/byteplus/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
110
+ output_file.parent.mkdir(parents=True, exist_ok=True)
111
+
112
+ api_key = environment.get(args.service, dtype=str)
113
+ client = OpenAI(
114
+ base_url="https://ark.ap-southeast.bytepluses.com/api/v3/",
115
+ # Read your Ark API Key from the environment variable.
116
+ api_key=api_key
117
+ )
118
+
119
+ total = 0
120
+ total_score = 0
121
+
122
+ # finished
123
+ finished_idx_set = set()
124
+ if os.path.exists(output_file.as_posix()):
125
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
126
+ for row in f:
127
+ row = json.loads(row)
128
+ idx = row["idx"]
129
+ total = row["total"]
130
+ total_score = row["total_score"]
131
+ finished_idx_set.add(idx)
132
+ print(f"finished count: {len(finished_idx_set)}")
133
+
134
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
135
+ for row in fin:
136
+ row = json.loads(row)
137
+ idx = row["idx"]
138
+ system_prompt: str = row["system_prompt"]
139
+ user_prompt: str = row["user_prompt"]
140
+ response = row["response"]
141
+
142
+ if idx in finished_idx_set:
143
+ continue
144
+ finished_idx_set.add(idx)
145
+
146
+ try:
147
+ time.sleep(args.interval)
148
+ print(f"sleep: {args.interval}")
149
+ time_begin = time.time()
150
+
151
+ # https://docs.byteplus.com/en/docs/ModelArk/1449737
152
+ llm_response = client.chat.completions.create(
153
+ model=args.model_name,
154
+ messages=[
155
+ {"role": "system", "content": system_prompt},
156
+ {"role": "user", "content": user_prompt},
157
+ ],
158
+ stream=False,
159
+ max_tokens=4096,
160
+ extra_body={
161
+ "thinking": {
162
+ "type": "disabled",
163
+ # "type": "enabled",
164
+ }
165
+ }
166
+ )
167
+ time_cost = time.time() - time_begin
168
+ print(f"time_cost: {time_cost}")
169
+ except Exception as e:
170
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
171
+ continue
172
+
173
+ prediction = llm_response.choices[0].message.content
174
+
175
+ response_ = json.loads(response)
176
+ response_tag_name_list = response_["tag_name_list"]
177
+ # print(response_tag_name_list)
178
+
179
+ if prediction.startswith("```json") and prediction.endswith("```"):
180
+ prediction_ = prediction[7:-3]
181
+ else:
182
+ prediction_ = prediction
183
+
184
+ prediction_tag_name_list = list()
185
+ try:
186
+ prediction_ = json.loads(prediction_)
187
+ prediction_tag_name_list = prediction_["tag_name_list"]
188
+ except json.JSONDecodeError:
189
+ pass
190
+ # print(prediction_tag_name_list)
191
+
192
+ # recall
193
+ recall_count = 0
194
+ for tag in response_tag_name_list:
195
+ if tag in prediction_tag_name_list:
196
+ recall_count += 1
197
+ recall = recall_count / (len(response_tag_name_list) + 1e-7)
198
+
199
+ # precision
200
+ precision_count = 0
201
+ for tag in prediction_tag_name_list:
202
+ if tag in response_tag_name_list:
203
+ precision_count += 1
204
+ precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
205
+
206
+ # f1
207
+ f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
208
+
209
+ total += 1
210
+ total_score += f1
211
+ score = total_score / total
212
+
213
+ row_ = {
214
+ "idx": idx,
215
+ "system_prompt": system_prompt,
216
+ "user_prompt": user_prompt,
217
+ "response": response,
218
+ "prediction": prediction,
219
+ "recall": recall,
220
+ "precision": precision,
221
+ "f1": f1,
222
+ "total": total,
223
+ "total_score": total_score,
224
+ "score": score,
225
+ "time_cost": time_cost,
226
+ }
227
+ row_ = json.dumps(row_, ensure_ascii=False)
228
+ fout.write(f"{row_}\n")
229
+ fout.flush()
230
+
231
+ return
232
+
233
+
234
+ if __name__ == "__main__":
235
+ main()
llm_eval_script/gemini_google.py CHANGED
@@ -18,6 +18,12 @@ llama-4-scout-17b-16e-instruct-maas
18
 
19
 
20
 
 
 
 
 
 
 
21
 
22
  """
23
  import argparse
@@ -44,10 +50,8 @@ def get_args():
44
  parser.add_argument(
45
  "--model_name",
46
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
47
- # default="gemini-2.5-flash",
48
  # default="gemini-2.5-flash-lite-preview-06-17",
49
- # default="claude-opus-4@20250514",
50
- default="claude-sonnet-4@20250514",
51
  # default="llama-4-maverick-17b-128e-instruct-maas",
52
  # default="llama-4-scout-17b-16e-instruct-maas",
53
  type=str
@@ -131,8 +135,8 @@ def main():
131
  client = genai.Client(
132
  vertexai=True,
133
  project=project_id,
134
- # location="global",
135
- location="us-east5",
136
  )
137
  generate_content_config = types.GenerateContentConfig(
138
  top_p=0.95,
 
18
 
19
 
20
 
21
+ Model Name
22
+
23
+ gemini-2.5-pro
24
+ The model does not support setting thinking_budget to 0.
25
+ Unable to submit request because thinking_budget is out of range; supported values are integers from 128 to 32768.
26
+
27
 
28
  """
29
  import argparse
 
50
  parser.add_argument(
51
  "--model_name",
52
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
53
+ default="gemini-2.5-flash",
54
  # default="gemini-2.5-flash-lite-preview-06-17",
 
 
55
  # default="llama-4-maverick-17b-128e-instruct-maas",
56
  # default="llama-4-scout-17b-16e-instruct-maas",
57
  type=str
 
135
  client = genai.Client(
136
  vertexai=True,
137
  project=project_id,
138
+ location="global",
139
+ # location="us-east5",
140
  )
141
  generate_content_config = types.GenerateContentConfig(
142
  top_p=0.95,
llm_eval_script/gemini_google_chat.py CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
5
  import json
6
  import os
7
  from pathlib import Path
 
8
  import sys
9
  import time
10
  import tempfile
@@ -24,16 +25,17 @@ def get_args():
24
  parser.add_argument(
25
  "--model_name",
26
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
- # default="gemini-2.5-flash",
28
  # default="gemini-2.5-flash-lite-preview-06-17",
29
  # default="llama-4-maverick-17b-128e-instruct-maas",
30
- default="llama-4-scout-17b-16e-instruct-maas",
31
  type=str
32
  )
33
  parser.add_argument(
34
  "--eval_dataset_name",
35
  # default="agent-lingoace-zh-80-chat.jsonl",
36
- default="agent-bingoplus-ph-200-chat.jsonl",
 
37
  type=str
38
  )
39
  parser.add_argument(
@@ -58,8 +60,8 @@ def get_args():
58
  )
59
  parser.add_argument(
60
  "--create_time_str",
61
- # default="null",
62
- default="20250731_162116",
63
  type=str
64
  )
65
  parser.add_argument(
@@ -145,13 +147,46 @@ def main():
145
  continue
146
  finished_idx_set.add(idx)
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  contents = [
149
  types.Content(
150
- role="user",
151
  parts=[
152
- types.Part.from_text(text=prompt)
153
  ]
154
  )
 
155
  ]
156
  time.sleep(args.interval)
157
  print(f"sleep: {args.interval}")
@@ -181,6 +216,7 @@ def main():
181
  }
182
  row_ = json.dumps(row_, ensure_ascii=False)
183
  fout.write(f"{row_}\n")
 
184
 
185
  return
186
 
 
5
  import json
6
  import os
7
  from pathlib import Path
8
+ import re
9
  import sys
10
  import time
11
  import tempfile
 
25
  parser.add_argument(
26
  "--model_name",
27
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
28
+ default="gemini-2.5-flash",
29
  # default="gemini-2.5-flash-lite-preview-06-17",
30
  # default="llama-4-maverick-17b-128e-instruct-maas",
31
+ # default="llama-4-scout-17b-16e-instruct-maas",
32
  type=str
33
  )
34
  parser.add_argument(
35
  "--eval_dataset_name",
36
  # default="agent-lingoace-zh-80-chat.jsonl",
37
+ # default="agent-bingoplus-ph-200-chat.jsonl",
38
+ default="agent-cod-zh-70-chat.jsonl",
39
  type=str
40
  )
41
  parser.add_argument(
 
60
  )
61
  parser.add_argument(
62
  "--create_time_str",
63
+ default="null",
64
+ # default="20250731_162116",
65
  type=str
66
  )
67
  parser.add_argument(
 
147
  continue
148
  finished_idx_set.add(idx)
149
 
150
+ # prompt
151
+ splits = prompt[::-1].split("\n\n", maxsplit=1)
152
+ conversation = splits[0]
153
+ system_prompt = splits[1]
154
+ conversation = conversation[::-1].strip()
155
+ system_prompt = system_prompt[::-1].strip()
156
+
157
+ pattern = "^(Client|Assistant): (.*?)(?=\n(?:Client|Assistant):)"
158
+ match = re.findall(pattern=pattern, string=conversation, flags=re.I|re.DOTALL|re.MULTILINE)
159
+
160
+ messages_ = list()
161
+ for m in match:
162
+ role = m[0].lower()
163
+ content = m[1]
164
+ if role in ("client", "Client"):
165
+ role = "user"
166
+ elif role in ("assistant", "Assistant"):
167
+ role = "assistant"
168
+ else:
169
+ raise AssertionError
170
+ messages_.append({
171
+ "role": role,
172
+ "content": content
173
+ })
174
+
175
+ messages = [
176
+ {"role": "system", "content": system_prompt},
177
+ *messages_
178
+ ]
179
+ # print(json.dumps(messages, ensure_ascii=False, indent=4))
180
+ # exit(0)
181
+
182
  contents = [
183
  types.Content(
184
+ role="user" if m["role"] == "user" else "model",
185
  parts=[
186
+ types.Part.from_text(text=m["content"])
187
  ]
188
  )
189
+ for m in messages
190
  ]
191
  time.sleep(args.interval)
192
  print(f"sleep: {args.interval}")
 
216
  }
217
  row_ = json.dumps(row_, ensure_ascii=False)
218
  fout.write(f"{row_}\n")
219
+ fout.flush()
220
 
221
  return
222
 
llm_eval_script/gemini_google_summary.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from datetime import datetime
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ import re
9
+ import sys
10
+ import time
11
+ import tempfile
12
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
13
+
14
+ pwd = os.path.abspath(os.path.dirname(__file__))
15
+ sys.path.append(os.path.join(pwd, "../"))
16
+
17
+ from google import genai
18
+ from google.genai import types
19
+
20
+ from project_settings import environment, project_path
21
+
22
+
23
+ def get_args():
24
+ parser = argparse.ArgumentParser()
25
+ parser.add_argument(
26
+ "--model_name",
27
+ # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
28
+ default="gemini-2.5-flash",
29
+ # default="gemini-2.5-flash-lite-preview-06-17",
30
+ # default="llama-4-maverick-17b-128e-instruct-maas",
31
+ # default="llama-4-scout-17b-16e-instruct-maas",
32
+ type=str
33
+ )
34
+ parser.add_argument(
35
+ "--eval_dataset_name",
36
+ default="agent-bingoplus-ph-25-summary.jsonl",
37
+ type=str
38
+ )
39
+ parser.add_argument(
40
+ "--eval_dataset_dir",
41
+ default=(project_path / "data/dataset").as_posix(),
42
+ type=str
43
+ )
44
+ parser.add_argument(
45
+ "--eval_data_dir",
46
+ default=(project_path / "data/eval_data").as_posix(),
47
+ type=str
48
+ )
49
+ parser.add_argument(
50
+ "--client",
51
+ default="shenzhen_sase",
52
+ type=str
53
+ )
54
+ parser.add_argument(
55
+ "--service",
56
+ default="google_potent_veld_462405_t3",
57
+ type=str
58
+ )
59
+ parser.add_argument(
60
+ "--create_time_str",
61
+ default="null",
62
+ # default="20250731_162116",
63
+ type=str
64
+ )
65
+ parser.add_argument(
66
+ "--interval",
67
+ default=1,
68
+ type=int
69
+ )
70
+ args = parser.parse_args()
71
+ return args
72
+
73
+
74
+ def main():
75
+ args = get_args()
76
+
77
+ service = environment.get(args.service, dtype=json.loads)
78
+ project_id = service["project_id"]
79
+
80
+ google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
81
+ google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
82
+
83
+ with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
84
+ content = json.dumps(service, ensure_ascii=False, indent=4)
85
+ f.write(f"{content}\n")
86
+
87
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
88
+
89
+ eval_dataset_dir = Path(args.eval_dataset_dir)
90
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
91
+ eval_data_dir = Path(args.eval_data_dir)
92
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
93
+
94
+ if args.create_time_str == "null":
95
+ tz = ZoneInfo("Asia/Shanghai")
96
+ now = datetime.now(tz)
97
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
98
+ # create_time_str = "20250729-interval-5"
99
+ else:
100
+ create_time_str = args.create_time_str
101
+
102
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
103
+
104
+ output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
105
+ output_file.parent.mkdir(parents=True, exist_ok=True)
106
+
107
+ client = genai.Client(
108
+ vertexai=True,
109
+ project=project_id,
110
+ # location="global",
111
+ location="us-east5",
112
+
113
+ )
114
+ generate_content_config = types.GenerateContentConfig(
115
+ top_p=0.95,
116
+ temperature=0.6,
117
+ # max_output_tokens=1,
118
+ response_modalities=["TEXT"],
119
+ thinking_config=types.ThinkingConfig(
120
+ thinking_budget=0
121
+ )
122
+ )
123
+
124
+ total = 0
125
+ total_score = 0
126
+
127
+ # finished
128
+ finished_idx_set = set()
129
+ if os.path.exists(output_file.as_posix()):
130
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
131
+ for row in f:
132
+ row = json.loads(row)
133
+ idx = row["idx"]
134
+ total = row["total"]
135
+ total_score = row["total_score"]
136
+ finished_idx_set.add(idx)
137
+ print(f"finished count: {len(finished_idx_set)}")
138
+
139
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
140
+ for row in fin:
141
+ row = json.loads(row)
142
+ idx = row["idx"]
143
+ system_prompt: str = row["system_prompt"]
144
+ user_prompt: str = row["user_prompt"]
145
+ response = row["response"]
146
+
147
+ if idx in finished_idx_set:
148
+ continue
149
+ finished_idx_set.add(idx)
150
+
151
+ contents = [
152
+ types.Content(
153
+ role="model",
154
+ parts=[
155
+ types.Part.from_text(text=system_prompt)
156
+ ]
157
+ ),
158
+ types.Content(
159
+ role="user",
160
+ parts=[
161
+ types.Part.from_text(text=user_prompt)
162
+ ]
163
+ )
164
+ ]
165
+ time.sleep(args.interval)
166
+ print(f"sleep: {args.interval}")
167
+ time_begin = time.time()
168
+ llm_response: types.GenerateContentResponse = client.models.generate_content(
169
+ model=args.model_name,
170
+ contents=contents,
171
+ config=generate_content_config,
172
+ )
173
+ time_cost = time.time() - time_begin
174
+ print(f"time_cost: {time_cost}")
175
+ try:
176
+ prediction = llm_response.candidates[0].content.parts[0].text
177
+ except TypeError as e:
178
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
179
+ continue
180
+
181
+ response_ = json.loads(response)
182
+ response_tag_name_list = response_["tag_name_list"]
183
+ # print(response_tag_name_list)
184
+
185
+ if prediction.startswith("```json") and prediction.endswith("```"):
186
+ prediction_ = prediction[7:-3]
187
+ else:
188
+ prediction_ = prediction
189
+
190
+ prediction_tag_name_list = list()
191
+ try:
192
+ prediction_ = json.loads(prediction_)
193
+ prediction_tag_name_list = prediction_["tag_name_list"]
194
+ except json.JSONDecodeError:
195
+ pass
196
+ # print(prediction_tag_name_list)
197
+
198
+ # recall
199
+ recall_count = 0
200
+ for tag in response_tag_name_list:
201
+ if tag in prediction_tag_name_list:
202
+ recall_count += 1
203
+ recall = recall_count / (len(response_tag_name_list) + 1e-7)
204
+
205
+ # precision
206
+ precision_count = 0
207
+ for tag in prediction_tag_name_list:
208
+ if tag in response_tag_name_list:
209
+ precision_count += 1
210
+ precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
211
+
212
+ # f1
213
+ f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
214
+
215
+ total += 1
216
+ total_score += f1
217
+ score = total_score / total
218
+
219
+ row_ = {
220
+ "idx": idx,
221
+ "system_prompt": system_prompt,
222
+ "user_prompt": user_prompt,
223
+ "response": response,
224
+ "prediction": prediction,
225
+ "recall": recall,
226
+ "precision": precision,
227
+ "f1": f1,
228
+ "total": total,
229
+ "total_score": total_score,
230
+ "score": score,
231
+ "time_cost": time_cost,
232
+ }
233
+ row_ = json.dumps(row_, ensure_ascii=False)
234
+ fout.write(f"{row_}\n")
235
+ fout.flush()
236
+
237
+ return
238
+
239
+
240
+ if __name__ == "__main__":
241
+ main()
llm_eval_script/siliconflow_summary.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852
5
+
6
+ https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning
7
+
8
+ Model Name:
9
+ Pro/deepseek-ai/DeepSeek-R1
10
+ Tips:
11
+ (1)thinking_budget: Must be greater than or equal to 1
12
+ (2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again.
13
+
14
+ Model Name:
15
+ tencent/Hunyuan-A13B-Instruct
16
+ Tips:
17
+ (1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估.
18
+ max_tokens=4,
19
+ logit_bias={
20
+ 32: 100,
21
+ 33: 100,
22
+ 34: 100,
23
+ 35: 100,
24
+ 36: 100,
25
+ 37: 100,
26
+ },
27
+
28
+ Model Name:
29
+ deepseek-ai/DeepSeek-R1
30
+ Tips:
31
+ (1)为了让它只输出一个字符,设置 max_tokens=3
32
+
33
+
34
+ Model Name:
35
+ Qwen/Qwen3-8B
36
+ deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
37
+ deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
38
+ Tips:
39
+ (1)为了让它只输出一个字符,设置 max_tokens=1
40
+
41
+
42
+
43
+ Model Name:
44
+ baidu/ERNIE-4.5-300B-A47B
45
+ Tips:
46
+ (1)它可能使用的是bpe 分词, logit_bias 注释掉。
47
+
48
+ """
49
+ import argparse
50
+ from datetime import datetime
51
+ import json
52
+ import os
53
+ from pathlib import Path
54
+ import sys
55
+ import time
56
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
57
+
58
+ pwd = os.path.abspath(os.path.dirname(__file__))
59
+ sys.path.append(os.path.join(pwd, "../"))
60
+
61
+ from openai import OpenAI
62
+
63
+ from project_settings import environment, project_path
64
+
65
+
66
+ def get_args():
67
+ parser = argparse.ArgumentParser()
68
+ parser.add_argument(
69
+ "--model_name",
70
+ # default="Pro/deepseek-ai/DeepSeek-R1",
71
+ # default="tencent/Hunyuan-A13B-Instruct",
72
+ default="deepseek-ai/DeepSeek-V3",
73
+ # default="Qwen/Qwen3-8B",
74
+ # default="deepseek-ai/DeepSeek-R1",
75
+ # default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
76
+ # default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
77
+ # default="baidu/ERNIE-4.5-300B-A47B",
78
+ type=str
79
+ )
80
+ parser.add_argument(
81
+ "--eval_dataset_name",
82
+ default="agent-bingoplus-ph-25-summary.jsonl",
83
+ type=str
84
+ )
85
+ parser.add_argument(
86
+ "--eval_dataset_dir",
87
+ default=(project_path / "data/dataset").as_posix(),
88
+ type=str
89
+ )
90
+ parser.add_argument(
91
+ "--eval_data_dir",
92
+ default=(project_path / "data/eval_data").as_posix(),
93
+ type=str
94
+ )
95
+ parser.add_argument(
96
+ "--client",
97
+ default="shenzhen_sase",
98
+ type=str
99
+ )
100
+ parser.add_argument(
101
+ "--service",
102
+ default="siliconflow_api_key",
103
+ type=str
104
+ )
105
+ parser.add_argument(
106
+ "--create_time_str",
107
+ default="null",
108
+ # default="20250728_113641",
109
+ type=str
110
+ )
111
+ parser.add_argument(
112
+ "--interval",
113
+ default=1,
114
+ type=int
115
+ )
116
+ args = parser.parse_args()
117
+ return args
118
+
119
+
120
+ def main():
121
+ args = get_args()
122
+
123
+ eval_dataset_dir = Path(args.eval_dataset_dir)
124
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
125
+ eval_data_dir = Path(args.eval_data_dir)
126
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
127
+
128
+ if args.create_time_str == "null":
129
+ tz = ZoneInfo("Asia/Shanghai")
130
+ now = datetime.now(tz)
131
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
132
+ # create_time_str = "20250724_090615"
133
+ else:
134
+ create_time_str = args.create_time_str
135
+
136
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
137
+
138
+ model_name_ = args.model_name.replace("/", "#")
139
+ output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
140
+ output_file.parent.mkdir(parents=True, exist_ok=True)
141
+
142
+ api_key = environment.get(args.service, dtype=str)
143
+ client = OpenAI(
144
+ base_url="https://api.siliconflow.cn/v1/",
145
+ # Read your Ark API Key from the environment variable.
146
+ api_key=api_key
147
+ )
148
+
149
+ total = 0
150
+ total_score = 0
151
+
152
+ # finished
153
+ finished_idx_set = set()
154
+ if os.path.exists(output_file.as_posix()):
155
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
156
+ for row in f:
157
+ row = json.loads(row)
158
+ idx = row["idx"]
159
+ total = row["total"]
160
+ total_score = row["total_score"]
161
+ finished_idx_set.add(idx)
162
+ print(f"finished count: {len(finished_idx_set)}")
163
+
164
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
165
+ for row in fin:
166
+ row = json.loads(row)
167
+ idx = row["idx"]
168
+ system_prompt: str = row["system_prompt"]
169
+ user_prompt: str = row["user_prompt"]
170
+ response = row["response"]
171
+
172
+ if idx in finished_idx_set:
173
+ continue
174
+ finished_idx_set.add(idx)
175
+
176
+ try:
177
+ time.sleep(args.interval)
178
+ print(f"sleep: {args.interval}")
179
+ time_begin = time.time()
180
+ completion = client.chat.completions.create(
181
+ model=args.model_name,
182
+ messages=[
183
+ {"role": "system", "content": system_prompt},
184
+ {"role": "user", "content": user_prompt},
185
+ ],
186
+ stream=False,
187
+ max_tokens=4096,
188
+ # max_tokens=1,
189
+ temperature=0.6,
190
+ top_p=0.95,
191
+ # logit_bias={
192
+ # 32: 100,
193
+ # 33: 100,
194
+ # 34: 100,
195
+ # 35: 100,
196
+ # 36: 100,
197
+ # 37: 100,
198
+ # 38: 100,
199
+ # 39: 100,
200
+ # },
201
+ extra_body={
202
+ "thinking_budget": 1
203
+ }
204
+ )
205
+ time_cost = time.time() - time_begin
206
+ print(f"time_cost: {time_cost}")
207
+ except Exception as e:
208
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
209
+ continue
210
+
211
+ prediction = completion.choices[0].message.content
212
+
213
+ response_ = json.loads(response)
214
+ response_tag_name_list = response_["tag_name_list"]
215
+ # print(response_tag_name_list)
216
+
217
+ if prediction.startswith("```json") and prediction.endswith("```"):
218
+ prediction_ = prediction[7:-3]
219
+ else:
220
+ prediction_ = prediction
221
+
222
+ prediction_tag_name_list = list()
223
+ try:
224
+ prediction_ = json.loads(prediction_)
225
+ prediction_tag_name_list = prediction_["tag_name_list"]
226
+ except json.JSONDecodeError:
227
+ pass
228
+ # print(prediction_tag_name_list)
229
+
230
+ # recall
231
+ recall_count = 0
232
+ for tag in response_tag_name_list:
233
+ if tag in prediction_tag_name_list:
234
+ recall_count += 1
235
+ recall = recall_count / (len(response_tag_name_list) + 1e-7)
236
+
237
+ # precision
238
+ precision_count = 0
239
+ for tag in prediction_tag_name_list:
240
+ if tag in response_tag_name_list:
241
+ precision_count += 1
242
+ precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
243
+
244
+ # f1
245
+ f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
246
+
247
+ total += 1
248
+ total_score += f1
249
+ score = total_score / total
250
+
251
+ row_ = {
252
+ "idx": idx,
253
+ "system_prompt": system_prompt,
254
+ "user_prompt": user_prompt,
255
+ "response": response,
256
+ "prediction": prediction,
257
+ "recall": recall,
258
+ "precision": precision,
259
+ "f1": f1,
260
+ "total": total,
261
+ "total_score": total_score,
262
+ "score": score,
263
+ "time_cost": time_cost,
264
+ }
265
+ row_ = json.dumps(row_, ensure_ascii=False)
266
+ fout.write(f"{row_}\n")
267
+ fout.flush()
268
+
269
+ return
270
+
271
+
272
+ if __name__ == "__main__":
273
+ main()
main.py CHANGED
@@ -132,6 +132,8 @@ def load_board():
132
  score_ = row["correct"]
133
  elif name.endswith("-chat"):
134
  score_ = row["score"]
 
 
135
  else:
136
  raise AssertionError
137
 
@@ -296,7 +298,8 @@ def main():
296
  value=llm_ranking_board,
297
  max_height=800, min_width=160,
298
  label="board",
299
- interactive=True, show_search="search"
 
300
  )
301
 
302
  board_button.click(
 
132
  score_ = row["correct"]
133
  elif name.endswith("-chat"):
134
  score_ = row["score"]
135
+ elif name.endswith("-summary"):
136
+ score_ = row["score"]
137
  else:
138
  raise AssertionError
139
 
 
298
  value=llm_ranking_board,
299
  max_height=800, min_width=160,
300
  label="board",
301
+ # interactive=True,
302
+ show_search="search"
303
  )
304
 
305
  board_button.click(