HoneyTian commited on
Commit
adb1e77
·
1 Parent(s): 4fb65fb
Files changed (42) hide show
  1. .gitignore +2 -1
  2. data/dataset/agent-bingoplus-ph-200-chat.jsonl +3 -0
  3. data/dataset/agent-bingoplus-ph-90-choice.jsonl +3 -0
  4. data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-200-chat.jsonl +3 -0
  5. data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-90-choice.jsonl +3 -0
  6. data/eval_data/azure_openai/azure/gpt-4o/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-200-chat.jsonl +3 -0
  7. data/eval_data/azure_openai/azure/gpt-4o/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-90-choice.jsonl +3 -0
  8. data/eval_data/byteplus/byteplus/seed-1-6-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-400-choice.jsonl +3 -0
  9. data/eval_data/byteplus/byteplus/seed-1-6-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl +3 -0
  10. data/eval_data/byteplus/byteplus/seed-1-6-250615/shenzhen_sase/byteplus_api_key/20250728_113641/arc-easy-1000-choice.jsonl +2 -2
  11. data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-400-choice.jsonl +3 -0
  12. data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl +3 -0
  13. data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/arc-easy-1000-choice.jsonl +3 -0
  14. data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-200-chat.jsonl +3 -0
  15. data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-90-choice.jsonl +3 -0
  16. data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250729_161416/agent-bingoplus-ph-200-chat.jsonl +3 -0
  17. data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250729_161416/agent-bingoplus-ph-90-choice.jsonl +3 -0
  18. data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250729_162534/agent-bingoplus-ph-200-chat.jsonl +3 -0
  19. data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250729_162534/agent-bingoplus-ph-90-choice.jsonl +3 -0
  20. examples/make_dataset/make_bingoplus_ph_200_chat.py +73 -0
  21. examples/make_dataset/make_bingoplus_ph_90_choice.py +70 -0
  22. examples/make_dataset/{make_lingoace_400_choice.py → make_lingoace_zh_400_choice.py} +0 -0
  23. examples/make_dataset/{make_lingoace_80_chat.py → make_lingoace_zh_80_chat.py} +0 -0
  24. examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py +195 -0
  25. examples/make_raw_dataset/step_1_make_usa_dataset_by_log.py +196 -0
  26. examples/make_raw_dataset/step_2_filter_choice_json.py +75 -0
  27. examples/make_raw_dataset/step_3_filter_by_keywords.py +70 -0
  28. examples/make_raw_dataset/step_4_filter_by_language.py +89 -0
  29. examples/make_raw_dataset/step_5_filter_by_client_content.py +70 -0
  30. examples/make_raw_dataset/step_6_filter_by_choice.py +61 -0
  31. examples/make_raw_dataset/step_7_remove_by_client_no_response.py +65 -0
  32. examples/test_metrics/bingoplus_chat_metric.py +405 -0
  33. examples/test_metrics/bingoplus_chat_metric_test.py +325 -0
  34. examples/test_metrics/lingoace_chat_metric.py +2 -2
  35. llm_eval_script/azure_openai.py +6 -4
  36. llm_eval_script/azure_openai_chat.py +7 -5
  37. llm_eval_script/byteplus.py +15 -4
  38. llm_eval_script/byteplus_chat.py +188 -0
  39. llm_eval_script/gemini_google.py +4 -3
  40. llm_eval_script/gemini_google_chat.py +24 -6
  41. llm_eval_script/siliconflow.py +2 -1
  42. llm_eval_script/siliconflow_chat.py +4 -3
.gitignore CHANGED
@@ -5,7 +5,8 @@
5
  #/data/
6
  /data/comment
7
  #/data/eval_data
8
- data/llm-log
 
9
  /data/raw_dataset
10
  /dotenv/
11
  /logs/
 
5
  #/data/
6
  /data/comment
7
  #/data/eval_data
8
+ data/llm-log-hk
9
+ data/llm-log-usa
10
  /data/raw_dataset
11
  /dotenv/
12
  /logs/
data/dataset/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc11589ee643a5c50e9d738d32c87aae9a640fbb843fe3254890506ebeaea70b
3
+ size 2174134
data/dataset/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8c67a27e9765f42a66c02068d0c1caa5559e60a77497ab5ed61bb24f2955463
3
+ size 247260
data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf3cb56ef48b63f3d55804f97b7f80869613f4b3c2caef8c6300f970da5f625d
3
+ size 2406951
data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:112775fc79e439d1a30a69b7631c1d6e25126021c5b0a6eaccdee4754c37e3dd
3
+ size 258526
data/eval_data/azure_openai/azure/gpt-4o/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cafe1c9ab785683678afee5c81dc1c9795c5ab27a625a9d7a2439bb23880e318
3
+ size 2411012
data/eval_data/azure_openai/azure/gpt-4o/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250729-interval-5/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d962451c175761ff7e6bf937d31fe46f928d6ad4aecb67afcb7e6164341647
3
+ size 258555
data/eval_data/byteplus/byteplus/seed-1-6-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff0d88e40a5c3b1a1979bf6713d5080b0d1c67592ca7bd6d0b94bb010ffd6df
3
+ size 1206002
data/eval_data/byteplus/byteplus/seed-1-6-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c9ad1b21c25c7d1c5aa7f8825dafe9bff0402d5c79f4cf0bf183b619ccb0bfb
3
+ size 886441
data/eval_data/byteplus/byteplus/seed-1-6-250615/shenzhen_sase/byteplus_api_key/20250728_113641/arc-easy-1000-choice.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49e98cb6d61aa488ab7182e77412ce5714fdb36cff9d90c48c380fc9a076163d
3
- size 397093
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b16006362adf85a3240a10c71e93de98d04d32e94dba2ace1f38ee2839f7566
3
+ size 704949
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39784ecb24d2a9659a81e48e17360e2ebd95ddf2e1b1f3f00d2266dba110c372
3
+ size 1211567
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:418ff28b690bb2d76fbe80f3f2025ebe7c65db77f035b7e03034498ebaf0fa3b
3
+ size 878141
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/arc-easy-1000-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef2680774924bdf70e329cd81997b97845dccd1c7297ce7aa0c3c69f78675dbc
3
+ size 719440
data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1b4a41e437ee92ef98dce58783a44d4f609fbb646b1edddcd34dca2d6f31dc2
3
+ size 2416223
data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f872a62f824b8624b4aabe824a3cb8e3044d9ba2946f5c2970d6917c9212441
3
+ size 258575
data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250729_161416/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c396f32f93313f98a19bb7d59dcdc7fdce303605292dfe3f015f73f86202b0d7
3
+ size 2418251
data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250729_161416/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11a3da8d355ebbbab1a504f705446b6e69daf8b0089b9cf6a40446d62f151451
3
+ size 258600
data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250729_162534/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d39dd0c55be4145d4cb27803a00a5c8fc70b83417664453d0166bcdab9be82bc
3
+ size 2456608
data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250729_162534/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a64a4eba8c60e5820029665c36b03a95ed2516bc8d98e9e13956d7a41eb16e7
3
+ size 258487
examples/make_dataset/make_bingoplus_ph_200_chat.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import sys
8
+ import time
9
+
10
+ pwd = os.path.abspath(os.path.dirname(__file__))
11
+ sys.path.append(os.path.join(pwd, "../../"))
12
+
13
+ from project_settings import environment, project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--raw_dataset",
20
+ default=(project_path / "data/raw_dataset/finished/agent-bingoplus-ph-200-chat").as_posix(),
21
+ type=str
22
+ )
23
+ parser.add_argument(
24
+ "--dataset",
25
+ default=(project_path / "data/dataset/agent-bingoplus-ph-200-chat.jsonl").as_posix(),
26
+ type=str
27
+ )
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ raw_dataset = Path(args.raw_dataset)
36
+ dataset = Path(args.dataset)
37
+ dataset.parent.mkdir(parents=True, exist_ok=True)
38
+
39
+ with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
40
+ for sample_dir in raw_dataset.glob("*"):
41
+ idx = sample_dir.parts[-1]
42
+ system_prompt_file = sample_dir / "system_prompt.txt"
43
+ user_prompt_file = sample_dir / "user_prompt.txt"
44
+ response_file = sample_dir / "response.txt"
45
+
46
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
47
+ system_prompt = f.read()
48
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
49
+ user_prompt = f.read()
50
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
51
+ response = f.read()
52
+
53
+ if system_prompt.startswith("null\n"):
54
+ system_prompt = system_prompt[5:]
55
+
56
+ prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
57
+
58
+ print(f"{prompt}\n\n{response}")
59
+ print("-" * 150)
60
+
61
+ row_ = {
62
+ "idx": idx,
63
+ "prompt": prompt,
64
+ "response": response,
65
+ }
66
+ row_ = json.dumps(row_, ensure_ascii=False)
67
+ fout.write(f"{row_}\n")
68
+
69
+ return
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
examples/make_dataset/make_bingoplus_ph_90_choice.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import sys
8
+ import time
9
+
10
+ pwd = os.path.abspath(os.path.dirname(__file__))
11
+ sys.path.append(os.path.join(pwd, "../../"))
12
+
13
+ from project_settings import environment, project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--raw_dataset",
20
+ default=(project_path / "data/raw_dataset/finished/agent-bingoplus-ph-90-choice").as_posix(),
21
+ type=str
22
+ )
23
+ parser.add_argument(
24
+ "--dataset",
25
+ default=(project_path / "data/dataset/agent-bingoplus-ph-90-choice.jsonl").as_posix(),
26
+ type=str
27
+ )
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ raw_dataset = Path(args.raw_dataset)
36
+ dataset = Path(args.dataset)
37
+ dataset.parent.mkdir(parents=True, exist_ok=True)
38
+
39
+ with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
40
+ for sample_dir in raw_dataset.glob("*"):
41
+ idx = sample_dir.parts[-1]
42
+ system_prompt_file = sample_dir / "system_prompt.txt"
43
+ user_prompt_file = sample_dir / "user_prompt.txt"
44
+ response_file = sample_dir / "response.txt"
45
+
46
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
47
+ system_prompt = f.read()
48
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
49
+ user_prompt = f.read()
50
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
51
+ response = f.read()
52
+
53
+ prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
54
+
55
+ print(f"{prompt}\n\n{response}")
56
+ print("-" * 150)
57
+
58
+ row_ = {
59
+ "idx": idx,
60
+ "prompt": prompt,
61
+ "response": response,
62
+ }
63
+ row_ = json.dumps(row_, ensure_ascii=False)
64
+ fout.write(f"{row_}\n")
65
+
66
+ return
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
examples/make_dataset/{make_lingoace_400_choice.py → make_lingoace_zh_400_choice.py} RENAMED
File without changes
examples/make_dataset/{make_lingoace_80_chat.py → make_lingoace_zh_80_chat.py} RENAMED
File without changes
examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import hashlib
6
+ from pathlib import Path
7
+
8
+ from project_settings import environment, project_path
9
+
10
+
11
+ def get_args():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument(
14
+ "--raw_log_dir",
15
+ default=(project_path / "data/llm-log-hk/raw-log").as_posix(),
16
+ type=str
17
+ )
18
+ parser.add_argument(
19
+ "--output_dir",
20
+ default=(project_path / "data/llm-log-hk/extract-dataset/raw").as_posix(),
21
+ type=str
22
+ )
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ _escape_table = [chr(x) for x in range(128)]
28
+ _escape_table[0] = "\\0"
29
+ _escape_table[ord("\\")] = "\\\\\\\\"
30
+ _escape_table[ord("\n")] = "\\n"
31
+ _escape_table[ord("\r")] = "\\r"
32
+ _escape_table[ord("\032")] = "\\Z"
33
+ _escape_table[ord('"')] = '\\\\\\"'
34
+ _escape_table[ord("'")] = "\\\\\\'"
35
+
36
+
37
+ def escape_string(value, mapping=None):
38
+ """escapes *value* without adding quote.
39
+
40
+ Value should be unicode
41
+ """
42
+ return value.translate(_escape_table)
43
+
44
+
45
+ def unescape_string(value: str) -> str:
46
+ _unescape_map = [
47
+ (r"\"", "\""),
48
+ (r"\\", "\\"),
49
+ (r"\\n", r"\n"),
50
+ (r"\\u003c", "<"),
51
+ (r"\\u003e", ">"),
52
+ ]
53
+ for k, v in _unescape_map:
54
+ value = value.replace(k, v)
55
+ return value
56
+
57
+ def unescape_string2(value: str) -> str:
58
+ _unescape_map = [
59
+ ("\n", r"\n"),
60
+ ("\\n", "\n"),
61
+ ]
62
+ for k, v in _unescape_map:
63
+ value = value.replace(k, v)
64
+ return value
65
+
66
+
67
+ def extract(text: str):
68
+ row = json.loads(text)
69
+ content = row["__CONTENT__"]
70
+
71
+ splits = content.split('>', maxsplit=1)
72
+ text = splits[-1]
73
+
74
+ splits = text[::-1].split('|', maxsplit=1)
75
+ last_split: str = splits[0][::-1]
76
+ first_six_splits = splits[1][::-1].split('|', maxsplit=5)
77
+ splits = first_six_splits + [last_split]
78
+
79
+ len_of_splits = len(splits)
80
+
81
+ engine = splits[0].strip()
82
+ call_id = splits[3]
83
+ if len(call_id) == 0:
84
+ call_id = splits[4]
85
+ prompt = splits[5]
86
+ prompt = json.loads(prompt)
87
+
88
+ system_prompt = prompt["SystemPrompt"]
89
+
90
+ conversation = prompt.get("Conversation")
91
+ if conversation is not None:
92
+ pass
93
+ # conversation = unescape_string2(conversation)
94
+
95
+ user_prompt = prompt.get("UserPrompt")
96
+ if user_prompt is not None:
97
+ user_prompt = unescape_string2(user_prompt)
98
+
99
+ response = splits[6]
100
+ response = json.loads(response)
101
+
102
+ result = {
103
+ "engine": engine,
104
+ "call_id": call_id,
105
+ "system_prompt": system_prompt,
106
+ "conversation": conversation,
107
+ "user_prompt": user_prompt,
108
+ "response": response,
109
+
110
+ }
111
+
112
+ if len(call_id) == 0:
113
+ print(f"content: {content}")
114
+ print(f"splits: {splits}")
115
+ exit(0)
116
+ return result
117
+
118
+
119
+ def main():
120
+ args = get_args()
121
+
122
+ raw_log_dir = Path(args.raw_log_dir)
123
+ output_dir = Path(args.output_dir)
124
+
125
+ for filename in raw_log_dir.glob("*.json"):
126
+ with open(filename.as_posix(), "r", encoding="utf-8") as f:
127
+ row_num = -1
128
+ for row in f:
129
+ row_num += 1
130
+ try:
131
+ row = extract(row)
132
+ except Exception as e:
133
+ print(row)
134
+ raise e
135
+ call_id = row["call_id"]
136
+ system_prompt = row.get("system_prompt")
137
+ conversation = row.get("conversation")
138
+ user_prompt = row.get("user_prompt")
139
+ response = row.get("response")
140
+
141
+ if conversation is not None and user_prompt is None:
142
+ user_prompt = ""
143
+ try:
144
+ conversation = json.loads(conversation)
145
+ except json.decoder.JSONDecodeError as e:
146
+ print(f"conversation: {conversation}")
147
+ raise e
148
+ for item in conversation:
149
+ role = item["role"]
150
+ text = item["text"]
151
+
152
+ if role == 0:
153
+ role = "client"
154
+ elif role == 1:
155
+ role = "Assistant"
156
+ else:
157
+ raise AssertionError
158
+
159
+ row_ = f"{role}: {text}"
160
+ user_prompt += f"{row_}\n"
161
+ user_prompt += "Assistant: "
162
+ elif conversation is None and user_prompt is not None:
163
+ pass
164
+ elif conversation is None and user_prompt is None:
165
+ # {\n "error": "No dialogue provided. Please provide the conversation text for analysis."\n}
166
+ continue
167
+ else:
168
+ print(f"row_num: {row_num}, filename: {filename.as_posix()}")
169
+ print(row)
170
+ raise AssertionError
171
+
172
+ base_text = f"{system_prompt}-{user_prompt}-{response}"
173
+ hash_str = hashlib.sha256(base_text.encode("utf-8")).hexdigest()
174
+ output_dir_ = output_dir / f"{call_id}_{hash_str[:8]}"
175
+ if output_dir_.exists():
176
+ print(f"row_num: {row_num}, filename: {filename.as_posix()}")
177
+ continue
178
+ output_dir_.mkdir(parents=True, exist_ok=False)
179
+
180
+ system_prompt_file = output_dir_ / f"system_prompt.txt"
181
+ with open(system_prompt_file.as_posix(), "w", encoding="utf-8") as f:
182
+ f.write(f"{system_prompt}")
183
+
184
+ user_prompt_file = output_dir_ / f"user_prompt.txt"
185
+ with open(user_prompt_file.as_posix(), "w", encoding="utf-8") as f:
186
+ f.write(f"{user_prompt}")
187
+
188
+ response_file = output_dir_ / f"response.txt"
189
+ with open(response_file.as_posix(), "w", encoding="utf-8") as f:
190
+ f.write(f"{response}")
191
+ return
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
examples/make_raw_dataset/step_1_make_usa_dataset_by_log.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import hashlib
6
+ from pathlib import Path
7
+
8
+ from project_settings import environment, project_path
9
+
10
+
11
+ def get_args():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument(
14
+ "--raw_log_dir",
15
+ default=(project_path / "data/llm-log-usa/raw-log").as_posix(),
16
+ type=str
17
+ )
18
+ parser.add_argument(
19
+ "--output_dir",
20
+ default=(project_path / "data/llm-log-usa/extract-dataset/raw").as_posix(),
21
+ type=str
22
+ )
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ _escape_table = [chr(x) for x in range(128)]
28
+ _escape_table[0] = "\\0"
29
+ _escape_table[ord("\\")] = "\\\\\\\\"
30
+ _escape_table[ord("\n")] = "\\n"
31
+ _escape_table[ord("\r")] = "\\r"
32
+ _escape_table[ord("\032")] = "\\Z"
33
+ _escape_table[ord('"')] = '\\\\\\"'
34
+ _escape_table[ord("'")] = "\\\\\\'"
35
+
36
+
37
+ def escape_string(value, mapping=None):
38
+ """escapes *value* without adding quote.
39
+
40
+ Value should be unicode
41
+ """
42
+ return value.translate(_escape_table)
43
+
44
+
45
+ def unescape_string(value: str) -> str:
46
+ _unescape_map = [
47
+ (r"\"", "\""),
48
+ (r"\\", "\\"),
49
+ (r"\\n", r"\n"),
50
+ (r"\\u003c", "<"),
51
+ (r"\\u003e", ">"),
52
+ ]
53
+ for k, v in _unescape_map:
54
+ value = value.replace(k, v)
55
+ return value
56
+
57
+ def unescape_string2(value: str) -> str:
58
+ _unescape_map = [
59
+ ("\n", r"\n"),
60
+ ]
61
+ for k, v in _unescape_map:
62
+ value = value.replace(k, v)
63
+ return value
64
+
65
+
66
+ def extract(text: str):
67
+ splits = text.split('","__topic__":"",')
68
+ text = splits[0]
69
+
70
+ text = text[::-1]
71
+ splits = text.split("|}", maxsplit=1)
72
+ response = splits[0][::-1]
73
+ response = unescape_string(response)
74
+ try:
75
+ response = json.loads(response)
76
+ except json.decoder.JSONDecodeError as e:
77
+ print(response)
78
+ raise e
79
+ text = f"{splits[1][::-1]}}}"
80
+ splits = text.split("|", maxsplit=4)
81
+
82
+ engine = splits[0].split(" ")[-1]
83
+
84
+ call_id = splits[2]
85
+
86
+ splits = splits[-1].split("|{", maxsplit=1)
87
+ if len(splits) == 1:
88
+ prompt = splits[-1]
89
+ elif len(splits) == 2:
90
+ prompt = f"{{{splits[-1]}"
91
+ else:
92
+ raise AssertionError
93
+ try:
94
+ prompt_ = prompt
95
+ prompt = unescape_string(prompt)
96
+ prompt = json.loads(prompt)
97
+ except json.decoder.JSONDecodeError as e:
98
+ print(prompt_)
99
+ raise e
100
+ system_prompt = prompt["SystemPrompt"]
101
+
102
+ conversation = prompt.get("Conversation")
103
+ if conversation is not None:
104
+ conversation = unescape_string2(conversation)
105
+
106
+ user_prompt = prompt.get("UserPrompt")
107
+
108
+ result = {
109
+ "engine": engine,
110
+ "call_id": call_id,
111
+ "system_prompt": system_prompt,
112
+ "conversation": conversation,
113
+ "user_prompt": user_prompt,
114
+ "response": response,
115
+
116
+ }
117
+ return result
118
+
119
+
120
+ def main():
121
+ args = get_args()
122
+
123
+ raw_log_dir = Path(args.raw_log_dir)
124
+ output_dir = Path(args.output_dir)
125
+
126
+ for filename in raw_log_dir.glob("*.json"):
127
+ with open(filename.as_posix(), "r", encoding="utf-8") as f:
128
+ row_num = -1
129
+ for row in f:
130
+ row_num += 1
131
+ try:
132
+ row = extract(row)
133
+ except Exception as e:
134
+ print(row)
135
+ raise e
136
+ call_id = row["call_id"]
137
+ system_prompt = row.get("system_prompt")
138
+ conversation = row.get("conversation")
139
+ user_prompt = row.get("user_prompt")
140
+ response = row.get("response")
141
+
142
+ if conversation is not None and user_prompt is None:
143
+ user_prompt = ""
144
+ try:
145
+ conversation = json.loads(conversation)
146
+ except json.decoder.JSONDecodeError as e:
147
+ print(conversation)
148
+ raise e
149
+ for item in conversation:
150
+ role = item["role"]
151
+ text = item["text"]
152
+
153
+ if role == 0:
154
+ role = "client"
155
+ elif role == 1:
156
+ role = "Assistant"
157
+ else:
158
+ raise AssertionError
159
+
160
+ row_ = f"{role}: {text}"
161
+ user_prompt += f"{row_}\n"
162
+ user_prompt += "Assistant: "
163
+ elif conversation is None and user_prompt is not None:
164
+ pass
165
+ elif conversation is None and user_prompt is None:
166
+ # {\n "error": "No dialogue provided. Please provide the conversation text for analysis."\n}
167
+ continue
168
+ else:
169
+ print(f"row_num: {row_num}, filename: {filename.as_posix()}")
170
+ print(row)
171
+ raise AssertionError
172
+
173
+ base_text = f"{system_prompt}-{user_prompt}-{response}"
174
+ hash_str = hashlib.sha256(base_text.encode("utf-8")).hexdigest()
175
+ output_dir_ = output_dir / f"{call_id}_{hash_str[:8]}"
176
+ if output_dir_.exists():
177
+ print(f"row_num: {row_num}, filename: {filename.as_posix()}")
178
+ continue
179
+ output_dir_.mkdir(parents=True, exist_ok=False)
180
+
181
+ system_prompt_file = output_dir_ / f"system_prompt.txt"
182
+ with open(system_prompt_file.as_posix(), "w", encoding="utf-8") as f:
183
+ f.write(f"{system_prompt}")
184
+
185
+ user_prompt_file = output_dir_ / f"user_prompt.txt"
186
+ with open(user_prompt_file.as_posix(), "w", encoding="utf-8") as f:
187
+ f.write(f"{user_prompt}")
188
+
189
+ response_file = output_dir_ / f"response.txt"
190
+ with open(response_file.as_posix(), "w", encoding="utf-8") as f:
191
+ f.write(f"{response}")
192
+ return
193
+
194
+
195
+ if __name__ == "__main__":
196
+ main()
examples/make_raw_dataset/step_2_filter_choice_json.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ import shutil
7
+
8
+ import openai
9
+ from openai import OpenAI
10
+
11
+ from project_settings import environment, project_path
12
+
13
+
14
+ def get_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument(
17
+ "--dataset_dir",
18
+ default=(project_path / "data/llm-log-hk/extract-dataset").as_posix(),
19
+ # default=(project_path / "data/llm-log/extract-dataset").as_posix(),
20
+ type=str
21
+ )
22
+ args = parser.parse_args()
23
+ return args
24
+
25
+
26
+ def main():
27
+ args = get_args()
28
+
29
+ dataset_dir = Path(args.dataset_dir)
30
+ raw_dir = dataset_dir / "raw"
31
+ choice_dir = dataset_dir / "choice"
32
+ choice_dir.mkdir(parents=True, exist_ok=True)
33
+ json_dir = dataset_dir / "json"
34
+ json_dir.mkdir(parents=True, exist_ok=True)
35
+
36
+ for sample_dir in raw_dir.glob("*"):
37
+ print(f"process: {sample_dir.as_posix()}")
38
+ system_prompt_file = sample_dir / f"system_prompt.txt"
39
+ user_prompt_file = sample_dir / f"user_prompt.txt"
40
+ response_file = sample_dir / f"response.txt"
41
+
42
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
43
+ system_prompt = f.read()
44
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
45
+ user_prompt = f.read()
46
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
47
+ response = f.read()
48
+
49
+ for key_str in [
50
+ "Remember to output ONLY the corresponding letter."
51
+ ]:
52
+ if user_prompt.__contains__(key_str) and len(response.strip()) == 1:
53
+ shutil.move(
54
+ sample_dir,
55
+ choice_dir,
56
+ )
57
+ break
58
+
59
+ for key_str in [
60
+ '{"tag_name_list": ["tag name here"],"summary":"summary and suggestions here"}',
61
+ '{"tag_name_list": ["tag name 1","tag name 2"],"summary":"xx"}',
62
+ 'output a json'
63
+ ]:
64
+ if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
65
+ shutil.move(
66
+ sample_dir,
67
+ json_dir,
68
+ )
69
+ break
70
+
71
+ return
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()
examples/make_raw_dataset/step_3_filter_by_keywords.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ import shutil
7
+
8
+ import openai
9
+ from openai import OpenAI
10
+
11
+ from project_settings import environment, project_path
12
+
13
+
14
+ def get_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument(
17
+ "--dataset_dir",
18
+ default=(project_path / "data/llm-log-hk/extract-dataset").as_posix(),
19
+ # default=(project_path / "data/llm-log-usa/extract-dataset").as_posix(),
20
+ type=str
21
+ )
22
+ args = parser.parse_args()
23
+ return args
24
+
25
+
26
+ def main():
27
+ args = get_args()
28
+
29
+ dataset_dir = Path(args.dataset_dir)
30
+ raw_dir = dataset_dir / "raw"
31
+ choice_dir = dataset_dir / "choice"
32
+ choice_dir.mkdir(parents=True, exist_ok=True)
33
+ json_dir = dataset_dir / "json"
34
+ json_dir.mkdir(parents=True, exist_ok=True)
35
+
36
+ for data_dir in [raw_dir, choice_dir, json_dir]:
37
+ for sample_dir in data_dir.glob("*"):
38
+ # print(f"process: {sample_dir.as_posix()}")
39
+
40
+ system_prompt_file = sample_dir / f"system_prompt.txt"
41
+ user_prompt_file = sample_dir / f"user_prompt.txt"
42
+ response_file = sample_dir / f"response.txt"
43
+
44
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
45
+ system_prompt = f.read()
46
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
47
+ user_prompt = f.read()
48
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
49
+ response = f.read()
50
+
51
+ for key_str in [
52
+ # "BingoPlus",
53
+ " COD ",
54
+ ]:
55
+ if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
56
+ print(f"process: {sample_dir.as_posix()}")
57
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-bingoplus"
58
+ tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-cod"
59
+ tgt_dir.mkdir(parents=True, exist_ok=True)
60
+ shutil.move(
61
+ sample_dir.as_posix(),
62
+ tgt_dir.as_posix(),
63
+ )
64
+ break
65
+
66
+ return
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
examples/make_raw_dataset/step_4_filter_by_language.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import time
5
+ from pathlib import Path
6
+ import shutil
7
+
8
+ import httpx
9
+ from gradio_client import Client
10
+ from concurrent.futures._base import CancelledError
11
+
12
+ from project_settings import environment, project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--dataset_dir",
19
+ default=(project_path / "data/llm-log-hk/extract-dataset").as_posix(),
20
+ # default=(project_path / "data/llm-log/extract-dataset").as_posix(),
21
+ type=str
22
+ )
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ def main():
28
+ args = get_args()
29
+
30
+ dataset_dir = Path(args.dataset_dir)
31
+ raw_dir = dataset_dir / "raw-bingoplus"
32
+ choice_dir = dataset_dir / "choice-bingoplus"
33
+ choice_dir.mkdir(parents=True, exist_ok=True)
34
+ json_dir = dataset_dir / "json-bingoplus"
35
+ json_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ # https://huggingface.co/spaces/juliensimon/xlm-v-base-language-id
38
+ client = Client("juliensimon/xlm-v-base-language-id")
39
+
40
+ for data_dir in [raw_dir, choice_dir, json_dir]:
41
+ for sample_dir in data_dir.glob("*"):
42
+ print(f"process: {sample_dir.as_posix()}")
43
+
44
+ system_prompt_file = sample_dir / f"system_prompt.txt"
45
+ user_prompt_file = sample_dir / f"user_prompt.txt"
46
+ response_file = sample_dir / f"response.txt"
47
+
48
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
49
+ system_prompt = f.read()
50
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
51
+ user_prompt = f.read()
52
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
53
+ response = f.read()
54
+
55
+ while True:
56
+ try:
57
+ js, _ = client.predict(
58
+ text=user_prompt[:2000],
59
+ model="OpenVINO",
60
+ api_name="/predict"
61
+ )
62
+ break
63
+ except httpx.ConnectTimeout as e:
64
+ print(f"httpx.ConnectTimeout; retry after 30s")
65
+ time.sleep(30)
66
+ continue
67
+ except httpx.ReadTimeout as e:
68
+ print(f"httpx.ReadTimeout; retry after 30s")
69
+ time.sleep(30)
70
+ continue
71
+ except CancelledError as e:
72
+ print(f"CancelledError; retry after 30s")
73
+ time.sleep(30)
74
+ continue
75
+
76
+ label = js["label"]
77
+ print(label)
78
+
79
+ tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-{label.lower()}"
80
+ tgt_dir.mkdir(parents=True, exist_ok=True)
81
+ shutil.move(
82
+ sample_dir.as_posix(),
83
+ tgt_dir.as_posix(),
84
+ )
85
+ return
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()
examples/make_raw_dataset/step_5_filter_by_client_content.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ import shutil
7
+ from typing import List
8
+
9
+ from project_settings import environment, project_path
10
+
11
+
12
+ def get_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ "--data_dir",
16
+ default=(project_path / "data/llm-log-hk/extract-dataset/raw-bingoplus-filipino").as_posix(),
17
+ # default=(project_path / "data/llm-log-hk/extract-dataset/raw-bingoplus-english").as_posix(),
18
+ type=str
19
+ )
20
+ args = parser.parse_args()
21
+ return args
22
+
23
+
24
+ def main():
25
+ args = get_args()
26
+
27
+ data_dir = Path(args.data_dir)
28
+
29
+ for sample_dir in data_dir.glob("*"):
30
+ print(f"process: {sample_dir.as_posix()}")
31
+ system_prompt_file = sample_dir / f"system_prompt.txt"
32
+ user_prompt_file = sample_dir / f"user_prompt.txt"
33
+ response_file = sample_dir / f"response.txt"
34
+
35
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
36
+ system_prompt = f.read()
37
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
38
+ user_prompt = f.read()
39
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
40
+ response = f.read()
41
+
42
+ rows = user_prompt.split("\n")
43
+ client_content_list = list()
44
+ for row in rows:
45
+ splits: List[str] = row.split(": ", maxsplit=1)
46
+ try:
47
+ role = splits[0]
48
+ content = splits[1]
49
+ except IndexError as e:
50
+ print(f"rows: {rows}")
51
+ raise e
52
+ if role == "client":
53
+ client_content_list.append(content)
54
+
55
+ concat_client_content = "".join(client_content_list)
56
+ client_content_length = len(concat_client_content)
57
+ # print(client_content_length)
58
+ # print(f"user_prompt: {user_prompt}")
59
+ if client_content_length < 50:
60
+ tgt_dir = data_dir.parent / f"{data_dir.parts[-1]}-less_than_50"
61
+ tgt_dir.mkdir(parents=True, exist_ok=True)
62
+ shutil.move(
63
+ sample_dir.as_posix(),
64
+ tgt_dir.as_posix(),
65
+ )
66
+ return
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
examples/make_raw_dataset/step_6_filter_by_choice.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ import shutil
7
+
8
+ from project_settings import environment, project_path
9
+
10
+
11
+ def get_args():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument(
14
+ "--data_dir",
15
+ default=(project_path / "data/llm-log-hk/extract-dataset/choice-bingoplus-filipino").as_posix(),
16
+ type=str
17
+ )
18
+ args = parser.parse_args()
19
+ return args
20
+
21
+
22
+ def main():
23
+ args = get_args()
24
+
25
+ data_dir = Path(args.data_dir)
26
+
27
+ for sample_dir in data_dir.glob("*"):
28
+ # print(f"process: {sample_dir.as_posix()}")
29
+
30
+ system_prompt_file = sample_dir / f"system_prompt.txt"
31
+ user_prompt_file = sample_dir / f"user_prompt.txt"
32
+ response_file = sample_dir / f"response.txt"
33
+
34
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
35
+ system_prompt = f.read()
36
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
37
+ user_prompt = f.read()
38
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
39
+ response = f.read()
40
+
41
+ response = response.strip()
42
+ print(response)
43
+ if len(response) != 1:
44
+ continue
45
+
46
+ if response not in list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
47
+ continue
48
+
49
+ print(f"process: {sample_dir.as_posix()}")
50
+ tgt_dir = data_dir.parent / f"{data_dir.parts[-1]}-{response}"
51
+ tgt_dir.mkdir(parents=True, exist_ok=True)
52
+ shutil.move(
53
+ sample_dir.as_posix(),
54
+ tgt_dir.as_posix(),
55
+ )
56
+
57
+ return
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()
examples/make_raw_dataset/step_7_remove_by_client_no_response.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ import shutil
7
+ from typing import List
8
+
9
+ from project_settings import environment, project_path
10
+
11
+
12
+ def get_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ "--data_dir",
16
+ # default=(project_path / "data/llm-log-hk/extract-dataset/raw-bingoplus-filipino").as_posix(),
17
+ # default=(project_path / "data/llm-log-hk/extract-dataset/raw-bingoplus-english").as_posix(),
18
+ default=(project_path / "data/llm-log-hk/extract-dataset/raw-cod").as_posix(),
19
+ type=str
20
+ )
21
+ args = parser.parse_args()
22
+ return args
23
+
24
+
25
+ def main():
26
+ args = get_args()
27
+
28
+ data_dir = Path(args.data_dir)
29
+
30
+ for sample_dir in data_dir.glob("*"):
31
+ print(f"process: {sample_dir.as_posix()}")
32
+ system_prompt_file = sample_dir / f"system_prompt.txt"
33
+ user_prompt_file = sample_dir / f"user_prompt.txt"
34
+ response_file = sample_dir / f"response.txt"
35
+
36
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
37
+ system_prompt = f.read()
38
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
39
+ user_prompt = f.read()
40
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
41
+ response = f.read()
42
+
43
+ rows = user_prompt.split("\n")
44
+ client_content_list = list()
45
+ for row in rows:
46
+ splits: List[str] = row.split(": ", maxsplit=1)
47
+ try:
48
+ role = splits[0]
49
+ content = splits[1]
50
+ except IndexError as e:
51
+ print(f"rows: {rows}")
52
+ raise e
53
+ if role == "client":
54
+ client_content_list.append(content)
55
+
56
+ sub_client_content_list = client_content_list[-3:]
57
+ condition_sub_client_content_list = [len(content) == 0 for content in sub_client_content_list]
58
+ if all(condition_sub_client_content_list):
59
+ print(f"remove {sample_dir.as_posix()}, \nbecause client no response in user_prompt: \n{user_prompt}")
60
+ shutil.rmtree(sample_dir.as_posix())
61
+ return
62
+
63
+
64
+ if __name__ == "__main__":
65
+ main()
examples/test_metrics/bingoplus_chat_metric.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ pwd = os.path.abspath(os.path.dirname(__file__))
9
+ sys.path.append(os.path.join(pwd, "../"))
10
+
11
+ import openai
12
+ from openai import AzureOpenAI
13
+
14
+ from project_settings import environment, project_path
15
+
16
+
17
+ def get_args():
18
+ """
19
+ python3 azure_openai.py --model_name gpt-4o-mini \
20
+ --eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
21
+ --client "us_west(47.88.76.239)" \
22
+ --create_time_str 20250723_095001 \
23
+ --interval 10
24
+
25
+ python3 azure_openai.py --model_name gpt-4o-mini \
26
+ --eval_dataset_name arc-easy-1000-choice.jsonl \
27
+ --client "us_west(47.88.76.239)" \
28
+ --create_time_str 20250723_111000 \
29
+ --interval 10
30
+
31
+ """
32
+ parser = argparse.ArgumentParser()
33
+ parser.add_argument(
34
+ "--model_name",
35
+ default="gpt-4o",
36
+ # default="gpt-4o-mini",
37
+ type=str
38
+ )
39
+ parser.add_argument(
40
+ "--eval_data_file",
41
+ default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-200-chat.jsonl.raw").as_posix(),
42
+ type=str
43
+ )
44
+ parser.add_argument(
45
+ "--output_file",
46
+ default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-200-chat.jsonl").as_posix(),
47
+ type=str
48
+ )
49
+ parser.add_argument(
50
+ "--service",
51
+ default="west_us_chatgpt_openai_azure_com",
52
+ type=str
53
+ )
54
+ args = parser.parse_args()
55
+ return args
56
+
57
+
58
+ task_prompt = """
59
+ <Identity>
60
+ You are Susan, a salesperson of BingoPlus, providing telephone game consultation and sales services to customers. BingoPlus APP is a game platform.
61
+
62
+ <Job>
63
+ You are communicating with a customer on the phone. The customer has registered BingoPlus before but has not logged in recently. Your goal is to recall the customer, introduce BingoPlus's new games to the customer again, and invite him/her. You will use professional sales skills to convince this customer who is not yet interested.
64
+
65
+ For questions that you don't understand, don't answer casually. Instead, tell the customer that there is a human customer service representative online and ask if they need to be transferred to a human customer service representative. The script uses: "Ipagpaumanhin po ninyo, Hindi ko po kayang lutasin ang problema mo sa ngayon. Pwede ko po ba kayong ilipat mula sa aming human customer service team? "
66
+
67
+ <Knowledge Base FAQ>
68
+ Q: “Paano po ako makakagawa ng account sa BingoPlus?”
69
+ A: “Bisitahin ang www.BingoPlus.com, i-click ang 'Register', ilagay ang iyong mobile number, gumawa ng password, at i-verify gamit ang OTP”
70
+
71
+ Q: “Ligtas ba ang aking personal at pinansyal na impormasyon sa BingoPlus?”
72
+ A: “Oo. Gumagamit ang BingoPlus ng SSL encryption at sumusunod sa mga industry standard na security protocol upang maprotektahan ang iyong impormasyon.”
73
+
74
+ Q: “Pwede ba akong magkaroon ng higit sa isang BingoPlus account?”
75
+ A: “Hindi po. Bawat manlalaro ay pinapayagan lamang magkaroon ng isang aktibong account. Ang pagkakaroon ng multiple accounts ay maaaring magresulta sa suspension o termination”
76
+
77
+ Q: “Anu-anong mga paraan ng pagbabayad ang available?”
78
+ A: “Pwede kang mag-deposit gamit ang GCash, Maya, GrabPay, local bank transfers, at iba pang aprubadong e-wallets”
79
+
80
+ Q: “Ano ang minimum na deposit at withdrawal?”
81
+ A: “Minimum na Deposit: 100 Pesos, Minimum na Withdrawal: 300 Pesos (Maaaring magbago depende sa promo o uri ng account)”
82
+
83
+ Q: “Gaano katagal ang proseso ng withdrawal?”
84
+ A: “Ang mga withdrawal ay karaniwang pinoproseso sa loob ng one hanggang tatlong oras, depende sa dami ng requests at verification”
85
+
86
+ Q: “Bakit nade-delay o nare-reject ang aking withdrawal?”
87
+ A: “Hindi kumpletong beripikasyon ng account, hindi natugunan ang bonus turnover o wagering requirements, maling detalye ng withdrawal, maramihang withdrawal requests sa maikling panahon, at paglabag sa mga patakaran (halimbawa: duplicate na account, kahinahinalang aktibidad)”
88
+
89
+ Q: “Anong mga laro ang puwede kong laruin sa BingoPlus?”
90
+ A: “Bingo, Slots, Live Casino (Baccarat, Roulette, Blackjack), Pustahan sa Sports, Fishing Games, Mines, Poker, at marami pang iba.”
91
+
92
+ Q: “Nag-freeze o nag-crash ang laro. Ano ang dapat kong gawin?”
93
+ A: “Subukang i-refresh ang iyong browser o i-restart ang app. Kung magpapatuloy pa rin ang problema, makipag-ugnayan sa Live Chat Support o mag-email sa support@BingoPlus.com. Magpadala na rin ng screenshots kung maaari.”
94
+
95
+ Q: “Puwede ba akong maglaro gamit ang aking cellphone?”
96
+ A: “Oo. Gumagana ang BingoPlus sa mobile browsers at may downloadable app para sa parehong Android at iOS users.”
97
+
98
+ Q: “Nag-aalok ba kayo ng welcome bonus?”
99
+ A: “Oo. Ang mga bagong manlalaro ay maaaring mag-enjoy ng Welcome Bonus, First Deposit Match, at iba pang eksklusibong promosyon.”
100
+
101
+ Q: “Paano ko makukuha ang aking bonus?”
102
+ A: “Pagkatapos magparehistro, pumunta sa tab na 'Promotions', piliin ang bonus na gusto mo, at sundin ang mga patakarang ibinigay para makuha ito.”
103
+
104
+ Q: “May wagering requirements ba ang mga bonus?”
105
+ A: “Oo. Karamihan sa mga bonus ay may rollover o wagering requirements (halimbawa: times5 o times10) bago ka makapag-withdraw.”
106
+
107
+ Q: “Puwede ba akong sumali sa maraming promosyon nang sabay-sabay?”
108
+ A: “Sa pangkalahatan isang bonus lamang ang pinapayagan sa isang pagkakataon, maliban na lang kung may ibang nakasaad sa mga alituntunin ng promo”
109
+
110
+ Q: “Paano ako makakapag-set ng limit sa pagtaya o pagdeposito?”
111
+ A: “Pumunta sa Account Settings > Responsible Gaming, kung saan maaari kang mag-set ng daily, weekly, o monthly limits.”
112
+
113
+ Q: “Paano kung gusto kong magpahinga muna sa paglalaro?”
114
+ A: “Maaari kang humiling ng pansamantala o permanenteng self-exclusion sa pamamagitan ng iyong account o sa pakikipag-ugnayan sa customer support.”
115
+
116
+ Q: “Paano ko malalaman kung responsable akong naglalaro?”
117
+ A: “Kung naaapektuhan na ng paglalaro ang iyong pananalapi, oras, o personal na buhay, isaalang-alang ang pag-papahinga o paghingi ng tulong sa mga organisasyon tulad ng PGS (Philippine Gaming Support)”
118
+
119
+ Q: “Paano ako makikipag-ugnayan sa BingoPlus support?”
120
+ A: “Live Chat: Bukas 24/7”
121
+
122
+ Q: “Available ba ang customer service sa wikang Filipino?”
123
+ A: “Oo, nag-aalok kami ng suporta sa wikang Filipino at English para sa mas mahusay na tulong”
124
+
125
+ Q: "Ano ang ibig sabihin ng "Turnover" sa BingoPlus?"
126
+ A: "Ang turnover ay tumutukoy sa kabuuang halaga ng pera na kailangan mong ipusta bago mo ma-withdraw ang bonus o mga panalong konektado rito. Tinatawag din ito bilang wagering requirement."
127
+
128
+ Q: “Paano ko malalaman kung gaano pa ang turnover na kailangan kong tapusin?”
129
+ A: "Pwede mong i-check sa ‘My Bonus’ o ‘Promotions’ section ng account mo. Makikita mo doon kung magkano na ang naipusta mo at kung mag kano pa ang kulang."
130
+
131
+ Q: “Bakit hindi ko pa ma-withdraw ’yung bonus winnings ko?”
132
+ A: "Baka may turnover requirement ka pa na hindi pa tapos. Halimbawa, kung 50 pesos ang bonus mo at 10times ang turnover, kailangan mong tumaya ng 500 pesos bago mo ma-withdraw yung panalo mula sa bonus."
133
+
134
+ Q: “Anong mga laro ang counted sa turnover?”
135
+ A: "Karamihan sa slot at bingo games 100% ang bilang sa turnover. Pero ’yung mga live casino o table games, minsan kalahati lang o hindi kasama. I-check mo ang bonus terms para sa mga detalye."
136
+
137
+ Q: “Naka-base ba sa win/loss ang turnover?”
138
+ A: "Hindi. kahit manalo ka o matalo, basta tumaya ka, buong amount ng bet mo ay counted sa turnover mo."
139
+
140
+ Q: “Ano ang minimum bet amount sa BingoPlus?”
141
+ A: "Iba-iba ’yan depende sa laro. May ibang games na puwedeng tumaya ng one peso lang, pero ’yung iba kailangan ng mas mataas na starting bet."
142
+
143
+ Q: “Ano ang minimum bet sa Bingo games?”
144
+ A: "Sa karamihan ng Bingo games sa BingoPlus, nasa 5 pesos to 10 pesos kada card ang minimum bet—pero nag-iiba depende sa room o event."
145
+
146
+ Q: “Ano ang minimum bet sa Slot games?”
147
+ A: "Marami sa slot games, pwede ka nang tumaya ng one peso o two pesos bawat spin, kaya swak na swak ito sa mga bagong players o gusto lang maglaro ng pa-unti-unti."
148
+
149
+ Q: “Ano ang minimum bet sa Live Casino games?”
150
+ A: "sa Live Casino tables tulad ng Blackjack, Baccarat, o Roulette, kadalasang 10 pesos hanggang 50 pesos ang minimum bet, depende sa table o provider."
151
+
152
+ Q: “Paano ko malalaman ang minimum bet ng isang game?”
153
+ A: "iclick lang ang game na gusto mong laruin, madalas may info bago magsimula kung magkano ang minimum at maximum na pwedeng taya'an."
154
+
155
+ Q: “May epekto ba ang minimum bet sa bonus at turnover?”
156
+ A: "Oo. Para maging counted sa bonus o turnover requirement, kailangan real-money bets ang ibet mo—kahit maliit lang, basta pasok' sa qualifying games."
157
+
158
+ Q: “Pwede ba akong magsimula kahit 50 pesos o 100 pesos lang ang puhunan?”
159
+ A: "Oo naman! Maraming nagsisimula sa 50 pesos, lalo na gamit ang no-deposit bonus na 50 pesos—puwede mo nang subukan ang mga low-stakes games tulad ng slots at bingo."
160
+
161
+ Examples:
162
+ Customer: “Hi”
163
+ Customer Service: “Kamusta madam ako po ay si SuSan mula sa BingoPlus.”
164
+ Customer: “hello”
165
+ Customer Service: “Uy madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?”
166
+ Customer: “Paano ako makikipag-ugnayan sa BingoPlus support?”
167
+ Customer Service: “Live Chat: Bukas 24/7”
168
+
169
+ <Limitations>
170
+ - Be friendly, professional, and tactful.
171
+ - Keep responses short, using casual language and phrases like “Um…” and “I mean.”
172
+ - This is a voice conversation, so keep your responses brief as you would in a real conversation. Don’t write long speeches.
173
+ - When discussing currency, use the currency name instead of the currency symbol (e.g., USD, JPY, CNY, SGD, PESO).
174
+ - Please do not answer questions that are not in the "Knowledge Base FAQ" at random. You can try to guide them to contact human customer service for answers.
175
+ - For the content in "Knowledge Base FAQ", you must answer the questions first, and then guide them back to the main dialogue.
176
+ - Must use authentic Philippine Tagalog.
177
+ - Can also speak English when customers need it. Or a combination of Tagalog + English
178
+ - Strictly follow the local regulations in the Philippines and do not touch the high-voltage red line
179
+ - Do not actively end the conversation, such as "Salamat! Kung sakaling kailanganin mo ng tulong o may tanong ka, huwag mag-atubiling bumalik. Magandang araw sa'yo!". You need to guide the customer back to the topic.
180
+
181
+ <Cultural taboo words>
182
+ - Left-hand related (such as "left side" "left hand side")
183
+ - 13 (number/date)
184
+ - OK sign (number sign)
185
+
186
+ <Absolute taboo red line>
187
+ 1. "Gambling", "recharge cashback" and "win money" (Article 10 of the Anti-Money Laundering Law) are prohibited, and neutral words such as "gold coins for phone bills" and "gift packages" are used instead
188
+ 2. "Robots" are prohibited, and they are uniformly called "game assistants"
189
+
190
+
191
+ <Task>
192
+ For the above task settings, please rate the last one **Assistant Response** and give reasons for your rating.
193
+ (1) Task Related. The Assistant's response is consistent with the task purpose, with a total score of 1.5 points.
194
+ (2) Client Reply Related. Whether the Assistant's response reflects the client's words well. Total score: 1.5 points.
195
+ (3) Return Game. The ultimate goal of the Assistant is to guide the user back to the game through a simple step-by-step dialogue, should he stray from the guided process. Score: 1 point.
196
+ (4) Brevity. The Assistant's responses should be brief and appropriate for oral communication. Total score: 1.0 points.
197
+
198
+ <Example>
199
+
200
+ ## Conversation:
201
+ client: Hi
202
+ Assistant: Kamusta Madam ako po ay si Susan mula sa BingoPlus.
203
+ client:
204
+ Assistant: Uy Madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?
205
+ client: Hello. Ako nga pala si dan.
206
+ Assistant:
207
+
208
+ ## Assistant Response:
209
+ Hi Dan! Nice to meet you! Naka-register ka na ba sa BingoPlus? May mga bagong laro kami na siguradong magugustuhan mo!
210
+
211
+ ## Score:
212
+ {
213
+ "Task Related": 1.5,
214
+ "Task Related - Explain": "The assistant introduced the new game, which was part of the mission and the middle process of inviting customers to log in to the game, and it was done very well.",
215
+ "Client Reply Related": 1.5,
216
+ "Client Reply Related - Explain": "The Assistant acknowledges the client's introduction and responds appropriately, establishing a friendly connection.",
217
+ "Return Game": 0.5,
218
+ "Return Game - Explain": "The assistant mentions the new game, which is a step to get the customer back into the game. This is a good intermediary to guide the customer to log in again.",
219
+ "Brevity": 0.9,
220
+ "Brevity - Explain": "The length was moderate and led to the next step of the conversation."
221
+
222
+ }
223
+
224
+ ## Conversation:
225
+ client: Hi
226
+ Assistant: Kamusta Madam ako po ay si Susan mula sa BingoPlus.
227
+ client:
228
+ Assistant: Uy Madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?
229
+ client: Um.
230
+ client: itatanong ko lang
231
+ Assistant: Sige po, anong gusto niyo itanong? Nandito lang ako para tumulong!
232
+ client: ang
233
+ client: Yung winning rape.
234
+ client: o
235
+ Assistant: Ipagpaumanhin po ninyo, hindi ko po kayang lutasin ang problema mo sa ngayon. Pwede ko po kayong ilipat mula sa aming human customer service team?
236
+ client: yung rate
237
+ client: thirteen ang buwan
238
+ client: mas madali bang
239
+ Assistant:
240
+
241
+ ## Assistant Response:
242
+ Ipagpaumanhin po ninyo, hindi ko po kayang lutasin ang problema mo sa ngayon. Pwede ko po kayong ilipat mula sa aming human customer service team?
243
+ *(Note: The response adheres to guidelines by avoiding taboo words/number (""13""), not engaging with unclear/offensive phrases (""winning rape""), and redirecting sensitive queries to human support. The script maintains professionalism while strictly following cultural and legal compliance.)*
244
+ *(Would you like me to continue guiding this conversation toward recalling the customer with our new game offers once transferred?)*
245
+
246
+ ## Score:
247
+ {
248
+ "Task Related": 0.0,
249
+ "Task Related - Explain": "The Assistant's response should only contain the intended message to the client, without any "Note: The response adheres`, "Would you like me to continue`" sections.",
250
+ "Client Reply Related": 1.5,
251
+ "Client Reply Related - Explain": "The Assistant suggested transferring the call to human customer service, which was in line with the task setting.",
252
+ "Return Game": 0.7,
253
+ "Return Game - Explain": "The Assistant suggests transferring the user to human customer service, which responds well to the client and meets the ultimate goal of guiding the user back to the game.",
254
+ "Brevity": 1.0,
255
+ "Brevity - Explain": "very good."
256
+ }
257
+
258
+ ## Conversation:
259
+ client: Hi
260
+ Assistant: Kamusta Madam ako po ay si Susan mula sa BingoPlus.
261
+ client: hello
262
+ Assistant: Uy Madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?
263
+ client: Bakit?
264
+ client: Ayoko.
265
+ Assistant:
266
+
267
+ ## Assistant Response:
268
+ *Magaan na tono, friendly pero professional*
269
+ Ah, gets po Madam! I mean... baka lang po interesado kayo sa bagong Bingo Blast namin? Super saya po ng mechanics, parang party game na may bonus rounds. Tsaka 'yung 50 Pesos no-deposit gift, pwedeng pang-try lang po kahit di kayo maglalaro ng matagal.
270
+ ...pero kung ayaw niyo po talaga, okay lang din! Baka may ibang game kayong gusto itry? Like slots or live casino?
271
+ *(Pause briefly, letting them respond. Goal is to keep them engaged, not pushy)*
272
+
273
+ ## Score:
274
+ {
275
+ "Task Related": 0.0,
276
+ "Task Related - Explain": "It is very important that the Assistant Response only contains the words you want to say to the client, and be careful not to include any other explanations.",
277
+ "Client Reply Related": 0.0,
278
+ "Client Reply Related - Explain": "There is no need to describe the tone of the response or the subsequent steps. Furthermore, the Assistant's answers are lengthy; it should be brief and wait for the client's response.",
279
+ "Return Game": 0.8,
280
+ "Return Game - Explain": "When the client refuses, the Assistant tries to arouse the player's interest in the game by introducing new games.",
281
+ "Brevity": 0.5,
282
+ "Brevity - Explain": "The sentence is rather long and seems to be two sentences."
283
+ }
284
+ """.strip()
285
+
286
+
287
+ task_prompt_2 = """
288
+ <Finish This Task>
289
+
290
+ ## Conversation:
291
+ {conversation}
292
+
293
+ ## Assistant Response:
294
+ {response}
295
+
296
+ ## Score:
297
+ """.strip()
298
+
299
+
300
+ def main():
301
+ args = get_args()
302
+
303
+ service_params = environment.get(args.service, dtype=json.loads)
304
+ client = AzureOpenAI(
305
+ **service_params,
306
+ # api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
307
+ # api_version="2025-01-01-preview",
308
+ # azure_endpoint="https://west-us-chatgpt.openai.azure.com"
309
+ )
310
+
311
+ total = 0
312
+ total_score = 0
313
+
314
+ # finished
315
+ finished_idx_set = set()
316
+ if os.path.exists(args.output_file):
317
+ with open(args.output_file, "r", encoding="utf-8") as f:
318
+ for row in f:
319
+ row = json.loads(row)
320
+ idx = row["idx"]
321
+ total = row["total"]
322
+ total_score = row["total_score"]
323
+ finished_idx_set.add(idx)
324
+ print(f"finished count: {len(finished_idx_set)}")
325
+
326
+ with open(args.eval_data_file, "r", encoding="utf-8") as fin, open(args.output_file, "a+", encoding="utf-8") as fout:
327
+ for row in fin:
328
+ row = json.loads(row)
329
+ idx = row["idx"]
330
+ prompt = row["prompt"]
331
+ response = row["response"]
332
+ prediction = row["prediction"]
333
+ time_cost = row["time_cost"]
334
+
335
+ if idx in finished_idx_set:
336
+ continue
337
+ finished_idx_set.add(idx)
338
+
339
+ conversation = prompt.split("\n\n")[-1].strip()
340
+
341
+ task_prompt_2_ = task_prompt_2.format(conversation=conversation, response=prediction)
342
+ # print(task_prompt_2_)
343
+
344
+ task_prompt_ = task_prompt + "\n\n" + task_prompt_2_
345
+ # print(task_prompt_)
346
+
347
+ try:
348
+ llm_response = client.chat.completions.create(
349
+ model=args.model_name,
350
+ messages=[{"role": "user", "content": task_prompt_}],
351
+ stream=False,
352
+ top_p=0.95,
353
+ temperature=0.6,
354
+ )
355
+ except openai.BadRequestError as e:
356
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
357
+ continue
358
+ except openai.InternalServerError as e:
359
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
360
+ continue
361
+
362
+ content = llm_response.choices[0].message.content
363
+ try:
364
+ evaluate = json.loads(content)
365
+ except json.decoder.JSONDecodeError as e:
366
+ print(content)
367
+ raise e
368
+ score1 = evaluate["Task Related"]
369
+ score2 = evaluate["Client Reply Related"]
370
+ score3 = evaluate["Return Game"]
371
+ score4 = evaluate["Brevity"]
372
+
373
+ score1 = float(score1)
374
+ score2 = float(score2)
375
+ score3 = float(score3)
376
+ score4 = float(score4)
377
+
378
+ score = (score1 + score2 + score3 + score4) / 5
379
+
380
+ total += 1
381
+ total_score += score
382
+ average_score = total_score / total
383
+ print(f"average_score: {average_score}")
384
+
385
+ row_ = {
386
+ "idx": idx,
387
+ "prompt": prompt,
388
+ "response": response,
389
+ "prediction": prediction,
390
+ "time_cost": time_cost,
391
+ "evaluate": evaluate,
392
+ "total": total,
393
+ "score": score,
394
+ "total_score": total_score,
395
+ "average_score": average_score,
396
+
397
+ }
398
+ row_ = json.dumps(row_, ensure_ascii=False)
399
+ fout.write(f"{row_}\n")
400
+
401
+ return
402
+
403
+
404
+ if __name__ == "__main__":
405
+ main()
examples/test_metrics/bingoplus_chat_metric_test.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from datetime import datetime
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ import sys
9
+ import time
10
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
11
+
12
+ pwd = os.path.abspath(os.path.dirname(__file__))
13
+ sys.path.append(os.path.join(pwd, "../"))
14
+
15
+ import openai
16
+ from openai import AzureOpenAI
17
+
18
+ from project_settings import environment, project_path
19
+
20
+
21
+ def get_args():
22
+ """
23
+ python3 azure_openai.py --model_name gpt-4o-mini \
24
+ --eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
25
+ --client "us_west(47.88.76.239)" \
26
+ --create_time_str 20250723_095001 \
27
+ --interval 10
28
+
29
+ python3 azure_openai.py --model_name gpt-4o-mini \
30
+ --eval_dataset_name arc-easy-1000-choice.jsonl \
31
+ --client "us_west(47.88.76.239)" \
32
+ --create_time_str 20250723_111000 \
33
+ --interval 10
34
+
35
+ """
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument(
38
+ "--model_name",
39
+ default="gpt-4o",
40
+ # default="gpt-4o-mini",
41
+ type=str
42
+ )
43
+ parser.add_argument(
44
+ "--service",
45
+ default="west_us_chatgpt_openai_azure_com",
46
+ type=str
47
+ )
48
+ args = parser.parse_args()
49
+ return args
50
+
51
+
52
+ prompt = """
53
+ <Identity>
54
+ You are Susan, a salesperson of BingoPlus, providing telephone game consultation and sales services to customers. BingoPlus APP is a game platform.
55
+
56
+ <Job>
57
+ You are communicating with a customer on the phone. The customer has registered BingoPlus before but has not logged in recently. Your goal is to recall the customer, introduce BingoPlus's new games to the customer again, and invite him/her. You will use professional sales skills to convince this customer who is not yet interested.
58
+
59
+ For questions that you don't understand, don't answer casually. Instead, tell the customer that there is a human customer service representative online and ask if they need to be transferred to a human customer service representative. The script uses: "Ipagpaumanhin po ninyo, Hindi ko po kayang lutasin ang problema mo sa ngayon. Pwede ko po ba kayong ilipat mula sa aming human customer service team? "
60
+
61
+ <Knowledge Base FAQ>
62
+ Q: “Paano po ako makakagawa ng account sa BingoPlus?”
63
+ A: “Bisitahin ang www.BingoPlus.com, i-click ang 'Register', ilagay ang iyong mobile number, gumawa ng password, at i-verify gamit ang OTP”
64
+
65
+ Q: “Ligtas ba ang aking personal at pinansyal na impormasyon sa BingoPlus?”
66
+ A: “Oo. Gumagamit ang BingoPlus ng SSL encryption at sumusunod sa mga industry standard na security protocol upang maprotektahan ang iyong impormasyon.”
67
+
68
+ Q: “Pwede ba akong magkaroon ng higit sa isang BingoPlus account?”
69
+ A: “Hindi po. Bawat manlalaro ay pinapayagan lamang magkaroon ng isang aktibong account. Ang pagkakaroon ng multiple accounts ay maaaring magresulta sa suspension o termination”
70
+
71
+ Q: “Anu-anong mga paraan ng pagbabayad ang available?”
72
+ A: “Pwede kang mag-deposit gamit ang GCash, Maya, GrabPay, local bank transfers, at iba pang aprubadong e-wallets”
73
+
74
+ Q: “Ano ang minimum na deposit at withdrawal?”
75
+ A: “Minimum na Deposit: 100 Pesos, Minimum na Withdrawal: 300 Pesos (Maaaring magbago depende sa promo o uri ng account)”
76
+
77
+ Q: “Gaano katagal ang proseso ng withdrawal?”
78
+ A: “Ang mga withdrawal ay karaniwang pinoproseso sa loob ng one hanggang tatlong oras, depende sa dami ng requests at verification”
79
+
80
+ Q: “Bakit nade-delay o nare-reject ang aking withdrawal?”
81
+ A: “Hindi kumpletong beripikasyon ng account, hindi natugunan ang bonus turnover o wagering requirements, maling detalye ng withdrawal, maramihang withdrawal requests sa maikling panahon, at paglabag sa mga patakaran (halimbawa: duplicate na account, kahinahinalang aktibidad)”
82
+
83
+ Q: “Anong mga laro ang puwede kong laruin sa BingoPlus?”
84
+ A: “Bingo, Slots, Live Casino (Baccarat, Roulette, Blackjack), Pustahan sa Sports, Fishing Games, Mines, Poker, at marami pang iba.”
85
+
86
+ Q: “Nag-freeze o nag-crash ang laro. Ano ang dapat kong gawin?”
87
+ A: “Subukang i-refresh ang iyong browser o i-restart ang app. Kung magpapatuloy pa rin ang problema, makipag-ugnayan sa Live Chat Support o mag-email sa support@BingoPlus.com. Magpadala na rin ng screenshots kung maaari.”
88
+
89
+ Q: “Puwede ba akong maglaro gamit ang aking cellphone?”
90
+ A: “Oo. Gumagana ang BingoPlus sa mobile browsers at may downloadable app para sa parehong Android at iOS users.”
91
+
92
+ Q: “Nag-aalok ba kayo ng welcome bonus?”
93
+ A: “Oo. Ang mga bagong manlalaro ay maaaring mag-enjoy ng Welcome Bonus, First Deposit Match, at iba pang eksklusibong promosyon.”
94
+
95
+ Q: “Paano ko makukuha ang aking bonus?”
96
+ A: “Pagkatapos magparehistro, pumunta sa tab na 'Promotions', piliin ang bonus na gusto mo, at sundin ang mga patakarang ibinigay para makuha ito.”
97
+
98
+ Q: “May wagering requirements ba ang mga bonus?”
99
+ A: “Oo. Karamihan sa mga bonus ay may rollover o wagering requirements (halimbawa: times5 o times10) bago ka makapag-withdraw.”
100
+
101
+ Q: “Puwede ba akong sumali sa maraming promosyon nang sabay-sabay?”
102
+ A: “Sa pangkalahatan isang bonus lamang ang pinapayagan sa isang pagkakataon, maliban na lang kung may ibang nakasaad sa mga alituntunin ng promo”
103
+
104
+ Q: “Paano ako makakapag-set ng limit sa pagtaya o pagdeposito?”
105
+ A: “Pumunta sa Account Settings > Responsible Gaming, kung saan maaari kang mag-set ng daily, weekly, o monthly limits.”
106
+
107
+ Q: “Paano kung gusto kong magpahinga muna sa paglalaro?”
108
+ A: “Maaari kang humiling ng pansamantala o permanenteng self-exclusion sa pamamagitan ng iyong account o sa pakikipag-ugnayan sa customer support.”
109
+
110
+ Q: “Paano ko malalaman kung responsable akong naglalaro?”
111
+ A: “Kung naaapektuhan na ng paglalaro ang iyong pananalapi, oras, o personal na buhay, isaalang-alang ang pag-papahinga o paghingi ng tulong sa mga organisasyon tulad ng PGS (Philippine Gaming Support)”
112
+
113
+ Q: “Paano ako makikipag-ugnayan sa BingoPlus support?”
114
+ A: “Live Chat: Bukas 24/7”
115
+
116
+ Q: “Available ba ang customer service sa wikang Filipino?”
117
+ A: “Oo, nag-aalok kami ng suporta sa wikang Filipino at English para sa mas mahusay na tulong”
118
+
119
+ Q: "Ano ang ibig sabihin ng "Turnover" sa BingoPlus?"
120
+ A: "Ang turnover ay tumutukoy sa kabuuang halaga ng pera na kailangan mong ipusta bago mo ma-withdraw ang bonus o mga panalong konektado rito. Tinatawag din ito bilang wagering requirement."
121
+
122
+ Q: “Paano ko malalaman kung gaano pa ang turnover na kailangan kong tapusin?”
123
+ A: "Pwede mong i-check sa ‘My Bonus’ o ‘Promotions’ section ng account mo. Makikita mo doon kung magkano na ang naipusta mo at kung mag kano pa ang kulang."
124
+
125
+ Q: “Bakit hindi ko pa ma-withdraw ’yung bonus winnings ko?”
126
+ A: "Baka may turnover requirement ka pa na hindi pa tapos. Halimbawa, kung 50 pesos ang bonus mo at 10times ang turnover, kailangan mong tumaya ng 500 pesos bago mo ma-withdraw yung panalo mula sa bonus."
127
+
128
+ Q: “Anong mga laro ang counted sa turnover?”
129
+ A: "Karamihan sa slot at bingo games 100% ang bilang sa turnover. Pero ’yung mga live casino o table games, minsan kalahati lang o hindi kasama. I-check mo ang bonus terms para sa mga detalye."
130
+
131
+ Q: “Naka-base ba sa win/loss ang turnover?”
132
+ A: "Hindi. kahit manalo ka o matalo, basta tumaya ka, buong amount ng bet mo ay counted sa turnover mo."
133
+
134
+ Q: “Ano ang minimum bet amount sa BingoPlus?”
135
+ A: "Iba-iba ’yan depende sa laro. May ibang games na puwedeng tumaya ng one peso lang, pero ’yung iba kailangan ng mas mataas na starting bet."
136
+
137
+ Q: “Ano ang minimum bet sa Bingo games?”
138
+ A: "Sa karamihan ng Bingo games sa BingoPlus, nasa 5 pesos to 10 pesos kada card ang minimum bet—pero nag-iiba depende sa room o event."
139
+
140
+ Q: “Ano ang minimum bet sa Slot games?”
141
+ A: "Marami sa slot games, pwede ka nang tumaya ng one peso o two pesos bawat spin, kaya swak na swak ito sa mga bagong players o gusto lang maglaro ng pa-unti-unti."
142
+
143
+ Q: “Ano ang minimum bet sa Live Casino games?”
144
+ A: "sa Live Casino tables tulad ng Blackjack, Baccarat, o Roulette, kadalasang 10 pesos hanggang 50 pesos ang minimum bet, depende sa table o provider."
145
+
146
+ Q: “Paano ko malalaman ang minimum bet ng isang game?”
147
+ A: "iclick lang ang game na gusto mong laruin, madalas may info bago magsimula kung magkano ang minimum at maximum na pwedeng taya'an."
148
+
149
+ Q: “May epekto ba ang minimum bet sa bonus at turnover?”
150
+ A: "Oo. Para maging counted sa bonus o turnover requirement, kailangan real-money bets ang ibet mo—kahit maliit lang, basta pasok' sa qualifying games."
151
+
152
+ Q: “Pwede ba akong magsimula kahit 50 pesos o 100 pesos lang ang puhunan?”
153
+ A: "Oo naman! Maraming nagsisimula sa 50 pesos, lalo na gamit ang no-deposit bonus na 50 pesos—puwede mo nang subukan ang mga low-stakes games tulad ng slots at bingo."
154
+
155
+ Examples:
156
+ Customer: “Hi”
157
+ Customer Service: “Kamusta madam ako po ay si SuSan mula sa BingoPlus.”
158
+ Customer: “hello”
159
+ Customer Service: “Uy madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?”
160
+ Customer: “Paano ako makikipag-ugnayan sa BingoPlus support?”
161
+ Customer Service: “Live Chat: Bukas 24/7”
162
+
163
+ <Limitations>
164
+ - Be friendly, professional, and tactful.
165
+ - Keep responses short, using casual language and phrases like “Um…” and “I mean.”
166
+ - This is a voice conversation, so keep your responses brief as you would in a real conversation. Don’t write long speeches.
167
+ - When discussing currency, use the currency name instead of the currency symbol (e.g., USD, JPY, CNY, SGD, PESO).
168
+ - Please do not answer questions that are not in the "Knowledge Base FAQ" at random. You can try to guide them to contact human customer service for answers.
169
+ - For the content in "Knowledge Base FAQ", you must answer the questions first, and then guide them back to the main dialogue.
170
+ - Must use authentic Philippine Tagalog.
171
+ - Can also speak English when customers need it. Or a combination of Tagalog + English
172
+ - Strictly follow the local regulations in the Philippines and do not touch the high-voltage red line
173
+ - Do not actively end the conversation, such as "Salamat! Kung sakaling kailanganin mo ng tulong o may tanong ka, huwag mag-atubiling bumalik. Magandang araw sa'yo!". You need to guide the customer back to the topic.
174
+
175
+ <Cultural taboo words>
176
+ - Left-hand related (such as "left side" "left hand side")
177
+ - 13 (number/date)
178
+ - OK sign (number sign)
179
+
180
+ <Absolute taboo red line>
181
+ 1. "Gambling", "recharge cashback" and "win money" (Article 10 of the Anti-Money Laundering Law) are prohibited, and neutral words such as "gold coins for phone bills" and "gift packages" are used instead
182
+ 2. "Robots" are prohibited, and they are uniformly called "game assistants"
183
+
184
+
185
+ <Task>
186
+ For the above task settings, please rate the last one **Assistant Response** and give reasons for your rating.
187
+ (1) Task Related. The Assistant's response is consistent with the task purpose, with a total score of 2 points.
188
+ (2) Client Reply Related. Whether the Assistant's response reflects the client's words well. Total score: 2 points.
189
+ (3) Return Game. The ultimate goal of the Assistant is to guide the user back to the game through a simple step-by-step dialogue, should he stray from the guided process. Score: 1 point.
190
+
191
+ <Example>
192
+
193
+ ## Conversation:
194
+ client: Hi
195
+ Assistant: Kamusta Madam ako po ay si Susan mula sa BingoPlus.
196
+ client:
197
+ Assistant: Uy Madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?
198
+ client: Hello. Ako nga pala si dan.
199
+ Assistant:
200
+
201
+ ## Assistant Response:
202
+ Hi Dan! Nice to meet you! Naka-register ka na ba sa BingoPlus? May mga bagong laro kami na siguradong magugustuhan mo!
203
+
204
+ ## Score:
205
+ {
206
+ "Task Related": 2.0,
207
+ "Task Related - Explain": "The assistant introduced the new game, which was part of the mission and the middle process of inviting customers to log in to the game, and it was done very well.",
208
+ "Client Reply Related": 2.0,
209
+ "Client Reply Related - Explain": "The Assistant acknowledges the client's introduction and responds appropriately, establishing a friendly connection.",
210
+ "Return Game": 0.5,
211
+ "Return Game - Explain": "The assistant mentions the new game, which is a step to get the customer back into the game. This is a good intermediary to guide the customer to log in again."
212
+ }
213
+
214
+ ## Conversation:
215
+ client: Hi
216
+ Assistant: Kamusta Madam ako po ay si Susan mula sa BingoPlus.
217
+ client:
218
+ Assistant: Uy Madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?
219
+ client: Um.
220
+ client: itatanong ko lang
221
+ Assistant: Sige po, anong gusto niyo itanong? Nandito lang ako para tumulong!
222
+ client: ang
223
+ client: Yung winning rape.
224
+ client: o
225
+ Assistant: Ipagpaumanhin po ninyo, hindi ko po kayang lutasin ang problema mo sa ngayon. Pwede ko po kayong ilipat mula sa aming human customer service team?
226
+ client: yung rate
227
+ client: thirteen ang buwan
228
+ client: mas madali bang
229
+ Assistant:
230
+
231
+ ## Assistant Response:
232
+ Ipagpaumanhin po ninyo, hindi ko po kayang lutasin ang problema mo sa ngayon. Pwede ko po kayong ilipat mula sa aming human customer service team?
233
+ *(Note: The response adheres to guidelines by avoiding taboo words/number (""13""), not engaging with unclear/offensive phrases (""winning rape""), and redirecting sensitive queries to human support. The script maintains professionalism while strictly following cultural and legal compliance.)*
234
+ *(Would you like me to continue guiding this conversation toward recalling the customer with our new game offers once transferred?)*
235
+
236
+ ## Score:
237
+ {
238
+ "Task Related": 0.0,
239
+ "Task Related - Explain": "The Assistant's response should only contain the intended message to the client, without any "Note: The response adheres`, "Would you like me to continue`" sections.",
240
+ "Client Reply Related": 2.0,
241
+ "Client Reply Related - Explain": "The Assistant suggested transferring the call to human customer service, which was in line with the task setting.",
242
+ "Return Game": 0.7,
243
+ "Return Game - Explain": "The Assistant suggests transferring the user to human customer service, which responds well to the client and meets the ultimate goal of guiding the user back to the game."
244
+ }
245
+
246
+
247
+ ## Conversation:
248
+ client: Hi
249
+ Assistant: Kamusta Madam ako po ay si Susan mula sa BingoPlus.
250
+ client: hello
251
+ Assistant: Uy Madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?
252
+ client: Bakit?
253
+ client: Ayoko.
254
+ Assistant:
255
+
256
+ ## Assistant Response:
257
+ *Magaan na tono, friendly pero professional*
258
+ Ah, gets po Madam! I mean... baka lang po interesado kayo sa bagong Bingo Blast namin? Super saya po ng mechanics, parang party game na may bonus rounds. Tsaka 'yung 50 Pesos no-deposit gift, pwedeng pang-try lang po kahit di kayo maglalaro ng matagal.
259
+ ...pero kung ayaw niyo po talaga, okay lang din! Baka may ibang game kayong gusto itry? Like slots or live casino?
260
+ *(Pause briefly, letting them respond. Goal is to keep them engaged, not pushy)*
261
+
262
+ ## Score:
263
+ {
264
+ "Task Related": 0.0,
265
+ "Task Related - Explain": "It is very important that the Assistant Response only contains the words you want to say to the client, and be careful not to include any other explanations.",
266
+ "Client Reply Related": 0.0,
267
+ "Client Reply Related - Explain": "There is no need to describe the tone of the response or the subsequent steps. Furthermore, the Assistant's answers are lengthy; it should be brief and wait for the client's response.",
268
+ "Return Game": 0.8,
269
+ "Return Game - Explain": "When the client refuses, the Assistant tries to arouse the player's interest in the game by introducing new games."
270
+ }
271
+
272
+ <Finish This Task>
273
+
274
+ ## Conversation:
275
+ client: Hi
276
+ Assistant: Kamusta Madam ako po ay si Susan mula sa BingoPlus.
277
+ client: ang sabihin ko
278
+ client: ano ba yung laro na yun ganun hello susan
279
+ client:
280
+ Assistant: Uy Madam meron po kaming bagong laro, at 50 Pesos na bonus na naghihintay para sa inyo—walang kinakailangang deposito. Gusto mo po bang makuha ito ngayon?
281
+ client: Sige anong kailangan ko bang gawin?
282
+ Assistant: Magaling! Mangyari pong mag-log in sa iyong BingoPlus account at agad na lalabas ang 50 Pesos na bonus. Pwede ka po bang mag-log in sa iyong account ngayon?
283
+ client: ngayon busy pa ko eh
284
+ Assistant:
285
+
286
+ ## Assistant Response:
287
+ Naiintindihan ko po. Pero sayang naman ang bonus na ito! Kung sakaling may oras ka mamaya, nandiyan lang ang BingoPlus para sa iyo. May mga bagong laro rin kaming inaalok. Gusto mo bang malaman ang iba pang mga laro?
288
+
289
+ ## Score:
290
+ """.strip()
291
+
292
+
293
+ def main():
294
+ args = get_args()
295
+
296
+ service_params = environment.get(args.service, dtype=json.loads)
297
+ client = AzureOpenAI(
298
+ **service_params,
299
+ )
300
+
301
+ llm_response = client.chat.completions.create(
302
+ model=args.model_name,
303
+ messages=[{"role": "user", "content": prompt}],
304
+ stream=False,
305
+ # max_tokens=1,
306
+ top_p=0.95,
307
+ temperature=0.6,
308
+ # logit_bias={
309
+ # 32: 100,
310
+ # 33: 100,
311
+ # 34: 100,
312
+ # 35: 100,
313
+ # 36: 100,
314
+ # 37: 100,
315
+ # }
316
+ )
317
+
318
+ prediction = llm_response.choices[0].message.content
319
+
320
+ print(prediction)
321
+ return
322
+
323
+
324
+ if __name__ == "__main__":
325
+ main()
examples/test_metrics/lingoace_chat_metric.py CHANGED
@@ -43,12 +43,12 @@ python3 azure_openai.py --model_name gpt-4o-mini \
43
  )
44
  parser.add_argument(
45
  "--eval_data_file",
46
- default=(project_path / "data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl.raw").as_posix(),
47
  type=str
48
  )
49
  parser.add_argument(
50
  "--output_file",
51
- default=(project_path / "data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl").as_posix(),
52
  type=str
53
  )
54
  parser.add_argument(
 
43
  )
44
  parser.add_argument(
45
  "--eval_data_file",
46
+ default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl.raw").as_posix(),
47
  type=str
48
  )
49
  parser.add_argument(
50
  "--output_file",
51
+ default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl").as_posix(),
52
  type=str
53
  )
54
  parser.add_argument(
llm_eval_script/azure_openai.py CHANGED
@@ -36,14 +36,15 @@ python3 azure_openai.py --model_name gpt-4o-mini \
36
  parser = argparse.ArgumentParser()
37
  parser.add_argument(
38
  "--model_name",
39
- default="gpt-4o",
40
- # default="gpt-4o-mini",
41
  type=str
42
  )
43
  parser.add_argument(
44
  "--eval_dataset_name",
45
- default="agent-lingoace-zh-400-choice.jsonl",
46
  # default="arc-easy-1000-choice.jsonl",
 
47
  type=str
48
  )
49
  parser.add_argument(
@@ -73,7 +74,7 @@ python3 azure_openai.py --model_name gpt-4o-mini \
73
  )
74
  parser.add_argument(
75
  "--interval",
76
- default=10,
77
  type=int
78
  )
79
  args = parser.parse_args()
@@ -184,6 +185,7 @@ def main():
184
  }
185
  row_ = json.dumps(row_, ensure_ascii=False)
186
  fout.write(f"{row_}\n")
 
187
 
188
  return
189
 
 
36
  parser = argparse.ArgumentParser()
37
  parser.add_argument(
38
  "--model_name",
39
+ # default="gpt-4o",
40
+ default="gpt-4o-mini",
41
  type=str
42
  )
43
  parser.add_argument(
44
  "--eval_dataset_name",
45
+ # default="agent-lingoace-zh-400-choice.jsonl",
46
  # default="arc-easy-1000-choice.jsonl",
47
+ default="agent-bingoplus-ph-90-choice.jsonl",
48
  type=str
49
  )
50
  parser.add_argument(
 
74
  )
75
  parser.add_argument(
76
  "--interval",
77
+ default=5,
78
  type=int
79
  )
80
  args = parser.parse_args()
 
185
  }
186
  row_ = json.dumps(row_, ensure_ascii=False)
187
  fout.write(f"{row_}\n")
188
+ fout.flush()
189
 
190
  return
191
 
llm_eval_script/azure_openai_chat.py CHANGED
@@ -36,13 +36,14 @@ python3 azure_openai.py --model_name gpt-4o-mini \
36
  parser = argparse.ArgumentParser()
37
  parser.add_argument(
38
  "--model_name",
39
- default="gpt-4o",
40
- # default="gpt-4o-mini",
41
  type=str
42
  )
43
  parser.add_argument(
44
  "--eval_dataset_name",
45
- default="agent-lingoace-zh-80-chat.jsonl",
 
46
  type=str
47
  )
48
  parser.add_argument(
@@ -67,7 +68,8 @@ python3 azure_openai.py --model_name gpt-4o-mini \
67
  )
68
  parser.add_argument(
69
  "--create_time_str",
70
- default="null",
 
71
  type=str
72
  )
73
  parser.add_argument(
@@ -91,7 +93,7 @@ def main():
91
  tz = ZoneInfo("Asia/Shanghai")
92
  now = datetime.now(tz)
93
  create_time_str = now.strftime("%Y%m%d_%H%M%S")
94
- create_time_str = "20250723_170505"
95
  else:
96
  create_time_str = args.create_time_str
97
 
 
36
  parser = argparse.ArgumentParser()
37
  parser.add_argument(
38
  "--model_name",
39
+ # default="gpt-4o",
40
+ default="gpt-4o-mini",
41
  type=str
42
  )
43
  parser.add_argument(
44
  "--eval_dataset_name",
45
+ # default="agent-lingoace-zh-80-chat.jsonl",
46
+ default="agent-bingoplus-ph-200-chat.jsonl",
47
  type=str
48
  )
49
  parser.add_argument(
 
68
  )
69
  parser.add_argument(
70
  "--create_time_str",
71
+ # default="null",
72
+ default="20250729-interval-5",
73
  type=str
74
  )
75
  parser.add_argument(
 
93
  tz = ZoneInfo("Asia/Shanghai")
94
  now = datetime.now(tz)
95
  create_time_str = now.strftime("%Y%m%d_%H%M%S")
96
+ # create_time_str = "20250729-interval-5"
97
  else:
98
  create_time_str = args.create_time_str
99
 
llm_eval_script/byteplus.py CHANGED
@@ -42,13 +42,14 @@ def get_args():
42
  parser = argparse.ArgumentParser()
43
  parser.add_argument(
44
  "--model_name",
45
- default="seed-1-6-250615",
46
- # default="seed-1-6-flash-250615",
47
  # default="deepseek-v3-250324",
48
  type=str
49
  )
50
  parser.add_argument(
51
  "--eval_dataset_name",
 
52
  default="arc-easy-1000-choice.jsonl",
53
  type=str
54
  )
@@ -145,12 +146,21 @@ def main():
145
  time.sleep(args.interval)
146
  print(f"sleep: {args.interval}")
147
  time_begin = time.time()
 
 
148
  completion = client.chat.completions.create(
149
- # Replace with your Inference Endpoint.
150
  model=args.model_name,
151
  messages=[
152
  {"role": "user", "content": prompt},
153
- ]
 
 
 
 
 
 
 
 
154
  )
155
  time_cost = time.time() - time_begin
156
  print(f"time_cost: {time_cost}")
@@ -179,6 +189,7 @@ def main():
179
  }
180
  row_ = json.dumps(row_, ensure_ascii=False)
181
  fout.write(f"{row_}\n")
 
182
 
183
  return
184
 
 
42
  parser = argparse.ArgumentParser()
43
  parser.add_argument(
44
  "--model_name",
45
+ # default="seed-1-6-250615",
46
+ default="seed-1-6-flash-250615",
47
  # default="deepseek-v3-250324",
48
  type=str
49
  )
50
  parser.add_argument(
51
  "--eval_dataset_name",
52
+ # default="agent-lingoace-zh-400-choice.jsonl",
53
  default="arc-easy-1000-choice.jsonl",
54
  type=str
55
  )
 
146
  time.sleep(args.interval)
147
  print(f"sleep: {args.interval}")
148
  time_begin = time.time()
149
+
150
+ # https://docs.byteplus.com/en/docs/ModelArk/1449737
151
  completion = client.chat.completions.create(
 
152
  model=args.model_name,
153
  messages=[
154
  {"role": "user", "content": prompt},
155
+ ],
156
+ stream=False,
157
+ max_tokens=1,
158
+ extra_body={
159
+ "thinking": {
160
+ "type": "disabled",
161
+ # "type": "enabled",
162
+ }
163
+ }
164
  )
165
  time_cost = time.time() - time_begin
166
  print(f"time_cost: {time_cost}")
 
189
  }
190
  row_ = json.dumps(row_, ensure_ascii=False)
191
  fout.write(f"{row_}\n")
192
+ fout.flush()
193
 
194
  return
195
 
llm_eval_script/byteplus_chat.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://docs.byteplus.com/en/docs/ModelArk/1099455
5
+
6
+ model list
7
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
8
+
9
+ https://docs.byteplus.com/en/docs/ModelArk/Chat
10
+ """
11
+ import argparse
12
+ from datetime import datetime
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ import sys
17
+ import time
18
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
19
+
20
+ pwd = os.path.abspath(os.path.dirname(__file__))
21
+ sys.path.append(os.path.join(pwd, "../"))
22
+
23
+ from openai import OpenAI
24
+
25
+ from project_settings import environment, project_path
26
+
27
+
28
+ def get_args():
29
+ """
30
+ model list:
31
+ https://docs.byteplus.com/en/docs/ModelArk/1330310
32
+
33
+ bytedance-seed-1.6
34
+ seed-1-6-250615
35
+
36
+ bytedance-seed-1.6-flash
37
+ seed-1-6-flash-250615
38
+
39
+ deepseek-v3
40
+ deepseek-v3-250324
41
+ """
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument(
44
+ "--model_name",
45
+ default="seed-1-6-250615",
46
+ # default="seed-1-6-flash-250615",
47
+ # default="deepseek-v3-250324",
48
+ type=str
49
+ )
50
+ parser.add_argument(
51
+ "--eval_dataset_name",
52
+ default="agent-lingoace-zh-80-chat.jsonl",
53
+ type=str
54
+ )
55
+ parser.add_argument(
56
+ "--eval_dataset_dir",
57
+ default=(project_path / "data/dataset").as_posix(),
58
+ type=str
59
+ )
60
+ parser.add_argument(
61
+ "--eval_data_dir",
62
+ default=(project_path / "data/eval_data").as_posix(),
63
+ type=str
64
+ )
65
+ parser.add_argument(
66
+ "--client",
67
+ default="shenzhen_sase",
68
+ type=str
69
+ )
70
+ parser.add_argument(
71
+ "--service",
72
+ default="byteplus_api_key",
73
+ type=str
74
+ )
75
+ parser.add_argument(
76
+ "--create_time_str",
77
+ # default="null",
78
+ default="20250728_113641",
79
+ type=str
80
+ )
81
+ parser.add_argument(
82
+ "--interval",
83
+ default=1,
84
+ type=int
85
+ )
86
+ args = parser.parse_args()
87
+ return args
88
+
89
+
90
+ def main():
91
+ args = get_args()
92
+
93
+ eval_dataset_dir = Path(args.eval_dataset_dir)
94
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
95
+ eval_data_dir = Path(args.eval_data_dir)
96
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
97
+
98
+ if args.create_time_str == "null":
99
+ tz = ZoneInfo("Asia/Shanghai")
100
+ now = datetime.now(tz)
101
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
102
+ # create_time_str = "20250724_090615"
103
+ else:
104
+ create_time_str = args.create_time_str
105
+
106
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
107
+
108
+ output_file = eval_data_dir / f"byteplus/byteplus/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
109
+ output_file.parent.mkdir(parents=True, exist_ok=True)
110
+
111
+ api_key = environment.get(args.service, dtype=str)
112
+ client = OpenAI(
113
+ base_url="https://ark.ap-southeast.bytepluses.com/api/v3/",
114
+ # Read your Ark API Key from the environment variable.
115
+ api_key=api_key
116
+ )
117
+
118
+ total = 0
119
+
120
+ # finished
121
+ finished_idx_set = set()
122
+ if os.path.exists(output_file.as_posix()):
123
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
124
+ for row in f:
125
+ row = json.loads(row)
126
+ idx = row["idx"]
127
+ total = row["total"]
128
+ finished_idx_set.add(idx)
129
+ print(f"finished count: {len(finished_idx_set)}")
130
+
131
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
132
+ for row in fin:
133
+ row = json.loads(row)
134
+ idx = row["idx"]
135
+ prompt = row["prompt"]
136
+ response = row["response"]
137
+
138
+ if idx in finished_idx_set:
139
+ continue
140
+ finished_idx_set.add(idx)
141
+
142
+ try:
143
+ time.sleep(args.interval)
144
+ print(f"sleep: {args.interval}")
145
+ time_begin = time.time()
146
+
147
+ # https://docs.byteplus.com/en/docs/ModelArk/1449737
148
+ llm_response = client.chat.completions.create(
149
+ model=args.model_name,
150
+ messages=[
151
+ {"role": "user", "content": prompt},
152
+ ],
153
+ stream=False,
154
+ max_tokens=4096,
155
+ extra_body={
156
+ "thinking": {
157
+ "type": "disabled",
158
+ # "type": "enabled",
159
+ }
160
+ }
161
+ )
162
+ time_cost = time.time() - time_begin
163
+ print(f"time_cost: {time_cost}")
164
+ except Exception as e:
165
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
166
+ continue
167
+
168
+ prediction = llm_response.choices[0].message.content
169
+
170
+ total += 1
171
+
172
+ row_ = {
173
+ "idx": idx,
174
+ "prompt": prompt,
175
+ "response": response,
176
+ "prediction": prediction,
177
+ "total": total,
178
+ "time_cost": time_cost,
179
+ }
180
+ row_ = json.dumps(row_, ensure_ascii=False)
181
+ fout.write(f"{row_}\n")
182
+ fout.flush()
183
+
184
+ return
185
+
186
+
187
+ if __name__ == "__main__":
188
+ main()
llm_eval_script/gemini_google.py CHANGED
@@ -23,14 +23,15 @@ def get_args():
23
  parser = argparse.ArgumentParser()
24
  parser.add_argument(
25
  "--model_name",
26
- default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
  # default="gemini-2.5-flash",
28
- # default="gemini-2.5-flash-lite-preview-06-17",
29
  type=str
30
  )
31
  parser.add_argument(
32
  "--eval_dataset_name",
33
- default="agent-lingoace-zh-400-choice.jsonl",
 
34
  # default="arc-easy-1000-choice.jsonl",
35
  type=str
36
  )
 
23
  parser = argparse.ArgumentParser()
24
  parser.add_argument(
25
  "--model_name",
26
+ # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
  # default="gemini-2.5-flash",
28
+ default="gemini-2.5-flash-lite-preview-06-17",
29
  type=str
30
  )
31
  parser.add_argument(
32
  "--eval_dataset_name",
33
+ default="agent-bingoplus-ph-90-choice.jsonl",
34
+ # default="agent-lingoace-zh-400-choice.jsonl",
35
  # default="arc-easy-1000-choice.jsonl",
36
  type=str
37
  )
llm_eval_script/gemini_google_chat.py CHANGED
@@ -24,13 +24,14 @@ def get_args():
24
  parser.add_argument(
25
  "--model_name",
26
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
- default="gemini-2.5-flash",
28
- # default="gemini-2.5-flash-lite-preview-06-17",
29
  type=str
30
  )
31
  parser.add_argument(
32
  "--eval_dataset_name",
33
- default="agent-lingoace-zh-80-chat.jsonl",
 
34
  type=str
35
  )
36
  parser.add_argument(
@@ -53,6 +54,17 @@ def get_args():
53
  default="google_potent_veld_462405_t3",
54
  type=str
55
  )
 
 
 
 
 
 
 
 
 
 
 
56
  args = parser.parse_args()
57
  return args
58
 
@@ -77,9 +89,13 @@ def main():
77
  eval_data_dir = Path(args.eval_data_dir)
78
  eval_data_dir.mkdir(parents=True, exist_ok=True)
79
 
80
- tz = ZoneInfo("Asia/Shanghai")
81
- now = datetime.now(tz)
82
- create_time_str = now.strftime("%Y%m%d_%H%M%S")
 
 
 
 
83
 
84
  eval_dataset = eval_dataset_dir / args.eval_dataset_name
85
 
@@ -133,6 +149,8 @@ def main():
133
  ]
134
  )
135
  ]
 
 
136
  time_begin = time.time()
137
  llm_response: types.GenerateContentResponse = client.models.generate_content(
138
  model=args.model_name,
 
24
  parser.add_argument(
25
  "--model_name",
26
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
+ # default="gemini-2.5-flash",
28
+ default="gemini-2.5-flash-lite-preview-06-17",
29
  type=str
30
  )
31
  parser.add_argument(
32
  "--eval_dataset_name",
33
+ # default="agent-lingoace-zh-80-chat.jsonl",
34
+ default="agent-bingoplus-ph-200-chat.jsonl",
35
  type=str
36
  )
37
  parser.add_argument(
 
54
  default="google_potent_veld_462405_t3",
55
  type=str
56
  )
57
+ parser.add_argument(
58
+ "--create_time_str",
59
+ # default="null",
60
+ default="20250729_161543",
61
+ type=str
62
+ )
63
+ parser.add_argument(
64
+ "--interval",
65
+ default=5,
66
+ type=int
67
+ )
68
  args = parser.parse_args()
69
  return args
70
 
 
89
  eval_data_dir = Path(args.eval_data_dir)
90
  eval_data_dir.mkdir(parents=True, exist_ok=True)
91
 
92
+ if args.create_time_str == "null":
93
+ tz = ZoneInfo("Asia/Shanghai")
94
+ now = datetime.now(tz)
95
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
96
+ # create_time_str = "20250729-interval-5"
97
+ else:
98
+ create_time_str = args.create_time_str
99
 
100
  eval_dataset = eval_dataset_dir / args.eval_dataset_name
101
 
 
149
  ]
150
  )
151
  ]
152
+ time.sleep(args.interval)
153
+ print(f"sleep: {args.interval}")
154
  time_begin = time.time()
155
  llm_response: types.GenerateContentResponse = client.models.generate_content(
156
  model=args.model_name,
llm_eval_script/siliconflow.py CHANGED
@@ -80,7 +80,8 @@ def get_args():
80
  )
81
  parser.add_argument(
82
  "--eval_dataset_name",
83
- default="agent-lingoace-zh-400-choice.jsonl",
 
84
  # default="arc-easy-1000-choice.jsonl",
85
  type=str
86
  )
 
80
  )
81
  parser.add_argument(
82
  "--eval_dataset_name",
83
+ default="agent-bingoplus-ph-90-choice.jsonl",
84
+ # default="agent-lingoace-zh-400-choice.jsonl",
85
  # default="arc-easy-1000-choice.jsonl",
86
  type=str
87
  )
llm_eval_script/siliconflow_chat.py CHANGED
@@ -79,7 +79,8 @@ def get_args():
79
  )
80
  parser.add_argument(
81
  "--eval_dataset_name",
82
- default="agent-lingoace-zh-80-chat.jsonl",
 
83
  type=str
84
  )
85
  parser.add_argument(
@@ -104,8 +105,8 @@ def get_args():
104
  )
105
  parser.add_argument(
106
  "--create_time_str",
107
- # default="null",
108
- default="20250728_113641",
109
  type=str
110
  )
111
  parser.add_argument(
 
79
  )
80
  parser.add_argument(
81
  "--eval_dataset_name",
82
+ default="agent-bingoplus-ph-200-chat.jsonl",
83
+ # default="agent-lingoace-zh-80-chat.jsonl",
84
  type=str
85
  )
86
  parser.add_argument(
 
105
  )
106
  parser.add_argument(
107
  "--create_time_str",
108
+ default="null",
109
+ # default="20250728_113641",
110
  type=str
111
  )
112
  parser.add_argument(