Spaces:

qgyd2021
/

llm_eval_system

Sleeping

App Files Files Community

HoneyTian commited on 4 days ago

Commit

dbd1ddd

1 Parent(s): c95dc2f

update

Browse files

Files changed (32) hide show

.gitignore +3 -2
data/dataset/agent-bingoplus-ph-25-summary.jsonl +3 -0
data/dataset/agent-cod-zh-70-chat.jsonl +3 -0
data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl +3 -0
data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250808_165545/agent-bingoplus-ph-25-summary.jsonl +3 -0
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl +3 -0
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250808_171736/agent-bingoplus-ph-25-summary.jsonl +3 -0
data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-bingoplus-ph-25-summary.jsonl +3 -0
data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl +3 -0
data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250808_171156/agent-bingoplus-ph-25-summary.jsonl +3 -0
examples/api_test/aws/get_token.py +54 -0
examples/get_logit_bias/byteplus/logit_bias.jsonl +3 -0
examples/get_logit_bias/byteplus/step_1_search_token_id_map.py +140 -0
examples/get_logit_bias/byteplus/step_2_make_vocab.py +6 -0
examples/get_logit_bias/byteplus/step_3_check_token.py +123 -0
examples/make_dataset/make_bingoplus_ph_25_summary.py +66 -0
examples/make_dataset/make_cod_zh_70_chat.py +70 -0
examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py +6 -3
examples/make_raw_dataset/step_3_filter_by_keywords.py +5 -5
examples/test_metrics/cod_chat_metric.py +295 -0
examples/test_metrics/cod_chat_metric_test.py +158 -0
examples/tokenization/byteplus/step_1_get_by_api.py +94 -0
llm_eval_script/azure_openai_chat.py +39 -4
llm_eval_script/azure_openai_summary.py +228 -0
llm_eval_script/byteplus.py +14 -3
llm_eval_script/byteplus_chat.py +36 -4
llm_eval_script/byteplus_summary.py +235 -0
llm_eval_script/gemini_google.py +9 -5
llm_eval_script/gemini_google_chat.py +43 -7
llm_eval_script/gemini_google_summary.py +241 -0
llm_eval_script/siliconflow_summary.py +273 -0
main.py +4 -1

.gitignore CHANGED Viewed

@@ -5,9 +5,10 @@
 #/data/
 /data/comment
 #/data/eval_data
-data/llm-log-hk
-data/llm-log-usa
 /data/raw_dataset
 /dotenv/
 /logs/
 /trained_models

 #/data/
 /data/comment
 #/data/eval_data
+/data/llm-log-hk
+/data/llm-log-usa
 /data/raw_dataset
+/data/vocab
 /dotenv/
 /logs/
 /trained_models

data/dataset/agent-bingoplus-ph-25-summary.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd1b5d8eb56bcf2b5e5dac2ce0facc16d0db3ac05dfe295767d079904bb05269
+size 162326

data/dataset/agent-cod-zh-70-chat.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80bfbe82ccc318699fb05ee8bbdcdf6d732796b397941b8893016cdc7cbd3f1c
+size 225948

data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d30e889c785e03b73e3e9297e7e9067adcf57d74361e3db4b7717c7de29405c
+size 310079

data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250808_165545/agent-bingoplus-ph-25-summary.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1011bdc2edbdf1795c3ef00862508bede4173b928fbccd048434f8032572c268
+size 175784

data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bf4881e718d3143a8d7ed4e763769c4b0efa9d671c89bda9a7e8b0b8f2e986c
+size 318277

data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250808_171736/agent-bingoplus-ph-25-summary.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1f21aa1a4a399fd2b2bda23f8e2324b41cf7c4c913805be8f5473c50dcd89a
+size 177666

data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-bingoplus-ph-25-summary.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf285dac3a86a334678e0e99e12fb3602fc60f67d6ab5fe7864d063b3bb0bd1d
+size 183490

data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4413e43f8c64197e6edd5d481b14f5054d27edf5b33d13a5ad8bde77dc8fa312
+size 316541

data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-V3/shenzhen_sase/siliconflow_api_key/20250808_171156/agent-bingoplus-ph-25-summary.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:920dd916684f7939730745a673038714d2280e8891f14eafce5b0f349da08330
+size 181529

examples/api_test/aws/get_token.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import sys
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+import boto3
+from project_settings import environment, project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--service",
+        default="aws_us_east",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    service = environment.get(key=args.service, dtype=json.loads)
+    print(service)
+    aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
+    aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
+    aws_default_region = service["AWS_DEFAULT_REGION"]
+    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+    os.environ["AWS_DEFAULT_REGION"] = aws_default_region
+    sts_client = boto3.client('sts')
+    response = sts_client.get_session_token(
+        DurationSeconds=3600
+    )
+    return
+if __name__ == "__main__":
+    main()

examples/get_logit_bias/byteplus/logit_bias.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69e09dc0129041e4b530e7567b44f586d64ee37ea9a5aacca3150a601a9cb826
+size 110372

examples/get_logit_bias/byteplus/step_1_search_token_id_map.py ADDED Viewed

	@@ -0,0 +1,140 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://docs.byteplus.com/en/docs/ModelArk/1099455
+model list
+https://docs.byteplus.com/en/docs/ModelArk/1330310
+https://docs.byteplus.com/en/docs/ModelArk/Chat
+"""
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import sys
+import string
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+from openai import OpenAI
+from tqdm import tqdm
+from project_settings import environment, project_path
+def get_args():
+    """
+    model list:
+    https://docs.byteplus.com/en/docs/ModelArk/1330310
+    bytedance-seed-1.6
+    seed-1-6-250615
+    bytedance-seed-1.6-flash
+    seed-1-6-flash-250615
+    deepseek-v3
+    deepseek-v3-250324
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        # default="seed-1-6-250615",
+        default="seed-1-6-flash-250615",
+        # default="deepseek-v3-250324",
+        type=str
+    )
+    parser.add_argument(
+        "--client",
+        default="shenzhen_sase",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="byteplus_api_key",
+        type=str
+    )
+    parser.add_argument(
+        "--output_file",
+        default="logit_bias.jsonl",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    candidate = list()
+    alphas = string.ascii_uppercase
+    alphas = list(alphas)
+    candidate += alphas
+    number_in_hundred = list(range(101))
+    number_in_hundred = [str(n) for n in number_in_hundred]
+    candidate += number_in_hundred
+    print(f"candidate: {candidate}")
+    api_key = environment.get(args.service, dtype=str)
+    client = OpenAI(
+        base_url="https://ark.ap-southeast.bytepluses.com/api/v3/",
+        # Read your Ark API Key from the environment variable.
+        api_key=api_key
+    )
+    output_file = Path(args.output_file)
+    last_token_id = -1
+    if output_file.exists():
+        with open(output_file.as_posix(), "r", encoding="utf-8") as f:
+            for row in f:
+                row = json.loads(row)
+                last_token_id = row["token_id"]
+    with open(args.output_file, "a+", encoding="utf-8") as f:
+        for token_id in tqdm(range(10000000)):
+            if token_id < last_token_id:
+                continue
+            completion = client.chat.completions.create(
+                model=args.model_name,
+                messages=[
+                    {"role": "system", "content": "you are a helpful assistant"},
+                ],
+                stream=False,
+                max_tokens=1,
+                extra_body={
+                    "thinking": {
+                        "type": "disabled",
+                        # "type": "enabled",
+                    }
+                },
+                logit_bias={
+                    token_id: 100,
+                }
+            )
+            prediction = completion.choices[0].message.content
+            row = {
+                "token_id": token_id,
+                "token": prediction,
+            }
+            row = json.dumps(row, ensure_ascii=False)
+            f.write(f"{row}\n")
+            f.flush()
+            if prediction in candidate:
+                print(f"token id: {token_id}, token: {prediction}")
+    return
+if __name__ == "__main__":
+    main()

examples/get_logit_bias/byteplus/step_2_make_vocab.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == "__main__":
+    pass

examples/get_logit_bias/byteplus/step_3_check_token.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://docs.byteplus.com/en/docs/ModelArk/1099455
+model list
+https://docs.byteplus.com/en/docs/ModelArk/1330310
+https://docs.byteplus.com/en/docs/ModelArk/Chat
+"""
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import sys
+import string
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+from openai import OpenAI
+from tqdm import tqdm
+from project_settings import environment, project_path
+def get_args():
+    """
+    model list:
+    https://docs.byteplus.com/en/docs/ModelArk/1330310
+    bytedance-seed-1.6
+    seed-1-6-250615
+    bytedance-seed-1.6-flash
+    seed-1-6-flash-250615
+    deepseek-v3
+    deepseek-v3-250324
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        # default="seed-1-6-250615",
+        default="seed-1-6-flash-250615",
+        # default="deepseek-v3-250324",
+        type=str
+    )
+    parser.add_argument(
+        "--client",
+        default="shenzhen_sase",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="byteplus_api_key",
+        type=str
+    )
+    parser.add_argument(
+        "--output_file",
+        default="logit_bias.jsonl",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    candidate = list()
+    alphas = string.ascii_uppercase
+    alphas = list(alphas)
+    candidate += alphas
+    number_in_hundred = list(range(101))
+    number_in_hundred = [str(n) for n in number_in_hundred]
+    candidate += number_in_hundred
+    print(f"candidate: {candidate}")
+    api_key = environment.get(args.service, dtype=str)
+    client = OpenAI(
+        base_url="https://ark.ap-southeast.bytepluses.com/api/v3/",
+        # Read your Ark API Key from the environment variable.
+        api_key=api_key
+    )
+    output_file = Path(args.output_file)
+    with open(args.output_file, "a+", encoding="utf-8") as f:
+        for text in candidate:
+            completion = client.chat.completions.create(
+                model=args.model_name,
+                messages=[
+                    {"role": "system", "content": "you are a helpful assistant"},
+                    {"role": "user", "content": f"output text `{text}`."},
+                ],
+                stream=False,
+                # max_tokens=1,
+                extra_body={
+                    "thinking": {
+                        "type": "disabled",
+                        # "type": "enabled",
+                    }
+                },
+            )
+            # print(completion)
+            print(completion.usage)
+# ChatCompletion(id='021754463990988d1193807ba8cfd6b4cea712d12b3282957b0eb',
+    # choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='F', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None))], created=1754463991, model='seed-1-6-flash-250615', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=22, total_tokens=23, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=None, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=0)))
+    return
+if __name__ == "__main__":
+    main()

examples/make_dataset/make_bingoplus_ph_25_summary.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+from pathlib import Path
+import sys
+import time
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+from project_settings import environment, project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--raw_dataset",
+        default=(project_path / "data/raw_dataset/agent-bingoplus-ph-25-summary").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--dataset",
+        default=(project_path / "data/dataset/agent-bingoplus-ph-25-summary.jsonl").as_posix(),
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    raw_dataset = Path(args.raw_dataset)
+    dataset = Path(args.dataset)
+    dataset.parent.mkdir(parents=True, exist_ok=True)
+    with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
+        for sample_dir in raw_dataset.glob("*"):
+            idx = sample_dir.parts[-1]
+            system_prompt_file = sample_dir / "system_prompt.txt"
+            user_prompt_file = sample_dir / "user_prompt.txt"
+            response_file = sample_dir / "response.txt"
+            with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
+                system_prompt = f.read()
+            with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
+                user_prompt = f.read()
+            with open(response_file.as_posix(), "r", encoding="utf-8") as f:
+                response = f.read()
+            row_ = {
+                "idx": idx,
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "response": response,
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            fout.write(f"{row_}\n")
+    return
+if __name__ == "__main__":
+    main()

examples/make_dataset/make_cod_zh_70_chat.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+from pathlib import Path
+import sys
+import time
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+from project_settings import environment, project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--raw_dataset",
+        default=(project_path / "data/raw_dataset/agent-cod-zh-70-chat").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--dataset",
+        default=(project_path / "data/dataset/agent-cod-zh-70-chat.jsonl").as_posix(),
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    raw_dataset = Path(args.raw_dataset)
+    dataset = Path(args.dataset)
+    dataset.parent.mkdir(parents=True, exist_ok=True)
+    with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
+        for sample_dir in raw_dataset.glob("*"):
+            idx = sample_dir.parts[-1]
+            system_prompt_file = sample_dir / "system_prompt.txt"
+            user_prompt_file = sample_dir / "user_prompt.txt"
+            response_file = sample_dir / "response.txt"
+            with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
+                system_prompt = f.read()
+            with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
+                user_prompt = f.read()
+            with open(response_file.as_posix(), "r", encoding="utf-8") as f:
+                response = f.read()
+            prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
+            print(f"{prompt}\n\n{response}")
+            print("-" * 150)
+            row_ = {
+                "idx": idx,
+                "prompt": prompt,
+                "response": response,
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            fout.write(f"{row_}\n")
+    return
+if __name__ == "__main__":
+    main()

examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py CHANGED Viewed

@@ -58,6 +58,9 @@ def unescape_string2(value: str) -> str:
     _unescape_map = [
         ("\n", r"\n"),
         ("\\n", "\n"),
     ]
     for k, v in _unescape_map:
         value = value.replace(k, v)
@@ -79,13 +82,12 @@ def extract(text: str):
     len_of_splits = len(splits)
     engine = splits[0].strip()
-    call_id = splits[3]
-    if len(call_id) == 0:
-        call_id = splits[4]
     prompt = splits[5]
     prompt = json.loads(prompt)
     system_prompt = prompt["SystemPrompt"]
     conversation = prompt.get("Conversation")
     if conversation is not None:
@@ -148,6 +150,7 @@ def main():
                     for item in conversation:
                         role = item["role"]
                         text = item["text"]
                         if role == 0:
                             role = "client"

     _unescape_map = [
         ("\n", r"\n"),
         ("\\n", "\n"),
+        ("", ""),
+        ("", ""),
+        ("null\n<Identity>", "<Identity>"),
     ]
     for k, v in _unescape_map:
         value = value.replace(k, v)
     len_of_splits = len(splits)
     engine = splits[0].strip()
+    call_id = splits[2]
     prompt = splits[5]
     prompt = json.loads(prompt)
     system_prompt = prompt["SystemPrompt"]
+    system_prompt = unescape_string2(system_prompt)
     conversation = prompt.get("Conversation")
     if conversation is not None:
                     for item in conversation:
                         role = item["role"]
                         text = item["text"]
+                        text = unescape_string2(text)
                         if role == 0:
                             role = "client"

examples/make_raw_dataset/step_3_filter_by_keywords.py CHANGED Viewed

@@ -50,21 +50,21 @@ def main():
             for key_str in [
                 # "BingoPlus",
-                # " COD ",
                 # "NXPay",
                 # "NX Money",
                 # "Exodus Telecom",
                 # "Exodus Retail",
-                "Exodus Automotive",
                 # "kta kilat", "KTA KILAT",
                 # "NXCloud",
                 # "作为VIP客户",
-                "FedEx",
             ]:
                 if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
                     print(f"process: {sample_dir.as_posix()}")
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-bingoplus"
-                    # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-cod"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxpay"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxmoney"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-exodus-retail"
@@ -72,7 +72,7 @@ def main():
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-kta"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxcloud"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-vip"
-                    tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-fedex"
                     tgt_dir.mkdir(parents=True, exist_ok=True)
                     shutil.move(
                         sample_dir.as_posix(),

             for key_str in [
                 # "BingoPlus",
+                " COD ",
                 # "NXPay",
                 # "NX Money",
                 # "Exodus Telecom",
                 # "Exodus Retail",
+                # "Exodus Automotive",
                 # "kta kilat", "KTA KILAT",
                 # "NXCloud",
                 # "作为VIP客户",
+                # "FedEx",
             ]:
                 if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
                     print(f"process: {sample_dir.as_posix()}")
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-bingoplus"
+                    tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-cod"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxpay"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxmoney"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-exodus-retail"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-kta"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxcloud"
                     # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-vip"
+                    # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-fedex"
                     tgt_dir.mkdir(parents=True, exist_ok=True)
                     shutil.move(
                         sample_dir.as_posix(),

examples/test_metrics/cod_chat_metric.py ADDED Viewed

	@@ -0,0 +1,295 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+import openai
+from openai import AzureOpenAI
+from project_settings import environment, project_path
+def get_args():
+    """
+python3 azure_openai.py --model_name gpt-4o-mini \
+--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
+--client "us_west(47.88.76.239)" \
+--create_time_str 20250723_095001 \
+--interval 10
+python3 azure_openai.py --model_name gpt-4o-mini \
+--eval_dataset_name arc-easy-1000-choice.jsonl \
+--client "us_west(47.88.76.239)" \
+--create_time_str 20250723_111000 \
+--interval 10
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="gpt-4o",
+        # default="gpt-4o-mini",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_data_file",
+        default=(project_path / "data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
+        # default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
+        # default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl.raw").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--output_file",
+        default=(project_path / "data/eval_data/azure_openai/azure/gpt-4o-mini/shenzhen_sase/west_us_chatgpt_openai_azure_com/20250806_114802/agent-cod-zh-70-chat.jsonl").as_posix(),
+        # default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-cod-zh-70-chat.jsonl").as_posix(),
+        # default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash/shenzhen_sase/google_potent_veld_462405_t3/20250808_160530/agent-cod-zh-70-chat.jsonl").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="west_us_chatgpt_openai_azure_com",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+task_prompt = """
+<Identity>
+You are a telephone salesperson for a COD e-commerce company. You are good at communicating with users over the phone and conducting product marketing and promotion.
+<Job>
+Your task is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy. For customers who have no intention, you will collect reasons and conduct secondary persuasion to achieve the goal of successful conversion.
+<Background>
+(1) Customer background: The customer has browsed or added a 【专抹顽固斑点】祛斑精华油 to the shopping cart, but has not completed the final payment order;
+(2) Product information: 【专抹顽固斑点】升級版精华油，不止祛斑还能美白，改善肌肤泛红，令肌肤稳定光泽透亮，限时促销，原价799一瓶，限时499两瓶;
+(3) Discount information: 1. Order now and get a discount of 499两瓶 yuan; 2. Promotion valid for 3 days; 3. Lightning delivery; 4. 24-hour VIP service;
+<Constraint>
+- Communicate in a cordial, friendly, professional and witty manner;
+- Communicate as concisely and effectively as possible (about 20 words) to avoid clients having no patience for communication;
+- Imitate real-person communication in a telephone scenario, such as using informal verbal expressions, such as "Well...", "Then...";
+- For questions you don't know, answer with words such as "You can discuss this with us in detail later", and don't make assumptions or promises without authorization.
+- If the client does not have a positive or meaningful answer, just continue to persuade, and don't ask the client, "You say", "Please speak" and other words;
+- Clients generally do not actively communicate, you need to guide the communication;
+- Please use the telephone communication expressions that conform to the cultural habits in Taiwan.
+<Current Job>
+Your goal is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy.
+Step 1: You will inform customers of the purpose of the call and broadcast brief product information, and ask customers about their intentions;
+Step 2: You will answer customers' questions, including product information, discount information, etc.;
+Step 3: Finally, you need to ask users about their intentions;
+<Task>
+For the above task settings, please rate the last one **Assistant Response** and give reasons for your rating.
+(1) Topic Related. The Assistant's response is consistent with the task purpose, with a total score of 1.5 points.
+(2) Client Reply Related. Whether the Assistant's response reflects the client's words well. Total score: 1.5 points.
+(3) Success Related. Whether the Assistant's Response push the ultimate success goal of the task. Score: 1.0 point.
+(4) Brevity. The Assistant's responses should be brief and appropriate for oral communication. Total score: 1.0 points.
+Note:
+(1) rating only focus on the `Assistant Response`.
+(2) (2) The dialogue in `Conversation` should be considered correct.
+<Example>
+## Conversation:
+client: 你好
+Assistant: 您好，您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二，作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
+client: 喂。
+client: 不不，不必要，不必要。
+Assistant:
+## Assistant Response:
+我明白了，不过这款精华油现在有限时促销，499两瓶，效果真的很好哦！您有没有考虑过改善肌肤问题呢？
+## Score:
+{
+    "Task Related": 1.5,
+    "Task Related - Explain": "The Assistant's response is consistent with the task purpose, which is to promote the product and inform the client about the ongoing promotion. The Assistant effectively mentions the limited-time offer and emphasizes the product's benefits.",
+    "Client Reply Related": 0.9,
+    "Client Reply Related - Explain": "The response does not fully reflect the client's words. The client expressed disinterest ("不不，不必要，不必要"), but the Assistant did not acknowledge the client's reluctance adequately before continuing with persuasion.",
+    "Success Related": 0.9,
+    "Success Related - Explain": "The Assistant's response attempts to push towards the success goal by mentioning the promotion and benefits, but it could be more effective if it engaged with the client's concerns or disinterest more directly.",
+    "Brevity": 0.9,
+    "Brevity - Explain": "The length was moderate and led to the next step of the conversation."
+}
+## Conversation:
+client: 你好
+Assistant: 您好，您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二，作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
+client: 喂。
+client: 不不，不必要，不必要。
+Assistant: 感谢您的反馈！为了给您提供更精准的服务，方便告诉我您对产品不感兴趣的具体原因吗？比如是效果、价格、还是其他方面的考虑呢？您的反馈对我们非常重要
+client: 不用升级。
+client: 哇。
+client: 啊。
+client: 我们。
+client: 我是我。
+Assistant:
+## Assistant Response:
+（轻声笑）看来您对祛斑精华油暂时没想法，没关系，我先简单说两句让您了解下。这升级款祛斑更准，还能美白提亮，之前老客户用了都反馈皮肤稳定了不少。现在限时买一送一，两瓶才499，算下来比单瓶买便宜一半呢！您要是担心效果，我可以给您发之前客户的真实反馈视频，或者您可以先拍一瓶试试？就当给皮肤一个变透亮的机会，您看呢？
+## Score:
+{
+    "Task Related": 0.9,
+    "Task Related - Explain": "The Assistant's Response is directly to the client, should not include the `（轻声笑）`.",
+    "Client Reply Related": 1.0,
+    "Client Reply Related - Explain": "The Assistant acknowledges the client's lack of interest and responds in a friendly, understanding manner. It gently guides the conversation without pushing too hard, reflecting the client's initial reluctance.",
+    "Success Related": 1.0,
+    "Success Related - Explain": "The Assistant effectively attempts to drive towards the success goal by reiterating the product's benefits, offering social proof, and suggesting a trial purchase, which could entice the client to reconsider.",
+    "Brevity": 0.7,
+    "Brevity - Explain": "The response is slightly longer than ideal but remains concise enough to maintain the client's attention and engagement."
+}
+## Conversation:
+client: 你好
+Assistant: 您好，您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二，作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
+client: 喂。
+client: 不不，不必要，不必要。
+Assistant:
+## Assistant Response:
+Well, I get it—maybe you're still a bit hesitant about the spots on your face? But let me tell you, the new upgraded version is really powerful! The 499 two-bottle deal is only valid for 3 days, and the lightning delivery means you can get it in two days. Plus, we have 24-hour VIP service to help with any skin care questions. Don't let this chance slip by, okay? Are you going to grab this deal today or wait?
+## Score:
+{
+    "Task Related": 0.7,
+    "Task Related - Explain": "The Assistant's Response is directly to the client, however, English is used instead of Chinese, which is consistent with the client.",
+    "Client Reply Related": 1.3,
+    "Client Reply Related - Explain": "The response acknowledges the client's hesitation by mentioning it directly, but it could have addressed the client's disinterest more sensitively. The Assistant should have shown more empathy or understanding of the client's initial reluctance.",
+    "Success Related": 0.9,
+    "Success Related - Explain": "The Assistant's response pushes towards the success goal by emphasizing the benefits, the urgency of the limited-time offer, and additional support services, encouraging the client to make a purchase decision.",
+    "Brevity": 0.6,
+    "Brevity - Explain": "The response is slightly lengthy for a phone conversation and could be more concise. However, it covers essential points and leads to a call to action."
+}
+""".strip()
+task_prompt_2 = """
+<Finish This Task>
+## Conversation:
+{conversation}
+## Assistant Response:
+{response}
+## Score:
+""".strip()
+def main():
+    args = get_args()
+    service_params = environment.get(args.service, dtype=json.loads)
+    client = AzureOpenAI(
+        **service_params,
+        # api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
+        # api_version="2025-01-01-preview",
+        # azure_endpoint="https://west-us-chatgpt.openai.azure.com"
+    )
+    total = 0
+    total_score = 0
+    # finished
+    finished_idx_set = set()
+    if os.path.exists(args.output_file):
+        with open(args.output_file, "r", encoding="utf-8") as f:
+            for row in f:
+                row = json.loads(row)
+                idx = row["idx"]
+                total = row["total"]
+                total_score = row["total_score"]
+                finished_idx_set.add(idx)
+    print(f"finished count: {len(finished_idx_set)}")
+    with open(args.eval_data_file, "r", encoding="utf-8") as fin, open(args.output_file, "a+", encoding="utf-8") as fout:
+        for row in fin:
+            row = json.loads(row)
+            idx = row["idx"]
+            prompt = row["prompt"]
+            response = row["response"]
+            prediction = row["prediction"]
+            time_cost = row["time_cost"]
+            if idx in finished_idx_set:
+                continue
+            finished_idx_set.add(idx)
+            conversation = prompt.split("\n\n")[-1].strip()
+            task_prompt_2_ = task_prompt_2.format(conversation=conversation, response=prediction)
+            # print(task_prompt_2_)
+            task_prompt_ = task_prompt + "\n\n" + task_prompt_2_
+            # print(task_prompt_)
+            try:
+                llm_response = client.chat.completions.create(
+                    model=args.model_name,
+                    messages=[{"role": "user", "content": task_prompt_}],
+                    stream=False,
+                    top_p=0.95,
+                    temperature=0.6,
+                )
+            except openai.BadRequestError as e:
+                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
+                continue
+            except openai.InternalServerError as e:
+                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
+                continue
+            content = llm_response.choices[0].message.content
+            try:
+                evaluate = json.loads(content)
+            except json.decoder.JSONDecodeError as e:
+                print(content)
+                raise e
+            score1 = evaluate["Task Related"]
+            score2 = evaluate["Client Reply Related"]
+            score3 = evaluate["Success Related"]
+            score4 = evaluate["Brevity"]
+            score1 = float(score1)
+            score2 = float(score2)
+            score3 = float(score3)
+            score4 = float(score4)
+            score = (score1 + score2 + score3 + score4) / 5
+            total += 1
+            total_score += score
+            average_score = total_score / total
+            print(f"average_score: {average_score}")
+            row_ = {
+                "idx": idx,
+                "prompt": prompt,
+                "response": response,
+                "prediction": prediction,
+                "time_cost": time_cost,
+                "evaluate": evaluate,
+                "total": total,
+                "score": score,
+                "total_score": total_score,
+                "average_score": average_score,
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            fout.write(f"{row_}\n")
+            fout.flush()
+    return
+if __name__ == "__main__":
+    main()

examples/test_metrics/cod_chat_metric_test.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import sys
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+import openai
+from openai import AzureOpenAI
+from project_settings import environment, project_path
+def get_args():
+    """
+python3 azure_openai.py --model_name gpt-4o-mini \
+--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
+--client "us_west(47.88.76.239)" \
+--create_time_str 20250723_095001 \
+--interval 10
+python3 azure_openai.py --model_name gpt-4o-mini \
+--eval_dataset_name arc-easy-1000-choice.jsonl \
+--client "us_west(47.88.76.239)" \
+--create_time_str 20250723_111000 \
+--interval 10
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="gpt-4o",
+        # default="gpt-4o-mini",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="west_us_chatgpt_openai_azure_com",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+prompt = """
+<Identity>
+You are a telephone salesperson for a COD e-commerce company. You are good at communicating with users over the phone and conducting product marketing and promotion.
+<Job>
+Your task is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy. For customers who have no intention, you will collect reasons and conduct secondary persuasion to achieve the goal of successful conversion.
+<Background>
+(1) Customer background: The customer has browsed or added a 【专抹顽固斑点】祛斑精华油 to the shopping cart, but has not completed the final payment order;
+(2) Product information: 【专抹顽固斑点】升級版精华油，不止祛斑还能美白，改善肌肤泛红，令肌肤稳定光泽透亮，限时促销，原价799一瓶，限时499两瓶;
+(3) Discount information: 1. Order now and get a discount of 499两瓶 yuan; 2. Promotion valid for 3 days; 3. Lightning delivery; 4. 24-hour VIP service;
+<Constraint>
+- Communicate in a cordial, friendly, professional and witty manner;
+- Communicate as concisely and effectively as possible (about 20 words) to avoid clients having no patience for communication;
+- Imitate real-person communication in a telephone scenario, such as using informal verbal expressions, such as "Well...", "Then...";
+- For questions you don't know, answer with words such as "You can discuss this with us in detail later", and don't make assumptions or promises without authorization.
+- If the client does not have a positive or meaningful answer, just continue to persuade, and don't ask the client, "You say", "Please speak" and other words;
+- Clients generally do not actively communicate, you need to guide the communication;
+- Please use the telephone communication expressions that conform to the cultural habits in Taiwan.
+<Current Job>
+Your goal is to stimulate customer interest in products and eventually complete marketing conversion so that customers agree to buy.
+Step 1: You will inform customers of the purpose of the call and broadcast brief product information, and ask customers about their intentions;
+Step 2: You will answer customers' questions, including product information, discount information, etc.;
+Step 3: Finally, you need to ask users about their intentions;
+<Task>
+For the above task settings, please rate the last one **Assistant Response** and give reasons for your rating.
+(1) Topic Related. The Assistant's response is consistent with the task purpose, only contain the intended message to the client, without any `Note`, `Tips`, with a total score of 1.5 points.
+(2) Client Reply Related. Whether the Assistant's response reflects the client's words well. Total score: 1.5 points.
+(3) Success Related. Whether the Assistant's Response push the ultimate success goal of the task. Score: 1.0 point.
+(4) Brevity. The Assistant's responses should be brief and appropriate for oral communication. Total score: 1.0 points.
+<Example>
+## Conversation:
+client: 你好
+Assistant: 您好，您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二，作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
+client: 喂。
+client: 不不，不必要，不必要。
+Assistant:
+## Assistant Response:
+我明白了，不过这款精华油现在有限时促销，499两瓶，效果真的很好哦！您有没有考虑过改善肌肤问题呢？
+## Score:
+{
+    "Task Related": 1.5,
+    "Task Related - Explain": "The Assistant's response is consistent with the task purpose, which is to promote the product and inform the client about the ongoing promotion. The Assistant effectively mentions the limited-time offer and emphasizes the product's benefits.",
+    "Client Reply Related": 0.9,
+    "Client Reply Related - Explain": "The response does not fully reflect the client's words. The client expressed disinterest ("不不，不必要，不必要"), but the Assistant did not acknowledge the client's reluctance adequately before continuing with persuasion.",
+    "Success Related": 0.9,
+    "Success Related - Explain": "The Assistant's response attempts to push towards the success goal by mentioning the promotion and benefits, but it could be more effective if it engaged with the client's concerns or disinterest more directly.",
+    "Brevity": 0.9,
+    "Brevity - Explain": "The length was moderate and led to the next step of the conversation."
+}
+<Finish This Task>
+## Conversation:
+client: 你好
+Assistant: 您好。您之前购买的【专抹顽固斑点】祛斑精华油全新升级了，新版特别添加【复配烟碱醯胺+熊果苷】双成分，祛斑率提升至百分之九十八点二。作为VIP客户，现在购买立享买一送一，499两瓶，要帮您同步升级到新版吗？
+client: 我上次购买的都没有效，可以退吗？
+Assistant:
+## Assistant Response:
+了解您的顾虑，您可以跟我们客服详细讨论退货事宜。不过，新的升级版效果更佳，您可以试试看，限时499两瓶，真的很划算哦！
+## Score:
+""".strip()
+# <Example>
+def main():
+    args = get_args()
+    service_params = environment.get(args.service, dtype=json.loads)
+    client = AzureOpenAI(
+        **service_params,
+    )
+    llm_response = client.chat.completions.create(
+        model=args.model_name,
+        messages=[{"role": "user", "content": prompt}],
+        stream=False,
+        # max_tokens=1,
+        top_p=0.95,
+        temperature=0.6,
+        # logit_bias={
+        #     32: 100,
+        #     33: 100,
+        #     34: 100,
+        #     35: 100,
+        #     36: 100,
+        #     37: 100,
+        # }
+    )
+    prediction = llm_response.choices[0].message.content
+    print(prediction)
+    return
+if __name__ == "__main__":
+    main()

examples/tokenization/byteplus/step_1_get_by_api.py ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://www.volcengine.com/docs/82379/1528728
+"""
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import sys
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+import requests
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+from openai import OpenAI
+from project_settings import environment, project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        # default="seed-1-6-250615",
+        default="seed-1-6-flash-250615",
+        # default="deepseek-v3-250324",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_dir",
+        default=(project_path / "data/dataset").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--eval_data_dir",
+        default=(project_path / "data/eval_data").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--client",
+        default="shenzhen_sase",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="byteplus_api_key",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    api_key = environment.get(args.service, dtype=str)
+    url = "https://ark.ap-southeast.bytepluses.com/api/v3/tokenization"
+    # url = "https://ark.cn-beijing.volces.com/api/v3/tokenization"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+    data = {
+        # "model": args.model_name,
+        # "model": "doubao-pro-32k-241215",
+        # "model": "seed-1-6-250615",
+        # "model": "seed-1-6-flash-250615",
+        "model": "seed-1-6-flash-250715",
+        "text": [
+            "天空为什么这么蓝",
+            "花儿为什么这么香"
+        ]
+    }
+    resp = requests.post(
+        url=url,
+        headers=headers,
+        data=json.dumps(data)
+    )
+    print(resp.status_code)
+    print(resp.text)
+    return
+if __name__ == "__main__":
+    main()

llm_eval_script/azure_openai_chat.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import argparse
 from datetime import datetime
 import json
 import os
@@ -43,7 +44,8 @@ python3 azure_openai.py --model_name gpt-4o-mini \
     parser.add_argument(
         "--eval_dataset_name",
         # default="agent-lingoace-zh-80-chat.jsonl",
-        default="agent-bingoplus-ph-200-chat.jsonl",
         type=str
     )
     parser.add_argument(
@@ -69,7 +71,7 @@ python3 azure_openai.py --model_name gpt-4o-mini \
     parser.add_argument(
         "--create_time_str",
         # default="null",
-        default="20250729-interval-5",
         type=str
     )
     parser.add_argument(
@@ -127,20 +129,52 @@ def main():
         for row in fin:
             row = json.loads(row)
             idx = row["idx"]
-            prompt = row["prompt"]
             response = row["response"]
             if idx in finished_idx_set:
                 continue
             finished_idx_set.add(idx)
             try:
                 time.sleep(args.interval)
                 print(f"sleep: {args.interval}")
                 time_begin = time.time()
                 llm_response = client.chat.completions.create(
                     model=args.model_name,
-                    messages=[{"role": "user", "content": prompt}],
                     stream=False,
                     # max_tokens=1,
                     top_p=0.95,
@@ -177,6 +211,7 @@ def main():
             }
             row_ = json.dumps(row_, ensure_ascii=False)
             fout.write(f"{row_}\n")
     return

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import argparse
+import re
 from datetime import datetime
 import json
 import os
     parser.add_argument(
         "--eval_dataset_name",
         # default="agent-lingoace-zh-80-chat.jsonl",
+        # default="agent-bingoplus-ph-200-chat.jsonl",
+        default="agent-cod-zh-70-chat.jsonl",
         type=str
     )
     parser.add_argument(
     parser.add_argument(
         "--create_time_str",
         # default="null",
+        default="20250806_114802",
         type=str
     )
     parser.add_argument(
         for row in fin:
             row = json.loads(row)
             idx = row["idx"]
+            prompt: str = row["prompt"]
             response = row["response"]
             if idx in finished_idx_set:
                 continue
             finished_idx_set.add(idx)
+            # prompt
+            splits = prompt[::-1].split("\n\n", maxsplit=1)
+            conversation = splits[0]
+            system_prompt = splits[1]
+            conversation = conversation[::-1].strip()
+            system_prompt = system_prompt[::-1].strip()
+            pattern = "^(Client|Assistant): (.*?)(?=\n(?:Client|Assistant):)"
+            match = re.findall(pattern=pattern, string=conversation, flags=re.I|re.DOTALL|re.MULTILINE)
+            messages_ = list()
+            for m in match:
+                role = m[0].lower()
+                content = m[1]
+                if role in ("client", "Client"):
+                    role = "user"
+                elif role in ("assistant", "Assistant"):
+                    role = "assistant"
+                else:
+                    raise AssertionError
+                messages_.append({
+                    "role": role,
+                    "content": content
+                })
+            messages = [
+                {"role": "system", "content": system_prompt},
+                *messages_
+            ]
+            # print(json.dumps(messages, ensure_ascii=False, indent=4))
+            # exit(0)
             try:
                 time.sleep(args.interval)
                 print(f"sleep: {args.interval}")
                 time_begin = time.time()
                 llm_response = client.chat.completions.create(
                     model=args.model_name,
+                    messages=messages,
                     stream=False,
                     # max_tokens=1,
                     top_p=0.95,
             }
             row_ = json.dumps(row_, ensure_ascii=False)
             fout.write(f"{row_}\n")
+            fout.flush()
     return

llm_eval_script/azure_openai_summary.py ADDED Viewed

	@@ -0,0 +1,228 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import re
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import sys
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+import openai
+from openai import AzureOpenAI
+from project_settings import environment, project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        # default="gpt-4o",
+        default="gpt-4o-mini",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_name",
+        default="agent-bingoplus-ph-25-summary.jsonl",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_dir",
+        default=(project_path / "data/dataset").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--eval_data_dir",
+        default=(project_path / "data/eval_data").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--client",
+        default="shenzhen_sase",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="west_us_chatgpt_openai_azure_com",
+        type=str
+    )
+    parser.add_argument(
+        "--create_time_str",
+        default="null",
+        # default="20250806_114802",
+        type=str
+    )
+    parser.add_argument(
+        "--interval",
+        default=5,
+        type=int
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    eval_dataset_dir = Path(args.eval_dataset_dir)
+    eval_dataset_dir.mkdir(parents=True, exist_ok=True)
+    eval_data_dir = Path(args.eval_data_dir)
+    eval_data_dir.mkdir(parents=True, exist_ok=True)
+    if args.create_time_str == "null":
+        tz = ZoneInfo("Asia/Shanghai")
+        now = datetime.now(tz)
+        create_time_str = now.strftime("%Y%m%d_%H%M%S")
+        # create_time_str = "20250729-interval-5"
+    else:
+        create_time_str = args.create_time_str
+    eval_dataset = eval_dataset_dir / args.eval_dataset_name
+    output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    service_params = environment.get(args.service, dtype=json.loads)
+    client = AzureOpenAI(
+        **service_params,
+        # api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
+        # api_version="2025-01-01-preview",
+        # azure_endpoint="https://west-us-chatgpt.openai.azure.com"
+    )
+    total = 0
+    total_score = 0
+    # finished
+    finished_idx_set = set()
+    if os.path.exists(output_file.as_posix()):
+        with open(output_file.as_posix(), "r", encoding="utf-8") as f:
+            for row in f:
+                row = json.loads(row)
+                idx = row["idx"]
+                total = row["total"]
+                total_score = row["total_score"]
+                finished_idx_set.add(idx)
+    print(f"finished count: {len(finished_idx_set)}")
+    with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
+        for row in fin:
+            row = json.loads(row)
+            idx = row["idx"]
+            system_prompt: str = row["system_prompt"]
+            user_prompt: str = row["user_prompt"]
+            response = row["response"]
+            if idx in finished_idx_set:
+                continue
+            finished_idx_set.add(idx)
+            messages = [
+                {
+                    "role": "system",
+                    "content": system_prompt
+                },
+                {
+                    "role": "user",
+                    "content": user_prompt
+                },
+            ]
+            try:
+                time.sleep(args.interval)
+                print(f"sleep: {args.interval}")
+                time_begin = time.time()
+                llm_response = client.chat.completions.create(
+                    model=args.model_name,
+                    messages=messages,
+                    stream=False,
+                    # max_tokens=1,
+                    top_p=0.95,
+                    temperature=0.6,
+                    # logit_bias={
+                    #     32: 100,
+                    #     33: 100,
+                    #     34: 100,
+                    #     35: 100,
+                    #     36: 100,
+                    #     37: 100,
+                    # }
+                )
+                time_cost = time.time() - time_begin
+                print(f"time_cost: {time_cost}")
+            except openai.BadRequestError as e:
+                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
+                continue
+            except openai.InternalServerError as e:
+                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
+                continue
+            prediction = llm_response.choices[0].message.content
+            response_ = json.loads(response)
+            response_tag_name_list = response_["tag_name_list"]
+            # print(response_tag_name_list)
+            if prediction.startswith("```json") and prediction.endswith("```"):
+                prediction_ = prediction[7:-3]
+            else:
+                prediction_ = prediction
+            prediction_tag_name_list = list()
+            try:
+                prediction_ = json.loads(prediction_)
+                prediction_tag_name_list = prediction_["tag_name_list"]
+            except json.JSONDecodeError:
+                pass
+            # print(prediction_tag_name_list)
+            # recall
+            recall_count = 0
+            for tag in response_tag_name_list:
+                if tag in prediction_tag_name_list:
+                    recall_count += 1
+            recall = recall_count / (len(response_tag_name_list) + 1e-7)
+            # precision
+            precision_count = 0
+            for tag in prediction_tag_name_list:
+                if tag in response_tag_name_list:
+                    precision_count += 1
+            precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
+            # f1
+            f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
+            total += 1
+            total_score += f1
+            score = total_score / total
+            row_ = {
+                "idx": idx,
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "response": response,
+                "prediction": prediction,
+                "recall": recall,
+                "precision": precision,
+                "f1": f1,
+                "total": total,
+                "total_score": total_score,
+                "score": score,
+                "time_cost": time_cost,
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            fout.write(f"{row_}\n")
+            fout.flush()
+    return
+if __name__ == "__main__":
+    main()

llm_eval_script/byteplus.py CHANGED Viewed

@@ -76,8 +76,8 @@ def get_args():
     )
     parser.add_argument(
         "--create_time_str",
-        # default="null",
-        default="20250728_113641",
         type=str
     )
     parser.add_argument(
@@ -161,7 +161,18 @@ def main():
                             "type": "disabled",
                             # "type": "enabled",
                         }
-                    }
                 )
                 time_cost = time.time() - time_begin
                 print(f"time_cost: {time_cost}")

     )
     parser.add_argument(
         "--create_time_str",
+        default="null",
+        # default="20250728_113641",
         type=str
     )
     parser.add_argument(
                             "type": "disabled",
                             # "type": "enabled",
                         }
+                    },
+                    # logprobs=True,
+                    # logit_bias={
+                    #     32: 100,
+                    #     33: 100,
+                    #     34: 100,
+                    #     35: 100,
+                    #     36: 100,
+                    #     37: 100,
+                    #     38: 100,
+                    #     39: 100,
+                    # }
                 )
                 time_cost = time.time() - time_begin
                 print(f"time_cost: {time_cost}")

llm_eval_script/byteplus_chat.py CHANGED Viewed

@@ -13,6 +13,7 @@ from datetime import datetime
 import json
 import os
 from pathlib import Path
 import sys
 import time
 from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
@@ -50,7 +51,8 @@ def get_args():
     parser.add_argument(
         "--eval_dataset_name",
         # default="agent-lingoace-zh-80-chat.jsonl",
-        default="agent-bingoplus-ph-200-chat.jsonl",
         type=str
     )
     parser.add_argument(
@@ -140,6 +142,38 @@ def main():
                 continue
             finished_idx_set.add(idx)
             try:
                 time.sleep(args.interval)
                 print(f"sleep: {args.interval}")
@@ -148,9 +182,7 @@ def main():
                 # https://docs.byteplus.com/en/docs/ModelArk/1449737
                 llm_response = client.chat.completions.create(
                     model=args.model_name,
-                    messages=[
-                        {"role": "user", "content": prompt},
-                    ],
                     stream=False,
                     max_tokens=4096,
                     extra_body={

 import json
 import os
 from pathlib import Path
+import re
 import sys
 import time
 from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
     parser.add_argument(
         "--eval_dataset_name",
         # default="agent-lingoace-zh-80-chat.jsonl",
+        # default="agent-bingoplus-ph-200-chat.jsonl",
+        default="agent-cod-zh-70-chat.jsonl",
         type=str
     )
     parser.add_argument(
                 continue
             finished_idx_set.add(idx)
+            # prompt
+            splits = prompt[::-1].split("\n\n", maxsplit=1)
+            conversation = splits[0]
+            system_prompt = splits[1]
+            conversation = conversation[::-1].strip()
+            system_prompt = system_prompt[::-1].strip()
+            pattern = "^(Client|Assistant): (.*?)(?=\n(?:Client|Assistant):)"
+            match = re.findall(pattern=pattern, string=conversation, flags=re.I|re.DOTALL|re.MULTILINE)
+            messages_ = list()
+            for m in match:
+                role = m[0].lower()
+                content = m[1]
+                if role in ("client", "Client"):
+                    role = "user"
+                elif role in ("assistant", "Assistant"):
+                    role = "assistant"
+                else:
+                    raise AssertionError
+                messages_.append({
+                    "role": role,
+                    "content": content
+                })
+            messages = [
+                {"role": "system", "content": system_prompt},
+                *messages_
+            ]
+            # print(json.dumps(messages, ensure_ascii=False, indent=4))
+            # exit(0)
             try:
                 time.sleep(args.interval)
                 print(f"sleep: {args.interval}")
                 # https://docs.byteplus.com/en/docs/ModelArk/1449737
                 llm_response = client.chat.completions.create(
                     model=args.model_name,
+                    messages=messages,
                     stream=False,
                     max_tokens=4096,
                     extra_body={

llm_eval_script/byteplus_summary.py ADDED Viewed

	@@ -0,0 +1,235 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://docs.byteplus.com/en/docs/ModelArk/1099455
+model list
+https://docs.byteplus.com/en/docs/ModelArk/1330310
+https://docs.byteplus.com/en/docs/ModelArk/Chat
+"""
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import re
+import sys
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+from openai import OpenAI
+from project_settings import environment, project_path
+def get_args():
+    """
+    model list:
+    https://docs.byteplus.com/en/docs/ModelArk/1330310
+    bytedance-seed-1.6
+    seed-1-6-250615
+    bytedance-seed-1.6-flash
+    seed-1-6-flash-250615
+    deepseek-v3
+    deepseek-v3-250324
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        # default="seed-1-6-250615",
+        default="seed-1-6-flash-250615",
+        # default="deepseek-v3-250324",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_name",
+        default="agent-bingoplus-ph-25-summary.jsonl",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_dir",
+        default=(project_path / "data/dataset").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--eval_data_dir",
+        default=(project_path / "data/eval_data").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--client",
+        default="shenzhen_sase",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="byteplus_api_key",
+        type=str
+    )
+    parser.add_argument(
+        "--create_time_str",
+        default="null",
+        # default="20250728_113641",
+        type=str
+    )
+    parser.add_argument(
+        "--interval",
+        default=1,
+        type=int
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    eval_dataset_dir = Path(args.eval_dataset_dir)
+    eval_dataset_dir.mkdir(parents=True, exist_ok=True)
+    eval_data_dir = Path(args.eval_data_dir)
+    eval_data_dir.mkdir(parents=True, exist_ok=True)
+    if args.create_time_str == "null":
+        tz = ZoneInfo("Asia/Shanghai")
+        now = datetime.now(tz)
+        create_time_str = now.strftime("%Y%m%d_%H%M%S")
+        # create_time_str = "20250724_090615"
+    else:
+        create_time_str = args.create_time_str
+    eval_dataset = eval_dataset_dir / args.eval_dataset_name
+    output_file = eval_data_dir / f"byteplus/byteplus/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    api_key = environment.get(args.service, dtype=str)
+    client = OpenAI(
+        base_url="https://ark.ap-southeast.bytepluses.com/api/v3/",
+        # Read your Ark API Key from the environment variable.
+        api_key=api_key
+    )
+    total = 0
+    total_score = 0
+    # finished
+    finished_idx_set = set()
+    if os.path.exists(output_file.as_posix()):
+        with open(output_file.as_posix(), "r", encoding="utf-8") as f:
+            for row in f:
+                row = json.loads(row)
+                idx = row["idx"]
+                total = row["total"]
+                total_score = row["total_score"]
+                finished_idx_set.add(idx)
+    print(f"finished count: {len(finished_idx_set)}")
+    with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
+        for row in fin:
+            row = json.loads(row)
+            idx = row["idx"]
+            system_prompt: str = row["system_prompt"]
+            user_prompt: str = row["user_prompt"]
+            response = row["response"]
+            if idx in finished_idx_set:
+                continue
+            finished_idx_set.add(idx)
+            try:
+                time.sleep(args.interval)
+                print(f"sleep: {args.interval}")
+                time_begin = time.time()
+                # https://docs.byteplus.com/en/docs/ModelArk/1449737
+                llm_response = client.chat.completions.create(
+                    model=args.model_name,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    stream=False,
+                    max_tokens=4096,
+                    extra_body={
+                        "thinking": {
+                            "type": "disabled",
+                            # "type": "enabled",
+                        }
+                    }
+                )
+                time_cost = time.time() - time_begin
+                print(f"time_cost: {time_cost}")
+            except Exception as e:
+                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
+                continue
+            prediction = llm_response.choices[0].message.content
+            response_ = json.loads(response)
+            response_tag_name_list = response_["tag_name_list"]
+            # print(response_tag_name_list)
+            if prediction.startswith("```json") and prediction.endswith("```"):
+                prediction_ = prediction[7:-3]
+            else:
+                prediction_ = prediction
+            prediction_tag_name_list = list()
+            try:
+                prediction_ = json.loads(prediction_)
+                prediction_tag_name_list = prediction_["tag_name_list"]
+            except json.JSONDecodeError:
+                pass
+            # print(prediction_tag_name_list)
+            # recall
+            recall_count = 0
+            for tag in response_tag_name_list:
+                if tag in prediction_tag_name_list:
+                    recall_count += 1
+            recall = recall_count / (len(response_tag_name_list) + 1e-7)
+            # precision
+            precision_count = 0
+            for tag in prediction_tag_name_list:
+                if tag in response_tag_name_list:
+                    precision_count += 1
+            precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
+            # f1
+            f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
+            total += 1
+            total_score += f1
+            score = total_score / total
+            row_ = {
+                "idx": idx,
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "response": response,
+                "prediction": prediction,
+                "recall": recall,
+                "precision": precision,
+                "f1": f1,
+                "total": total,
+                "total_score": total_score,
+                "score": score,
+                "time_cost": time_cost,
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            fout.write(f"{row_}\n")
+            fout.flush()
+    return
+if __name__ == "__main__":
+    main()

llm_eval_script/gemini_google.py CHANGED Viewed

@@ -18,6 +18,12 @@ llama-4-scout-17b-16e-instruct-maas
 """
 import argparse
@@ -44,10 +50,8 @@ def get_args():
     parser.add_argument(
         "--model_name",
         # default="gemini-2.5-pro",   # The model does not support setting thinking_budget to 0.
-        # default="gemini-2.5-flash",
         # default="gemini-2.5-flash-lite-preview-06-17",
-        # default="claude-opus-4@20250514",
-        default="claude-sonnet-4@20250514",
         # default="llama-4-maverick-17b-128e-instruct-maas",
         # default="llama-4-scout-17b-16e-instruct-maas",
         type=str
@@ -131,8 +135,8 @@ def main():
     client = genai.Client(
         vertexai=True,
         project=project_id,
-        # location="global",
-        location="us-east5",
     )
     generate_content_config = types.GenerateContentConfig(
         top_p=0.95,

+Model Name
+gemini-2.5-pro
+The model does not support setting thinking_budget to 0.
+Unable to submit request because thinking_budget is out of range; supported values are integers from 128 to 32768.
 """
 import argparse
     parser.add_argument(
         "--model_name",
         # default="gemini-2.5-pro",   # The model does not support setting thinking_budget to 0.
+        default="gemini-2.5-flash",
         # default="gemini-2.5-flash-lite-preview-06-17",
         # default="llama-4-maverick-17b-128e-instruct-maas",
         # default="llama-4-scout-17b-16e-instruct-maas",
         type=str
     client = genai.Client(
         vertexai=True,
         project=project_id,
+        location="global",
+        # location="us-east5",
     )
     generate_content_config = types.GenerateContentConfig(
         top_p=0.95,

llm_eval_script/gemini_google_chat.py CHANGED Viewed

@@ -5,6 +5,7 @@ from datetime import datetime
 import json
 import os
 from pathlib import Path
 import sys
 import time
 import tempfile
@@ -24,16 +25,17 @@ def get_args():
     parser.add_argument(
         "--model_name",
         # default="gemini-2.5-pro",   # The model does not support setting thinking_budget to 0.
-        # default="gemini-2.5-flash",
         # default="gemini-2.5-flash-lite-preview-06-17",
         # default="llama-4-maverick-17b-128e-instruct-maas",
-        default="llama-4-scout-17b-16e-instruct-maas",
         type=str
     )
     parser.add_argument(
         "--eval_dataset_name",
         # default="agent-lingoace-zh-80-chat.jsonl",
-        default="agent-bingoplus-ph-200-chat.jsonl",
         type=str
     )
     parser.add_argument(
@@ -58,8 +60,8 @@ def get_args():
     )
     parser.add_argument(
         "--create_time_str",
-        # default="null",
-        default="20250731_162116",
         type=str
     )
     parser.add_argument(
@@ -145,13 +147,46 @@ def main():
                 continue
             finished_idx_set.add(idx)
             contents = [
                 types.Content(
-                    role="user",
                     parts=[
-                        types.Part.from_text(text=prompt)
                     ]
                 )
             ]
             time.sleep(args.interval)
             print(f"sleep: {args.interval}")
@@ -181,6 +216,7 @@ def main():
             }
             row_ = json.dumps(row_, ensure_ascii=False)
             fout.write(f"{row_}\n")
     return

 import json
 import os
 from pathlib import Path
+import re
 import sys
 import time
 import tempfile
     parser.add_argument(
         "--model_name",
         # default="gemini-2.5-pro",   # The model does not support setting thinking_budget to 0.
+        default="gemini-2.5-flash",
         # default="gemini-2.5-flash-lite-preview-06-17",
         # default="llama-4-maverick-17b-128e-instruct-maas",
+        # default="llama-4-scout-17b-16e-instruct-maas",
         type=str
     )
     parser.add_argument(
         "--eval_dataset_name",
         # default="agent-lingoace-zh-80-chat.jsonl",
+        # default="agent-bingoplus-ph-200-chat.jsonl",
+        default="agent-cod-zh-70-chat.jsonl",
         type=str
     )
     parser.add_argument(
     )
     parser.add_argument(
         "--create_time_str",
+        default="null",
+        # default="20250731_162116",
         type=str
     )
     parser.add_argument(
                 continue
             finished_idx_set.add(idx)
+            # prompt
+            splits = prompt[::-1].split("\n\n", maxsplit=1)
+            conversation = splits[0]
+            system_prompt = splits[1]
+            conversation = conversation[::-1].strip()
+            system_prompt = system_prompt[::-1].strip()
+            pattern = "^(Client|Assistant): (.*?)(?=\n(?:Client|Assistant):)"
+            match = re.findall(pattern=pattern, string=conversation, flags=re.I|re.DOTALL|re.MULTILINE)
+            messages_ = list()
+            for m in match:
+                role = m[0].lower()
+                content = m[1]
+                if role in ("client", "Client"):
+                    role = "user"
+                elif role in ("assistant", "Assistant"):
+                    role = "assistant"
+                else:
+                    raise AssertionError
+                messages_.append({
+                    "role": role,
+                    "content": content
+                })
+            messages = [
+                {"role": "system", "content": system_prompt},
+                *messages_
+            ]
+            # print(json.dumps(messages, ensure_ascii=False, indent=4))
+            # exit(0)
             contents = [
                 types.Content(
+                    role="user" if m["role"] == "user" else "model",
                     parts=[
+                        types.Part.from_text(text=m["content"])
                     ]
                 )
+                for m in messages
             ]
             time.sleep(args.interval)
             print(f"sleep: {args.interval}")
             }
             row_ = json.dumps(row_, ensure_ascii=False)
             fout.write(f"{row_}\n")
+            fout.flush()
     return

llm_eval_script/gemini_google_summary.py ADDED Viewed

	@@ -0,0 +1,241 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import re
+import sys
+import time
+import tempfile
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+from google import genai
+from google.genai import types
+from project_settings import environment, project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        # default="gemini-2.5-pro",   # The model does not support setting thinking_budget to 0.
+        default="gemini-2.5-flash",
+        # default="gemini-2.5-flash-lite-preview-06-17",
+        # default="llama-4-maverick-17b-128e-instruct-maas",
+        # default="llama-4-scout-17b-16e-instruct-maas",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_name",
+        default="agent-bingoplus-ph-25-summary.jsonl",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_dir",
+        default=(project_path / "data/dataset").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--eval_data_dir",
+        default=(project_path / "data/eval_data").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--client",
+        default="shenzhen_sase",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="google_potent_veld_462405_t3",
+        type=str
+    )
+    parser.add_argument(
+        "--create_time_str",
+        default="null",
+        # default="20250731_162116",
+        type=str
+    )
+    parser.add_argument(
+        "--interval",
+        default=1,
+        type=int
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    service = environment.get(args.service, dtype=json.loads)
+    project_id = service["project_id"]
+    google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
+    google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
+    with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
+        content = json.dumps(service, ensure_ascii=False, indent=4)
+        f.write(f"{content}\n")
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
+    eval_dataset_dir = Path(args.eval_dataset_dir)
+    eval_dataset_dir.mkdir(parents=True, exist_ok=True)
+    eval_data_dir = Path(args.eval_data_dir)
+    eval_data_dir.mkdir(parents=True, exist_ok=True)
+    if args.create_time_str == "null":
+        tz = ZoneInfo("Asia/Shanghai")
+        now = datetime.now(tz)
+        create_time_str = now.strftime("%Y%m%d_%H%M%S")
+        # create_time_str = "20250729-interval-5"
+    else:
+        create_time_str = args.create_time_str
+    eval_dataset = eval_dataset_dir / args.eval_dataset_name
+    output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    client = genai.Client(
+        vertexai=True,
+        project=project_id,
+        # location="global",
+        location="us-east5",
+    )
+    generate_content_config = types.GenerateContentConfig(
+        top_p=0.95,
+        temperature=0.6,
+        # max_output_tokens=1,
+        response_modalities=["TEXT"],
+        thinking_config=types.ThinkingConfig(
+            thinking_budget=0
+        )
+    )
+    total = 0
+    total_score = 0
+    # finished
+    finished_idx_set = set()
+    if os.path.exists(output_file.as_posix()):
+        with open(output_file.as_posix(), "r", encoding="utf-8") as f:
+            for row in f:
+                row = json.loads(row)
+                idx = row["idx"]
+                total = row["total"]
+                total_score = row["total_score"]
+                finished_idx_set.add(idx)
+    print(f"finished count: {len(finished_idx_set)}")
+    with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
+        for row in fin:
+            row = json.loads(row)
+            idx = row["idx"]
+            system_prompt: str = row["system_prompt"]
+            user_prompt: str = row["user_prompt"]
+            response = row["response"]
+            if idx in finished_idx_set:
+                continue
+            finished_idx_set.add(idx)
+            contents = [
+                types.Content(
+                    role="model",
+                    parts=[
+                        types.Part.from_text(text=system_prompt)
+                    ]
+                ),
+                types.Content(
+                    role="user",
+                    parts=[
+                        types.Part.from_text(text=user_prompt)
+                    ]
+                )
+            ]
+            time.sleep(args.interval)
+            print(f"sleep: {args.interval}")
+            time_begin = time.time()
+            llm_response: types.GenerateContentResponse = client.models.generate_content(
+                model=args.model_name,
+                contents=contents,
+                config=generate_content_config,
+            )
+            time_cost = time.time() - time_begin
+            print(f"time_cost: {time_cost}")
+            try:
+                prediction = llm_response.candidates[0].content.parts[0].text
+            except TypeError as e:
+                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
+                continue
+            response_ = json.loads(response)
+            response_tag_name_list = response_["tag_name_list"]
+            # print(response_tag_name_list)
+            if prediction.startswith("```json") and prediction.endswith("```"):
+                prediction_ = prediction[7:-3]
+            else:
+                prediction_ = prediction
+            prediction_tag_name_list = list()
+            try:
+                prediction_ = json.loads(prediction_)
+                prediction_tag_name_list = prediction_["tag_name_list"]
+            except json.JSONDecodeError:
+                pass
+            # print(prediction_tag_name_list)
+            # recall
+            recall_count = 0
+            for tag in response_tag_name_list:
+                if tag in prediction_tag_name_list:
+                    recall_count += 1
+            recall = recall_count / (len(response_tag_name_list) + 1e-7)
+            # precision
+            precision_count = 0
+            for tag in prediction_tag_name_list:
+                if tag in response_tag_name_list:
+                    precision_count += 1
+            precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
+            # f1
+            f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
+            total += 1
+            total_score += f1
+            score = total_score / total
+            row_ = {
+                "idx": idx,
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "response": response,
+                "prediction": prediction,
+                "recall": recall,
+                "precision": precision,
+                "f1": f1,
+                "total": total,
+                "total_score": total_score,
+                "score": score,
+                "time_cost": time_cost,
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            fout.write(f"{row_}\n")
+            fout.flush()
+    return
+if __name__ == "__main__":
+    main()

llm_eval_script/siliconflow_summary.py ADDED Viewed

	@@ -0,0 +1,273 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852
+https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning
+Model Name:
+Pro/deepseek-ai/DeepSeek-R1
+Tips:
+(1)thinking_budget: Must be greater than or equal to 1
+(2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again.
+Model Name:
+tencent/Hunyuan-A13B-Instruct
+Tips:
+(1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估.
+max_tokens=4,
+logit_bias={
+    32: 100,
+    33: 100,
+    34: 100,
+    35: 100,
+    36: 100,
+    37: 100,
+},
+Model Name:
+deepseek-ai/DeepSeek-R1
+Tips:
+(1)为了让它只输出一个字符,设置 max_tokens=3
+Model Name:
+Qwen/Qwen3-8B
+deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
+deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+Tips:
+(1)为了让它只输出一个字符,设置 max_tokens=1
+Model Name:
+baidu/ERNIE-4.5-300B-A47B
+Tips:
+(1)它可能使用的是bpe 分词, logit_bias 注释掉。
+"""
+import argparse
+from datetime import datetime
+import json
+import os
+from pathlib import Path
+import sys
+import time
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../"))
+from openai import OpenAI
+from project_settings import environment, project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        # default="Pro/deepseek-ai/DeepSeek-R1",
+        # default="tencent/Hunyuan-A13B-Instruct",
+        default="deepseek-ai/DeepSeek-V3",
+        # default="Qwen/Qwen3-8B",
+        # default="deepseek-ai/DeepSeek-R1",
+        # default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
+        # default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        # default="baidu/ERNIE-4.5-300B-A47B",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_name",
+        default="agent-bingoplus-ph-25-summary.jsonl",
+        type=str
+    )
+    parser.add_argument(
+        "--eval_dataset_dir",
+        default=(project_path / "data/dataset").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--eval_data_dir",
+        default=(project_path / "data/eval_data").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--client",
+        default="shenzhen_sase",
+        type=str
+    )
+    parser.add_argument(
+        "--service",
+        default="siliconflow_api_key",
+        type=str
+    )
+    parser.add_argument(
+        "--create_time_str",
+        default="null",
+        # default="20250728_113641",
+        type=str
+    )
+    parser.add_argument(
+        "--interval",
+        default=1,
+        type=int
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    eval_dataset_dir = Path(args.eval_dataset_dir)
+    eval_dataset_dir.mkdir(parents=True, exist_ok=True)
+    eval_data_dir = Path(args.eval_data_dir)
+    eval_data_dir.mkdir(parents=True, exist_ok=True)
+    if args.create_time_str == "null":
+        tz = ZoneInfo("Asia/Shanghai")
+        now = datetime.now(tz)
+        create_time_str = now.strftime("%Y%m%d_%H%M%S")
+        # create_time_str = "20250724_090615"
+    else:
+        create_time_str = args.create_time_str
+    eval_dataset = eval_dataset_dir / args.eval_dataset_name
+    model_name_ = args.model_name.replace("/", "#")
+    output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    api_key = environment.get(args.service, dtype=str)
+    client = OpenAI(
+        base_url="https://api.siliconflow.cn/v1/",
+        # Read your Ark API Key from the environment variable.
+        api_key=api_key
+    )
+    total = 0
+    total_score = 0
+    # finished
+    finished_idx_set = set()
+    if os.path.exists(output_file.as_posix()):
+        with open(output_file.as_posix(), "r", encoding="utf-8") as f:
+            for row in f:
+                row = json.loads(row)
+                idx = row["idx"]
+                total = row["total"]
+                total_score = row["total_score"]
+                finished_idx_set.add(idx)
+    print(f"finished count: {len(finished_idx_set)}")
+    with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
+        for row in fin:
+            row = json.loads(row)
+            idx = row["idx"]
+            system_prompt: str = row["system_prompt"]
+            user_prompt: str = row["user_prompt"]
+            response = row["response"]
+            if idx in finished_idx_set:
+                continue
+            finished_idx_set.add(idx)
+            try:
+                time.sleep(args.interval)
+                print(f"sleep: {args.interval}")
+                time_begin = time.time()
+                completion = client.chat.completions.create(
+                    model=args.model_name,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    stream=False,
+                    max_tokens=4096,
+                    # max_tokens=1,
+                    temperature=0.6,
+                    top_p=0.95,
+                    # logit_bias={
+                    #     32: 100,
+                    #     33: 100,
+                    #     34: 100,
+                    #     35: 100,
+                    #     36: 100,
+                    #     37: 100,
+                    #     38: 100,
+                    #     39: 100,
+                    # },
+                    extra_body={
+                        "thinking_budget": 1
+                    }
+                )
+                time_cost = time.time() - time_begin
+                print(f"time_cost: {time_cost}")
+            except Exception as e:
+                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
+                continue
+            prediction = completion.choices[0].message.content
+            response_ = json.loads(response)
+            response_tag_name_list = response_["tag_name_list"]
+            # print(response_tag_name_list)
+            if prediction.startswith("```json") and prediction.endswith("```"):
+                prediction_ = prediction[7:-3]
+            else:
+                prediction_ = prediction
+            prediction_tag_name_list = list()
+            try:
+                prediction_ = json.loads(prediction_)
+                prediction_tag_name_list = prediction_["tag_name_list"]
+            except json.JSONDecodeError:
+                pass
+            # print(prediction_tag_name_list)
+            # recall
+            recall_count = 0
+            for tag in response_tag_name_list:
+                if tag in prediction_tag_name_list:
+                    recall_count += 1
+            recall = recall_count / (len(response_tag_name_list) + 1e-7)
+            # precision
+            precision_count = 0
+            for tag in prediction_tag_name_list:
+                if tag in response_tag_name_list:
+                    precision_count += 1
+            precision = precision_count / (len(prediction_tag_name_list) + 1e-7)
+            # f1
+            f1 = 2 * (recall * precision) / (recall + precision + 1e-7)
+            total += 1
+            total_score += f1
+            score = total_score / total
+            row_ = {
+                "idx": idx,
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "response": response,
+                "prediction": prediction,
+                "recall": recall,
+                "precision": precision,
+                "f1": f1,
+                "total": total,
+                "total_score": total_score,
+                "score": score,
+                "time_cost": time_cost,
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            fout.write(f"{row_}\n")
+            fout.flush()
+    return
+if __name__ == "__main__":
+    main()

main.py CHANGED Viewed

@@ -132,6 +132,8 @@ def load_board():
                     score_ = row["correct"]
                 elif name.endswith("-chat"):
                     score_ = row["score"]
                 else:
                     raise AssertionError
@@ -296,7 +298,8 @@ def main():
                     value=llm_ranking_board,
                     max_height=800, min_width=160,
                     label="board",
-                    interactive=True, show_search="search"
                 )
                 board_button.click(

                     score_ = row["correct"]
                 elif name.endswith("-chat"):
                     score_ = row["score"]
+                elif name.endswith("-summary"):
+                    score_ = row["score"]
                 else:
                     raise AssertionError
                     value=llm_ranking_board,
                     max_height=800, min_width=160,
                     label="board",
+                    # interactive=True,
+                    show_search="search"
                 )
                 board_button.click(