llm_eval_system / llm_eval_script /gemini_google.py
HoneyTian's picture
update
dbd1ddd
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude?hl=zh-cn
https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude/use-claude?hl=zh-cn
Llama
https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn
https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn#regions-quotas
Model Name
llama-4-maverick-17b-128e-instruct-maas
llama-4-scout-17b-16e-instruct-maas
区域选择 us-east5
Model Name
gemini-2.5-pro
The model does not support setting thinking_budget to 0.
Unable to submit request because thinking_budget is out of range; supported values are integers from 128 to 32768.
"""
import argparse
from datetime import datetime
import json
import os
from pathlib import Path
import sys
import time
import tempfile
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../"))
from google import genai
from google.genai import types
from project_settings import environment, project_path
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
# default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
default="gemini-2.5-flash",
# default="gemini-2.5-flash-lite-preview-06-17",
# default="llama-4-maverick-17b-128e-instruct-maas",
# default="llama-4-scout-17b-16e-instruct-maas",
type=str
)
parser.add_argument(
"--eval_dataset_name",
# default="agent-bingoplus-ph-90-choice.jsonl",
default="agent-lingoace-zh-400-choice.jsonl",
# default="arc-easy-1000-choice.jsonl",
type=str
)
parser.add_argument(
"--eval_dataset_dir",
default=(project_path / "data/dataset").as_posix(),
type=str
)
parser.add_argument(
"--eval_data_dir",
default=(project_path / "data/eval_data").as_posix(),
type=str
)
parser.add_argument(
"--client",
default="shenzhen_sase",
type=str
)
parser.add_argument(
"--service",
# default="google_potent_veld_462405_t3",
default="google_nxcloud_312303",
type=str
)
parser.add_argument(
"--create_time_str",
default="null",
# default="20250731_162116",
type=str
)
parser.add_argument(
"--interval",
default=1,
type=int
)
args = parser.parse_args()
return args
def main():
args = get_args()
service = environment.get(args.service, dtype=json.loads)
project_id = service["project_id"]
google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
content = json.dumps(service, ensure_ascii=False, indent=4)
f.write(f"{content}\n")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
eval_dataset_dir = Path(args.eval_dataset_dir)
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
eval_data_dir = Path(args.eval_data_dir)
eval_data_dir.mkdir(parents=True, exist_ok=True)
if args.create_time_str == "null":
tz = ZoneInfo("Asia/Shanghai")
now = datetime.now(tz)
create_time_str = now.strftime("%Y%m%d_%H%M%S")
# create_time_str = "20250729-interval-5"
else:
create_time_str = args.create_time_str
eval_dataset = eval_dataset_dir / args.eval_dataset_name
output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
output_file.parent.mkdir(parents=True, exist_ok=True)
client = genai.Client(
vertexai=True,
project=project_id,
location="global",
# location="us-east5",
)
generate_content_config = types.GenerateContentConfig(
top_p=0.95,
temperature=0.6,
max_output_tokens=1,
response_modalities=["TEXT"],
thinking_config=types.ThinkingConfig(
thinking_budget=0
)
)
total = 0
total_correct = 0
# finished
finished_idx_set = set()
if os.path.exists(output_file.as_posix()):
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
for row in f:
row = json.loads(row)
idx = row["idx"]
total = row["total"]
total_correct = row["total_correct"]
finished_idx_set.add(idx)
print(f"finished count: {len(finished_idx_set)}")
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
for row in fin:
row = json.loads(row)
idx = row["idx"]
prompt = row["prompt"]
response = row["response"]
if idx in finished_idx_set:
continue
finished_idx_set.add(idx)
contents = [
types.Content(
role="user",
parts=[
types.Part.from_text(text=prompt)
]
)
]
time.sleep(args.interval)
print(f"sleep: {args.interval}")
time_begin = time.time()
llm_response: types.GenerateContentResponse = client.models.generate_content(
model=args.model_name,
contents=contents,
config=generate_content_config,
)
time_cost = time.time() - time_begin
print(f"time_cost: {time_cost}")
try:
prediction = llm_response.candidates[0].content.parts[0].text
except TypeError as e:
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
continue
correct = 1 if prediction == response else 0
total += 1
total_correct += correct
score = total_correct / total
row_ = {
"idx": idx,
"prompt": prompt,
"response": response,
"prediction": prediction,
"correct": correct,
"total": total,
"total_correct": total_correct,
"score": score,
"time_cost": time_cost,
}
row_ = json.dumps(row_, ensure_ascii=False)
fout.write(f"{row_}\n")
return
if __name__ == "__main__":
main()