Spaces:
Running
Running
File size: 3,805 Bytes
2c50826 34046e2 2c50826 4f41410 2c50826 4f41410 34046e2 4f41410 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 4f41410 34046e2 4f41410 2c50826 34046e2 2c50826 4f41410 2c50826 34046e2 4f41410 34046e2 2c50826 4f41410 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 34046e2 2c50826 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import argparse
import json
import warnings
from pathlib import Path
from typing import Dict
import numpy as np
from PIL import Image
from tqdm import tqdm
from benchmark import create_benchmark
from benchmark.metrics import create_metric
warnings.filterwarnings("ignore", category=FutureWarning)
def evaluate_benchmark(
benchmark_type: str, api_type: str, images_dir: Path = Path("images")
) -> Dict:
"""
Evaluate a benchmark's images using its specific metrics.
Args:
benchmark_type (str): Type of benchmark to evaluate
api_type (str): Type of API used to generate images
images_dir (Path): Base directory containing generated images
Returns:
Dict containing evaluation results
"""
benchmark = create_benchmark(benchmark_type)
benchmark_dir = images_dir / api_type / benchmark_type
metadata_file = benchmark_dir / "metadata.jsonl"
if not metadata_file.exists():
raise FileNotFoundError(
f"No metadata file found for {api_type}/{benchmark_type}. Please run sample.py first."
)
metadata = []
with open(metadata_file, "r") as f:
for line in f:
metadata.append(json.loads(line))
metrics = {
metric_type: create_metric(metric_type) for metric_type in benchmark.metrics
}
results = {
"api": api_type,
"benchmark": benchmark_type,
"metrics": {metric: 0.0 for metric in benchmark.metrics},
"total_images": len(metadata),
}
inference_times = []
for entry in tqdm(metadata):
image_path = benchmark_dir / entry["filepath"]
if not image_path.exists():
continue
for metric_type, metric in metrics.items():
try:
if metric_type == "vqa":
score = metric.compute_score(image_path, entry["prompt"])
else:
image = Image.open(image_path)
score = metric.compute_score(image, entry["prompt"])
results["metrics"][metric_type] += score[metric_type]
except Exception as e:
print(f"Error computing {metric_type} for {image_path}: {str(e)}")
inference_times.append(entry["inference_time"])
for metric in results["metrics"]:
results["metrics"][metric] /= len(metadata)
results["median_inference_time"] = np.median(inference_times).item()
return results
def main():
parser = argparse.ArgumentParser(
description="Evaluate generated images using benchmark-specific metrics"
)
parser.add_argument("api_type", help="Type of API to evaluate")
parser.add_argument(
"benchmarks", nargs="+", help="List of benchmark types to evaluate"
)
args = parser.parse_args()
results_dir = Path("evaluation_results")
results_dir.mkdir(exist_ok=True)
results_file = results_dir / f"{args.api_type}.jsonl"
existing_results = set()
if results_file.exists():
with open(results_file, "r") as f:
for line in f:
result = json.loads(line)
existing_results.add(result["benchmark"])
for benchmark_type in args.benchmarks:
if benchmark_type in existing_results:
print(f"Skipping {args.api_type}/{benchmark_type} - already evaluated")
continue
try:
print(f"Evaluating {args.api_type}/{benchmark_type}")
results = evaluate_benchmark(benchmark_type, args.api_type)
# Append results to file
with open(results_file, "a") as f:
f.write(json.dumps(results) + "\n")
except Exception as e:
print(f"Error evaluating {args.api_type}/{benchmark_type}: {str(e)}")
if __name__ == "__main__":
main()
|