import argparse import json import warnings from pathlib import Path from typing import Dict import numpy as np from PIL import Image from tqdm import tqdm from benchmark import create_benchmark from benchmark.metrics import create_metric warnings.filterwarnings("ignore", category=FutureWarning) def evaluate_benchmark( benchmark_type: str, api_type: str, images_dir: Path = Path("images") ) -> Dict: """ Evaluate a benchmark's images using its specific metrics. Args: benchmark_type (str): Type of benchmark to evaluate api_type (str): Type of API used to generate images images_dir (Path): Base directory containing generated images Returns: Dict containing evaluation results """ benchmark = create_benchmark(benchmark_type) benchmark_dir = images_dir / api_type / benchmark_type metadata_file = benchmark_dir / "metadata.jsonl" if not metadata_file.exists(): raise FileNotFoundError( f"No metadata file found for {api_type}/{benchmark_type}. Please run sample.py first." ) metadata = [] with open(metadata_file, "r") as f: for line in f: metadata.append(json.loads(line)) metrics = { metric_type: create_metric(metric_type) for metric_type in benchmark.metrics } results = { "api": api_type, "benchmark": benchmark_type, "metrics": {metric: 0.0 for metric in benchmark.metrics}, "total_images": len(metadata), } inference_times = [] for entry in tqdm(metadata): image_path = benchmark_dir / entry["filepath"] if not image_path.exists(): continue for metric_type, metric in metrics.items(): try: if metric_type == "vqa": score = metric.compute_score(image_path, entry["prompt"]) else: image = Image.open(image_path) score = metric.compute_score(image, entry["prompt"]) results["metrics"][metric_type] += score[metric_type] except Exception as e: print(f"Error computing {metric_type} for {image_path}: {str(e)}") inference_times.append(entry["inference_time"]) for metric in results["metrics"]: results["metrics"][metric] /= len(metadata) results["median_inference_time"] = np.median(inference_times).item() return results def main(): parser = argparse.ArgumentParser( description="Evaluate generated images using benchmark-specific metrics" ) parser.add_argument("api_type", help="Type of API to evaluate") parser.add_argument( "benchmarks", nargs="+", help="List of benchmark types to evaluate" ) args = parser.parse_args() results_dir = Path("evaluation_results") results_dir.mkdir(exist_ok=True) results_file = results_dir / f"{args.api_type}.jsonl" existing_results = set() if results_file.exists(): with open(results_file, "r") as f: for line in f: result = json.loads(line) existing_results.add(result["benchmark"]) for benchmark_type in args.benchmarks: if benchmark_type in existing_results: print(f"Skipping {args.api_type}/{benchmark_type} - already evaluated") continue try: print(f"Evaluating {args.api_type}/{benchmark_type}") results = evaluate_benchmark(benchmark_type, args.api_type) # Append results to file with open(results_file, "a") as f: f.write(json.dumps(results) + "\n") except Exception as e: print(f"Error evaluating {args.api_type}/{benchmark_type}: {str(e)}") if __name__ == "__main__": main()