File size: 3,805 Bytes
2c50826
 
34046e2
2c50826
 
 
4f41410
2c50826
4f41410
 
34046e2
 
4f41410
 
2c50826
 
34046e2
 
 
2c50826
 
34046e2
2c50826
 
 
 
34046e2
2c50826
 
 
 
34046e2
2c50826
 
34046e2
2c50826
34046e2
 
 
 
2c50826
 
 
 
34046e2
 
 
 
 
2c50826
 
 
 
34046e2
2c50826
4f41410
34046e2
4f41410
2c50826
 
 
34046e2
2c50826
 
4f41410
 
 
 
 
2c50826
 
 
34046e2
4f41410
34046e2
2c50826
 
4f41410
34046e2
2c50826
 
 
 
34046e2
 
 
2c50826
34046e2
 
 
 
2c50826
34046e2
2c50826
 
34046e2
2c50826
 
 
 
 
 
 
 
 
 
 
 
 
34046e2
2c50826
 
 
34046e2
2c50826
 
 
34046e2
2c50826
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import argparse
import json
import warnings
from pathlib import Path
from typing import Dict

import numpy as np
from PIL import Image
from tqdm import tqdm

from benchmark import create_benchmark
from benchmark.metrics import create_metric

warnings.filterwarnings("ignore", category=FutureWarning)


def evaluate_benchmark(
    benchmark_type: str, api_type: str, images_dir: Path = Path("images")
) -> Dict:
    """
    Evaluate a benchmark's images using its specific metrics.

    Args:
        benchmark_type (str): Type of benchmark to evaluate
        api_type (str): Type of API used to generate images
        images_dir (Path): Base directory containing generated images

    Returns:
        Dict containing evaluation results
    """
    benchmark = create_benchmark(benchmark_type)

    benchmark_dir = images_dir / api_type / benchmark_type
    metadata_file = benchmark_dir / "metadata.jsonl"

    if not metadata_file.exists():
        raise FileNotFoundError(
            f"No metadata file found for {api_type}/{benchmark_type}. Please run sample.py first."
        )

    metadata = []
    with open(metadata_file, "r") as f:
        for line in f:
            metadata.append(json.loads(line))

    metrics = {
        metric_type: create_metric(metric_type) for metric_type in benchmark.metrics
    }

    results = {
        "api": api_type,
        "benchmark": benchmark_type,
        "metrics": {metric: 0.0 for metric in benchmark.metrics},
        "total_images": len(metadata),
    }
    inference_times = []

    for entry in tqdm(metadata):
        image_path = benchmark_dir / entry["filepath"]
        if not image_path.exists():
            continue

        for metric_type, metric in metrics.items():
            try:
                if metric_type == "vqa":
                    score = metric.compute_score(image_path, entry["prompt"])
                else:
                    image = Image.open(image_path)
                    score = metric.compute_score(image, entry["prompt"])
                results["metrics"][metric_type] += score[metric_type]
            except Exception as e:
                print(f"Error computing {metric_type} for {image_path}: {str(e)}")

        inference_times.append(entry["inference_time"])

    for metric in results["metrics"]:
        results["metrics"][metric] /= len(metadata)
    results["median_inference_time"] = np.median(inference_times).item()

    return results


def main():
    parser = argparse.ArgumentParser(
        description="Evaluate generated images using benchmark-specific metrics"
    )
    parser.add_argument("api_type", help="Type of API to evaluate")
    parser.add_argument(
        "benchmarks", nargs="+", help="List of benchmark types to evaluate"
    )

    args = parser.parse_args()

    results_dir = Path("evaluation_results")
    results_dir.mkdir(exist_ok=True)

    results_file = results_dir / f"{args.api_type}.jsonl"
    existing_results = set()

    if results_file.exists():
        with open(results_file, "r") as f:
            for line in f:
                result = json.loads(line)
                existing_results.add(result["benchmark"])

    for benchmark_type in args.benchmarks:
        if benchmark_type in existing_results:
            print(f"Skipping {args.api_type}/{benchmark_type} - already evaluated")
            continue

        try:
            print(f"Evaluating {args.api_type}/{benchmark_type}")
            results = evaluate_benchmark(benchmark_type, args.api_type)

            # Append results to file
            with open(results_file, "a") as f:
                f.write(json.dumps(results) + "\n")

        except Exception as e:
            print(f"Error evaluating {args.api_type}/{benchmark_type}: {str(e)}")


if __name__ == "__main__":
    main()