Spaces:

PrunaAI
/

InferBench

Running

File size: 3,805 Bytes

import argparse
import json
import warnings
from pathlib import Path
from typing import Dict

import numpy as np
from PIL import Image
from tqdm import tqdm

from benchmark import create_benchmark
from benchmark.metrics import create_metric

warnings.filterwarnings("ignore", category=FutureWarning)


def evaluate_benchmark(
    benchmark_type: str, api_type: str, images_dir: Path = Path("images")
) -> Dict:
    """
    Evaluate a benchmark's images using its specific metrics.

    Args:
        benchmark_type (str): Type of benchmark to evaluate
        api_type (str): Type of API used to generate images
        images_dir (Path): Base directory containing generated images

    Returns:
        Dict containing evaluation results
    """
    benchmark = create_benchmark(benchmark_type)

    benchmark_dir = images_dir / api_type / benchmark_type
    metadata_file = benchmark_dir / "metadata.jsonl"

    if not metadata_file.exists():
        raise FileNotFoundError(
            f"No metadata file found for {api_type}/{benchmark_type}. Please run sample.py first."
        )

    metadata = []
    with open(metadata_file, "r") as f:
        for line in f:
            metadata.append(json.loads(line))

    metrics = {
        metric_type: create_metric(metric_type) for metric_type in benchmark.metrics
    }

    results = {
        "api": api_type,
        "benchmark": benchmark_type,
        "metrics": {metric: 0.0 for metric in benchmark.metrics},
        "total_images": len(metadata),
    }
    inference_times = []

    for entry in tqdm(metadata):
        image_path = benchmark_dir / entry["filepath"]
        if not image_path.exists():
            continue

        for metric_type, metric in metrics.items():
            try:
                if metric_type == "vqa":
                    score = metric.compute_score(image_path, entry["prompt"])
                else:
                    image = Image.open(image_path)
                    score = metric.compute_score(image, entry["prompt"])
                results["metrics"][metric_type] += score[metric_type]
            except Exception as e:
                print(f"Error computing {metric_type} for {image_path}: {str(e)}")

        inference_times.append(entry["inference_time"])

    for metric in results["metrics"]:
        results["metrics"][metric] /= len(metadata)
    results["median_inference_time"] = np.median(inference_times).item()

    return results


def main():
    parser = argparse.ArgumentParser(
        description="Evaluate generated images using benchmark-specific metrics"
    )
    parser.add_argument("api_type", help="Type of API to evaluate")
    parser.add_argument(
        "benchmarks", nargs="+", help="List of benchmark types to evaluate"
    )

    args = parser.parse_args()

    results_dir = Path("evaluation_results")
    results_dir.mkdir(exist_ok=True)

    results_file = results_dir / f"{args.api_type}.jsonl"
    existing_results = set()

    if results_file.exists():
        with open(results_file, "r") as f:
            for line in f:
                result = json.loads(line)
                existing_results.add(result["benchmark"])

    for benchmark_type in args.benchmarks:
        if benchmark_type in existing_results:
            print(f"Skipping {args.api_type}/{benchmark_type} - already evaluated")
            continue

        try:
            print(f"Evaluating {args.api_type}/{benchmark_type}")
            results = evaluate_benchmark(benchmark_type, args.api_type)

            # Append results to file
            with open(results_file, "a") as f:
                f.write(json.dumps(results) + "\n")

        except Exception as e:
            print(f"Error evaluating {args.api_type}/{benchmark_type}: {str(e)}")


if __name__ == "__main__":
    main()