Spaces:

anonymous-upload-neurips-2025
/

PinPoint

Running

File size: 9,533 Bytes

8c9048a

from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
import torch
import cv2
import json
import os
import numpy as np
from PIL import Image
import argparse
import pandas as pd
import torch
from torchvision.transforms import Resize
from torchvision import transforms
import torch.nn.functional as F
import numpy as np
from torchmetrics.multimodal import CLIPScore
from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
from torchmetrics.regression import MeanSquaredError
from urllib.request import urlretrieve 
from PIL import Image
import open_clip
import os
import hpsv2
import ImageReward as RM
import math
from transformers import AutoProcessor, AutoModel

def rle2mask(mask_rle, shape): # height, width
    starts, lengths = [np.asarray(x, dtype=int) for x in (mask_rle[0:][::2], mask_rle[1:][::2])]
    starts -= 1
    ends = starts + lengths
    binary_mask = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        binary_mask[lo:hi] = 1
    return binary_mask.reshape(shape)


class MetricsCalculator:
    def __init__(self, device,ckpt_path="../../data/ckpt") -> None:
        self.device=device
        # clip
        self.clip_metric_calculator = CLIPScore(model_name_or_path="openai/clip-vit-large-patch14").to(device)
        # lpips
        self.lpips_metric_calculator = LearnedPerceptualImagePatchSimilarity(net_type='squeeze').to(device)
        # aesthetic model
        self.aesthetic_model = torch.nn.Linear(768, 1)
        aesthetic_model_url = (
                    "https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth?raw=true"
                )
        aesthetic_model_ckpt_path=os.path.join(ckpt_path,"sa_0_4_vit_l_14_linear.pth")
        urlretrieve(aesthetic_model_url, aesthetic_model_ckpt_path)
        self.aesthetic_model.load_state_dict(torch.load(aesthetic_model_ckpt_path))
        self.aesthetic_model.eval()
        self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
        # image reward model
        self.imagereward_model = RM.load("ImageReward-v1.0")
 

    def calculate_image_reward(self,image,prompt):
        reward = self.imagereward_model.score(prompt, [image])
        return reward

    def calculate_hpsv21_score(self,image,prompt):
        result = hpsv2.score(image, prompt, hps_version="v2.1")[0]
        return result.item()

    def calculate_aesthetic_score(self,img):
        image = self.clip_preprocess(img).unsqueeze(0)
        with torch.no_grad():
            image_features = self.clip_model.encode_image(image)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            prediction = self.aesthetic_model(image_features)
        return prediction.cpu().item()

    def calculate_clip_similarity(self, img, txt):
        img = np.array(img)
        
        img_tensor=torch.tensor(img).permute(2,0,1).to(self.device)
        
        score = self.clip_metric_calculator(img_tensor, txt)
        score = score.cpu().item()
        
        return score
    
    def calculate_psnr(self, img_pred, img_gt, mask=None):
        img_pred = np.array(img_pred).astype(np.float32)/255.
        img_gt = np.array(img_gt).astype(np.float32)/255.

        assert img_pred.shape == img_gt.shape, "Image shapes should be the same."
        if mask is not None:
            mask = np.array(mask).astype(np.float32)
            img_pred = img_pred * mask
            img_gt = img_gt * mask
        
        difference = img_pred - img_gt
        difference_square = difference ** 2
        difference_square_sum = difference_square.sum()
        difference_size = mask.sum()

        mse = difference_square_sum/difference_size

        if mse < 1.0e-10:
            return 1000
        PIXEL_MAX = 1
        return 20 * math.log10(PIXEL_MAX / math.sqrt(mse))

    
    def calculate_lpips(self, img_gt, img_pred, mask=None):
        img_pred = np.array(img_pred).astype(np.float32)/255
        img_gt = np.array(img_gt).astype(np.float32)/255
        assert img_pred.shape == img_gt.shape, "Image shapes should be the same."

        if mask is not None:
            mask = np.array(mask).astype(np.float32)
            img_pred = img_pred * mask 
            img_gt = img_gt * mask
            
        img_pred_tensor=torch.tensor(img_pred).permute(2,0,1).unsqueeze(0).to(self.device)
        img_gt_tensor=torch.tensor(img_gt).permute(2,0,1).unsqueeze(0).to(self.device)
            
        score =  self.lpips_metric_calculator(img_pred_tensor*2-1,img_gt_tensor*2-1)
        score = score.cpu().item()
        
        return score
    
    def calculate_mse(self, img_pred, img_gt, mask=None):
        img_pred = np.array(img_pred).astype(np.float32)/255.
        img_gt = np.array(img_gt).astype(np.float32)/255.

        assert img_pred.shape == img_gt.shape, "Image shapes should be the same."
        if mask is not None:
            mask = np.array(mask).astype(np.float32)
            img_pred = img_pred * mask
            img_gt = img_gt * mask
        
        difference = img_pred - img_gt
        difference_square = difference ** 2
        difference_square_sum = difference_square.sum()
        difference_size = mask.sum()

        mse = difference_square_sum/difference_size

        return mse.item()
    

device = "cuda" if torch.cuda.is_available() else "cpu"


cc3m_base_dir = "/home/kis/datasets/cc3m_attempt12"
cc3m_full_regen = "/home/kis/Downloads/imgs"

# annotations = pd.read_csv("annotations/annotations_synthetic_100percent.txt", sep='\t', header=None, dtype=str)
cc3m_annotations_full = pd.read_csv("../../../../datasets/cc3m/Train_GCC-training.tsv", sep='\t', header=0)

# evaluation
# evaluation_df = pd.DataFrame(columns=['Image ID','Image Reward', 'HPS V2.1', 'Aesthetic Score'])#, 'CLIP Similarity'])
evaluation_df = pd.DataFrame(columns=['Image ID', 'Aesthetic Score'])#, 'CLIP Similarity'])

metrics_calculator=MetricsCalculator(device)
mask_root = "../../../SemanticSegmentation/mask_skin"
prev = None

gender = "real"

root = "PATH_TO_THE_ROOT"

if gender == "man":
    img_root = root + "/foreground_syn_men/"
elif gender == "woman":
    img_root = root +  "/foreground_syn_women/"
elif gender == "real":
    img_root = root +  "/foreground/"
    annotations = cc3m_annotations_full
elif gender == "cvpr":
    img_root = root + "/foreground_cvpr_images/"
    annotations = pd.read_csv("/home/kis/Downloads/annotations_cvpr.csv")
elif gender == "syn":
    img_root = root + "/foreground_fully_synthetic/"
    # annotations = pd.read_csv("/home/kis/Downloads/annotations_cvpr.csv")
elif gender == "coco":
    img_root = root + "/foreground_coco_counterfactuals/"
    annotations = pd.read_json(path_or_buf=root + "/coco_counterfactuals.jsonl", lines=True)
    annotations = annotations.set_index("id")

for fname in os.listdir(img_root):
    image_name = fname.split(".")[0]
    if "mask" in image_name or "original" in image_name:
        continue
    image_name = fname.split("_")[0]


    print(f"evaluating image {image_name} ...")


    image_id = str(image_name).zfill(9)

    if gender in ["man", "woman"]:
        caption = annotations[(annotations[0]==image_name) & (annotations[1]==gender)][2].item()
    elif gender == "real":
        caption = annotations.iloc[int(image_id), 0]
    elif gender == "cvpr":
        caption = annotations.iloc[int(image_name), 1]
    elif gender == "syn":
        caption = annotations.iloc[int(image_id), 0]
    elif gender == "coco":
        image_id = f'{fname.split("_")[0]}{fname.split("_")[1]}'
        n = int(fname.split(".")[0][-1]) # 0 or 1 in the end of the filename - two counterfactuals
        caption = annotations.loc[int(image_id)][n]

    image_path = f"{img_root}/{fname}"
    prompt = caption
    try:
        src_image_path = image_path
        src_image = Image.open(src_image_path).resize((512,512))
        evaluation_result=[str(image_id)+"_"+str(gender)]
    except FileNotFoundError:
        continue

    success = True
    for metric in evaluation_df.columns.values.tolist()[1:]:
        print(f"evaluating metric: {metric}")
        try:

            if metric == 'Image Reward':
                metric_result = metrics_calculator.calculate_image_reward(src_image,prompt)
                
            if metric == 'HPS V2.1':
                metric_result = metrics_calculator.calculate_hpsv21_score(src_image,prompt)
            
            if metric == 'Aesthetic Score':
                metric_result = metrics_calculator.calculate_aesthetic_score(src_image)
                        
            if metric == 'CLIP Similarity':
                metric_result = metrics_calculator.calculate_clip_similarity(src_image, prompt)

            evaluation_result.append(metric_result)
        except RuntimeError:
            success = False
            break
    
    if success:
        evaluation_df.loc[len(evaluation_df.index)] = evaluation_result
        evaluation_df.to_csv(f"evaluation/evaluation_result_{gender}.csv")

print("The averaged evaluation result:")
averaged_results=evaluation_df.mean(numeric_only=True)
print(averaged_results)
averaged_results.to_csv(f"evaluation/evaluation_result_{gender}.csv")
evaluation_df.to_csv(f"evaluation/evaluation_result_{gender}.csv")

print(f"The generated images and evaluation results is saved in ./evaluation/")