from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler import torch import cv2 import json import os import numpy as np from PIL import Image import argparse import pandas as pd import torch from torchvision.transforms import Resize from torchvision import transforms import torch.nn.functional as F import numpy as np from torchmetrics.multimodal import CLIPScore from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity from torchmetrics.regression import MeanSquaredError from urllib.request import urlretrieve from PIL import Image import open_clip import os import hpsv2 import ImageReward as RM import math from transformers import AutoProcessor, AutoModel def rle2mask(mask_rle, shape): # height, width starts, lengths = [np.asarray(x, dtype=int) for x in (mask_rle[0:][::2], mask_rle[1:][::2])] starts -= 1 ends = starts + lengths binary_mask = np.zeros(shape[0] * shape[1], dtype=np.uint8) for lo, hi in zip(starts, ends): binary_mask[lo:hi] = 1 return binary_mask.reshape(shape) class MetricsCalculator: def __init__(self, device,ckpt_path="../../data/ckpt") -> None: self.device=device # clip self.clip_metric_calculator = CLIPScore(model_name_or_path="openai/clip-vit-large-patch14").to(device) # lpips self.lpips_metric_calculator = LearnedPerceptualImagePatchSimilarity(net_type='squeeze').to(device) # aesthetic model self.aesthetic_model = torch.nn.Linear(768, 1) aesthetic_model_url = ( "https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth?raw=true" ) aesthetic_model_ckpt_path=os.path.join(ckpt_path,"sa_0_4_vit_l_14_linear.pth") urlretrieve(aesthetic_model_url, aesthetic_model_ckpt_path) self.aesthetic_model.load_state_dict(torch.load(aesthetic_model_ckpt_path)) self.aesthetic_model.eval() self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai') # image reward model self.imagereward_model = RM.load("ImageReward-v1.0") def calculate_image_reward(self,image,prompt): reward = self.imagereward_model.score(prompt, [image]) return reward def calculate_hpsv21_score(self,image,prompt): result = hpsv2.score(image, prompt, hps_version="v2.1")[0] return result.item() def calculate_aesthetic_score(self,img): image = self.clip_preprocess(img).unsqueeze(0) with torch.no_grad(): image_features = self.clip_model.encode_image(image) image_features /= image_features.norm(dim=-1, keepdim=True) prediction = self.aesthetic_model(image_features) return prediction.cpu().item() def calculate_clip_similarity(self, img, txt): img = np.array(img) img_tensor=torch.tensor(img).permute(2,0,1).to(self.device) score = self.clip_metric_calculator(img_tensor, txt) score = score.cpu().item() return score def calculate_psnr(self, img_pred, img_gt, mask=None): img_pred = np.array(img_pred).astype(np.float32)/255. img_gt = np.array(img_gt).astype(np.float32)/255. assert img_pred.shape == img_gt.shape, "Image shapes should be the same." if mask is not None: mask = np.array(mask).astype(np.float32) img_pred = img_pred * mask img_gt = img_gt * mask difference = img_pred - img_gt difference_square = difference ** 2 difference_square_sum = difference_square.sum() difference_size = mask.sum() mse = difference_square_sum/difference_size if mse < 1.0e-10: return 1000 PIXEL_MAX = 1 return 20 * math.log10(PIXEL_MAX / math.sqrt(mse)) def calculate_lpips(self, img_gt, img_pred, mask=None): img_pred = np.array(img_pred).astype(np.float32)/255 img_gt = np.array(img_gt).astype(np.float32)/255 assert img_pred.shape == img_gt.shape, "Image shapes should be the same." if mask is not None: mask = np.array(mask).astype(np.float32) img_pred = img_pred * mask img_gt = img_gt * mask img_pred_tensor=torch.tensor(img_pred).permute(2,0,1).unsqueeze(0).to(self.device) img_gt_tensor=torch.tensor(img_gt).permute(2,0,1).unsqueeze(0).to(self.device) score = self.lpips_metric_calculator(img_pred_tensor*2-1,img_gt_tensor*2-1) score = score.cpu().item() return score def calculate_mse(self, img_pred, img_gt, mask=None): img_pred = np.array(img_pred).astype(np.float32)/255. img_gt = np.array(img_gt).astype(np.float32)/255. assert img_pred.shape == img_gt.shape, "Image shapes should be the same." if mask is not None: mask = np.array(mask).astype(np.float32) img_pred = img_pred * mask img_gt = img_gt * mask difference = img_pred - img_gt difference_square = difference ** 2 difference_square_sum = difference_square.sum() difference_size = mask.sum() mse = difference_square_sum/difference_size return mse.item() device = "cuda" if torch.cuda.is_available() else "cpu" cc3m_base_dir = "/home/kis/datasets/cc3m_attempt12" cc3m_full_regen = "/home/kis/Downloads/imgs" # annotations = pd.read_csv("annotations/annotations_synthetic_100percent.txt", sep='\t', header=None, dtype=str) cc3m_annotations_full = pd.read_csv("../../../../datasets/cc3m/Train_GCC-training.tsv", sep='\t', header=0) # evaluation # evaluation_df = pd.DataFrame(columns=['Image ID','Image Reward', 'HPS V2.1', 'Aesthetic Score'])#, 'CLIP Similarity']) evaluation_df = pd.DataFrame(columns=['Image ID', 'Aesthetic Score'])#, 'CLIP Similarity']) metrics_calculator=MetricsCalculator(device) mask_root = "../../../SemanticSegmentation/mask_skin" prev = None gender = "real" root = "PATH_TO_THE_ROOT" if gender == "man": img_root = root + "/foreground_syn_men/" elif gender == "woman": img_root = root + "/foreground_syn_women/" elif gender == "real": img_root = root + "/foreground/" annotations = cc3m_annotations_full elif gender == "cvpr": img_root = root + "/foreground_cvpr_images/" annotations = pd.read_csv("/home/kis/Downloads/annotations_cvpr.csv") elif gender == "syn": img_root = root + "/foreground_fully_synthetic/" # annotations = pd.read_csv("/home/kis/Downloads/annotations_cvpr.csv") elif gender == "coco": img_root = root + "/foreground_coco_counterfactuals/" annotations = pd.read_json(path_or_buf=root + "/coco_counterfactuals.jsonl", lines=True) annotations = annotations.set_index("id") for fname in os.listdir(img_root): image_name = fname.split(".")[0] if "mask" in image_name or "original" in image_name: continue image_name = fname.split("_")[0] print(f"evaluating image {image_name} ...") image_id = str(image_name).zfill(9) if gender in ["man", "woman"]: caption = annotations[(annotations[0]==image_name) & (annotations[1]==gender)][2].item() elif gender == "real": caption = annotations.iloc[int(image_id), 0] elif gender == "cvpr": caption = annotations.iloc[int(image_name), 1] elif gender == "syn": caption = annotations.iloc[int(image_id), 0] elif gender == "coco": image_id = f'{fname.split("_")[0]}{fname.split("_")[1]}' n = int(fname.split(".")[0][-1]) # 0 or 1 in the end of the filename - two counterfactuals caption = annotations.loc[int(image_id)][n] image_path = f"{img_root}/{fname}" prompt = caption try: src_image_path = image_path src_image = Image.open(src_image_path).resize((512,512)) evaluation_result=[str(image_id)+"_"+str(gender)] except FileNotFoundError: continue success = True for metric in evaluation_df.columns.values.tolist()[1:]: print(f"evaluating metric: {metric}") try: if metric == 'Image Reward': metric_result = metrics_calculator.calculate_image_reward(src_image,prompt) if metric == 'HPS V2.1': metric_result = metrics_calculator.calculate_hpsv21_score(src_image,prompt) if metric == 'Aesthetic Score': metric_result = metrics_calculator.calculate_aesthetic_score(src_image) if metric == 'CLIP Similarity': metric_result = metrics_calculator.calculate_clip_similarity(src_image, prompt) evaluation_result.append(metric_result) except RuntimeError: success = False break if success: evaluation_df.loc[len(evaluation_df.index)] = evaluation_result evaluation_df.to_csv(f"evaluation/evaluation_result_{gender}.csv") print("The averaged evaluation result:") averaged_results=evaluation_df.mean(numeric_only=True) print(averaged_results) averaged_results.to_csv(f"evaluation/evaluation_result_{gender}.csv") evaluation_df.to_csv(f"evaluation/evaluation_result_{gender}.csv") print(f"The generated images and evaluation results is saved in ./evaluation/")