File size: 9,533 Bytes
8c9048a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
import torch
import cv2
import json
import os
import numpy as np
from PIL import Image
import argparse
import pandas as pd
import torch
from torchvision.transforms import Resize
from torchvision import transforms
import torch.nn.functional as F
import numpy as np
from torchmetrics.multimodal import CLIPScore
from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
from torchmetrics.regression import MeanSquaredError
from urllib.request import urlretrieve
from PIL import Image
import open_clip
import os
import hpsv2
import ImageReward as RM
import math
from transformers import AutoProcessor, AutoModel
def rle2mask(mask_rle, shape): # height, width
starts, lengths = [np.asarray(x, dtype=int) for x in (mask_rle[0:][::2], mask_rle[1:][::2])]
starts -= 1
ends = starts + lengths
binary_mask = np.zeros(shape[0] * shape[1], dtype=np.uint8)
for lo, hi in zip(starts, ends):
binary_mask[lo:hi] = 1
return binary_mask.reshape(shape)
class MetricsCalculator:
def __init__(self, device,ckpt_path="../../data/ckpt") -> None:
self.device=device
# clip
self.clip_metric_calculator = CLIPScore(model_name_or_path="openai/clip-vit-large-patch14").to(device)
# lpips
self.lpips_metric_calculator = LearnedPerceptualImagePatchSimilarity(net_type='squeeze').to(device)
# aesthetic model
self.aesthetic_model = torch.nn.Linear(768, 1)
aesthetic_model_url = (
"https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth?raw=true"
)
aesthetic_model_ckpt_path=os.path.join(ckpt_path,"sa_0_4_vit_l_14_linear.pth")
urlretrieve(aesthetic_model_url, aesthetic_model_ckpt_path)
self.aesthetic_model.load_state_dict(torch.load(aesthetic_model_ckpt_path))
self.aesthetic_model.eval()
self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
# image reward model
self.imagereward_model = RM.load("ImageReward-v1.0")
def calculate_image_reward(self,image,prompt):
reward = self.imagereward_model.score(prompt, [image])
return reward
def calculate_hpsv21_score(self,image,prompt):
result = hpsv2.score(image, prompt, hps_version="v2.1")[0]
return result.item()
def calculate_aesthetic_score(self,img):
image = self.clip_preprocess(img).unsqueeze(0)
with torch.no_grad():
image_features = self.clip_model.encode_image(image)
image_features /= image_features.norm(dim=-1, keepdim=True)
prediction = self.aesthetic_model(image_features)
return prediction.cpu().item()
def calculate_clip_similarity(self, img, txt):
img = np.array(img)
img_tensor=torch.tensor(img).permute(2,0,1).to(self.device)
score = self.clip_metric_calculator(img_tensor, txt)
score = score.cpu().item()
return score
def calculate_psnr(self, img_pred, img_gt, mask=None):
img_pred = np.array(img_pred).astype(np.float32)/255.
img_gt = np.array(img_gt).astype(np.float32)/255.
assert img_pred.shape == img_gt.shape, "Image shapes should be the same."
if mask is not None:
mask = np.array(mask).astype(np.float32)
img_pred = img_pred * mask
img_gt = img_gt * mask
difference = img_pred - img_gt
difference_square = difference ** 2
difference_square_sum = difference_square.sum()
difference_size = mask.sum()
mse = difference_square_sum/difference_size
if mse < 1.0e-10:
return 1000
PIXEL_MAX = 1
return 20 * math.log10(PIXEL_MAX / math.sqrt(mse))
def calculate_lpips(self, img_gt, img_pred, mask=None):
img_pred = np.array(img_pred).astype(np.float32)/255
img_gt = np.array(img_gt).astype(np.float32)/255
assert img_pred.shape == img_gt.shape, "Image shapes should be the same."
if mask is not None:
mask = np.array(mask).astype(np.float32)
img_pred = img_pred * mask
img_gt = img_gt * mask
img_pred_tensor=torch.tensor(img_pred).permute(2,0,1).unsqueeze(0).to(self.device)
img_gt_tensor=torch.tensor(img_gt).permute(2,0,1).unsqueeze(0).to(self.device)
score = self.lpips_metric_calculator(img_pred_tensor*2-1,img_gt_tensor*2-1)
score = score.cpu().item()
return score
def calculate_mse(self, img_pred, img_gt, mask=None):
img_pred = np.array(img_pred).astype(np.float32)/255.
img_gt = np.array(img_gt).astype(np.float32)/255.
assert img_pred.shape == img_gt.shape, "Image shapes should be the same."
if mask is not None:
mask = np.array(mask).astype(np.float32)
img_pred = img_pred * mask
img_gt = img_gt * mask
difference = img_pred - img_gt
difference_square = difference ** 2
difference_square_sum = difference_square.sum()
difference_size = mask.sum()
mse = difference_square_sum/difference_size
return mse.item()
device = "cuda" if torch.cuda.is_available() else "cpu"
cc3m_base_dir = "/home/kis/datasets/cc3m_attempt12"
cc3m_full_regen = "/home/kis/Downloads/imgs"
# annotations = pd.read_csv("annotations/annotations_synthetic_100percent.txt", sep='\t', header=None, dtype=str)
cc3m_annotations_full = pd.read_csv("../../../../datasets/cc3m/Train_GCC-training.tsv", sep='\t', header=0)
# evaluation
# evaluation_df = pd.DataFrame(columns=['Image ID','Image Reward', 'HPS V2.1', 'Aesthetic Score'])#, 'CLIP Similarity'])
evaluation_df = pd.DataFrame(columns=['Image ID', 'Aesthetic Score'])#, 'CLIP Similarity'])
metrics_calculator=MetricsCalculator(device)
mask_root = "../../../SemanticSegmentation/mask_skin"
prev = None
gender = "real"
root = "PATH_TO_THE_ROOT"
if gender == "man":
img_root = root + "/foreground_syn_men/"
elif gender == "woman":
img_root = root + "/foreground_syn_women/"
elif gender == "real":
img_root = root + "/foreground/"
annotations = cc3m_annotations_full
elif gender == "cvpr":
img_root = root + "/foreground_cvpr_images/"
annotations = pd.read_csv("/home/kis/Downloads/annotations_cvpr.csv")
elif gender == "syn":
img_root = root + "/foreground_fully_synthetic/"
# annotations = pd.read_csv("/home/kis/Downloads/annotations_cvpr.csv")
elif gender == "coco":
img_root = root + "/foreground_coco_counterfactuals/"
annotations = pd.read_json(path_or_buf=root + "/coco_counterfactuals.jsonl", lines=True)
annotations = annotations.set_index("id")
for fname in os.listdir(img_root):
image_name = fname.split(".")[0]
if "mask" in image_name or "original" in image_name:
continue
image_name = fname.split("_")[0]
print(f"evaluating image {image_name} ...")
image_id = str(image_name).zfill(9)
if gender in ["man", "woman"]:
caption = annotations[(annotations[0]==image_name) & (annotations[1]==gender)][2].item()
elif gender == "real":
caption = annotations.iloc[int(image_id), 0]
elif gender == "cvpr":
caption = annotations.iloc[int(image_name), 1]
elif gender == "syn":
caption = annotations.iloc[int(image_id), 0]
elif gender == "coco":
image_id = f'{fname.split("_")[0]}{fname.split("_")[1]}'
n = int(fname.split(".")[0][-1]) # 0 or 1 in the end of the filename - two counterfactuals
caption = annotations.loc[int(image_id)][n]
image_path = f"{img_root}/{fname}"
prompt = caption
try:
src_image_path = image_path
src_image = Image.open(src_image_path).resize((512,512))
evaluation_result=[str(image_id)+"_"+str(gender)]
except FileNotFoundError:
continue
success = True
for metric in evaluation_df.columns.values.tolist()[1:]:
print(f"evaluating metric: {metric}")
try:
if metric == 'Image Reward':
metric_result = metrics_calculator.calculate_image_reward(src_image,prompt)
if metric == 'HPS V2.1':
metric_result = metrics_calculator.calculate_hpsv21_score(src_image,prompt)
if metric == 'Aesthetic Score':
metric_result = metrics_calculator.calculate_aesthetic_score(src_image)
if metric == 'CLIP Similarity':
metric_result = metrics_calculator.calculate_clip_similarity(src_image, prompt)
evaluation_result.append(metric_result)
except RuntimeError:
success = False
break
if success:
evaluation_df.loc[len(evaluation_df.index)] = evaluation_result
evaluation_df.to_csv(f"evaluation/evaluation_result_{gender}.csv")
print("The averaged evaluation result:")
averaged_results=evaluation_df.mean(numeric_only=True)
print(averaged_results)
averaged_results.to_csv(f"evaluation/evaluation_result_{gender}.csv")
evaluation_df.to_csv(f"evaluation/evaluation_result_{gender}.csv")
print(f"The generated images and evaluation results is saved in ./evaluation/") |