|
import os |
|
import tqdm |
|
import argparse |
|
import os.path as osp |
|
import numpy as np |
|
from PIL import Image |
|
from transformers import (AutoTokenizer, AutoProcessor, |
|
CLIPVisionModelWithProjection, |
|
CLIPTextModelWithProjection) |
|
|
|
if __name__ == "__main__": |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
'--model', |
|
type=str, |
|
default='../pretrained_models/open-ai-clip-vit-base-patch32') |
|
parser.add_argument('--image-dir', type=str, default='data/samples.txt') |
|
parser.add_argument('--out-dir', type=str, default='') |
|
parser.add_argument('--out-file', type=str) |
|
|
|
args = parser.parse_args() |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.model) |
|
vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model) |
|
text_model = CLIPTextModelWithProjection.from_pretrained(args.model) |
|
processor = AutoProcessor.from_pretrained(args.model) |
|
|
|
|
|
device = 'cuda:0' |
|
text_model.to(device) |
|
texts = tokenizer(text=[' '], return_tensors='pt', padding=True) |
|
texts = texts.to(device) |
|
text_outputs = text_model(**texts) |
|
txt_feats = text_outputs.text_embeds |
|
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) |
|
txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy() |
|
|
|
images = os.listdir(args.image_dir) |
|
category_embeds = [] |
|
|
|
def _forward_vision_model(image_name): |
|
image_path = osp.join(args.image_dir, image_name) |
|
|
|
image = Image.open(image_path).convert("RGB") |
|
inputs = processor(images=image, return_tensors="pt", padding=True) |
|
image_outputs = vision_model(**inputs) |
|
img_feats = image_outputs.image_embeds |
|
|
|
img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True) |
|
img_feats = img_feats.reshape( |
|
-1, img_feats.shape[-1])[0].cpu().data.numpy() |
|
category_embeds.append(img_feats) |
|
|
|
for image_ in tqdm.tqdm(images): |
|
_forward_vision_model(image_) |
|
category_embeds.append(txt_feats) |
|
category_embeds = np.stack(category_embeds) |
|
np.save(osp.join(args.out_dir, args.out_file), category_embeds) |
|
|