Spaces:

wondervictor
/

YOLO-World-Image

Paused

YOLO-World-Image / tools /generate_text_prompts.py

update lfs

f5fdf51 about 1 year ago

1.18 kB

	import json
	import argparse
	import numpy as np
	from transformers import (AutoTokenizer, CLIPTextModelWithProjection)


	if __name__ == "__main__":

	parser = argparse.ArgumentParser()
	parser.add_argument(
	'--model',
	type=str,
	default='./pretrained_models/clip-vit-base-patch32-projection')
	parser.add_argument('--text',
	type=str,
	default='data/captions/coco_class_captions.json')
	parser.add_argument('--out', type=str, default='output.npy')

	args = parser.parse_args()

	tokenizer = AutoTokenizer.from_pretrained(args.model)
	model = CLIPTextModelWithProjection.from_pretrained(args.model)

	with open(args.text) as f:
	data = json.load(f)
	texts = [x[0] for x in data]
	device = 'cuda:0'
	model.to(device)
	texts = tokenizer(text=texts, return_tensors='pt', padding=True)
	texts = texts.to(device)
	text_outputs = model(**texts)
	txt_feats = text_outputs.text_embeds
	txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
	txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1])

	np.save(args.out, txt_feats.cpu().data.numpy())