acai66
/

M2_Encoder_Large

multi-modal-embedding

Model card Files Files and versions

M2_Encoder_Large / vlmo /transforms /utils.py

acai66's picture

Upload folder using huggingface_hub

3440f83 verified 5 months ago

history blame contribute delete

1.79 kB

	from torchvision import transforms
	from PIL import Image


	class MinMaxResize:
	def __init__(self, shorter=800, longer=1333):
	self.min = shorter
	self.max = longer

	def __call__(self, x):
	w, h = x.size
	scale = self.min / min(w, h)
	if h < w:
	newh, neww = self.min, scale * w
	else:
	newh, neww = scale * h, self.min

	if max(newh, neww) > self.max:
	scale = self.max / max(newh, neww)
	newh = newh * scale
	neww = neww * scale

	newh, neww = int(newh + 0.5), int(neww + 0.5)
	newh, neww = newh // 32 * 32, neww // 32 * 32

	return x.resize((neww, newh), resample=Image.BICUBIC)


	class UnNormalize(object):
	def __init__(self, mean, std):
	self.mean = mean
	self.std = std

	def __call__(self, tensor):
	"""
	Args:
	tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
	Returns:
	Tensor: Normalized image.
	"""
	for t, m, s in zip(tensor, self.mean, self.std):
	t.mul_(s).add_(m)
	# The normalize code -> t.sub_(m).div_(s)
	return tensor


	# This is simple maximum entropy normalization performed in Inception paper
	inception_normalize = transforms.Compose([transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])

	# ViT uses simple non-biased inception normalization
	# https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py#L132
	inception_unnormalize = transforms.Compose([UnNormalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])

	cn_clip_normalize = transforms.Compose(
	[transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])]
	)