Spaces:

provetgrizzner
/

Image_Caption_CNNLSTM

Sleeping

Image_Caption_CNNLSTM / scripts /utilities.py

lordpotato

push to hf

858b392 about 2 months ago

3.54 kB

	"""
	The file Contains important functions, variables and definitions for the inference
	Contains sequence generators, image loader , preprocessing, models and everything else
	"""

	import pickle
	import numpy as np
	from tensorflow.keras.preprocessing.image import img_to_array, load_img
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import Model, load_model
	from tensorflow.keras.applications.inception_v3 import InceptionV3
	import tensorflow as tf


	caption_model = load_model('models/caption_model.keras')
	max_caption_length = 37 #changed from 34, after reviewing notebook, and dimension error in first run
	cnn_output_dim = 2048

	#Preprocessing functions
	def preprocess_image(image_path):
	img = load_img(image_path, target_size=(299, 299))
	img = img_to_array(img)
	img = np.expand_dims(img, axis=0)
	img = tf.keras.applications.inception_v3.preprocess_input(img)
	return img

	def extract_image_features(model, image_path):
	img = preprocess_image(image_path)
	features = model.predict(img, verbose=0)
	features = features.flatten()
	return features

	#Inception Model - CNN
	inception_v3_model = InceptionV3(weights = 'imagenet', input_shape=(299, 299, 3))
	inception_v3_model.layers.pop()
	inception_v3_model = Model(inputs=inception_v3_model.inputs, outputs=inception_v3_model.layers[-2].output)

	#Tokenizer
	with open('models/preprocessing/tokenizer.pkl', 'rb') as handle:
	tokenizer = pickle.load(handle)


	# Sequence Generators
	def beam_search_generator(image_features, K_beams = 3, log = False):
	start = [tokenizer.word_index['start']]
	start_word = [[start, 0.0]]
	for _ in range(max_caption_length):
	temp = []
	for s in start_word:
	sequence = pad_sequences([s[0]], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') # Cast to float32
	preds = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0)
	word_preds = np.argsort(preds[0])[-K_beams:]
	for w in word_preds:
	next_cap, prob = s[0][:], s[1]
	next_cap.append(w)
	if log:
	prob += np.log(preds[0][w]) # assign a probability to each K words
	else:
	prob += preds[0][w]
	temp.append([next_cap, prob])

	start_word = temp
	start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
	start_word = start_word[-K_beams:]

	start_word = start_word[-1][0]
	captions_ = [tokenizer.index_word[i] for i in start_word]
	final_caption = []
	for i in captions_:
	if i != 'end':
	final_caption.append(i)
	else:
	break

	final_caption = ' '.join(final_caption[1:])
	return final_caption

	def greedy_generator(image_features):
	in_text = 'start '
	for _ in range(max_caption_length):
	sequence = tokenizer.texts_to_sequences([in_text])[0]
	sequence = pad_sequences([sequence], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') # Cast to float32
	prediction = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0)
	idx = np.argmax(prediction)
	word = tokenizer.index_word[idx]
	in_text += ' ' + word
	if word == 'end':
	break

	in_text = in_text.replace('start ', '')
	in_text = in_text.replace(' end', '')

	return in_text