lordpotato
push to hf
858b392
"""
The file Contains important functions, variables and definitions for the inference
Contains sequence generators, image loader , preprocessing, models and everything else
"""
import pickle
import numpy as np
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.applications.inception_v3 import InceptionV3
import tensorflow as tf
caption_model = load_model('models/caption_model.keras')
max_caption_length = 37 #changed from 34, after reviewing notebook, and dimension error in first run
cnn_output_dim = 2048
#Preprocessing functions
def preprocess_image(image_path):
img = load_img(image_path, target_size=(299, 299))
img = img_to_array(img)
img = np.expand_dims(img, axis=0)
img = tf.keras.applications.inception_v3.preprocess_input(img)
return img
def extract_image_features(model, image_path):
img = preprocess_image(image_path)
features = model.predict(img, verbose=0)
features = features.flatten()
return features
#Inception Model - CNN
inception_v3_model = InceptionV3(weights = 'imagenet', input_shape=(299, 299, 3))
inception_v3_model.layers.pop()
inception_v3_model = Model(inputs=inception_v3_model.inputs, outputs=inception_v3_model.layers[-2].output)
#Tokenizer
with open('models/preprocessing/tokenizer.pkl', 'rb') as handle:
tokenizer = pickle.load(handle)
# Sequence Generators
def beam_search_generator(image_features, K_beams = 3, log = False):
start = [tokenizer.word_index['start']]
start_word = [[start, 0.0]]
for _ in range(max_caption_length):
temp = []
for s in start_word:
sequence = pad_sequences([s[0]], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') # Cast to float32
preds = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0)
word_preds = np.argsort(preds[0])[-K_beams:]
for w in word_preds:
next_cap, prob = s[0][:], s[1]
next_cap.append(w)
if log:
prob += np.log(preds[0][w]) # assign a probability to each K words
else:
prob += preds[0][w]
temp.append([next_cap, prob])
start_word = temp
start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
start_word = start_word[-K_beams:]
start_word = start_word[-1][0]
captions_ = [tokenizer.index_word[i] for i in start_word]
final_caption = []
for i in captions_:
if i != 'end':
final_caption.append(i)
else:
break
final_caption = ' '.join(final_caption[1:])
return final_caption
def greedy_generator(image_features):
in_text = 'start '
for _ in range(max_caption_length):
sequence = tokenizer.texts_to_sequences([in_text])[0]
sequence = pad_sequences([sequence], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') # Cast to float32
prediction = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0)
idx = np.argmax(prediction)
word = tokenizer.index_word[idx]
in_text += ' ' + word
if word == 'end':
break
in_text = in_text.replace('start ', '')
in_text = in_text.replace(' end', '')
return in_text