|
""" |
|
The file Contains important functions, variables and definitions for the inference |
|
Contains sequence generators, image loader , preprocessing, models and everything else |
|
""" |
|
|
|
import pickle |
|
import numpy as np |
|
from tensorflow.keras.preprocessing.image import img_to_array, load_img |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow.keras.models import Model, load_model |
|
from tensorflow.keras.applications.inception_v3 import InceptionV3 |
|
import tensorflow as tf |
|
|
|
|
|
caption_model = load_model('models/caption_model.keras') |
|
max_caption_length = 37 |
|
cnn_output_dim = 2048 |
|
|
|
|
|
def preprocess_image(image_path): |
|
img = load_img(image_path, target_size=(299, 299)) |
|
img = img_to_array(img) |
|
img = np.expand_dims(img, axis=0) |
|
img = tf.keras.applications.inception_v3.preprocess_input(img) |
|
return img |
|
|
|
def extract_image_features(model, image_path): |
|
img = preprocess_image(image_path) |
|
features = model.predict(img, verbose=0) |
|
features = features.flatten() |
|
return features |
|
|
|
|
|
inception_v3_model = InceptionV3(weights = 'imagenet', input_shape=(299, 299, 3)) |
|
inception_v3_model.layers.pop() |
|
inception_v3_model = Model(inputs=inception_v3_model.inputs, outputs=inception_v3_model.layers[-2].output) |
|
|
|
|
|
with open('models/preprocessing/tokenizer.pkl', 'rb') as handle: |
|
tokenizer = pickle.load(handle) |
|
|
|
|
|
|
|
def beam_search_generator(image_features, K_beams = 3, log = False): |
|
start = [tokenizer.word_index['start']] |
|
start_word = [[start, 0.0]] |
|
for _ in range(max_caption_length): |
|
temp = [] |
|
for s in start_word: |
|
sequence = pad_sequences([s[0]], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') |
|
preds = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0) |
|
word_preds = np.argsort(preds[0])[-K_beams:] |
|
for w in word_preds: |
|
next_cap, prob = s[0][:], s[1] |
|
next_cap.append(w) |
|
if log: |
|
prob += np.log(preds[0][w]) |
|
else: |
|
prob += preds[0][w] |
|
temp.append([next_cap, prob]) |
|
|
|
start_word = temp |
|
start_word = sorted(start_word, reverse=False, key=lambda l: l[1]) |
|
start_word = start_word[-K_beams:] |
|
|
|
start_word = start_word[-1][0] |
|
captions_ = [tokenizer.index_word[i] for i in start_word] |
|
final_caption = [] |
|
for i in captions_: |
|
if i != 'end': |
|
final_caption.append(i) |
|
else: |
|
break |
|
|
|
final_caption = ' '.join(final_caption[1:]) |
|
return final_caption |
|
|
|
def greedy_generator(image_features): |
|
in_text = 'start ' |
|
for _ in range(max_caption_length): |
|
sequence = tokenizer.texts_to_sequences([in_text])[0] |
|
sequence = pad_sequences([sequence], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') |
|
prediction = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0) |
|
idx = np.argmax(prediction) |
|
word = tokenizer.index_word[idx] |
|
in_text += ' ' + word |
|
if word == 'end': |
|
break |
|
|
|
in_text = in_text.replace('start ', '') |
|
in_text = in_text.replace(' end', '') |
|
|
|
return in_text |