""" The file Contains important functions, variables and definitions for the inference Contains sequence generators, image loader , preprocessing, models and everything else """ import pickle import numpy as np from tensorflow.keras.preprocessing.image import img_to_array, load_img from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Model, load_model from tensorflow.keras.applications.inception_v3 import InceptionV3 import tensorflow as tf caption_model = load_model('models/caption_model.keras') max_caption_length = 37 #changed from 34, after reviewing notebook, and dimension error in first run cnn_output_dim = 2048 #Preprocessing functions def preprocess_image(image_path): img = load_img(image_path, target_size=(299, 299)) img = img_to_array(img) img = np.expand_dims(img, axis=0) img = tf.keras.applications.inception_v3.preprocess_input(img) return img def extract_image_features(model, image_path): img = preprocess_image(image_path) features = model.predict(img, verbose=0) features = features.flatten() return features #Inception Model - CNN inception_v3_model = InceptionV3(weights = 'imagenet', input_shape=(299, 299, 3)) inception_v3_model.layers.pop() inception_v3_model = Model(inputs=inception_v3_model.inputs, outputs=inception_v3_model.layers[-2].output) #Tokenizer with open('models/preprocessing/tokenizer.pkl', 'rb') as handle: tokenizer = pickle.load(handle) # Sequence Generators def beam_search_generator(image_features, K_beams = 3, log = False): start = [tokenizer.word_index['start']] start_word = [[start, 0.0]] for _ in range(max_caption_length): temp = [] for s in start_word: sequence = pad_sequences([s[0]], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') # Cast to float32 preds = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0) word_preds = np.argsort(preds[0])[-K_beams:] for w in word_preds: next_cap, prob = s[0][:], s[1] next_cap.append(w) if log: prob += np.log(preds[0][w]) # assign a probability to each K words else: prob += preds[0][w] temp.append([next_cap, prob]) start_word = temp start_word = sorted(start_word, reverse=False, key=lambda l: l[1]) start_word = start_word[-K_beams:] start_word = start_word[-1][0] captions_ = [tokenizer.index_word[i] for i in start_word] final_caption = [] for i in captions_: if i != 'end': final_caption.append(i) else: break final_caption = ' '.join(final_caption[1:]) return final_caption def greedy_generator(image_features): in_text = 'start ' for _ in range(max_caption_length): sequence = tokenizer.texts_to_sequences([in_text])[0] sequence = pad_sequences([sequence], maxlen=max_caption_length, padding='post').reshape((1,max_caption_length)).astype('float32') # Cast to float32 prediction = caption_model.predict([image_features.reshape(1,cnn_output_dim), sequence], verbose=0) idx = np.argmax(prediction) word = tokenizer.index_word[idx] in_text += ' ' + word if word == 'end': break in_text = in_text.replace('start ', '') in_text = in_text.replace(' end', '') return in_text