Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PIL import Image | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input | |
from tensorflow.keras.preprocessing.image import img_to_array | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
import pickle | |
import os | |
# Check if required files exist | |
def check_required_files(): | |
required_files = ["caption_model.h5", "tokenizer.pkl"] | |
missing_files = [] | |
for file in required_files: | |
if not os.path.exists(file): | |
missing_files.append(file) | |
else: | |
size = os.path.getsize(file) | |
print(f"β Found {file} ({size} bytes)") | |
if missing_files: | |
print(f"β Missing files: {missing_files}") | |
return False | |
return True | |
print("Checking required files...") | |
files_exist = check_required_files() | |
# Custom function to handle attention mechanism | |
def attention_function(inputs): | |
""" | |
Custom attention function that likely combines two inputs | |
Input 1: (None, 34, 34) - attention weights | |
Input 2: (None, 34, 512) - feature vectors | |
Output: (None, 34, 512) - attended features | |
""" | |
attention_weights, features = inputs | |
# Expand attention weights to match feature dimensions | |
attention_weights = tf.expand_dims(attention_weights, axis=-1) | |
# Apply attention weights to features | |
attended_features = attention_weights * features | |
return attended_features | |
def attention_output_shape(input_shapes): | |
"""Define the output shape for attention mechanism""" | |
# Return the shape of the feature input (second input) | |
return input_shapes[1] # (None, 34, 512) | |
# Alternative attention functions to try | |
def attention_function_v2(inputs): | |
"""Alternative attention mechanism - weighted sum""" | |
attention_weights, features = inputs | |
# Normalize attention weights | |
attention_weights = tf.nn.softmax(attention_weights, axis=-1) | |
attention_weights = tf.expand_dims(attention_weights, axis=-1) | |
return attention_weights * features | |
def attention_function_v3(inputs): | |
"""Another alternative - dot product attention""" | |
attention_weights, features = inputs | |
# Sum along the second dimension of attention weights | |
attention_weights = tf.reduce_sum(attention_weights, axis=-1, keepdims=True) | |
attention_weights = tf.expand_dims(attention_weights, axis=-1) | |
return attention_weights * features | |
# Custom Lambda layer class | |
class AttentionLambda(tf.keras.layers.Lambda): | |
def __init__(self, function, output_shape_func=None, **kwargs): | |
super().__init__(function, **kwargs) | |
self.output_shape_func = output_shape_func | |
def compute_output_shape(self, input_shape): | |
if self.output_shape_func: | |
return self.output_shape_func(input_shape) | |
# Default: return the shape of the second input (features) | |
if isinstance(input_shape, list) and len(input_shape) >= 2: | |
return input_shape[1] | |
return input_shape | |
# Define multiple custom objects to try different attention mechanisms | |
def get_custom_objects(attention_func, output_shape_func): | |
return { | |
'Lambda': lambda function=None, **kwargs: AttentionLambda( | |
attention_func if function is None else function, | |
output_shape_func, | |
**kwargs | |
) | |
} | |
# Multiple loading strategies with different attention mechanisms | |
def load_model_safely(): | |
print("Starting model loading process...") | |
# Strategy 1: Try with custom Lambda that handles the attention operation | |
try: | |
print("Strategy 1: Loading with custom attention Lambda...") | |
def custom_attention(inputs): | |
"""Handle attention mechanism between two inputs""" | |
if len(inputs) == 2: | |
attention_weights, features = inputs | |
# Simple attention: multiply attention weights with features | |
# Expand attention weights to match feature dimensions | |
if len(attention_weights.shape) == 3 and len(features.shape) == 3: | |
attention_weights = tf.expand_dims(attention_weights, axis=-1) | |
return tf.multiply(attention_weights, features) | |
return inputs[0] if isinstance(inputs, list) else inputs | |
custom_objects = { | |
'Lambda': lambda function=None, output_shape=None, **kwargs: | |
tf.keras.layers.Lambda( | |
custom_attention if function is None else function, | |
output_shape=lambda input_shape: input_shape[1] if isinstance(input_shape, list) else input_shape, | |
**kwargs | |
) | |
} | |
model = tf.keras.models.load_model("caption_model.h5", custom_objects=custom_objects) | |
print("β Strategy 1 successful!") | |
return model | |
except Exception as e: | |
print(f"β Strategy 1 failed: {str(e)[:200]}...") | |
# Strategy 2: Load with compile=False and try to fix compilation later | |
try: | |
print("Strategy 2: Loading without compilation...") | |
model = tf.keras.models.load_model("caption_model.h5", compile=False) | |
print("β Strategy 2 successful!") | |
return model | |
except Exception as e: | |
print(f"β Strategy 2 failed: {str(e)[:200]}...") | |
# Strategy 3: Try loading with TensorFlow's built-in Lambda handling | |
try: | |
print("Strategy 3: Loading with default Lambda handling...") | |
def identity_function(x): | |
if isinstance(x, list) and len(x) == 2: | |
# For attention mechanism, return the second input (features) | |
return x[1] | |
return x | |
custom_objects = { | |
'Lambda': lambda function=identity_function, output_shape=None, **kwargs: | |
tf.keras.layers.Lambda( | |
function, | |
output_shape=lambda input_shape: input_shape[1] if isinstance(input_shape, list) else input_shape, | |
**kwargs | |
) | |
} | |
model = tf.keras.models.load_model("caption_model.h5", custom_objects=custom_objects) | |
print("β Strategy 3 successful!") | |
return model | |
except Exception as e: | |
print(f"β Strategy 3 failed: {str(e)[:200]}...") | |
# Strategy 4: Try with minimal custom objects | |
try: | |
print("Strategy 4: Loading with minimal custom objects...") | |
model = tf.keras.models.load_model("caption_model.h5", custom_objects={'Lambda': tf.keras.layers.Lambda}) | |
print("β Strategy 4 successful!") | |
return model | |
except Exception as e: | |
print(f"β Strategy 4 failed: {str(e)[:200]}...") | |
print("All strategies failed. Model could not be loaded.") | |
raise Exception("All model loading strategies failed. The model file may be corrupted or incompatible.") | |
# Load your pre-trained model and tokenizer | |
if not files_exist: | |
print("Cannot proceed without required files.") | |
model = None | |
tokenizer = None | |
else: | |
# Load tokenizer first | |
try: | |
with open("tokenizer.pkl", "rb") as handle: | |
tokenizer = pickle.load(handle) | |
print("β Tokenizer loaded successfully") | |
except Exception as e: | |
print(f"β Failed to load tokenizer: {e}") | |
tokenizer = None | |
# Load model | |
try: | |
model = load_model_safely() | |
print("β Model loaded successfully and ready for inference!") | |
except Exception as e: | |
print(f"β Failed to load model: {e}") | |
print("The app will not work without a properly loaded model.") | |
model = None | |
# Image feature extractor model | |
feature_extractor = VGG16() | |
feature_extractor = tf.keras.Model(feature_extractor.input, feature_extractor.layers[-2].output) | |
# Description generation function | |
def generate_caption(image): | |
try: | |
if model is None: | |
return "β Model failed to load. Please check the model file and console output for details." | |
if tokenizer is None: | |
return "β Tokenizer failed to load. Please check the tokenizer.pkl file." | |
# Preprocess the image | |
image = image.resize((224, 224)) | |
image = img_to_array(image) | |
image = np.expand_dims(image, axis=0) | |
image = preprocess_input(image) | |
# Extract features | |
print("Extracting image features...") | |
feature = feature_extractor.predict(image, verbose=0) | |
print(f"Features extracted, shape: {feature.shape}") | |
# Generate caption | |
input_text = 'startseq' | |
max_length = 34 # set this to your model's max_length | |
print("Starting caption generation...") | |
for i in range(max_length): | |
sequence = tokenizer.texts_to_sequences([input_text])[0] | |
sequence = pad_sequences([sequence], maxlen=max_length) | |
try: | |
print(f"Prediction step {i+1}: input_text = '{input_text}'") | |
yhat = model.predict([feature, sequence], verbose=0) | |
yhat = np.argmax(yhat) | |
print(f"Predicted token index: {yhat}") | |
except Exception as e: | |
print(f"Prediction error at step {i+1}: {e}") | |
return f"β Error during prediction: {str(e)}" | |
word = '' | |
for w, i in tokenizer.word_index.items(): | |
if i == yhat: | |
word = w | |
break | |
print(f"Predicted word: '{word}'") | |
if word == 'endseq' or word == '': | |
break | |
input_text += ' ' + word | |
caption = input_text.replace('startseq', '').strip() | |
print(f"Final caption: '{caption}'") | |
return f"β {caption}" if caption else "β Unable to generate caption" | |
except Exception as e: | |
error_msg = f"β Error processing image: {str(e)}" | |
print(error_msg) | |
return error_msg | |
# Gradio Interface | |
title = "πΈ Image Caption Generator" | |
description = "Upload an image and let the AI generate a descriptive caption for it." | |
theme = "soft" | |
iface = gr.Interface( | |
fn=generate_caption, | |
inputs=gr.Image(type="pil"), | |
outputs=gr.Textbox(label="Generated Caption"), | |
title=title, | |
description=description, | |
theme=theme, | |
allow_flagging="never" | |
) | |
if __name__ == "__main__": | |
iface.launch() |