Spaces:

pr0ximaCent
/

caption

Sleeping

App Files Files Community

caption / app.py

pr0ximaCent

Update app.py

de58874 verified 4 months ago

raw

history blame contribute delete

10.5 kB

	import gradio as gr
	from PIL import Image
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
	from tensorflow.keras.preprocessing.image import img_to_array
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import pickle
	import os

	# Check if required files exist
	def check_required_files():
	required_files = ["caption_model.h5", "tokenizer.pkl"]
	missing_files = []

	for file in required_files:
	if not os.path.exists(file):
	missing_files.append(file)
	else:
	size = os.path.getsize(file)
	print(f"✓ Found {file} ({size} bytes)")

	if missing_files:
	print(f"✗ Missing files: {missing_files}")
	return False
	return True

	print("Checking required files...")
	files_exist = check_required_files()

	# Custom function to handle attention mechanism
	def attention_function(inputs):
	"""
	Custom attention function that likely combines two inputs
	Input 1: (None, 34, 34) - attention weights
	Input 2: (None, 34, 512) - feature vectors
	Output: (None, 34, 512) - attended features
	"""
	attention_weights, features = inputs
	# Expand attention weights to match feature dimensions
	attention_weights = tf.expand_dims(attention_weights, axis=-1)
	# Apply attention weights to features
	attended_features = attention_weights * features
	return attended_features

	def attention_output_shape(input_shapes):
	"""Define the output shape for attention mechanism"""
	# Return the shape of the feature input (second input)
	return input_shapes[1] # (None, 34, 512)

	# Alternative attention functions to try
	def attention_function_v2(inputs):
	"""Alternative attention mechanism - weighted sum"""
	attention_weights, features = inputs
	# Normalize attention weights
	attention_weights = tf.nn.softmax(attention_weights, axis=-1)
	attention_weights = tf.expand_dims(attention_weights, axis=-1)
	return attention_weights * features

	def attention_function_v3(inputs):
	"""Another alternative - dot product attention"""
	attention_weights, features = inputs
	# Sum along the second dimension of attention weights
	attention_weights = tf.reduce_sum(attention_weights, axis=-1, keepdims=True)
	attention_weights = tf.expand_dims(attention_weights, axis=-1)
	return attention_weights * features

	# Custom Lambda layer class
	class AttentionLambda(tf.keras.layers.Lambda):
	def __init__(self, function, output_shape_func=None, **kwargs):
	super().__init__(function, **kwargs)
	self.output_shape_func = output_shape_func

	def compute_output_shape(self, input_shape):
	if self.output_shape_func:
	return self.output_shape_func(input_shape)
	# Default: return the shape of the second input (features)
	if isinstance(input_shape, list) and len(input_shape) >= 2:
	return input_shape[1]
	return input_shape

	# Define multiple custom objects to try different attention mechanisms
	def get_custom_objects(attention_func, output_shape_func):
	return {
	'Lambda': lambda function=None, **kwargs: AttentionLambda(
	attention_func if function is None else function,
	output_shape_func,
	**kwargs
	)
	}

	# Multiple loading strategies with different attention mechanisms
	def load_model_safely():
	print("Starting model loading process...")

	# Strategy 1: Try with custom Lambda that handles the attention operation
	try:
	print("Strategy 1: Loading with custom attention Lambda...")

	def custom_attention(inputs):
	"""Handle attention mechanism between two inputs"""
	if len(inputs) == 2:
	attention_weights, features = inputs
	# Simple attention: multiply attention weights with features
	# Expand attention weights to match feature dimensions
	if len(attention_weights.shape) == 3 and len(features.shape) == 3:
	attention_weights = tf.expand_dims(attention_weights, axis=-1)
	return tf.multiply(attention_weights, features)
	return inputs[0] if isinstance(inputs, list) else inputs

	custom_objects = {
	'Lambda': lambda function=None, output_shape=None, **kwargs:
	tf.keras.layers.Lambda(
	custom_attention if function is None else function,
	output_shape=lambda input_shape: input_shape[1] if isinstance(input_shape, list) else input_shape,
	**kwargs
	)
	}

	model = tf.keras.models.load_model("caption_model.h5", custom_objects=custom_objects)
	print("✓ Strategy 1 successful!")
	return model

	except Exception as e:
	print(f"✗ Strategy 1 failed: {str(e)[:200]}...")

	# Strategy 2: Load with compile=False and try to fix compilation later
	try:
	print("Strategy 2: Loading without compilation...")
	model = tf.keras.models.load_model("caption_model.h5", compile=False)
	print("✓ Strategy 2 successful!")
	return model

	except Exception as e:
	print(f"✗ Strategy 2 failed: {str(e)[:200]}...")

	# Strategy 3: Try loading with TensorFlow's built-in Lambda handling
	try:
	print("Strategy 3: Loading with default Lambda handling...")

	def identity_function(x):
	if isinstance(x, list) and len(x) == 2:
	# For attention mechanism, return the second input (features)
	return x[1]
	return x

	custom_objects = {
	'Lambda': lambda function=identity_function, output_shape=None, **kwargs:
	tf.keras.layers.Lambda(
	function,
	output_shape=lambda input_shape: input_shape[1] if isinstance(input_shape, list) else input_shape,
	**kwargs
	)
	}

	model = tf.keras.models.load_model("caption_model.h5", custom_objects=custom_objects)
	print("✓ Strategy 3 successful!")
	return model

	except Exception as e:
	print(f"✗ Strategy 3 failed: {str(e)[:200]}...")

	# Strategy 4: Try with minimal custom objects
	try:
	print("Strategy 4: Loading with minimal custom objects...")
	model = tf.keras.models.load_model("caption_model.h5", custom_objects={'Lambda': tf.keras.layers.Lambda})
	print("✓ Strategy 4 successful!")
	return model

	except Exception as e:
	print(f"✗ Strategy 4 failed: {str(e)[:200]}...")

	print("All strategies failed. Model could not be loaded.")
	raise Exception("All model loading strategies failed. The model file may be corrupted or incompatible.")

	# Load your pre-trained model and tokenizer
	if not files_exist:
	print("Cannot proceed without required files.")
	model = None
	tokenizer = None
	else:
	# Load tokenizer first
	try:
	with open("tokenizer.pkl", "rb") as handle:
	tokenizer = pickle.load(handle)
	print("✓ Tokenizer loaded successfully")
	except Exception as e:
	print(f"✗ Failed to load tokenizer: {e}")
	tokenizer = None

	# Load model
	try:
	model = load_model_safely()
	print("✓ Model loaded successfully and ready for inference!")
	except Exception as e:
	print(f"✗ Failed to load model: {e}")
	print("The app will not work without a properly loaded model.")
	model = None

	# Image feature extractor model
	feature_extractor = VGG16()
	feature_extractor = tf.keras.Model(feature_extractor.input, feature_extractor.layers[-2].output)

	# Description generation function
	def generate_caption(image):
	try:
	if model is None:
	return "❌ Model failed to load. Please check the model file and console output for details."

	if tokenizer is None:
	return "❌ Tokenizer failed to load. Please check the tokenizer.pkl file."

	# Preprocess the image
	image = image.resize((224, 224))
	image = img_to_array(image)
	image = np.expand_dims(image, axis=0)
	image = preprocess_input(image)

	# Extract features
	print("Extracting image features...")
	feature = feature_extractor.predict(image, verbose=0)
	print(f"Features extracted, shape: {feature.shape}")

	# Generate caption
	input_text = 'startseq'
	max_length = 34 # set this to your model's max_length

	print("Starting caption generation...")
	for i in range(max_length):
	sequence = tokenizer.texts_to_sequences([input_text])[0]
	sequence = pad_sequences([sequence], maxlen=max_length)

	try:
	print(f"Prediction step {i+1}: input_text = '{input_text}'")
	yhat = model.predict([feature, sequence], verbose=0)
	yhat = np.argmax(yhat)
	print(f"Predicted token index: {yhat}")
	except Exception as e:
	print(f"Prediction error at step {i+1}: {e}")
	return f"❌ Error during prediction: {str(e)}"

	word = ''
	for w, i in tokenizer.word_index.items():
	if i == yhat:
	word = w
	break

	print(f"Predicted word: '{word}'")
	if word == 'endseq' or word == '':
	break
	input_text += ' ' + word

	caption = input_text.replace('startseq', '').strip()
	print(f"Final caption: '{caption}'")
	return f"✅ {caption}" if caption else "❌ Unable to generate caption"

	except Exception as e:
	error_msg = f"❌ Error processing image: {str(e)}"
	print(error_msg)
	return error_msg

	# Gradio Interface
	title = "📸 Image Caption Generator"
	description = "Upload an image and let the AI generate a descriptive caption for it."
	theme = "soft"

	iface = gr.Interface(
	fn=generate_caption,
	inputs=gr.Image(type="pil"),
	outputs=gr.Textbox(label="Generated Caption"),
	title=title,
	description=description,
	theme=theme,
	allow_flagging="never"
	)

	if __name__ == "__main__":
	iface.launch()