Spaces:

pr0ximaCent
/

caption

Sleeping

File size: 10,491 Bytes

import gradio as gr
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import os

# Check if required files exist
def check_required_files():
    required_files = ["caption_model.h5", "tokenizer.pkl"]
    missing_files = []
    
    for file in required_files:
        if not os.path.exists(file):
            missing_files.append(file)
        else:
            size = os.path.getsize(file)
            print(f"✓ Found {file} ({size} bytes)")
    
    if missing_files:
        print(f"✗ Missing files: {missing_files}")
        return False
    return True

print("Checking required files...")
files_exist = check_required_files()

# Custom function to handle attention mechanism
def attention_function(inputs):
    """
    Custom attention function that likely combines two inputs
    Input 1: (None, 34, 34) - attention weights
    Input 2: (None, 34, 512) - feature vectors
    Output: (None, 34, 512) - attended features
    """
    attention_weights, features = inputs
    # Expand attention weights to match feature dimensions
    attention_weights = tf.expand_dims(attention_weights, axis=-1)
    # Apply attention weights to features
    attended_features = attention_weights * features
    return attended_features

def attention_output_shape(input_shapes):
    """Define the output shape for attention mechanism"""
    # Return the shape of the feature input (second input)
    return input_shapes[1]  # (None, 34, 512)

# Alternative attention functions to try
def attention_function_v2(inputs):
    """Alternative attention mechanism - weighted sum"""
    attention_weights, features = inputs
    # Normalize attention weights
    attention_weights = tf.nn.softmax(attention_weights, axis=-1)
    attention_weights = tf.expand_dims(attention_weights, axis=-1)
    return attention_weights * features

def attention_function_v3(inputs):
    """Another alternative - dot product attention"""
    attention_weights, features = inputs
    # Sum along the second dimension of attention weights
    attention_weights = tf.reduce_sum(attention_weights, axis=-1, keepdims=True)
    attention_weights = tf.expand_dims(attention_weights, axis=-1)
    return attention_weights * features

# Custom Lambda layer class
class AttentionLambda(tf.keras.layers.Lambda):
    def __init__(self, function, output_shape_func=None, **kwargs):
        super().__init__(function, **kwargs)
        self.output_shape_func = output_shape_func
    
    def compute_output_shape(self, input_shape):
        if self.output_shape_func:
            return self.output_shape_func(input_shape)
        # Default: return the shape of the second input (features)
        if isinstance(input_shape, list) and len(input_shape) >= 2:
            return input_shape[1]
        return input_shape

# Define multiple custom objects to try different attention mechanisms
def get_custom_objects(attention_func, output_shape_func):
    return {
        'Lambda': lambda function=None, **kwargs: AttentionLambda(
            attention_func if function is None else function,
            output_shape_func,
            **kwargs
        )
    }

# Multiple loading strategies with different attention mechanisms
def load_model_safely():
    print("Starting model loading process...")
    
    # Strategy 1: Try with custom Lambda that handles the attention operation
    try:
        print("Strategy 1: Loading with custom attention Lambda...")
        
        def custom_attention(inputs):
            """Handle attention mechanism between two inputs"""
            if len(inputs) == 2:
                attention_weights, features = inputs
                # Simple attention: multiply attention weights with features
                # Expand attention weights to match feature dimensions
                if len(attention_weights.shape) == 3 and len(features.shape) == 3:
                    attention_weights = tf.expand_dims(attention_weights, axis=-1)
                    return tf.multiply(attention_weights, features)
            return inputs[0] if isinstance(inputs, list) else inputs
        
        custom_objects = {
            'Lambda': lambda function=None, output_shape=None, **kwargs: 
                tf.keras.layers.Lambda(
                    custom_attention if function is None else function,
                    output_shape=lambda input_shape: input_shape[1] if isinstance(input_shape, list) else input_shape,
                    **kwargs
                )
        }
        
        model = tf.keras.models.load_model("caption_model.h5", custom_objects=custom_objects)
        print("✓ Strategy 1 successful!")
        return model
        
    except Exception as e:
        print(f"✗ Strategy 1 failed: {str(e)[:200]}...")
    
    # Strategy 2: Load with compile=False and try to fix compilation later
    try:
        print("Strategy 2: Loading without compilation...")
        model = tf.keras.models.load_model("caption_model.h5", compile=False)
        print("✓ Strategy 2 successful!")
        return model
        
    except Exception as e:
        print(f"✗ Strategy 2 failed: {str(e)[:200]}...")
    
    # Strategy 3: Try loading with TensorFlow's built-in Lambda handling
    try:
        print("Strategy 3: Loading with default Lambda handling...")
        
        def identity_function(x):
            if isinstance(x, list) and len(x) == 2:
                # For attention mechanism, return the second input (features)
                return x[1]
            return x
            
        custom_objects = {
            'Lambda': lambda function=identity_function, output_shape=None, **kwargs:
                tf.keras.layers.Lambda(
                    function,
                    output_shape=lambda input_shape: input_shape[1] if isinstance(input_shape, list) else input_shape,
                    **kwargs
                )
        }
        
        model = tf.keras.models.load_model("caption_model.h5", custom_objects=custom_objects)
        print("✓ Strategy 3 successful!")
        return model
        
    except Exception as e:
        print(f"✗ Strategy 3 failed: {str(e)[:200]}...")
    
    # Strategy 4: Try with minimal custom objects
    try:
        print("Strategy 4: Loading with minimal custom objects...")
        model = tf.keras.models.load_model("caption_model.h5", custom_objects={'Lambda': tf.keras.layers.Lambda})
        print("✓ Strategy 4 successful!")
        return model
        
    except Exception as e:
        print(f"✗ Strategy 4 failed: {str(e)[:200]}...")
    
    print("All strategies failed. Model could not be loaded.")
    raise Exception("All model loading strategies failed. The model file may be corrupted or incompatible.")

# Load your pre-trained model and tokenizer
if not files_exist:
    print("Cannot proceed without required files.")
    model = None
    tokenizer = None
else:
    # Load tokenizer first
    try:
        with open("tokenizer.pkl", "rb") as handle:
            tokenizer = pickle.load(handle)
        print("✓ Tokenizer loaded successfully")
    except Exception as e:
        print(f"✗ Failed to load tokenizer: {e}")
        tokenizer = None
    
    # Load model
    try:
        model = load_model_safely()
        print("✓ Model loaded successfully and ready for inference!")
    except Exception as e:
        print(f"✗ Failed to load model: {e}")
        print("The app will not work without a properly loaded model.")
        model = None

# Image feature extractor model
feature_extractor = VGG16()
feature_extractor = tf.keras.Model(feature_extractor.input, feature_extractor.layers[-2].output)

# Description generation function
def generate_caption(image):
    try:
        if model is None:
            return "❌ Model failed to load. Please check the model file and console output for details."
        
        if tokenizer is None:
            return "❌ Tokenizer failed to load. Please check the tokenizer.pkl file."
        
        # Preprocess the image
        image = image.resize((224, 224))
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)
        
        # Extract features
        print("Extracting image features...")
        feature = feature_extractor.predict(image, verbose=0)
        print(f"Features extracted, shape: {feature.shape}")
        
        # Generate caption
        input_text = 'startseq'
        max_length = 34  # set this to your model's max_length
        
        print("Starting caption generation...")
        for i in range(max_length):
            sequence = tokenizer.texts_to_sequences([input_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            
            try:
                print(f"Prediction step {i+1}: input_text = '{input_text}'")
                yhat = model.predict([feature, sequence], verbose=0)
                yhat = np.argmax(yhat)
                print(f"Predicted token index: {yhat}")
            except Exception as e:
                print(f"Prediction error at step {i+1}: {e}")
                return f"❌ Error during prediction: {str(e)}"
            
            word = ''
            for w, i in tokenizer.word_index.items():
                if i == yhat:
                    word = w
                    break
            
            print(f"Predicted word: '{word}'")
            if word == 'endseq' or word == '':
                break
            input_text += ' ' + word
        
        caption = input_text.replace('startseq', '').strip()
        print(f"Final caption: '{caption}'")
        return f"✅ {caption}" if caption else "❌ Unable to generate caption"
    
    except Exception as e:
        error_msg = f"❌ Error processing image: {str(e)}"
        print(error_msg)
        return error_msg

# Gradio Interface
title = "📸 Image Caption Generator"
description = "Upload an image and let the AI generate a descriptive caption for it."
theme = "soft"

iface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Generated Caption"),
    title=title,
    description=description,
    theme=theme,
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()