Spaces:

therealsaed
/

Image_Captioning_model

Runtime error

File size: 7,256 Bytes

24b38ad

"""
Hugging Face Spaces App - Image Captioning
Deploy this to HF Spaces for free hosting
"""

import gradio as gr
import torch
from PIL import Image
import time

def load_models():
    """Load models with error handling"""
    models = {}
    
    try:
        from transformers import BlipProcessor, BlipForConditionalGeneration
        print("Loading BLIP model...")
        models['blip_processor'] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        models['blip_model'] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        print("✅ BLIP loaded successfully")
    except Exception as e:
        print(f"❌ BLIP failed to load: {e}")
        models['blip_error'] = str(e)
    
    try:
        from transformers import AutoProcessor, AutoModelForCausalLM
        print("Loading GIT model...")
        models['git_processor'] = AutoProcessor.from_pretrained("microsoft/git-base")
        models['git_model'] = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
        print("✅ GIT loaded successfully")
    except Exception as e:
        print(f"❌ GIT failed to load: {e}")
        models['git_error'] = str(e)
    
    return models

# Load models at startup
print("🚀 Loading AI models...")
models = load_models()
print(f"📦 Models loading completed")

def generate_captions(image, true_caption=""):
    """Generate captions using available models"""
    if image is None:
        return "❌ Please upload an image first."
    
    # Ensure image is in RGB format
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    results = []
    start_time = time.time()
    
    # Add true caption if provided
    if true_caption.strip():
        results.append(f"**🎯 True Caption:**")
        results.append(f"{true_caption.strip()}")
        results.append("")
    
    # BLIP model
    if 'blip_model' in models:
        try:
            blip_start = time.time()
            inputs = models['blip_processor'](image, return_tensors="pt")
            out = models['blip_model'].generate(**inputs, max_length=50, num_beams=5)
            blip_caption = models['blip_processor'].decode(out[0], skip_special_tokens=True)
            blip_time = time.time() - blip_start
            
            results.append(f"**🤖 BLIP Model:** ({blip_time:.2f}s)")
            results.append(f"{blip_caption}")
            results.append("")
        except Exception as e:
            results.append(f"**🤖 BLIP Model:** Error - {str(e)}")
            results.append("")
    elif 'blip_error' in models:
        results.append(f"**🤖 BLIP Model:** Not available - {models['blip_error']}")
        results.append("")
    
    # GIT model
    if 'git_model' in models:
        try:
            git_start = time.time()
            inputs = models['git_processor'](images=image, return_tensors="pt")
            generated_ids = models['git_model'].generate(
                pixel_values=inputs.pixel_values, 
                max_length=50, 
                num_beams=5
            )
            git_caption = models['git_processor'].batch_decode(generated_ids, skip_special_tokens=True)[0]
            git_time = time.time() - git_start
            
            results.append(f"**🧠 GIT Model:** ({git_time:.2f}s)")
            results.append(f"{git_caption}")
            results.append("")
        except Exception as e:
            results.append(f"**🧠 GIT Model:** Error - {str(e)}")
            results.append("")
    elif 'git_error' in models:
        results.append(f"**🧠 GIT Model:** Not available - {models['git_error']}")
        results.append("")
    
    total_time = time.time() - start_time
    results.append("---")
    results.append(f"**⏱️ Total Processing Time:** {total_time:.2f} seconds")
    results.append("")
    results.append("**📊 About the Models:**")
    results.append("• **BLIP**: Salesforce's Bootstrapping Language-Image Pre-training")
    results.append("• **GIT**: Microsoft's Generative Image-to-text Transformer")
    
    return "\n".join(results)

# Create Gradio interface
with gr.Blocks(
    title="AI Image Captioning", 
    theme=gr.themes.Soft(),
    css="""
    .gradio-container {
        max-width: 1200px !important;
    }
    """
) as demo:
    
    gr.Markdown("""
    # 🤖 AI Image Captioning
    
    Upload an image and get captions from multiple state-of-the-art AI models!
    
    **Available Models:**
    - 🤖 **BLIP** (Salesforce): Fast and accurate image captioning
    - 🧠 **GIT** (Microsoft): Advanced generative image-to-text model
    
    *Simply upload an image or try one of the examples below!*
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(
                type="pil", 
                label="📸 Upload Your Image",
                height=400
            )
            
            true_caption_input = gr.Textbox(
                label="🎯 True Caption (Optional)",
                placeholder="Enter the correct caption to compare with AI predictions...",
                lines=2
            )
            
            generate_btn = gr.Button(
                "✨ Generate Captions", 
                variant="primary",
                size="lg"
            )
        
        with gr.Column(scale=1):
            output = gr.Textbox(
                label="🤖 AI Generated Captions",
                lines=20,
                max_lines=25,
                show_copy_button=True
            )
    
    # Example images
    gr.Markdown("### 📋 Try These Examples:")
    
    example_images = [
        ["https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat.jpg", "A cat sitting on a surface"],
        ["https://huggingface.co/datasets/mishig/sample_images/resolve/main/dog.jpg", "A dog in a field"],
        ["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=500", "A mountain landscape with snow"],
        ["https://images.unsplash.com/photo-1549298916-b41d501d3772?w=500", "A red sports car"],
        ["https://images.unsplash.com/photo-1551963831-b3b1ca40c98e?w=500", "A breakfast with coffee and pastries"],
    ]
    
    gr.Examples(
        examples=example_images,
        inputs=[image_input, true_caption_input],
        outputs=output,
        fn=generate_captions,
        cache_examples=False
    )
    
    # Event handlers
    generate_btn.click(
        fn=generate_captions,
        inputs=[image_input, true_caption_input],
        outputs=output
    )
    
    # Auto-generate when image is uploaded
    image_input.change(
        fn=generate_captions,
        inputs=[image_input, true_caption_input],
        outputs=output
    )
    
    gr.Markdown("""
    ---
    
    **🔧 Technical Details:**
    - Models run on Hugging Face's infrastructure
    - Processing time varies based on image size and complexity
    - All models are open-source and publicly available
    
    **📝 Tips:**
    - Try different types of images (people, objects, landscapes, etc.)
    - Compare the AI captions with your own description
    - Larger images may take longer to process
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()