""" Hugging Face Spaces App - Image Captioning Deploy this to HF Spaces for free hosting """ import gradio as gr import torch from PIL import Image import time def load_models(): """Load models with error handling""" models = {} try: from transformers import BlipProcessor, BlipForConditionalGeneration print("Loading BLIP model...") models['blip_processor'] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") models['blip_model'] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") print("✅ BLIP loaded successfully") except Exception as e: print(f"❌ BLIP failed to load: {e}") models['blip_error'] = str(e) try: from transformers import AutoProcessor, AutoModelForCausalLM print("Loading GIT model...") models['git_processor'] = AutoProcessor.from_pretrained("microsoft/git-base") models['git_model'] = AutoModelForCausalLM.from_pretrained("microsoft/git-base") print("✅ GIT loaded successfully") except Exception as e: print(f"❌ GIT failed to load: {e}") models['git_error'] = str(e) return models # Load models at startup print("🚀 Loading AI models...") models = load_models() print(f"📦 Models loading completed") def generate_captions(image, true_caption=""): """Generate captions using available models""" if image is None: return "❌ Please upload an image first." # Ensure image is in RGB format if image.mode != 'RGB': image = image.convert('RGB') results = [] start_time = time.time() # Add true caption if provided if true_caption.strip(): results.append(f"**🎯 True Caption:**") results.append(f"{true_caption.strip()}") results.append("") # BLIP model if 'blip_model' in models: try: blip_start = time.time() inputs = models['blip_processor'](image, return_tensors="pt") out = models['blip_model'].generate(**inputs, max_length=50, num_beams=5) blip_caption = models['blip_processor'].decode(out[0], skip_special_tokens=True) blip_time = time.time() - blip_start results.append(f"**🤖 BLIP Model:** ({blip_time:.2f}s)") results.append(f"{blip_caption}") results.append("") except Exception as e: results.append(f"**🤖 BLIP Model:** Error - {str(e)}") results.append("") elif 'blip_error' in models: results.append(f"**🤖 BLIP Model:** Not available - {models['blip_error']}") results.append("") # GIT model if 'git_model' in models: try: git_start = time.time() inputs = models['git_processor'](images=image, return_tensors="pt") generated_ids = models['git_model'].generate( pixel_values=inputs.pixel_values, max_length=50, num_beams=5 ) git_caption = models['git_processor'].batch_decode(generated_ids, skip_special_tokens=True)[0] git_time = time.time() - git_start results.append(f"**🧠 GIT Model:** ({git_time:.2f}s)") results.append(f"{git_caption}") results.append("") except Exception as e: results.append(f"**🧠 GIT Model:** Error - {str(e)}") results.append("") elif 'git_error' in models: results.append(f"**🧠 GIT Model:** Not available - {models['git_error']}") results.append("") total_time = time.time() - start_time results.append("---") results.append(f"**⏱️ Total Processing Time:** {total_time:.2f} seconds") results.append("") results.append("**📊 About the Models:**") results.append("• **BLIP**: Salesforce's Bootstrapping Language-Image Pre-training") results.append("• **GIT**: Microsoft's Generative Image-to-text Transformer") return "\n".join(results) # Create Gradio interface with gr.Blocks( title="AI Image Captioning", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } """ ) as demo: gr.Markdown(""" # 🤖 AI Image Captioning Upload an image and get captions from multiple state-of-the-art AI models! **Available Models:** - 🤖 **BLIP** (Salesforce): Fast and accurate image captioning - 🧠 **GIT** (Microsoft): Advanced generative image-to-text model *Simply upload an image or try one of the examples below!* """) with gr.Row(): with gr.Column(scale=1): image_input = gr.Image( type="pil", label="📸 Upload Your Image", height=400 ) true_caption_input = gr.Textbox( label="🎯 True Caption (Optional)", placeholder="Enter the correct caption to compare with AI predictions...", lines=2 ) generate_btn = gr.Button( "✨ Generate Captions", variant="primary", size="lg" ) with gr.Column(scale=1): output = gr.Textbox( label="🤖 AI Generated Captions", lines=20, max_lines=25, show_copy_button=True ) # Example images gr.Markdown("### 📋 Try These Examples:") example_images = [ ["https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat.jpg", "A cat sitting on a surface"], ["https://huggingface.co/datasets/mishig/sample_images/resolve/main/dog.jpg", "A dog in a field"], ["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=500", "A mountain landscape with snow"], ["https://images.unsplash.com/photo-1549298916-b41d501d3772?w=500", "A red sports car"], ["https://images.unsplash.com/photo-1551963831-b3b1ca40c98e?w=500", "A breakfast with coffee and pastries"], ] gr.Examples( examples=example_images, inputs=[image_input, true_caption_input], outputs=output, fn=generate_captions, cache_examples=False ) # Event handlers generate_btn.click( fn=generate_captions, inputs=[image_input, true_caption_input], outputs=output ) # Auto-generate when image is uploaded image_input.change( fn=generate_captions, inputs=[image_input, true_caption_input], outputs=output ) gr.Markdown(""" --- **🔧 Technical Details:** - Models run on Hugging Face's infrastructure - Processing time varies based on image size and complexity - All models are open-source and publicly available **📝 Tips:** - Try different types of images (people, objects, landscapes, etc.) - Compare the AI captions with your own description - Larger images may take longer to process """) # Launch the app if __name__ == "__main__": demo.launch()