Spaces:

SaadNadeem07
/

BLIP_2-VLM-Finetuning

Running

App Files Files Community

Saadi07 commited on May 1

Commit

1f19f01

1 Parent(s): c76cc1b

f

Browse files

Files changed (3) hide show

README.md +9 -14
app.py +38 -51
requirements.txt +4 -7

README.md CHANGED Viewed

@@ -9,29 +9,24 @@ app_file: app.py
 pinned: false
 ---
-# Fine-tuned BLIP2 Image Caption Generator
-This Hugging Face Space hosts a BLIP2 model that has been fine-tuned on the Flickr8k dataset using Low-Rank Adaptation (LoRA).
 ## Model Details
-- Base model: `Salesforce/blip2-opt-2.7b` (with fallback to `Salesforce/blip2-opt-560m` for CPU environments)
-- Fine-tuning technique: LoRA (Low-Rank Adaptation)
-- Training dataset: Flickr8k
-- LoRA configuration:
-  - Rank (r): 16
-  - Alpha: 32
-  - Dropout: 0.05
-  - Target modules: q_proj, k_proj
 ## Usage
-Upload an image to generate a caption. The model will process the image and return a descriptive caption based on its fine-tuned knowledge.
 ## Notes
-- The app will automatically detect if CUDA is available
-- If running on CPU, it will use a smaller model version to maintain performance
-- The app includes fallback mechanisms to ensure it works in various environments
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# Image Caption Generator
+This Hugging Face Space hosts a lightweight BLIP model for image captioning, optimized for CPU environments.
 ## Model Details
+- Base model: `Salesforce/blip-image-captioning-base`
+- Optimized for CPU environments with low memory requirements
+- No GPU required
 ## Usage
+Upload an image to generate a caption. The model will process the image and return a descriptive caption.
 ## Notes
+- This is a simplified version of the model to ensure it runs reliably on Hugging Face Spaces
+- The model is optimized for CPU usage and low memory consumption
+- For best results, use clear images with well-defined subjects
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,71 +1,58 @@
 import gradio as gr
 from PIL import Image
 import torch
-from transformers import AutoProcessor, Blip2ForConditionalGeneration
-from peft import PeftModel, LoraConfig
 import os
-# LoRA configuration used during training:
-# config = LoraConfig(
-#     r=16,
-#     lora_alpha=32,
-#     lora_dropout=0.05,
-#     bias="none",
-#     target_modules=["q_proj", "k_proj"]
-# )
 # Check if we're running on CPU or GPU
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Using device: {device}")
 # Load processor first
-processor = AutoProcessor.from_pretrained("./processor")
-# Load base model without 8-bit quantization for CPU compatibility
 try:
-    # Try loading with device_map for better memory usage if available
-    base_model = Blip2ForConditionalGeneration.from_pretrained(
-        "Salesforce/blip2-opt-2.7b",
-        device_map="auto" if torch.cuda.is_available() else None,
-        load_in_8bit=torch.cuda.is_available()  # Only use 8-bit if CUDA is available
-    )
 except Exception as e:
-    print(f"Error loading full model: {e}")
-    print("Falling back to smaller model...")
-    # Fall back to a smaller model if the large one fails
-    base_model = Blip2ForConditionalGeneration.from_pretrained(
-        "Salesforce/blip2-opt-560m",
-        device_map=None
-    )
-# Load the fine-tuned LoRA weights
 try:
-    model = PeftModel.from_pretrained(base_model, "./model")
-    print("Successfully loaded fine-tuned LoRA weights")
 except Exception as e:
-    print(f"Error loading LoRA weights: {e}")
-    print("Continuing with base model only")
-    model = base_model
-# Move model to device if not using device_map
-if not hasattr(model, "hf_device_map"):
-    model = model.to(device)
-# Define the function to generate caption - exactly as in colab
 def generate_caption(image):
-    # Convert image to RGB if needed
-    image = image.convert('RGB') if image.mode != 'RGB' else image
-    # Process the image exactly as in colab.py
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    # Use fp32 instead of fp16 for CPU compatibility
-    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-    pixel_values = inputs.pixel_values.to(dtype)
     try:
-        # Generate caption with the same parameters
-        generated_ids = model.generate(pixel_values=pixel_values, max_length=25)
         # Decode the caption
         caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -78,8 +65,8 @@ iface = gr.Interface(
     fn=generate_caption,
     inputs=gr.Image(type="pil"),
     outputs="text",
-    title="Fine-tuned BLIP2 Image Caption Generator",
-    description="Upload an image to generate a caption using BLIP2 fine-tuned on Flickr8k with LoRA (r=16, alpha=32).",
     examples=["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"]
 )

 import gradio as gr
 from PIL import Image
 import torch
+from transformers import AutoProcessor, AutoModelForCausalLM, BlipProcessor
 import os
 # Check if we're running on CPU or GPU
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Using device: {device}")
 # Load processor first
 try:
+    # Try to load the custom processor
+    processor = AutoProcessor.from_pretrained("./processor")
+    print("Loaded custom processor")
 except Exception as e:
+    print(f"Failed to load custom processor: {e}")
+    # Fall back to a smaller processor
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+    print("Using fallback processor")
+# Load base model - use the smallest possible model for CPU
 try:
+    # Try loading the smallest BLIP model
+    model = AutoModelForCausalLM.from_pretrained(
+        "Salesforce/blip-image-captioning-base",
+        device_map=None,
+        torch_dtype=torch.float32  # Use float32 for CPU compatibility
+    )
+    print("Loaded base BLIP model")
 except Exception as e:
+    print(f"Error loading model: {e}")
+    # If that fails, load an even smaller model
+    model = AutoModelForCausalLM.from_pretrained(
+        "Salesforce/blip-image-captioning-base",
+        device_map=None,
+        low_cpu_mem_usage=True
+    )
+    print("Loaded fallback model")
+# Move model to device if needed
+model = model.to(device)
+print("Model loaded and ready")
+# Define the function to generate caption
 def generate_caption(image):
     try:
+        # Convert image to RGB if needed
+        image = image.convert('RGB') if image.mode != 'RGB' else image
+        # Process the image
+        inputs = processor(images=image, return_tensors="pt").to(device)
+        # Generate caption
+        generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
         # Decode the caption
         caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     fn=generate_caption,
     inputs=gr.Image(type="pil"),
     outputs="text",
+    title="Image Caption Generator",
+    description="Upload an image to generate a caption.",
     examples=["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"]
 )

requirements.txt CHANGED Viewed

@@ -1,7 +1,4 @@
-torch>=2.0.0
-transformers>=4.31.0
-gradio>=3.40.0
-Pillow
-peft>=0.5.0
-safetensors
-accelerate>=0.25.0

+torch>=1.10.0
+transformers>=4.25.0
+gradio>=3.20.0
+Pillow