Spaces:

SaadNadeem07
/

BLIP_2-VLM-Finetuning

Running

App Files Files Community

Saadi07 commited on May 1

Commit

c76cc1b

1 Parent(s): 95e1523

f

Browse files

Files changed (3) hide show

README.md +5 -3
app.py +49 -18
requirements.txt +6 -6

README.md CHANGED Viewed

@@ -9,13 +9,13 @@ app_file: app.py
 pinned: false
 ---
-# Fine-tuned BLIP2 Image Captioning
 This Hugging Face Space hosts a BLIP2 model that has been fine-tuned on the Flickr8k dataset using Low-Rank Adaptation (LoRA).
 ## Model Details
-- Base model: `ybelkada/blip2-opt-2.7b-fp16-sharded`
 - Fine-tuning technique: LoRA (Low-Rank Adaptation)
 - Training dataset: Flickr8k
 - LoRA configuration:
@@ -30,6 +30,8 @@ Upload an image to generate a caption. The model will process the image and retu
 ## Notes
-The model uses 8-bit quantization to reduce memory usage while maintaining performance.
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# Fine-tuned BLIP2 Image Caption Generator
 This Hugging Face Space hosts a BLIP2 model that has been fine-tuned on the Flickr8k dataset using Low-Rank Adaptation (LoRA).
 ## Model Details
+- Base model: `Salesforce/blip2-opt-2.7b` (with fallback to `Salesforce/blip2-opt-560m` for CPU environments)
 - Fine-tuning technique: LoRA (Low-Rank Adaptation)
 - Training dataset: Flickr8k
 - LoRA configuration:
 ## Notes
+- The app will automatically detect if CUDA is available
+- If running on CPU, it will use a smaller model version to maintain performance
+- The app includes fallback mechanisms to ensure it works in various environments
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from PIL import Image
 import torch
 from transformers import AutoProcessor, Blip2ForConditionalGeneration
 from peft import PeftModel, LoraConfig
 # LoRA configuration used during training:
 # config = LoraConfig(
@@ -13,20 +14,42 @@ from peft import PeftModel, LoraConfig
 #     target_modules=["q_proj", "k_proj"]
 # )
-# Load base model with the same configuration as in training
-base_model = Blip2ForConditionalGeneration.from_pretrained(
-    "ybelkada/blip2-opt-2.7b-fp16-sharded",
-    device_map="auto",
-    load_in_8bit=True
-)
-# Load the fine-tuned LoRA weights
-model = PeftModel.from_pretrained(base_model, "./model")
-# Load processor - use the same one as training
 processor = AutoProcessor.from_pretrained("./processor")
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 # Define the function to generate caption - exactly as in colab
 def generate_caption(image):
@@ -34,14 +57,21 @@ def generate_caption(image):
     image = image.convert('RGB') if image.mode != 'RGB' else image
     # Process the image exactly as in colab.py
-    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
-    # Generate caption with the same parameters
-    generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
-    # Decode the caption
-    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return caption
 # Create Gradio interface
 iface = gr.Interface(
@@ -49,7 +79,8 @@ iface = gr.Interface(
     inputs=gr.Image(type="pil"),
     outputs="text",
     title="Fine-tuned BLIP2 Image Caption Generator",
-    description="Upload an image to generate a caption using BLIP2 fine-tuned on Flickr8k with LoRA (r=16, alpha=32)."
 )
 # Launch

 import torch
 from transformers import AutoProcessor, Blip2ForConditionalGeneration
 from peft import PeftModel, LoraConfig
+import os
 # LoRA configuration used during training:
 # config = LoraConfig(
 #     target_modules=["q_proj", "k_proj"]
 # )
+# Check if we're running on CPU or GPU
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Using device: {device}")
+# Load processor first
 processor = AutoProcessor.from_pretrained("./processor")
+# Load base model without 8-bit quantization for CPU compatibility
+try:
+    # Try loading with device_map for better memory usage if available
+    base_model = Blip2ForConditionalGeneration.from_pretrained(
+        "Salesforce/blip2-opt-2.7b",
+        device_map="auto" if torch.cuda.is_available() else None,
+        load_in_8bit=torch.cuda.is_available()  # Only use 8-bit if CUDA is available
+    )
+except Exception as e:
+    print(f"Error loading full model: {e}")
+    print("Falling back to smaller model...")
+    # Fall back to a smaller model if the large one fails
+    base_model = Blip2ForConditionalGeneration.from_pretrained(
+        "Salesforce/blip2-opt-560m",
+        device_map=None
+    )
+# Load the fine-tuned LoRA weights
+try:
+    model = PeftModel.from_pretrained(base_model, "./model")
+    print("Successfully loaded fine-tuned LoRA weights")
+except Exception as e:
+    print(f"Error loading LoRA weights: {e}")
+    print("Continuing with base model only")
+    model = base_model
+# Move model to device if not using device_map
+if not hasattr(model, "hf_device_map"):
+    model = model.to(device)
 # Define the function to generate caption - exactly as in colab
 def generate_caption(image):
     image = image.convert('RGB') if image.mode != 'RGB' else image
     # Process the image exactly as in colab.py
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    # Use fp32 instead of fp16 for CPU compatibility
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    pixel_values = inputs.pixel_values.to(dtype)
+    try:
+        # Generate caption with the same parameters
+        generated_ids = model.generate(pixel_values=pixel_values, max_length=25)
+        # Decode the caption
+        caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return caption
+    except Exception as e:
+        return f"Error generating caption: {str(e)}"
 # Create Gradio interface
 iface = gr.Interface(
     inputs=gr.Image(type="pil"),
     outputs="text",
     title="Fine-tuned BLIP2 Image Caption Generator",
+    description="Upload an image to generate a caption using BLIP2 fine-tuned on Flickr8k with LoRA (r=16, alpha=32).",
+    examples=["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"]
 )
 # Launch

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-torch
-transformers>=4.30.0
-gradio
 Pillow
-peft
-bitsandbytes
-accelerate

+torch>=2.0.0
+transformers>=4.31.0
+gradio>=3.40.0
 Pillow
+peft>=0.5.0
+safetensors
+accelerate>=0.25.0