Saadi07 commited on
Commit
1f19f01
·
1 Parent(s): c76cc1b
Files changed (3) hide show
  1. README.md +9 -14
  2. app.py +38 -51
  3. requirements.txt +4 -7
README.md CHANGED
@@ -9,29 +9,24 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # Fine-tuned BLIP2 Image Caption Generator
13
 
14
- This Hugging Face Space hosts a BLIP2 model that has been fine-tuned on the Flickr8k dataset using Low-Rank Adaptation (LoRA).
15
 
16
  ## Model Details
17
 
18
- - Base model: `Salesforce/blip2-opt-2.7b` (with fallback to `Salesforce/blip2-opt-560m` for CPU environments)
19
- - Fine-tuning technique: LoRA (Low-Rank Adaptation)
20
- - Training dataset: Flickr8k
21
- - LoRA configuration:
22
- - Rank (r): 16
23
- - Alpha: 32
24
- - Dropout: 0.05
25
- - Target modules: q_proj, k_proj
26
 
27
  ## Usage
28
 
29
- Upload an image to generate a caption. The model will process the image and return a descriptive caption based on its fine-tuned knowledge.
30
 
31
  ## Notes
32
 
33
- - The app will automatically detect if CUDA is available
34
- - If running on CPU, it will use a smaller model version to maintain performance
35
- - The app includes fallback mechanisms to ensure it works in various environments
36
 
37
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  ---
11
 
12
+ # Image Caption Generator
13
 
14
+ This Hugging Face Space hosts a lightweight BLIP model for image captioning, optimized for CPU environments.
15
 
16
  ## Model Details
17
 
18
+ - Base model: `Salesforce/blip-image-captioning-base`
19
+ - Optimized for CPU environments with low memory requirements
20
+ - No GPU required
 
 
 
 
 
21
 
22
  ## Usage
23
 
24
+ Upload an image to generate a caption. The model will process the image and return a descriptive caption.
25
 
26
  ## Notes
27
 
28
+ - This is a simplified version of the model to ensure it runs reliably on Hugging Face Spaces
29
+ - The model is optimized for CPU usage and low memory consumption
30
+ - For best results, use clear images with well-defined subjects
31
 
32
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,71 +1,58 @@
1
  import gradio as gr
2
  from PIL import Image
3
  import torch
4
- from transformers import AutoProcessor, Blip2ForConditionalGeneration
5
- from peft import PeftModel, LoraConfig
6
  import os
7
 
8
- # LoRA configuration used during training:
9
- # config = LoraConfig(
10
- # r=16,
11
- # lora_alpha=32,
12
- # lora_dropout=0.05,
13
- # bias="none",
14
- # target_modules=["q_proj", "k_proj"]
15
- # )
16
-
17
  # Check if we're running on CPU or GPU
18
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
  print(f"Using device: {device}")
20
 
21
  # Load processor first
22
- processor = AutoProcessor.from_pretrained("./processor")
23
-
24
- # Load base model without 8-bit quantization for CPU compatibility
25
  try:
26
- # Try loading with device_map for better memory usage if available
27
- base_model = Blip2ForConditionalGeneration.from_pretrained(
28
- "Salesforce/blip2-opt-2.7b",
29
- device_map="auto" if torch.cuda.is_available() else None,
30
- load_in_8bit=torch.cuda.is_available() # Only use 8-bit if CUDA is available
31
- )
32
  except Exception as e:
33
- print(f"Error loading full model: {e}")
34
- print("Falling back to smaller model...")
35
- # Fall back to a smaller model if the large one fails
36
- base_model = Blip2ForConditionalGeneration.from_pretrained(
37
- "Salesforce/blip2-opt-560m",
38
- device_map=None
39
- )
40
 
41
- # Load the fine-tuned LoRA weights
42
  try:
43
- model = PeftModel.from_pretrained(base_model, "./model")
44
- print("Successfully loaded fine-tuned LoRA weights")
 
 
 
 
 
45
  except Exception as e:
46
- print(f"Error loading LoRA weights: {e}")
47
- print("Continuing with base model only")
48
- model = base_model
 
 
 
 
 
49
 
50
- # Move model to device if not using device_map
51
- if not hasattr(model, "hf_device_map"):
52
- model = model.to(device)
53
 
54
- # Define the function to generate caption - exactly as in colab
55
  def generate_caption(image):
56
- # Convert image to RGB if needed
57
- image = image.convert('RGB') if image.mode != 'RGB' else image
58
-
59
- # Process the image exactly as in colab.py
60
- inputs = processor(images=image, return_tensors="pt").to(device)
61
-
62
- # Use fp32 instead of fp16 for CPU compatibility
63
- dtype = torch.float16 if torch.cuda.is_available() else torch.float32
64
- pixel_values = inputs.pixel_values.to(dtype)
65
-
66
  try:
67
- # Generate caption with the same parameters
68
- generated_ids = model.generate(pixel_values=pixel_values, max_length=25)
 
 
 
 
 
 
69
 
70
  # Decode the caption
71
  caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -78,8 +65,8 @@ iface = gr.Interface(
78
  fn=generate_caption,
79
  inputs=gr.Image(type="pil"),
80
  outputs="text",
81
- title="Fine-tuned BLIP2 Image Caption Generator",
82
- description="Upload an image to generate a caption using BLIP2 fine-tuned on Flickr8k with LoRA (r=16, alpha=32).",
83
  examples=["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"]
84
  )
85
 
 
1
  import gradio as gr
2
  from PIL import Image
3
  import torch
4
+ from transformers import AutoProcessor, AutoModelForCausalLM, BlipProcessor
 
5
  import os
6
 
 
 
 
 
 
 
 
 
 
7
  # Check if we're running on CPU or GPU
8
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
9
  print(f"Using device: {device}")
10
 
11
  # Load processor first
 
 
 
12
  try:
13
+ # Try to load the custom processor
14
+ processor = AutoProcessor.from_pretrained("./processor")
15
+ print("Loaded custom processor")
 
 
 
16
  except Exception as e:
17
+ print(f"Failed to load custom processor: {e}")
18
+ # Fall back to a smaller processor
19
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
20
+ print("Using fallback processor")
 
 
 
21
 
22
+ # Load base model - use the smallest possible model for CPU
23
  try:
24
+ # Try loading the smallest BLIP model
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ "Salesforce/blip-image-captioning-base",
27
+ device_map=None,
28
+ torch_dtype=torch.float32 # Use float32 for CPU compatibility
29
+ )
30
+ print("Loaded base BLIP model")
31
  except Exception as e:
32
+ print(f"Error loading model: {e}")
33
+ # If that fails, load an even smaller model
34
+ model = AutoModelForCausalLM.from_pretrained(
35
+ "Salesforce/blip-image-captioning-base",
36
+ device_map=None,
37
+ low_cpu_mem_usage=True
38
+ )
39
+ print("Loaded fallback model")
40
 
41
+ # Move model to device if needed
42
+ model = model.to(device)
43
+ print("Model loaded and ready")
44
 
45
+ # Define the function to generate caption
46
  def generate_caption(image):
 
 
 
 
 
 
 
 
 
 
47
  try:
48
+ # Convert image to RGB if needed
49
+ image = image.convert('RGB') if image.mode != 'RGB' else image
50
+
51
+ # Process the image
52
+ inputs = processor(images=image, return_tensors="pt").to(device)
53
+
54
+ # Generate caption
55
+ generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
56
 
57
  # Decode the caption
58
  caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
65
  fn=generate_caption,
66
  inputs=gr.Image(type="pil"),
67
  outputs="text",
68
+ title="Image Caption Generator",
69
+ description="Upload an image to generate a caption.",
70
  examples=["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"]
71
  )
72
 
requirements.txt CHANGED
@@ -1,7 +1,4 @@
1
- torch>=2.0.0
2
- transformers>=4.31.0
3
- gradio>=3.40.0
4
- Pillow
5
- peft>=0.5.0
6
- safetensors
7
- accelerate>=0.25.0
 
1
+ torch>=1.10.0
2
+ transformers>=4.25.0
3
+ gradio>=3.20.0
4
+ Pillow