Spaces:

SaadNadeem07
/

BLIP_2-VLM-Finetuning

Running

App Files Files Community

Saadi07 commited on May 1

Commit

95e1523

1 Parent(s): d52fa20

f

Browse files

Files changed (4) hide show

README.md +23 -0
app.py +35 -9
colab.py +299 -0
requirements.txt +5 -2

README.md CHANGED Viewed

@@ -9,4 +9,27 @@ app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# Fine-tuned BLIP2 Image Captioning
+This Hugging Face Space hosts a BLIP2 model that has been fine-tuned on the Flickr8k dataset using Low-Rank Adaptation (LoRA).
+## Model Details
+- Base model: `ybelkada/blip2-opt-2.7b-fp16-sharded`
+- Fine-tuning technique: LoRA (Low-Rank Adaptation)
+- Training dataset: Flickr8k
+- LoRA configuration:
+  - Rank (r): 16
+  - Alpha: 32
+  - Dropout: 0.05
+  - Target modules: q_proj, k_proj
+## Usage
+Upload an image to generate a caption. The model will process the image and return a descriptive caption based on its fine-tuned knowledge.
+## Notes
+The model uses 8-bit quantization to reduce memory usage while maintaining performance.
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,19 +1,45 @@
 import gradio as gr
 from PIL import Image
 import torch
-from transformers import BlipProcessor, BlipForConditionalGeneration
-# Load model and processor
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model = model.to(device)
-# Define the function to generate caption
 def generate_caption(image):
-    inputs = processor(images=image, return_tensors="pt").to(device)
     generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
     caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return caption
@@ -22,8 +48,8 @@ iface = gr.Interface(
     fn=generate_caption,
     inputs=gr.Image(type="pil"),
     outputs="text",
-    title="Image Caption Generator",
-    description="Upload an image to generate a caption."
 )
 # Launch

 import gradio as gr
 from PIL import Image
 import torch
+from transformers import AutoProcessor, Blip2ForConditionalGeneration
+from peft import PeftModel, LoraConfig
+# LoRA configuration used during training:
+# config = LoraConfig(
+#     r=16,
+#     lora_alpha=32,
+#     lora_dropout=0.05,
+#     bias="none",
+#     target_modules=["q_proj", "k_proj"]
+# )
+# Load base model with the same configuration as in training
+base_model = Blip2ForConditionalGeneration.from_pretrained(
+    "ybelkada/blip2-opt-2.7b-fp16-sharded",
+    device_map="auto",
+    load_in_8bit=True
+)
+# Load the fine-tuned LoRA weights
+model = PeftModel.from_pretrained(base_model, "./model")
+# Load processor - use the same one as training
+processor = AutoProcessor.from_pretrained("./processor")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Define the function to generate caption - exactly as in colab
 def generate_caption(image):
+    # Convert image to RGB if needed
+    image = image.convert('RGB') if image.mode != 'RGB' else image
+    # Process the image exactly as in colab.py
+    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+    # Generate caption with the same parameters
     generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
+    # Decode the caption
     caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return caption
     fn=generate_caption,
     inputs=gr.Image(type="pil"),
     outputs="text",
+    title="Fine-tuned BLIP2 Image Caption Generator",
+    description="Upload an image to generate a caption using BLIP2 fine-tuned on Flickr8k with LoRA (r=16, alpha=32)."
 )
 # Launch

colab.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Create image # Install libraries
+# !pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets
+# # Fix fsspec version mismatch (if needed)
+# !pip install fsspec==2025.3.0
+# from google.colab import files
+# files.upload()  # Upload kaggle.json here
+# # Move kaggle.json to the right location
+# !mkdir -p ~/.kaggle
+# !cp kaggle.json ~/.kaggle/
+# !chmod 600 ~/.kaggle/kaggle.json
+# # Download the dataset
+# !kaggle datasets download -d adityajn105/flickr8k --force
+# # Unzip it
+# !unzip -q flickr8k.zip -d flickr8k
+# DATASET_PATH = '/content/flickr8k'
+# CAPTIONS_FILE = os.path.join(DATASET_PATH, 'captions.txt')
+# IMAGES_PATH = os.path.join(DATASET_PATH, 'Images/')
+# import os
+# import pandas as pd
+# from PIL import Image
+# from torch.utils.data import Dataset, DataLoader
+# import torch
+# # Load and process captions
+# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"])
+# df["caption"] = df["caption"][1:]
+# df["caption"]
+# df = df.dropna().reset_index(drop=True)
+# df
+# df = df[:8000]
+# from transformers import AutoProcessor
+# from PIL import Image
+# processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+# class Flickr8kDataset(Dataset):
+#     def __init__(self, dataframe, image_dir, processor):
+#         self.dataframe = dataframe
+#         self.image_dir = image_dir
+#         self.processor = processor
+#     def __len__(self):
+#         return len(self.dataframe)
+#     def __getitem__(self, idx):
+#         row = self.dataframe.iloc[idx]
+#         image_path = os.path.join(self.image_dir, row["image"])
+#         caption = row["caption"]
+#         # Load image
+#         image = Image.open(image_path).convert('RGB')
+#         # Process image
+#         encoding = self.processor(images=image, return_tensors="pt")
+#         encoding = {k: v.squeeze() for k, v in encoding.items()}
+#         encoding["text"] = caption
+#         return encoding
+# def collate_fn(batch):
+#     processed_batch = {}
+#     for key in batch[0].keys():
+#         if key != "text":
+#             processed_batch[key] = torch.stack([example[key] for example in batch])
+#         else:
+#             text_inputs = processor.tokenizer(
+#                 [example["text"] for example in batch], padding=True, return_tensors="pt"
+#             )
+#             processed_batch["input_ids"] = text_inputs["input_ids"]
+#             processed_batch["attention_mask"] = text_inputs["attention_mask"]
+#     return processed_batch
+# from transformers import Blip2ForConditionalGeneration
+# from peft import LoraConfig, get_peft_model
+# model = Blip2ForConditionalGeneration.from_pretrained(
+#     "ybelkada/blip2-opt-2.7b-fp16-sharded",
+#     device_map="auto",
+#     load_in_8bit=True
+# )
+# # Apply LoRA
+# config = LoraConfig(
+#     r=16,
+#     lora_alpha=32,
+#     lora_dropout=0.05,
+#     bias="none",
+#     target_modules=["q_proj", "k_proj"]
+# )
+# model = get_peft_model(model, config)
+# model.print_trainable_parameters()
+# # Load dataset
+# train_dataset = Flickr8kDataset(df, IMAGES_PATH, processor)
+# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn)
+# # Set up optimizer
+# optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# model.train()
+# # Training loop
+# for epoch in range(1):  # Use small epochs for testing, increase later
+#     print(f"Epoch: {epoch}")
+#     for idx, batch in enumerate(train_dataloader):
+#         input_ids = batch.pop("input_ids").to(device)
+#         pixel_values = batch.pop("pixel_values").to(device, torch.float16)
+#         outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
+#         loss = outputs.loss
+#         print(f"Batch {idx} Loss: {loss.item():.4f}")
+#         loss.backward()
+#         optimizer.step()
+#         optimizer.zero_grad()
+# # Example prediction
+# sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[0]["image"])).convert('RGB')
+# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16)
+# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
+# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+# print("Generated caption:", caption)
+# import matplotlib.pyplot as plt
+# # Show the sample image with the generated caption
+# plt.figure(figsize=(6,6))
+# plt.imshow(sample_image)
+# plt.axis("off")
+# plt.title(f"Generated caption:\n{caption}", fontsize=12)
+# plt.show()
+# # Load a sample image
+# sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[15]["image"])).convert('RGB')
+# # Prepare inputs
+# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16)
+# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
+# # Decode caption
+# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+# print("Generated caption:", caption)
+# # Show image with caption
+# import matplotlib.pyplot as plt
+# plt.figure(figsize=(6,6))
+# plt.imshow(sample_image)
+# plt.axis("off")
+# plt.title(f"Generated caption:\n{caption}", fontsize=12)
+# plt.show()
+# from PIL import Image
+# import matplotlib.pyplot as plt
+# import torch
+# import io
+# from google.colab import files  # Only for Colab
+# # Upload image
+# uploaded = files.upload()
+# # Get the uploaded file
+# for filename in uploaded.keys():
+#     image_path = filename
+# # Load the image
+# sample_image = Image.open(image_path).convert('RGB')
+# # Prepare inputs
+# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16)
+# # Generate caption
+# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
+# # Decode caption
+# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+# print("Generated caption:", caption)
+# # Show image with caption
+# plt.figure(figsize=(6,6))
+# plt.imshow(sample_image)
+# plt.axis("off")
+# plt.title(f"Generated caption:\n{caption}", fontsize=12)
+# plt.show()
+# !pip install evaluate pycocoevalcap --quiet
+# import evaluate
+# from tqdm import tqdm
+# from PIL import Image
+# import torch
+# import os
+# # Load metrics
+# bleu = evaluate.load("bleu")
+# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"])
+# # Subset of data
+# subset_df = df[8001:8092].reset_index(drop=True)
+# # Prepare references and predictions
+# references = {}
+# predictions = []
+# for idx in tqdm(range(len(subset_df))):
+#     image_name = subset_df.iloc[idx]['image']
+#     # Load image
+#     image_path = os.path.join(IMAGES_PATH, image_name)
+#     image = Image.open(image_path).convert('RGB')
+#     # Generate caption
+#     inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+#     generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
+#     predicted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+#     # Save prediction
+#     predictions.append(predicted_caption)
+#     # Prepare ground-truth references
+#     if image_name not in references:
+#         gt = df[df['image'] == image_name]['caption'].tolist()
+#         references[image_name] = gt
+# # Build reference and prediction lists for scoring
+# gt_list = [references[name] for name in subset_df['image']]
+# pred_list = predictions
+# import evaluate
+# from tqdm import tqdm
+# from PIL import Image
+# from pycocoevalcap.cider.cider import Cider
+# from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+# bleu = evaluate.load("bleu")
+# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"])
+# # Get subset
+# subset_df = df[8001:8092].reset_index(drop=True)
+# # Collect predictions and references
+# predictions = []
+# references = {}
+# for idx in tqdm(range(len(subset_df))):
+#     row = subset_df.iloc[idx]
+#     image_name = row['image']
+#     image_path = os.path.join(IMAGES_PATH, image_name)
+#     image = Image.open(image_path).convert('RGB')
+#     inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+#     generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
+#     caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+#     predictions.append(caption)
+#     if image_name not in references:
+#         refs = df[df['image'] == image_name]['caption'].tolist()
+#         references[image_name] = refs
+# # Prepare for BLEU
+# gt_list = [references[name] for name in subset_df["image"]]
+# pred_list = predictions
+# bleu_score = bleu.compute(predictions=pred_list, references=gt_list)
+# print("BLEU:", bleu_score)
+# # Prepare COCO-style input
+# gts = {}
+# res = {}
+# for i, img in enumerate(subset_df["image"]):
+#     gts[str(i)] = [{"caption": cap} for cap in references[img]]
+#     res[str(i)] = [{"caption": predictions[i]}]
+# # Tokenize
+# tokenizer = PTBTokenizer()
+# gts_tokenized = tokenizer.tokenize(gts)
+# res_tokenized = tokenizer.tokenize(res)
+# # Compute CIDEr
+# cider_scorer = Cider()
+# cider_score, _ = cider_scorer.compute_score(gts_tokenized, res_tokenized)
+# print("CIDEr:", cider_score)

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 torch
-transformers
 gradio
-Pillow

 torch
+transformers>=4.30.0
 gradio
+Pillow
+peft
+bitsandbytes
+accelerate