Saadi07's picture
f
95e1523
# Create image # Install libraries
# !pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets
# # Fix fsspec version mismatch (if needed)
# !pip install fsspec==2025.3.0
# from google.colab import files
# files.upload() # Upload kaggle.json here
# # Move kaggle.json to the right location
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# # Download the dataset
# !kaggle datasets download -d adityajn105/flickr8k --force
# # Unzip it
# !unzip -q flickr8k.zip -d flickr8k
# DATASET_PATH = '/content/flickr8k'
# CAPTIONS_FILE = os.path.join(DATASET_PATH, 'captions.txt')
# IMAGES_PATH = os.path.join(DATASET_PATH, 'Images/')
# import os
# import pandas as pd
# from PIL import Image
# from torch.utils.data import Dataset, DataLoader
# import torch
# # Load and process captions
# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"])
# df["caption"] = df["caption"][1:]
# df["caption"]
# df = df.dropna().reset_index(drop=True)
# df
# df = df[:8000]
# from transformers import AutoProcessor
# from PIL import Image
# processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
# class Flickr8kDataset(Dataset):
# def __init__(self, dataframe, image_dir, processor):
# self.dataframe = dataframe
# self.image_dir = image_dir
# self.processor = processor
# def __len__(self):
# return len(self.dataframe)
# def __getitem__(self, idx):
# row = self.dataframe.iloc[idx]
# image_path = os.path.join(self.image_dir, row["image"])
# caption = row["caption"]
# # Load image
# image = Image.open(image_path).convert('RGB')
# # Process image
# encoding = self.processor(images=image, return_tensors="pt")
# encoding = {k: v.squeeze() for k, v in encoding.items()}
# encoding["text"] = caption
# return encoding
# def collate_fn(batch):
# processed_batch = {}
# for key in batch[0].keys():
# if key != "text":
# processed_batch[key] = torch.stack([example[key] for example in batch])
# else:
# text_inputs = processor.tokenizer(
# [example["text"] for example in batch], padding=True, return_tensors="pt"
# )
# processed_batch["input_ids"] = text_inputs["input_ids"]
# processed_batch["attention_mask"] = text_inputs["attention_mask"]
# return processed_batch
# from transformers import Blip2ForConditionalGeneration
# from peft import LoraConfig, get_peft_model
# model = Blip2ForConditionalGeneration.from_pretrained(
# "ybelkada/blip2-opt-2.7b-fp16-sharded",
# device_map="auto",
# load_in_8bit=True
# )
# # Apply LoRA
# config = LoraConfig(
# r=16,
# lora_alpha=32,
# lora_dropout=0.05,
# bias="none",
# target_modules=["q_proj", "k_proj"]
# )
# model = get_peft_model(model, config)
# model.print_trainable_parameters()
# # Load dataset
# train_dataset = Flickr8kDataset(df, IMAGES_PATH, processor)
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn)
# # Set up optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.train()
# # Training loop
# for epoch in range(1): # Use small epochs for testing, increase later
# print(f"Epoch: {epoch}")
# for idx, batch in enumerate(train_dataloader):
# input_ids = batch.pop("input_ids").to(device)
# pixel_values = batch.pop("pixel_values").to(device, torch.float16)
# outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
# loss = outputs.loss
# print(f"Batch {idx} Loss: {loss.item():.4f}")
# loss.backward()
# optimizer.step()
# optimizer.zero_grad()
# # Example prediction
# sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[0]["image"])).convert('RGB')
# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16)
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# print("Generated caption:", caption)
# import matplotlib.pyplot as plt
# # Show the sample image with the generated caption
# plt.figure(figsize=(6,6))
# plt.imshow(sample_image)
# plt.axis("off")
# plt.title(f"Generated caption:\n{caption}", fontsize=12)
# plt.show()
# # Load a sample image
# sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[15]["image"])).convert('RGB')
# # Prepare inputs
# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16)
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
# # Decode caption
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# print("Generated caption:", caption)
# # Show image with caption
# import matplotlib.pyplot as plt
# plt.figure(figsize=(6,6))
# plt.imshow(sample_image)
# plt.axis("off")
# plt.title(f"Generated caption:\n{caption}", fontsize=12)
# plt.show()
# from PIL import Image
# import matplotlib.pyplot as plt
# import torch
# import io
# from google.colab import files # Only for Colab
# # Upload image
# uploaded = files.upload()
# # Get the uploaded file
# for filename in uploaded.keys():
# image_path = filename
# # Load the image
# sample_image = Image.open(image_path).convert('RGB')
# # Prepare inputs
# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16)
# # Generate caption
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
# # Decode caption
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# print("Generated caption:", caption)
# # Show image with caption
# plt.figure(figsize=(6,6))
# plt.imshow(sample_image)
# plt.axis("off")
# plt.title(f"Generated caption:\n{caption}", fontsize=12)
# plt.show()
# !pip install evaluate pycocoevalcap --quiet
# import evaluate
# from tqdm import tqdm
# from PIL import Image
# import torch
# import os
# # Load metrics
# bleu = evaluate.load("bleu")
# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"])
# # Subset of data
# subset_df = df[8001:8092].reset_index(drop=True)
# # Prepare references and predictions
# references = {}
# predictions = []
# for idx in tqdm(range(len(subset_df))):
# image_name = subset_df.iloc[idx]['image']
# # Load image
# image_path = os.path.join(IMAGES_PATH, image_name)
# image = Image.open(image_path).convert('RGB')
# # Generate caption
# inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
# predicted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# # Save prediction
# predictions.append(predicted_caption)
# # Prepare ground-truth references
# if image_name not in references:
# gt = df[df['image'] == image_name]['caption'].tolist()
# references[image_name] = gt
# # Build reference and prediction lists for scoring
# gt_list = [references[name] for name in subset_df['image']]
# pred_list = predictions
# import evaluate
# from tqdm import tqdm
# from PIL import Image
# from pycocoevalcap.cider.cider import Cider
# from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
# bleu = evaluate.load("bleu")
# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"])
# # Get subset
# subset_df = df[8001:8092].reset_index(drop=True)
# # Collect predictions and references
# predictions = []
# references = {}
# for idx in tqdm(range(len(subset_df))):
# row = subset_df.iloc[idx]
# image_name = row['image']
# image_path = os.path.join(IMAGES_PATH, image_name)
# image = Image.open(image_path).convert('RGB')
# inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25)
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# predictions.append(caption)
# if image_name not in references:
# refs = df[df['image'] == image_name]['caption'].tolist()
# references[image_name] = refs
# # Prepare for BLEU
# gt_list = [references[name] for name in subset_df["image"]]
# pred_list = predictions
# bleu_score = bleu.compute(predictions=pred_list, references=gt_list)
# print("BLEU:", bleu_score)
# # Prepare COCO-style input
# gts = {}
# res = {}
# for i, img in enumerate(subset_df["image"]):
# gts[str(i)] = [{"caption": cap} for cap in references[img]]
# res[str(i)] = [{"caption": predictions[i]}]
# # Tokenize
# tokenizer = PTBTokenizer()
# gts_tokenized = tokenizer.tokenize(gts)
# res_tokenized = tokenizer.tokenize(res)
# # Compute CIDEr
# cider_scorer = Cider()
# cider_score, _ = cider_scorer.compute_score(gts_tokenized, res_tokenized)
# print("CIDEr:", cider_score)