# Create image # Install libraries # !pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets # # Fix fsspec version mismatch (if needed) # !pip install fsspec==2025.3.0 # from google.colab import files # files.upload() # Upload kaggle.json here # # Move kaggle.json to the right location # !mkdir -p ~/.kaggle # !cp kaggle.json ~/.kaggle/ # !chmod 600 ~/.kaggle/kaggle.json # # Download the dataset # !kaggle datasets download -d adityajn105/flickr8k --force # # Unzip it # !unzip -q flickr8k.zip -d flickr8k # DATASET_PATH = '/content/flickr8k' # CAPTIONS_FILE = os.path.join(DATASET_PATH, 'captions.txt') # IMAGES_PATH = os.path.join(DATASET_PATH, 'Images/') # import os # import pandas as pd # from PIL import Image # from torch.utils.data import Dataset, DataLoader # import torch # # Load and process captions # df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) # df["caption"] = df["caption"][1:] # df["caption"] # df = df.dropna().reset_index(drop=True) # df # df = df[:8000] # from transformers import AutoProcessor # from PIL import Image # processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") # class Flickr8kDataset(Dataset): # def __init__(self, dataframe, image_dir, processor): # self.dataframe = dataframe # self.image_dir = image_dir # self.processor = processor # def __len__(self): # return len(self.dataframe) # def __getitem__(self, idx): # row = self.dataframe.iloc[idx] # image_path = os.path.join(self.image_dir, row["image"]) # caption = row["caption"] # # Load image # image = Image.open(image_path).convert('RGB') # # Process image # encoding = self.processor(images=image, return_tensors="pt") # encoding = {k: v.squeeze() for k, v in encoding.items()} # encoding["text"] = caption # return encoding # def collate_fn(batch): # processed_batch = {} # for key in batch[0].keys(): # if key != "text": # processed_batch[key] = torch.stack([example[key] for example in batch]) # else: # text_inputs = processor.tokenizer( # [example["text"] for example in batch], padding=True, return_tensors="pt" # ) # processed_batch["input_ids"] = text_inputs["input_ids"] # processed_batch["attention_mask"] = text_inputs["attention_mask"] # return processed_batch # from transformers import Blip2ForConditionalGeneration # from peft import LoraConfig, get_peft_model # model = Blip2ForConditionalGeneration.from_pretrained( # "ybelkada/blip2-opt-2.7b-fp16-sharded", # device_map="auto", # load_in_8bit=True # ) # # Apply LoRA # config = LoraConfig( # r=16, # lora_alpha=32, # lora_dropout=0.05, # bias="none", # target_modules=["q_proj", "k_proj"] # ) # model = get_peft_model(model, config) # model.print_trainable_parameters() # # Load dataset # train_dataset = Flickr8kDataset(df, IMAGES_PATH, processor) # train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn) # # Set up optimizer # optimizer = torch.optim.Adam(model.parameters(), lr=5e-4) # device = "cuda" if torch.cuda.is_available() else "cpu" # model.train() # # Training loop # for epoch in range(1): # Use small epochs for testing, increase later # print(f"Epoch: {epoch}") # for idx, batch in enumerate(train_dataloader): # input_ids = batch.pop("input_ids").to(device) # pixel_values = batch.pop("pixel_values").to(device, torch.float16) # outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids) # loss = outputs.loss # print(f"Batch {idx} Loss: {loss.item():.4f}") # loss.backward() # optimizer.step() # optimizer.zero_grad() # # Example prediction # sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[0]["image"])).convert('RGB') # inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # print("Generated caption:", caption) # import matplotlib.pyplot as plt # # Show the sample image with the generated caption # plt.figure(figsize=(6,6)) # plt.imshow(sample_image) # plt.axis("off") # plt.title(f"Generated caption:\n{caption}", fontsize=12) # plt.show() # # Load a sample image # sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[15]["image"])).convert('RGB') # # Prepare inputs # inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) # # Decode caption # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # print("Generated caption:", caption) # # Show image with caption # import matplotlib.pyplot as plt # plt.figure(figsize=(6,6)) # plt.imshow(sample_image) # plt.axis("off") # plt.title(f"Generated caption:\n{caption}", fontsize=12) # plt.show() # from PIL import Image # import matplotlib.pyplot as plt # import torch # import io # from google.colab import files # Only for Colab # # Upload image # uploaded = files.upload() # # Get the uploaded file # for filename in uploaded.keys(): # image_path = filename # # Load the image # sample_image = Image.open(image_path).convert('RGB') # # Prepare inputs # inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) # # Generate caption # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) # # Decode caption # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # print("Generated caption:", caption) # # Show image with caption # plt.figure(figsize=(6,6)) # plt.imshow(sample_image) # plt.axis("off") # plt.title(f"Generated caption:\n{caption}", fontsize=12) # plt.show() # !pip install evaluate pycocoevalcap --quiet # import evaluate # from tqdm import tqdm # from PIL import Image # import torch # import os # # Load metrics # bleu = evaluate.load("bleu") # df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) # # Subset of data # subset_df = df[8001:8092].reset_index(drop=True) # # Prepare references and predictions # references = {} # predictions = [] # for idx in tqdm(range(len(subset_df))): # image_name = subset_df.iloc[idx]['image'] # # Load image # image_path = os.path.join(IMAGES_PATH, image_name) # image = Image.open(image_path).convert('RGB') # # Generate caption # inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) # predicted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # # Save prediction # predictions.append(predicted_caption) # # Prepare ground-truth references # if image_name not in references: # gt = df[df['image'] == image_name]['caption'].tolist() # references[image_name] = gt # # Build reference and prediction lists for scoring # gt_list = [references[name] for name in subset_df['image']] # pred_list = predictions # import evaluate # from tqdm import tqdm # from PIL import Image # from pycocoevalcap.cider.cider import Cider # from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer # bleu = evaluate.load("bleu") # df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) # # Get subset # subset_df = df[8001:8092].reset_index(drop=True) # # Collect predictions and references # predictions = [] # references = {} # for idx in tqdm(range(len(subset_df))): # row = subset_df.iloc[idx] # image_name = row['image'] # image_path = os.path.join(IMAGES_PATH, image_name) # image = Image.open(image_path).convert('RGB') # inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # predictions.append(caption) # if image_name not in references: # refs = df[df['image'] == image_name]['caption'].tolist() # references[image_name] = refs # # Prepare for BLEU # gt_list = [references[name] for name in subset_df["image"]] # pred_list = predictions # bleu_score = bleu.compute(predictions=pred_list, references=gt_list) # print("BLEU:", bleu_score) # # Prepare COCO-style input # gts = {} # res = {} # for i, img in enumerate(subset_df["image"]): # gts[str(i)] = [{"caption": cap} for cap in references[img]] # res[str(i)] = [{"caption": predictions[i]}] # # Tokenize # tokenizer = PTBTokenizer() # gts_tokenized = tokenizer.tokenize(gts) # res_tokenized = tokenizer.tokenize(res) # # Compute CIDEr # cider_scorer = Cider() # cider_score, _ = cider_scorer.compute_score(gts_tokenized, res_tokenized) # print("CIDEr:", cider_score)