Spaces:
Sleeping
Sleeping
# Create image # Install libraries | |
# !pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets | |
# # Fix fsspec version mismatch (if needed) | |
# !pip install fsspec==2025.3.0 | |
# from google.colab import files | |
# files.upload() # Upload kaggle.json here | |
# # Move kaggle.json to the right location | |
# !mkdir -p ~/.kaggle | |
# !cp kaggle.json ~/.kaggle/ | |
# !chmod 600 ~/.kaggle/kaggle.json | |
# # Download the dataset | |
# !kaggle datasets download -d adityajn105/flickr8k --force | |
# # Unzip it | |
# !unzip -q flickr8k.zip -d flickr8k | |
# DATASET_PATH = '/content/flickr8k' | |
# CAPTIONS_FILE = os.path.join(DATASET_PATH, 'captions.txt') | |
# IMAGES_PATH = os.path.join(DATASET_PATH, 'Images/') | |
# import os | |
# import pandas as pd | |
# from PIL import Image | |
# from torch.utils.data import Dataset, DataLoader | |
# import torch | |
# # Load and process captions | |
# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) | |
# df["caption"] = df["caption"][1:] | |
# df["caption"] | |
# df = df.dropna().reset_index(drop=True) | |
# df | |
# df = df[:8000] | |
# from transformers import AutoProcessor | |
# from PIL import Image | |
# processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") | |
# class Flickr8kDataset(Dataset): | |
# def __init__(self, dataframe, image_dir, processor): | |
# self.dataframe = dataframe | |
# self.image_dir = image_dir | |
# self.processor = processor | |
# def __len__(self): | |
# return len(self.dataframe) | |
# def __getitem__(self, idx): | |
# row = self.dataframe.iloc[idx] | |
# image_path = os.path.join(self.image_dir, row["image"]) | |
# caption = row["caption"] | |
# # Load image | |
# image = Image.open(image_path).convert('RGB') | |
# # Process image | |
# encoding = self.processor(images=image, return_tensors="pt") | |
# encoding = {k: v.squeeze() for k, v in encoding.items()} | |
# encoding["text"] = caption | |
# return encoding | |
# def collate_fn(batch): | |
# processed_batch = {} | |
# for key in batch[0].keys(): | |
# if key != "text": | |
# processed_batch[key] = torch.stack([example[key] for example in batch]) | |
# else: | |
# text_inputs = processor.tokenizer( | |
# [example["text"] for example in batch], padding=True, return_tensors="pt" | |
# ) | |
# processed_batch["input_ids"] = text_inputs["input_ids"] | |
# processed_batch["attention_mask"] = text_inputs["attention_mask"] | |
# return processed_batch | |
# from transformers import Blip2ForConditionalGeneration | |
# from peft import LoraConfig, get_peft_model | |
# model = Blip2ForConditionalGeneration.from_pretrained( | |
# "ybelkada/blip2-opt-2.7b-fp16-sharded", | |
# device_map="auto", | |
# load_in_8bit=True | |
# ) | |
# # Apply LoRA | |
# config = LoraConfig( | |
# r=16, | |
# lora_alpha=32, | |
# lora_dropout=0.05, | |
# bias="none", | |
# target_modules=["q_proj", "k_proj"] | |
# ) | |
# model = get_peft_model(model, config) | |
# model.print_trainable_parameters() | |
# # Load dataset | |
# train_dataset = Flickr8kDataset(df, IMAGES_PATH, processor) | |
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn) | |
# # Set up optimizer | |
# optimizer = torch.optim.Adam(model.parameters(), lr=5e-4) | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# model.train() | |
# # Training loop | |
# for epoch in range(1): # Use small epochs for testing, increase later | |
# print(f"Epoch: {epoch}") | |
# for idx, batch in enumerate(train_dataloader): | |
# input_ids = batch.pop("input_ids").to(device) | |
# pixel_values = batch.pop("pixel_values").to(device, torch.float16) | |
# outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids) | |
# loss = outputs.loss | |
# print(f"Batch {idx} Loss: {loss.item():.4f}") | |
# loss.backward() | |
# optimizer.step() | |
# optimizer.zero_grad() | |
# # Example prediction | |
# sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[0]["image"])).convert('RGB') | |
# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) | |
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# print("Generated caption:", caption) | |
# import matplotlib.pyplot as plt | |
# # Show the sample image with the generated caption | |
# plt.figure(figsize=(6,6)) | |
# plt.imshow(sample_image) | |
# plt.axis("off") | |
# plt.title(f"Generated caption:\n{caption}", fontsize=12) | |
# plt.show() | |
# # Load a sample image | |
# sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[15]["image"])).convert('RGB') | |
# # Prepare inputs | |
# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) | |
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
# # Decode caption | |
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# print("Generated caption:", caption) | |
# # Show image with caption | |
# import matplotlib.pyplot as plt | |
# plt.figure(figsize=(6,6)) | |
# plt.imshow(sample_image) | |
# plt.axis("off") | |
# plt.title(f"Generated caption:\n{caption}", fontsize=12) | |
# plt.show() | |
# from PIL import Image | |
# import matplotlib.pyplot as plt | |
# import torch | |
# import io | |
# from google.colab import files # Only for Colab | |
# # Upload image | |
# uploaded = files.upload() | |
# # Get the uploaded file | |
# for filename in uploaded.keys(): | |
# image_path = filename | |
# # Load the image | |
# sample_image = Image.open(image_path).convert('RGB') | |
# # Prepare inputs | |
# inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) | |
# # Generate caption | |
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
# # Decode caption | |
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# print("Generated caption:", caption) | |
# # Show image with caption | |
# plt.figure(figsize=(6,6)) | |
# plt.imshow(sample_image) | |
# plt.axis("off") | |
# plt.title(f"Generated caption:\n{caption}", fontsize=12) | |
# plt.show() | |
# !pip install evaluate pycocoevalcap --quiet | |
# import evaluate | |
# from tqdm import tqdm | |
# from PIL import Image | |
# import torch | |
# import os | |
# # Load metrics | |
# bleu = evaluate.load("bleu") | |
# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) | |
# # Subset of data | |
# subset_df = df[8001:8092].reset_index(drop=True) | |
# # Prepare references and predictions | |
# references = {} | |
# predictions = [] | |
# for idx in tqdm(range(len(subset_df))): | |
# image_name = subset_df.iloc[idx]['image'] | |
# # Load image | |
# image_path = os.path.join(IMAGES_PATH, image_name) | |
# image = Image.open(image_path).convert('RGB') | |
# # Generate caption | |
# inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) | |
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
# predicted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# # Save prediction | |
# predictions.append(predicted_caption) | |
# # Prepare ground-truth references | |
# if image_name not in references: | |
# gt = df[df['image'] == image_name]['caption'].tolist() | |
# references[image_name] = gt | |
# # Build reference and prediction lists for scoring | |
# gt_list = [references[name] for name in subset_df['image']] | |
# pred_list = predictions | |
# import evaluate | |
# from tqdm import tqdm | |
# from PIL import Image | |
# from pycocoevalcap.cider.cider import Cider | |
# from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer | |
# bleu = evaluate.load("bleu") | |
# df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) | |
# # Get subset | |
# subset_df = df[8001:8092].reset_index(drop=True) | |
# # Collect predictions and references | |
# predictions = [] | |
# references = {} | |
# for idx in tqdm(range(len(subset_df))): | |
# row = subset_df.iloc[idx] | |
# image_name = row['image'] | |
# image_path = os.path.join(IMAGES_PATH, image_name) | |
# image = Image.open(image_path).convert('RGB') | |
# inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) | |
# generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
# caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# predictions.append(caption) | |
# if image_name not in references: | |
# refs = df[df['image'] == image_name]['caption'].tolist() | |
# references[image_name] = refs | |
# # Prepare for BLEU | |
# gt_list = [references[name] for name in subset_df["image"]] | |
# pred_list = predictions | |
# bleu_score = bleu.compute(predictions=pred_list, references=gt_list) | |
# print("BLEU:", bleu_score) | |
# # Prepare COCO-style input | |
# gts = {} | |
# res = {} | |
# for i, img in enumerate(subset_df["image"]): | |
# gts[str(i)] = [{"caption": cap} for cap in references[img]] | |
# res[str(i)] = [{"caption": predictions[i]}] | |
# # Tokenize | |
# tokenizer = PTBTokenizer() | |
# gts_tokenized = tokenizer.tokenize(gts) | |
# res_tokenized = tokenizer.tokenize(res) | |
# # Compute CIDEr | |
# cider_scorer = Cider() | |
# cider_score, _ = cider_scorer.compute_score(gts_tokenized, res_tokenized) | |
# print("CIDEr:", cider_score) | |