In [1]:
!pip install pillow



In [2]:
#colab 
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")

# print("Path to dataset files:", path)

In [3]:
#kaggle
path = '/kaggle/input/coco-2017-dataset'

In [4]:
from torchvision.datasets import CocoCaptions

In [67]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet152, ResNet152_Weights
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from PIL import Image
import json
import os


CFG = {
    "image_size": 224,
    "max_seq_len": 64,
    "embed_size": 768,
    "prefix_length": 10,
    "batch_size": 32,
    "num_epochs": 40,
    "learning_rate": 1e-4
}

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [68]:
# Dataset Class
class CocoDataset(Dataset):
    def __init__(self, image_dir, ann_path, tokenizer, transform=None, subset_size=None):
        with open(ann_path) as f:
            annotations = json.load(f)['annotations']
        # Load a subset of annotations if subset_size is specified
        if subset_size is not None:
            self.annotations = annotations[:subset_size]
        else:
            self.annotations = annotations

        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        image_id_str = str(ann['image_id']).zfill(12)
        image_path = os.path.join(self.image_dir, image_id_str + '.jpg')
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        caption = ann['caption']
        inputs = self.tokenizer(
            caption,
            max_length=CFG['max_seq_len'],
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return image, inputs.input_ids.squeeze(0), inputs.attention_mask.squeeze(0)

In [69]:
class ImageCaptionModel(nn.Module):
    def __init__(self, prefix_length=10):
        super().__init__()
        self.prefix_length = prefix_length
        self.cnn = nn.Sequential(*list(resnet152(weights=ResNet152_Weights.IMAGENET1K_V2).children())[:-1])
        for param in self.cnn.parameters():
            param.requires_grad = False

        # Project 2048-d image features to a sequence of embeddings
        self.projection = nn.Sequential(
            nn.Linear(2048, 768 * prefix_length),
            nn.ReLU(),
        )

        self.llm = GPT2LMHeadModel.from_pretrained('gpt2')
        self.llm.resize_token_embeddings(len(tokenizer))

    def forward(self, images, input_ids, attention_mask):
        with torch.no_grad():
            features = self.cnn(images).view(images.size(0), -1)

        # Project to sequence of embeddings
        img_seq = self.projection(features)  # (B, 768 * prefix_len)
        img_seq = img_seq.view(-1, self.prefix_length, 768)  # (B, prefix_len, 768)

        # Text embeddings (excluding last token for training)
        txt_embeds = self.llm.transformer.wte(input_ids[:, :-1])  # (B, T-1, 768)

        # Concatenate: [image embeddings | text embeddings]
        inputs_embeds = torch.cat([img_seq, txt_embeds], dim=1)

        # Adjust attention mask
        prefix_mask = torch.ones(input_ids.size(0), self.prefix_length).to(images.device)
        attention_mask = torch.cat([prefix_mask, attention_mask[:, :-1]], dim=1)

        # Forward
        outputs = self.llm(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask
        )

        return outputs.logits


In [70]:

# Training Setup
def train():
    # Transforms
    transform = transforms.Compose([
        transforms.Resize((CFG['image_size'], CFG['image_size'])),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # Tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    # Dataset & Loader
    # Using CocoDataset with a subset size of 8000
    dataset = CocoDataset(
        image_dir=f'{path}/coco2017/train2017',
        ann_path=f'{path}/coco2017/annotations/captions_train2017.json',
        tokenizer=tokenizer,
        transform=transform,
        subset_size=8000 # Load only 8000 entries
    )
    loader = DataLoader(dataset, batch_size=CFG['batch_size'], shuffle=True)

    # Model
    model = ImageCaptionModel().cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=CFG['learning_rate'])
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    
    for epoch in range(CFG['num_epochs']):
        model.train()
        for images, input_ids, attn_mask in loader:
            images = images.cuda()
            input_ids = input_ids.cuda()
            attn_mask = attn_mask.cuda()
    
            optimizer.zero_grad()
            logits = model(images, input_ids, attn_mask)
    
            # Shift logits and labels
            logits = logits[:, model.prefix_length:, :]  # Ignore prefix outputs
            labels = input_ids[:, 1:]  # Shift labels for teacher forcing
    
            loss = criterion(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
            loss.backward()
            optimizer.step()
    
        

        print(f"Epoch: {epoch+1}, Loss: {loss.item():.4f}")
    save_path = "/kaggle/working/clip_caption_model.pth"
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")
    

In [71]:

# Usage
if __name__ == "__main__":
    # Train the model
    train()


Epoch: 1, Loss: 2.7203
Epoch: 2, Loss: 2.5815
Epoch: 3, Loss: 2.1797
Epoch: 4, Loss: 2.2764
Epoch: 5, Loss: 1.8374
Epoch: 6, Loss: 1.5654
Epoch: 7, Loss: 1.4040
Epoch: 8, Loss: 1.3104
Epoch: 9, Loss: 1.0020
Epoch: 10, Loss: 0.9107
Epoch: 11, Loss: 0.7906
Epoch: 12, Loss: 0.7477
Epoch: 13, Loss: 0.6430
Epoch: 14, Loss: 0.5511
Epoch: 15, Loss: 0.5291
Epoch: 16, Loss: 0.6169
Epoch: 17, Loss: 0.6268
Epoch: 18, Loss: 0.5461
Epoch: 19, Loss: 0.4207
Epoch: 20, Loss: 0.3933
Epoch: 21, Loss: 0.3791
Epoch: 22, Loss: 0.3602
Epoch: 23, Loss: 0.3226
Epoch: 24, Loss: 0.2650
Epoch: 25, Loss: 0.3203
Epoch: 26, Loss: 0.3200
Epoch: 27, Loss: 0.3070
Epoch: 28, Loss: 0.2853
Epoch: 29, Loss: 0.2931
Epoch: 30, Loss: 0.3709
Epoch: 31, Loss: 0.2857
Epoch: 32, Loss: 0.2847
Epoch: 33, Loss: 0.3104
Epoch: 34, Loss: 0.2917
Epoch: 35, Loss: 0.2460
Epoch: 36, Loss: 0.2131
Epoch: 37, Loss: 0.2229
Epoch: 38, Loss: 0.2643
Epoch: 39, Loss: 0.2817
Epoch: 40, Loss: 0.2538
Model saved to /kaggle/working/clip_caption_model

In [72]:

# Inference
def generate_caption(image_path, model, tokenizer, device='cuda'):
    transform = transforms.Compose([
        transforms.Resize((CFG['image_size'], CFG['image_size'])),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        features = model.cnn(image)
    features = features.view(features.size(0), -1)
    img_embed = model.projection(features).unsqueeze(1)

    generated = model.llm.generate(
        inputs_embeds=img_embed,
        max_length=CFG['max_seq_len'],
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )

    return tokenizer.decode(generated[0], skip_special_tokens=True)


In [73]:
import requests

# URL of the image
url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRQrNtVwHLPW2MKg7qW4mkom5RN9HdfHgCgMw&s"
# Path to save the image
save_path = "/kaggle/working/downloaded_image.jpg"

# Download and save the image
response = requests.get(url)
if response.status_code == 200:
    with open(save_path, "wb") as f:
        f.write(response.content)
    print(f"Image saved to {save_path}")
else:
    print(f"Failed to download image. Status code: {response.status_code}")


Image saved to /kaggle/working/downloaded_image.jpg


In [60]:
def generate_caption(image_path, model, tokenizer, device='cuda'):
    transform = transforms.Compose([
        transforms.Resize((CFG['image_size'], CFG['image_size'])),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        features = model.cnn(image)
    features = features.view(features.size(0), -1)
    img_embed = model.projection(features).unsqueeze(1)

    # Add prompt tokens
    prompt = "A photo of"
    prompt_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    prompt_embeds = model.llm.transformer.wte(prompt_ids)

    inputs_embeds = torch.cat([img_embed, prompt_embeds], dim=1)
    attention_mask = torch.ones(inputs_embeds.shape[:-1], dtype=torch.long).to(device)

    # Sampling-based generation
    generated = model.llm.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=attention_mask,
        max_length=CFG['max_seq_len'],
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=1.0,
        pad_token_id=tokenizer.pad_token_id
    )

    return tokenizer.decode(generated[0], skip_special_tokens=True)


In [80]:

    # Example inference
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = ImageCaptionModel().cuda().eval()
    caption = generate_caption(save_path, model, tokenizer)
    print(f"Generated Caption: {caption}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Caption: 

I, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man, the man


In [81]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import torch

# Load model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

# Load image
img = Image.open("/kaggle/working/downloaded_image.jpg").convert('RGB')

# Preprocess
inputs = processor(img, return_tensors="pt").to("cuda")

# Generate caption
out = model.generate(**inputs, max_new_tokens=64)
caption = processor.decode(out[0], skip_special_tokens=True)
print("BLIP Caption:", caption)


BLIP Caption: a small dog wearing a red sweater running through the grass


In [74]:
def generate_caption(image_path, model, tokenizer, device='cuda'):
    transform = transforms.Compose([
        transforms.Resize((CFG['image_size'], CFG['image_size'])),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        features = model.cnn(image).view(1, -1)
        img_seq = model.projection(features).view(1, model.prefix_length, 768)

    # Generate from image embeddings only
    attention_mask = torch.ones((1, model.prefix_length), device=device)

    generated = model.llm.generate(
        inputs_embeds=img_seq,
        attention_mask=attention_mask,
        max_length=CFG['max_seq_len'],
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=1.0,
        pad_token_id=tokenizer.pad_token_id
    )

    return tokenizer.decode(generated[0], skip_special_tokens=True)
