Qwen2-VL-2B

Runtime error

File size: 1,677 Bytes

5ca9963
2f4647f
2319467
 
2f4647f
5ca9963
2f4647f
 
 
699fe26
2f4647f
 
5dda5d9
2f4647f
2319467
 
2f4647f
2319467
73d58c2
2f4647f
e5e2e79
73d58c2
5a6e90f
73d58c2
2f4647f
5a6e90f
73d58c2
 
5a6e90f
f580378
 
 
 
73d58c2
 
 
 
 
 
 
2f4647f
5a6e90f
5ca9963
2f4647f
 
2319467

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image
import requests
from io import BytesIO

# Initialize the model and processor
model_name = "Qwen/Qwen2-VL-2B-Instruct"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Qwen2VLForConditionalGeneration.from_pretrained(model_name).to(device)
processor = AutoProcessor.from_pretrained(model_name)

# Load the image from URL
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))

# Automatically preprocess the image and text input using the processor
text_input = "Describe this image."

# The processor automatically handles resizing, normalization, and tokenization
inputs = processor(
    images=img,
    text=text_input,
    return_tensors="pt",
    padding=True,  # Automatically pad to match model input size
)

# Check the number of tokens generated by the processor and the shape of inputs
print("Input tokens:", inputs.input_ids.shape)
print("Image features shape:", inputs.pixel_values.shape)

# Ensure image and text are properly tokenized and features align
assert inputs.input_ids.shape[1] > 0, "No tokens generated for text input!"
assert inputs.pixel_values.shape[0] > 0, "No features generated for the image!"

# Move inputs to the device (either GPU or CPU)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Inference
generated_ids = model.generate(**inputs, max_new_tokens=128)

# Decode the output
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
print(output_text)