Qwen2-VL-2B / app.py
vykanand's picture
Update app.py
73d58c2 verified
raw
history blame
1.68 kB
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image
import requests
from io import BytesIO
# Initialize the model and processor
model_name = "Qwen/Qwen2-VL-2B-Instruct"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name).to(device)
processor = AutoProcessor.from_pretrained(model_name)
# Load the image from URL
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))
# Automatically preprocess the image and text input using the processor
text_input = "Describe this image."
# The processor automatically handles resizing, normalization, and tokenization
inputs = processor(
images=img,
text=text_input,
return_tensors="pt",
padding=True, # Automatically pad to match model input size
)
# Check the number of tokens generated by the processor and the shape of inputs
print("Input tokens:", inputs.input_ids.shape)
print("Image features shape:", inputs.pixel_values.shape)
# Ensure image and text are properly tokenized and features align
assert inputs.input_ids.shape[1] > 0, "No tokens generated for text input!"
assert inputs.pixel_values.shape[0] > 0, "No features generated for the image!"
# Move inputs to the device (either GPU or CPU)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Inference
generated_ids = model.generate(**inputs, max_new_tokens=128)
# Decode the output
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
print(output_text)