Spaces:
Sleeping
Sleeping
| import torch | |
| from PIL import Image | |
| from transformers import ( | |
| AutoModelForSequenceClassification, | |
| AutoTokenizer, | |
| VisionEncoderDecoderModel, | |
| ViTImageProcessor, | |
| ) | |
| # Device configuration | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Load image captioning model | |
| def load_image_captioning_model(): | |
| model = VisionEncoderDecoderModel.from_pretrained( | |
| "nlpconnect/vit-gpt2-image-captioning" | |
| ).to(device) | |
| feature_extractor = ViTImageProcessor.from_pretrained( | |
| "nlpconnect/vit-gpt2-image-captioning" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| return model, feature_extractor, tokenizer | |
| # Generate caption for an image | |
| def generate_caption(image_path, model, feature_extractor, tokenizer): | |
| max_length = 16 | |
| num_beams = 4 | |
| image = Image.open(image_path).convert("RGB") | |
| pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to( | |
| device | |
| ) | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| pixel_values, max_length=max_length, num_beams=num_beams | |
| ) | |
| preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
| return preds[0].strip() | |
| # Load visual question answering model | |
| def load_vqa_model(): | |
| # For simplicity, we'll use the same image captioning model | |
| # In a real application, you would use a dedicated VQA model | |
| model = VisionEncoderDecoderModel.from_pretrained( | |
| "nlpconnect/vit-gpt2-image-captioning" | |
| ).to(device) | |
| feature_extractor = ViTImageProcessor.from_pretrained( | |
| "nlpconnect/vit-gpt2-image-captioning" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| return model, feature_extractor, tokenizer | |
| # Answer a question about an image | |
| def answer_question(image_path, question, model, feature_extractor, tokenizer): | |
| # This is a simplified version - in a real app, you'd use a proper VQA model | |
| # Here we just generate a caption and append it to a template | |
| caption = generate_caption(image_path, model, feature_extractor, tokenizer) | |
| return f"Based on the image which shows {caption}, I would say: {caption}" | |
| # Load sentiment analysis model | |
| def load_sentiment_model(): | |
| model_name = "distilbert-base-uncased-finetuned-sst-2-english" | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| return model, tokenizer | |
| # Analyze sentiment of text | |
| def analyze_sentiment(text, model, tokenizer): | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to( | |
| device | |
| ) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| scores = torch.nn.functional.softmax(outputs.logits, dim=1) | |
| scores = scores.cpu().numpy()[0] | |
| # DistilBERT-SST2 has 2 labels: negative (0) and positive (1) | |
| sentiment = "positive" if scores[1] > scores[0] else "negative" | |
| confidence = float(max(scores)) | |
| return sentiment, confidence | |