gaia-enhanced-agent / agents /enhanced_rtl_multimodal_agent.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
"""
Enhanced RTL (Rotated Text Layout) Multimodal Agent
This module enhances the existing multimodal capabilities with improved support for:
- Text in various orientations (0°, 90°, 180°, 270°)
- Multi-directional text detection
- Enhanced OCR prompting for rotated text
- Better text extraction regardless of orientation
"""
import os
import logging
import base64
import io
from typing import Dict, Any, List, Optional, Union
from pathlib import Path
import requests
from PIL import Image, ImageOps
import numpy as np
# Import the base multimodal tools
from .mistral_multimodal_agent import OpenSourceMultimodalTools
logger = logging.getLogger(__name__)
class EnhancedRTLMultimodalTools(OpenSourceMultimodalTools):
"""
Enhanced multimodal tools with improved rotated text recognition.
Key enhancements:
1. Multi-orientation text analysis
2. Enhanced prompting for rotated text
3. Image preprocessing for better OCR
4. Text direction detection and processing
"""
def __init__(self):
"""Initialize the enhanced RTL multimodal agent."""
super().__init__()
logger.info("🔄 Enhanced RTL Multimodal Tools initialized")
def analyze_image(self, image_input: Union[str, bytes, Image.Image, dict], question: str = None) -> str:
"""
Enhanced image analysis with improved rotated text recognition.
Args:
image_input: Image file path, bytes, PIL Image, or dict with file_path
question: Optional specific question about the image
Returns:
Analysis result with enhanced text recognition
"""
try:
# Convert input to PIL Image (reuse parent logic)
image = self._convert_to_pil_image(image_input)
if isinstance(image, str) and image.startswith("Error:"):
return image
# Enhanced analysis for text-related questions
if question and self._is_text_related_question(question):
return self._analyze_with_enhanced_text_recognition(image, question)
# Fall back to standard analysis for non-text questions
return super().analyze_image(image_input, question)
except Exception as e:
logger.error(f"Enhanced image analysis failed: {e}")
return f"Error: {e}"
def _convert_to_pil_image(self, image_input: Union[str, bytes, Image.Image, dict]) -> Union[Image.Image, str]:
"""Convert various input types to PIL Image."""
try:
if isinstance(image_input, dict):
if 'file_path' in image_input:
image_path = image_input['file_path']
if os.path.exists(image_path):
return Image.open(image_path)
else:
return f"Error: Image file not found: {image_path}"
else:
return "Error: Dictionary input must contain 'file_path' key"
elif isinstance(image_input, str):
if os.path.exists(image_input):
return Image.open(image_input)
else:
# Assume it's a URL
response = requests.get(image_input)
return Image.open(io.BytesIO(response.content))
elif isinstance(image_input, bytes):
return Image.open(io.BytesIO(image_input))
elif isinstance(image_input, Image.Image):
return image_input
else:
return "Error: Unsupported image input format"
except Exception as e:
return f"Error converting image: {e}"
def _is_text_related_question(self, question: str) -> bool:
"""Determine if the question is asking about text content."""
text_keywords = [
'text', 'read', 'words', 'letters', 'numbers', 'digits',
'writing', 'written', 'says', 'message', 'content',
'characters', 'alphabet', 'numeric', 'string', 'label',
'title', 'caption', 'sign', 'document', 'page'
]
question_lower = question.lower()
return any(keyword in question_lower for keyword in text_keywords)
def _analyze_with_enhanced_text_recognition(self, image: Image.Image, question: str) -> str:
"""
Perform enhanced text recognition analysis with multiple orientations.
Args:
image: PIL Image object
question: Question about text in the image
Returns:
Enhanced text analysis result
"""
try:
# Try Mistral Vision with enhanced prompting first
if self.mistral_client:
result = self._analyze_with_enhanced_mistral_vision(image, question)
if result and not result.startswith("Error"):
return result
# Fallback to multi-orientation analysis
return self._multi_orientation_text_analysis(image, question)
except Exception as e:
logger.error(f"Enhanced text recognition failed: {e}")
return f"Error in enhanced text recognition: {e}"
def _analyze_with_enhanced_mistral_vision(self, image: Image.Image, question: str) -> Optional[str]:
"""
Analyze image using Mistral Vision with enhanced prompting for rotated text.
Args:
image: PIL Image object
question: Question about the image
Returns:
Analysis result or None if failed
"""
try:
# Convert image to base64
buffer = io.BytesIO()
image.save(buffer, format='PNG')
image_b64 = base64.b64encode(buffer.getvalue()).decode()
# Enhanced prompt for rotated text recognition
enhanced_prompt = self._create_enhanced_text_prompt(question)
# Create message with enhanced prompt
from mistralai import UserMessage
messages = [
UserMessage(
content=[
{
"type": "text",
"text": enhanced_prompt
},
{
"type": "image_url",
"image_url": f"data:image/png;base64,{image_b64}"
}
]
)
]
# Use Mistral Vision model
if hasattr(self, 'mistral_client') and self.mistral_client:
from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE
if MISTRAL_CLIENT_TYPE == "new":
response = self.mistral_client.chat.complete(
model="pixtral-12b-2409",
messages=messages
)
else:
response = self.mistral_client.chat(
model="pixtral-12b-2409",
messages=messages
)
return response.choices[0].message.content
return None
except Exception as e:
logger.warning(f"Enhanced Mistral Vision analysis failed: {e}")
return None
def _create_enhanced_text_prompt(self, original_question: str) -> str:
"""
Create an enhanced prompt specifically designed for rotated text recognition.
Args:
original_question: Original question about the image
Returns:
Enhanced prompt for better text recognition
"""
enhanced_prompt = f"""
{original_question}
IMPORTANT INSTRUCTIONS FOR TEXT RECOGNITION:
- Look carefully for text in ALL orientations: normal (0°), rotated 90°, upside down (180°), and rotated 270°
- Text may appear in any direction - horizontal, vertical, or rotated
- Pay special attention to text that might be rotated or oriented differently than normal reading direction
- If you see text that appears sideways, upside down, or at an angle, please read it and include it in your response
- Look for numbers, letters, words, and any written content regardless of orientation
- Scan the entire image systematically for text in all possible orientations
- If text appears rotated, mentally rotate it to read it correctly
- Include ALL text you can identify, even if it's in an unusual orientation
Please provide a comprehensive reading of all text visible in the image, regardless of its orientation or direction.
"""
return enhanced_prompt
def _multi_orientation_text_analysis(self, image: Image.Image, question: str) -> str:
"""
Analyze text by trying multiple image orientations.
Args:
image: PIL Image object
question: Question about text in the image
Returns:
Combined text analysis from all orientations
"""
try:
orientations = [
("normal", 0),
("rotated_90", 90),
("rotated_180", 180),
("rotated_270", 270)
]
all_results = []
for orientation_name, rotation in orientations:
try:
# Rotate image
if rotation == 0:
rotated_image = image
else:
rotated_image = image.rotate(-rotation, expand=True, fillcolor='white')
# Analyze rotated image
if self.vision_pipeline:
caption_result = self.vision_pipeline(rotated_image)
caption = caption_result[0]['generated_text'] if caption_result else ""
if caption and len(caption.strip()) > 0:
all_results.append(f"{orientation_name}: {caption}")
except Exception as e:
logger.warning(f"Failed to analyze {orientation_name} orientation: {e}")
continue
# Combine results
if all_results:
combined_result = "Text found in different orientations:\n" + "\n".join(all_results)
# Use Mistral to synthesize the results if available
if self.mistral_client:
synthesis_prompt = f"""
Based on the following text recognition results from an image analyzed in different orientations,
please provide a comprehensive answer to the question: "{question}"
Recognition results:
{combined_result}
Please synthesize this information and provide the most accurate and complete answer possible.
Focus on extracting all readable text regardless of its original orientation in the image.
"""
try:
from mistralai import UserMessage
from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE
if MISTRAL_CLIENT_TYPE == "new":
response = self.mistral_client.chat.complete(
model="mistral-large-latest",
messages=[UserMessage(content=synthesis_prompt)]
)
else:
response = self.mistral_client.chat(
model="mistral-large-latest",
messages=[UserMessage(content=synthesis_prompt)]
)
return response.choices[0].message.content
except Exception as e:
logger.warning(f"Failed to synthesize results: {e}")
return combined_result
else:
return "No text could be detected in any orientation"
except Exception as e:
logger.error(f"Multi-orientation analysis failed: {e}")
return f"Error in multi-orientation analysis: {e}"
def get_enhanced_capabilities_status(self) -> Dict[str, Any]:
"""Get status of enhanced capabilities."""
base_status = super().get_capabilities_status()
enhanced_status = {
**base_status,
'enhanced_text_recognition': True,
'multi_orientation_analysis': True,
'rotated_text_support': True,
'text_direction_detection': True
}
return enhanced_status