Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

gaia-enhanced-agent / agents /enhanced_rtl_multimodal_agent.py

GAIA Agent Deployment

Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements

9a6a4dc 9 days ago

13.2 kB

	"""
	Enhanced RTL (Rotated Text Layout) Multimodal Agent

	This module enhances the existing multimodal capabilities with improved support for:
	- Text in various orientations (0°, 90°, 180°, 270°)
	- Multi-directional text detection
	- Enhanced OCR prompting for rotated text
	- Better text extraction regardless of orientation
	"""

	import os
	import logging
	import base64
	import io
	from typing import Dict, Any, List, Optional, Union
	from pathlib import Path
	import requests
	from PIL import Image, ImageOps
	import numpy as np

	# Import the base multimodal tools
	from .mistral_multimodal_agent import OpenSourceMultimodalTools

	logger = logging.getLogger(__name__)

	class EnhancedRTLMultimodalTools(OpenSourceMultimodalTools):
	"""
	Enhanced multimodal tools with improved rotated text recognition.

	Key enhancements:
	1. Multi-orientation text analysis
	2. Enhanced prompting for rotated text
	3. Image preprocessing for better OCR
	4. Text direction detection and processing
	"""

	def __init__(self):
	"""Initialize the enhanced RTL multimodal agent."""
	super().__init__()
	logger.info("🔄 Enhanced RTL Multimodal Tools initialized")

	def analyze_image(self, image_input: Union[str, bytes, Image.Image, dict], question: str = None) -> str:
	"""
	Enhanced image analysis with improved rotated text recognition.

	Args:
	image_input: Image file path, bytes, PIL Image, or dict with file_path
	question: Optional specific question about the image

	Returns:
	Analysis result with enhanced text recognition
	"""
	try:
	# Convert input to PIL Image (reuse parent logic)
	image = self._convert_to_pil_image(image_input)
	if isinstance(image, str) and image.startswith("Error:"):
	return image

	# Enhanced analysis for text-related questions
	if question and self._is_text_related_question(question):
	return self._analyze_with_enhanced_text_recognition(image, question)

	# Fall back to standard analysis for non-text questions
	return super().analyze_image(image_input, question)

	except Exception as e:
	logger.error(f"Enhanced image analysis failed: {e}")
	return f"Error: {e}"

	def _convert_to_pil_image(self, image_input: Union[str, bytes, Image.Image, dict]) -> Union[Image.Image, str]:
	"""Convert various input types to PIL Image."""
	try:
	if isinstance(image_input, dict):
	if 'file_path' in image_input:
	image_path = image_input['file_path']
	if os.path.exists(image_path):
	return Image.open(image_path)
	else:
	return f"Error: Image file not found: {image_path}"
	else:
	return "Error: Dictionary input must contain 'file_path' key"
	elif isinstance(image_input, str):
	if os.path.exists(image_input):
	return Image.open(image_input)
	else:
	# Assume it's a URL
	response = requests.get(image_input)
	return Image.open(io.BytesIO(response.content))
	elif isinstance(image_input, bytes):
	return Image.open(io.BytesIO(image_input))
	elif isinstance(image_input, Image.Image):
	return image_input
	else:
	return "Error: Unsupported image input format"
	except Exception as e:
	return f"Error converting image: {e}"

	def _is_text_related_question(self, question: str) -> bool:
	"""Determine if the question is asking about text content."""
	text_keywords = [
	'text', 'read', 'words', 'letters', 'numbers', 'digits',
	'writing', 'written', 'says', 'message', 'content',
	'characters', 'alphabet', 'numeric', 'string', 'label',
	'title', 'caption', 'sign', 'document', 'page'
	]

	question_lower = question.lower()
	return any(keyword in question_lower for keyword in text_keywords)

	def _analyze_with_enhanced_text_recognition(self, image: Image.Image, question: str) -> str:
	"""
	Perform enhanced text recognition analysis with multiple orientations.

	Args:
	image: PIL Image object
	question: Question about text in the image

	Returns:
	Enhanced text analysis result
	"""
	try:
	# Try Mistral Vision with enhanced prompting first
	if self.mistral_client:
	result = self._analyze_with_enhanced_mistral_vision(image, question)
	if result and not result.startswith("Error"):
	return result

	# Fallback to multi-orientation analysis
	return self._multi_orientation_text_analysis(image, question)

	except Exception as e:
	logger.error(f"Enhanced text recognition failed: {e}")
	return f"Error in enhanced text recognition: {e}"

	def _analyze_with_enhanced_mistral_vision(self, image: Image.Image, question: str) -> Optional[str]:
	"""
	Analyze image using Mistral Vision with enhanced prompting for rotated text.

	Args:
	image: PIL Image object
	question: Question about the image

	Returns:
	Analysis result or None if failed
	"""
	try:
	# Convert image to base64
	buffer = io.BytesIO()
	image.save(buffer, format='PNG')
	image_b64 = base64.b64encode(buffer.getvalue()).decode()

	# Enhanced prompt for rotated text recognition
	enhanced_prompt = self._create_enhanced_text_prompt(question)

	# Create message with enhanced prompt
	from mistralai import UserMessage
	messages = [
	UserMessage(
	content=[
	{
	"type": "text",
	"text": enhanced_prompt
	},
	{
	"type": "image_url",
	"image_url": f"data:image/png;base64,{image_b64}"
	}
	]
	)
	]

	# Use Mistral Vision model
	if hasattr(self, 'mistral_client') and self.mistral_client:
	from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE

	if MISTRAL_CLIENT_TYPE == "new":
	response = self.mistral_client.chat.complete(
	model="pixtral-12b-2409",
	messages=messages
	)
	else:
	response = self.mistral_client.chat(
	model="pixtral-12b-2409",
	messages=messages
	)

	return response.choices[0].message.content

	return None

	except Exception as e:
	logger.warning(f"Enhanced Mistral Vision analysis failed: {e}")
	return None

	def _create_enhanced_text_prompt(self, original_question: str) -> str:
	"""
	Create an enhanced prompt specifically designed for rotated text recognition.

	Args:
	original_question: Original question about the image

	Returns:
	Enhanced prompt for better text recognition
	"""
	enhanced_prompt = f"""
	{original_question}

	IMPORTANT INSTRUCTIONS FOR TEXT RECOGNITION:
	- Look carefully for text in ALL orientations: normal (0°), rotated 90°, upside down (180°), and rotated 270°
	- Text may appear in any direction - horizontal, vertical, or rotated
	- Pay special attention to text that might be rotated or oriented differently than normal reading direction
	- If you see text that appears sideways, upside down, or at an angle, please read it and include it in your response
	- Look for numbers, letters, words, and any written content regardless of orientation
	- Scan the entire image systematically for text in all possible orientations
	- If text appears rotated, mentally rotate it to read it correctly
	- Include ALL text you can identify, even if it's in an unusual orientation

	Please provide a comprehensive reading of all text visible in the image, regardless of its orientation or direction.
	"""
	return enhanced_prompt

	def _multi_orientation_text_analysis(self, image: Image.Image, question: str) -> str:
	"""
	Analyze text by trying multiple image orientations.

	Args:
	image: PIL Image object
	question: Question about text in the image

	Returns:
	Combined text analysis from all orientations
	"""
	try:
	orientations = [
	("normal", 0),
	("rotated_90", 90),
	("rotated_180", 180),
	("rotated_270", 270)
	]

	all_results = []

	for orientation_name, rotation in orientations:
	try:
	# Rotate image
	if rotation == 0:
	rotated_image = image
	else:
	rotated_image = image.rotate(-rotation, expand=True, fillcolor='white')

	# Analyze rotated image
	if self.vision_pipeline:
	caption_result = self.vision_pipeline(rotated_image)
	caption = caption_result[0]['generated_text'] if caption_result else ""

	if caption and len(caption.strip()) > 0:
	all_results.append(f"{orientation_name}: {caption}")

	except Exception as e:
	logger.warning(f"Failed to analyze {orientation_name} orientation: {e}")
	continue

	# Combine results
	if all_results:
	combined_result = "Text found in different orientations:\n" + "\n".join(all_results)

	# Use Mistral to synthesize the results if available
	if self.mistral_client:
	synthesis_prompt = f"""
	Based on the following text recognition results from an image analyzed in different orientations,
	please provide a comprehensive answer to the question: "{question}"

	Recognition results:
	{combined_result}

	Please synthesize this information and provide the most accurate and complete answer possible.
	Focus on extracting all readable text regardless of its original orientation in the image.
	"""

	try:
	from mistralai import UserMessage
	from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE

	if MISTRAL_CLIENT_TYPE == "new":
	response = self.mistral_client.chat.complete(
	model="mistral-large-latest",
	messages=[UserMessage(content=synthesis_prompt)]
	)
	else:
	response = self.mistral_client.chat(
	model="mistral-large-latest",
	messages=[UserMessage(content=synthesis_prompt)]
	)

	return response.choices[0].message.content
	except Exception as e:
	logger.warning(f"Failed to synthesize results: {e}")

	return combined_result
	else:
	return "No text could be detected in any orientation"

	except Exception as e:
	logger.error(f"Multi-orientation analysis failed: {e}")
	return f"Error in multi-orientation analysis: {e}"

	def get_enhanced_capabilities_status(self) -> Dict[str, Any]:
	"""Get status of enhanced capabilities."""
	base_status = super().get_capabilities_status()

	enhanced_status = {
	**base_status,
	'enhanced_text_recognition': True,
	'multi_orientation_analysis': True,
	'rotated_text_support': True,
	'text_direction_detection': True
	}

	return enhanced_status