Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

File size: 13,161 Bytes

9a6a4dc

"""
Enhanced RTL (Rotated Text Layout) Multimodal Agent

This module enhances the existing multimodal capabilities with improved support for:
- Text in various orientations (0°, 90°, 180°, 270°)
- Multi-directional text detection
- Enhanced OCR prompting for rotated text
- Better text extraction regardless of orientation
"""

import os
import logging
import base64
import io
from typing import Dict, Any, List, Optional, Union
from pathlib import Path
import requests
from PIL import Image, ImageOps
import numpy as np

# Import the base multimodal tools
from .mistral_multimodal_agent import OpenSourceMultimodalTools

logger = logging.getLogger(__name__)

class EnhancedRTLMultimodalTools(OpenSourceMultimodalTools):
    """
    Enhanced multimodal tools with improved rotated text recognition.
    
    Key enhancements:
    1. Multi-orientation text analysis
    2. Enhanced prompting for rotated text
    3. Image preprocessing for better OCR
    4. Text direction detection and processing
    """
    
    def __init__(self):
        """Initialize the enhanced RTL multimodal agent."""
        super().__init__()
        logger.info("🔄 Enhanced RTL Multimodal Tools initialized")
    
    def analyze_image(self, image_input: Union[str, bytes, Image.Image, dict], question: str = None) -> str:
        """
        Enhanced image analysis with improved rotated text recognition.
        
        Args:
            image_input: Image file path, bytes, PIL Image, or dict with file_path
            question: Optional specific question about the image
            
        Returns:
            Analysis result with enhanced text recognition
        """
        try:
            # Convert input to PIL Image (reuse parent logic)
            image = self._convert_to_pil_image(image_input)
            if isinstance(image, str) and image.startswith("Error:"):
                return image
            
            # Enhanced analysis for text-related questions
            if question and self._is_text_related_question(question):
                return self._analyze_with_enhanced_text_recognition(image, question)
            
            # Fall back to standard analysis for non-text questions
            return super().analyze_image(image_input, question)
            
        except Exception as e:
            logger.error(f"Enhanced image analysis failed: {e}")
            return f"Error: {e}"
    
    def _convert_to_pil_image(self, image_input: Union[str, bytes, Image.Image, dict]) -> Union[Image.Image, str]:
        """Convert various input types to PIL Image."""
        try:
            if isinstance(image_input, dict):
                if 'file_path' in image_input:
                    image_path = image_input['file_path']
                    if os.path.exists(image_path):
                        return Image.open(image_path)
                    else:
                        return f"Error: Image file not found: {image_path}"
                else:
                    return "Error: Dictionary input must contain 'file_path' key"
            elif isinstance(image_input, str):
                if os.path.exists(image_input):
                    return Image.open(image_input)
                else:
                    # Assume it's a URL
                    response = requests.get(image_input)
                    return Image.open(io.BytesIO(response.content))
            elif isinstance(image_input, bytes):
                return Image.open(io.BytesIO(image_input))
            elif isinstance(image_input, Image.Image):
                return image_input
            else:
                return "Error: Unsupported image input format"
        except Exception as e:
            return f"Error converting image: {e}"
    
    def _is_text_related_question(self, question: str) -> bool:
        """Determine if the question is asking about text content."""
        text_keywords = [
            'text', 'read', 'words', 'letters', 'numbers', 'digits',
            'writing', 'written', 'says', 'message', 'content',
            'characters', 'alphabet', 'numeric', 'string', 'label',
            'title', 'caption', 'sign', 'document', 'page'
        ]
        
        question_lower = question.lower()
        return any(keyword in question_lower for keyword in text_keywords)
    
    def _analyze_with_enhanced_text_recognition(self, image: Image.Image, question: str) -> str:
        """
        Perform enhanced text recognition analysis with multiple orientations.
        
        Args:
            image: PIL Image object
            question: Question about text in the image
            
        Returns:
            Enhanced text analysis result
        """
        try:
            # Try Mistral Vision with enhanced prompting first
            if self.mistral_client:
                result = self._analyze_with_enhanced_mistral_vision(image, question)
                if result and not result.startswith("Error"):
                    return result
            
            # Fallback to multi-orientation analysis
            return self._multi_orientation_text_analysis(image, question)
            
        except Exception as e:
            logger.error(f"Enhanced text recognition failed: {e}")
            return f"Error in enhanced text recognition: {e}"
    
    def _analyze_with_enhanced_mistral_vision(self, image: Image.Image, question: str) -> Optional[str]:
        """
        Analyze image using Mistral Vision with enhanced prompting for rotated text.
        
        Args:
            image: PIL Image object
            question: Question about the image
            
        Returns:
            Analysis result or None if failed
        """
        try:
            # Convert image to base64
            buffer = io.BytesIO()
            image.save(buffer, format='PNG')
            image_b64 = base64.b64encode(buffer.getvalue()).decode()
            
            # Enhanced prompt for rotated text recognition
            enhanced_prompt = self._create_enhanced_text_prompt(question)
            
            # Create message with enhanced prompt
            from mistralai import UserMessage
            messages = [
                UserMessage(
                    content=[
                        {
                            "type": "text",
                            "text": enhanced_prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": f"data:image/png;base64,{image_b64}"
                        }
                    ]
                )
            ]
            
            # Use Mistral Vision model
            if hasattr(self, 'mistral_client') and self.mistral_client:
                from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE
                
                if MISTRAL_CLIENT_TYPE == "new":
                    response = self.mistral_client.chat.complete(
                        model="pixtral-12b-2409",
                        messages=messages
                    )
                else:
                    response = self.mistral_client.chat(
                        model="pixtral-12b-2409",
                        messages=messages
                    )
                
                return response.choices[0].message.content
            
            return None
            
        except Exception as e:
            logger.warning(f"Enhanced Mistral Vision analysis failed: {e}")
            return None
    
    def _create_enhanced_text_prompt(self, original_question: str) -> str:
        """
        Create an enhanced prompt specifically designed for rotated text recognition.
        
        Args:
            original_question: Original question about the image
            
        Returns:
            Enhanced prompt for better text recognition
        """
        enhanced_prompt = f"""
{original_question}

IMPORTANT INSTRUCTIONS FOR TEXT RECOGNITION:
- Look carefully for text in ALL orientations: normal (0°), rotated 90°, upside down (180°), and rotated 270°
- Text may appear in any direction - horizontal, vertical, or rotated
- Pay special attention to text that might be rotated or oriented differently than normal reading direction
- If you see text that appears sideways, upside down, or at an angle, please read it and include it in your response
- Look for numbers, letters, words, and any written content regardless of orientation
- Scan the entire image systematically for text in all possible orientations
- If text appears rotated, mentally rotate it to read it correctly
- Include ALL text you can identify, even if it's in an unusual orientation

Please provide a comprehensive reading of all text visible in the image, regardless of its orientation or direction.
"""
        return enhanced_prompt
    
    def _multi_orientation_text_analysis(self, image: Image.Image, question: str) -> str:
        """
        Analyze text by trying multiple image orientations.
        
        Args:
            image: PIL Image object
            question: Question about text in the image
            
        Returns:
            Combined text analysis from all orientations
        """
        try:
            orientations = [
                ("normal", 0),
                ("rotated_90", 90),
                ("rotated_180", 180),
                ("rotated_270", 270)
            ]
            
            all_results = []
            
            for orientation_name, rotation in orientations:
                try:
                    # Rotate image
                    if rotation == 0:
                        rotated_image = image
                    else:
                        rotated_image = image.rotate(-rotation, expand=True, fillcolor='white')
                    
                    # Analyze rotated image
                    if self.vision_pipeline:
                        caption_result = self.vision_pipeline(rotated_image)
                        caption = caption_result[0]['generated_text'] if caption_result else ""
                        
                        if caption and len(caption.strip()) > 0:
                            all_results.append(f"{orientation_name}: {caption}")
                    
                except Exception as e:
                    logger.warning(f"Failed to analyze {orientation_name} orientation: {e}")
                    continue
            
            # Combine results
            if all_results:
                combined_result = "Text found in different orientations:\n" + "\n".join(all_results)
                
                # Use Mistral to synthesize the results if available
                if self.mistral_client:
                    synthesis_prompt = f"""
                    Based on the following text recognition results from an image analyzed in different orientations, 
                    please provide a comprehensive answer to the question: "{question}"
                    
                    Recognition results:
                    {combined_result}
                    
                    Please synthesize this information and provide the most accurate and complete answer possible.
                    Focus on extracting all readable text regardless of its original orientation in the image.
                    """
                    
                    try:
                        from mistralai import UserMessage
                        from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE
                        
                        if MISTRAL_CLIENT_TYPE == "new":
                            response = self.mistral_client.chat.complete(
                                model="mistral-large-latest",
                                messages=[UserMessage(content=synthesis_prompt)]
                            )
                        else:
                            response = self.mistral_client.chat(
                                model="mistral-large-latest",
                                messages=[UserMessage(content=synthesis_prompt)]
                            )
                        
                        return response.choices[0].message.content
                    except Exception as e:
                        logger.warning(f"Failed to synthesize results: {e}")
                
                return combined_result
            else:
                return "No text could be detected in any orientation"
                
        except Exception as e:
            logger.error(f"Multi-orientation analysis failed: {e}")
            return f"Error in multi-orientation analysis: {e}"
    
    def get_enhanced_capabilities_status(self) -> Dict[str, Any]:
        """Get status of enhanced capabilities."""
        base_status = super().get_capabilities_status()
        
        enhanced_status = {
            **base_status,
            'enhanced_text_recognition': True,
            'multi_orientation_analysis': True,
            'rotated_text_support': True,
            'text_direction_detection': True
        }
        
        return enhanced_status