Spaces:
Running
Running
""" | |
Enhanced RTL (Rotated Text Layout) Multimodal Agent | |
This module enhances the existing multimodal capabilities with improved support for: | |
- Text in various orientations (0°, 90°, 180°, 270°) | |
- Multi-directional text detection | |
- Enhanced OCR prompting for rotated text | |
- Better text extraction regardless of orientation | |
""" | |
import os | |
import logging | |
import base64 | |
import io | |
from typing import Dict, Any, List, Optional, Union | |
from pathlib import Path | |
import requests | |
from PIL import Image, ImageOps | |
import numpy as np | |
# Import the base multimodal tools | |
from .mistral_multimodal_agent import OpenSourceMultimodalTools | |
logger = logging.getLogger(__name__) | |
class EnhancedRTLMultimodalTools(OpenSourceMultimodalTools): | |
""" | |
Enhanced multimodal tools with improved rotated text recognition. | |
Key enhancements: | |
1. Multi-orientation text analysis | |
2. Enhanced prompting for rotated text | |
3. Image preprocessing for better OCR | |
4. Text direction detection and processing | |
""" | |
def __init__(self): | |
"""Initialize the enhanced RTL multimodal agent.""" | |
super().__init__() | |
logger.info("🔄 Enhanced RTL Multimodal Tools initialized") | |
def analyze_image(self, image_input: Union[str, bytes, Image.Image, dict], question: str = None) -> str: | |
""" | |
Enhanced image analysis with improved rotated text recognition. | |
Args: | |
image_input: Image file path, bytes, PIL Image, or dict with file_path | |
question: Optional specific question about the image | |
Returns: | |
Analysis result with enhanced text recognition | |
""" | |
try: | |
# Convert input to PIL Image (reuse parent logic) | |
image = self._convert_to_pil_image(image_input) | |
if isinstance(image, str) and image.startswith("Error:"): | |
return image | |
# Enhanced analysis for text-related questions | |
if question and self._is_text_related_question(question): | |
return self._analyze_with_enhanced_text_recognition(image, question) | |
# Fall back to standard analysis for non-text questions | |
return super().analyze_image(image_input, question) | |
except Exception as e: | |
logger.error(f"Enhanced image analysis failed: {e}") | |
return f"Error: {e}" | |
def _convert_to_pil_image(self, image_input: Union[str, bytes, Image.Image, dict]) -> Union[Image.Image, str]: | |
"""Convert various input types to PIL Image.""" | |
try: | |
if isinstance(image_input, dict): | |
if 'file_path' in image_input: | |
image_path = image_input['file_path'] | |
if os.path.exists(image_path): | |
return Image.open(image_path) | |
else: | |
return f"Error: Image file not found: {image_path}" | |
else: | |
return "Error: Dictionary input must contain 'file_path' key" | |
elif isinstance(image_input, str): | |
if os.path.exists(image_input): | |
return Image.open(image_input) | |
else: | |
# Assume it's a URL | |
response = requests.get(image_input) | |
return Image.open(io.BytesIO(response.content)) | |
elif isinstance(image_input, bytes): | |
return Image.open(io.BytesIO(image_input)) | |
elif isinstance(image_input, Image.Image): | |
return image_input | |
else: | |
return "Error: Unsupported image input format" | |
except Exception as e: | |
return f"Error converting image: {e}" | |
def _is_text_related_question(self, question: str) -> bool: | |
"""Determine if the question is asking about text content.""" | |
text_keywords = [ | |
'text', 'read', 'words', 'letters', 'numbers', 'digits', | |
'writing', 'written', 'says', 'message', 'content', | |
'characters', 'alphabet', 'numeric', 'string', 'label', | |
'title', 'caption', 'sign', 'document', 'page' | |
] | |
question_lower = question.lower() | |
return any(keyword in question_lower for keyword in text_keywords) | |
def _analyze_with_enhanced_text_recognition(self, image: Image.Image, question: str) -> str: | |
""" | |
Perform enhanced text recognition analysis with multiple orientations. | |
Args: | |
image: PIL Image object | |
question: Question about text in the image | |
Returns: | |
Enhanced text analysis result | |
""" | |
try: | |
# Try Mistral Vision with enhanced prompting first | |
if self.mistral_client: | |
result = self._analyze_with_enhanced_mistral_vision(image, question) | |
if result and not result.startswith("Error"): | |
return result | |
# Fallback to multi-orientation analysis | |
return self._multi_orientation_text_analysis(image, question) | |
except Exception as e: | |
logger.error(f"Enhanced text recognition failed: {e}") | |
return f"Error in enhanced text recognition: {e}" | |
def _analyze_with_enhanced_mistral_vision(self, image: Image.Image, question: str) -> Optional[str]: | |
""" | |
Analyze image using Mistral Vision with enhanced prompting for rotated text. | |
Args: | |
image: PIL Image object | |
question: Question about the image | |
Returns: | |
Analysis result or None if failed | |
""" | |
try: | |
# Convert image to base64 | |
buffer = io.BytesIO() | |
image.save(buffer, format='PNG') | |
image_b64 = base64.b64encode(buffer.getvalue()).decode() | |
# Enhanced prompt for rotated text recognition | |
enhanced_prompt = self._create_enhanced_text_prompt(question) | |
# Create message with enhanced prompt | |
from mistralai import UserMessage | |
messages = [ | |
UserMessage( | |
content=[ | |
{ | |
"type": "text", | |
"text": enhanced_prompt | |
}, | |
{ | |
"type": "image_url", | |
"image_url": f"data:image/png;base64,{image_b64}" | |
} | |
] | |
) | |
] | |
# Use Mistral Vision model | |
if hasattr(self, 'mistral_client') and self.mistral_client: | |
from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE | |
if MISTRAL_CLIENT_TYPE == "new": | |
response = self.mistral_client.chat.complete( | |
model="pixtral-12b-2409", | |
messages=messages | |
) | |
else: | |
response = self.mistral_client.chat( | |
model="pixtral-12b-2409", | |
messages=messages | |
) | |
return response.choices[0].message.content | |
return None | |
except Exception as e: | |
logger.warning(f"Enhanced Mistral Vision analysis failed: {e}") | |
return None | |
def _create_enhanced_text_prompt(self, original_question: str) -> str: | |
""" | |
Create an enhanced prompt specifically designed for rotated text recognition. | |
Args: | |
original_question: Original question about the image | |
Returns: | |
Enhanced prompt for better text recognition | |
""" | |
enhanced_prompt = f""" | |
{original_question} | |
IMPORTANT INSTRUCTIONS FOR TEXT RECOGNITION: | |
- Look carefully for text in ALL orientations: normal (0°), rotated 90°, upside down (180°), and rotated 270° | |
- Text may appear in any direction - horizontal, vertical, or rotated | |
- Pay special attention to text that might be rotated or oriented differently than normal reading direction | |
- If you see text that appears sideways, upside down, or at an angle, please read it and include it in your response | |
- Look for numbers, letters, words, and any written content regardless of orientation | |
- Scan the entire image systematically for text in all possible orientations | |
- If text appears rotated, mentally rotate it to read it correctly | |
- Include ALL text you can identify, even if it's in an unusual orientation | |
Please provide a comprehensive reading of all text visible in the image, regardless of its orientation or direction. | |
""" | |
return enhanced_prompt | |
def _multi_orientation_text_analysis(self, image: Image.Image, question: str) -> str: | |
""" | |
Analyze text by trying multiple image orientations. | |
Args: | |
image: PIL Image object | |
question: Question about text in the image | |
Returns: | |
Combined text analysis from all orientations | |
""" | |
try: | |
orientations = [ | |
("normal", 0), | |
("rotated_90", 90), | |
("rotated_180", 180), | |
("rotated_270", 270) | |
] | |
all_results = [] | |
for orientation_name, rotation in orientations: | |
try: | |
# Rotate image | |
if rotation == 0: | |
rotated_image = image | |
else: | |
rotated_image = image.rotate(-rotation, expand=True, fillcolor='white') | |
# Analyze rotated image | |
if self.vision_pipeline: | |
caption_result = self.vision_pipeline(rotated_image) | |
caption = caption_result[0]['generated_text'] if caption_result else "" | |
if caption and len(caption.strip()) > 0: | |
all_results.append(f"{orientation_name}: {caption}") | |
except Exception as e: | |
logger.warning(f"Failed to analyze {orientation_name} orientation: {e}") | |
continue | |
# Combine results | |
if all_results: | |
combined_result = "Text found in different orientations:\n" + "\n".join(all_results) | |
# Use Mistral to synthesize the results if available | |
if self.mistral_client: | |
synthesis_prompt = f""" | |
Based on the following text recognition results from an image analyzed in different orientations, | |
please provide a comprehensive answer to the question: "{question}" | |
Recognition results: | |
{combined_result} | |
Please synthesize this information and provide the most accurate and complete answer possible. | |
Focus on extracting all readable text regardless of its original orientation in the image. | |
""" | |
try: | |
from mistralai import UserMessage | |
from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE | |
if MISTRAL_CLIENT_TYPE == "new": | |
response = self.mistral_client.chat.complete( | |
model="mistral-large-latest", | |
messages=[UserMessage(content=synthesis_prompt)] | |
) | |
else: | |
response = self.mistral_client.chat( | |
model="mistral-large-latest", | |
messages=[UserMessage(content=synthesis_prompt)] | |
) | |
return response.choices[0].message.content | |
except Exception as e: | |
logger.warning(f"Failed to synthesize results: {e}") | |
return combined_result | |
else: | |
return "No text could be detected in any orientation" | |
except Exception as e: | |
logger.error(f"Multi-orientation analysis failed: {e}") | |
return f"Error in multi-orientation analysis: {e}" | |
def get_enhanced_capabilities_status(self) -> Dict[str, Any]: | |
"""Get status of enhanced capabilities.""" | |
base_status = super().get_capabilities_status() | |
enhanced_status = { | |
**base_status, | |
'enhanced_text_recognition': True, | |
'multi_orientation_analysis': True, | |
'rotated_text_support': True, | |
'text_direction_detection': True | |
} | |
return enhanced_status |