File size: 13,161 Bytes
9a6a4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""
Enhanced RTL (Rotated Text Layout) Multimodal Agent

This module enhances the existing multimodal capabilities with improved support for:
- Text in various orientations (0°, 90°, 180°, 270°)
- Multi-directional text detection
- Enhanced OCR prompting for rotated text
- Better text extraction regardless of orientation
"""

import os
import logging
import base64
import io
from typing import Dict, Any, List, Optional, Union
from pathlib import Path
import requests
from PIL import Image, ImageOps
import numpy as np

# Import the base multimodal tools
from .mistral_multimodal_agent import OpenSourceMultimodalTools

logger = logging.getLogger(__name__)

class EnhancedRTLMultimodalTools(OpenSourceMultimodalTools):
    """
    Enhanced multimodal tools with improved rotated text recognition.
    
    Key enhancements:
    1. Multi-orientation text analysis
    2. Enhanced prompting for rotated text
    3. Image preprocessing for better OCR
    4. Text direction detection and processing
    """
    
    def __init__(self):
        """Initialize the enhanced RTL multimodal agent."""
        super().__init__()
        logger.info("🔄 Enhanced RTL Multimodal Tools initialized")
    
    def analyze_image(self, image_input: Union[str, bytes, Image.Image, dict], question: str = None) -> str:
        """
        Enhanced image analysis with improved rotated text recognition.
        
        Args:
            image_input: Image file path, bytes, PIL Image, or dict with file_path
            question: Optional specific question about the image
            
        Returns:
            Analysis result with enhanced text recognition
        """
        try:
            # Convert input to PIL Image (reuse parent logic)
            image = self._convert_to_pil_image(image_input)
            if isinstance(image, str) and image.startswith("Error:"):
                return image
            
            # Enhanced analysis for text-related questions
            if question and self._is_text_related_question(question):
                return self._analyze_with_enhanced_text_recognition(image, question)
            
            # Fall back to standard analysis for non-text questions
            return super().analyze_image(image_input, question)
            
        except Exception as e:
            logger.error(f"Enhanced image analysis failed: {e}")
            return f"Error: {e}"
    
    def _convert_to_pil_image(self, image_input: Union[str, bytes, Image.Image, dict]) -> Union[Image.Image, str]:
        """Convert various input types to PIL Image."""
        try:
            if isinstance(image_input, dict):
                if 'file_path' in image_input:
                    image_path = image_input['file_path']
                    if os.path.exists(image_path):
                        return Image.open(image_path)
                    else:
                        return f"Error: Image file not found: {image_path}"
                else:
                    return "Error: Dictionary input must contain 'file_path' key"
            elif isinstance(image_input, str):
                if os.path.exists(image_input):
                    return Image.open(image_input)
                else:
                    # Assume it's a URL
                    response = requests.get(image_input)
                    return Image.open(io.BytesIO(response.content))
            elif isinstance(image_input, bytes):
                return Image.open(io.BytesIO(image_input))
            elif isinstance(image_input, Image.Image):
                return image_input
            else:
                return "Error: Unsupported image input format"
        except Exception as e:
            return f"Error converting image: {e}"
    
    def _is_text_related_question(self, question: str) -> bool:
        """Determine if the question is asking about text content."""
        text_keywords = [
            'text', 'read', 'words', 'letters', 'numbers', 'digits',
            'writing', 'written', 'says', 'message', 'content',
            'characters', 'alphabet', 'numeric', 'string', 'label',
            'title', 'caption', 'sign', 'document', 'page'
        ]
        
        question_lower = question.lower()
        return any(keyword in question_lower for keyword in text_keywords)
    
    def _analyze_with_enhanced_text_recognition(self, image: Image.Image, question: str) -> str:
        """
        Perform enhanced text recognition analysis with multiple orientations.
        
        Args:
            image: PIL Image object
            question: Question about text in the image
            
        Returns:
            Enhanced text analysis result
        """
        try:
            # Try Mistral Vision with enhanced prompting first
            if self.mistral_client:
                result = self._analyze_with_enhanced_mistral_vision(image, question)
                if result and not result.startswith("Error"):
                    return result
            
            # Fallback to multi-orientation analysis
            return self._multi_orientation_text_analysis(image, question)
            
        except Exception as e:
            logger.error(f"Enhanced text recognition failed: {e}")
            return f"Error in enhanced text recognition: {e}"
    
    def _analyze_with_enhanced_mistral_vision(self, image: Image.Image, question: str) -> Optional[str]:
        """
        Analyze image using Mistral Vision with enhanced prompting for rotated text.
        
        Args:
            image: PIL Image object
            question: Question about the image
            
        Returns:
            Analysis result or None if failed
        """
        try:
            # Convert image to base64
            buffer = io.BytesIO()
            image.save(buffer, format='PNG')
            image_b64 = base64.b64encode(buffer.getvalue()).decode()
            
            # Enhanced prompt for rotated text recognition
            enhanced_prompt = self._create_enhanced_text_prompt(question)
            
            # Create message with enhanced prompt
            from mistralai import UserMessage
            messages = [
                UserMessage(
                    content=[
                        {
                            "type": "text",
                            "text": enhanced_prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": f"data:image/png;base64,{image_b64}"
                        }
                    ]
                )
            ]
            
            # Use Mistral Vision model
            if hasattr(self, 'mistral_client') and self.mistral_client:
                from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE
                
                if MISTRAL_CLIENT_TYPE == "new":
                    response = self.mistral_client.chat.complete(
                        model="pixtral-12b-2409",
                        messages=messages
                    )
                else:
                    response = self.mistral_client.chat(
                        model="pixtral-12b-2409",
                        messages=messages
                    )
                
                return response.choices[0].message.content
            
            return None
            
        except Exception as e:
            logger.warning(f"Enhanced Mistral Vision analysis failed: {e}")
            return None
    
    def _create_enhanced_text_prompt(self, original_question: str) -> str:
        """
        Create an enhanced prompt specifically designed for rotated text recognition.
        
        Args:
            original_question: Original question about the image
            
        Returns:
            Enhanced prompt for better text recognition
        """
        enhanced_prompt = f"""
{original_question}

IMPORTANT INSTRUCTIONS FOR TEXT RECOGNITION:
- Look carefully for text in ALL orientations: normal (0°), rotated 90°, upside down (180°), and rotated 270°
- Text may appear in any direction - horizontal, vertical, or rotated
- Pay special attention to text that might be rotated or oriented differently than normal reading direction
- If you see text that appears sideways, upside down, or at an angle, please read it and include it in your response
- Look for numbers, letters, words, and any written content regardless of orientation
- Scan the entire image systematically for text in all possible orientations
- If text appears rotated, mentally rotate it to read it correctly
- Include ALL text you can identify, even if it's in an unusual orientation

Please provide a comprehensive reading of all text visible in the image, regardless of its orientation or direction.
"""
        return enhanced_prompt
    
    def _multi_orientation_text_analysis(self, image: Image.Image, question: str) -> str:
        """
        Analyze text by trying multiple image orientations.
        
        Args:
            image: PIL Image object
            question: Question about text in the image
            
        Returns:
            Combined text analysis from all orientations
        """
        try:
            orientations = [
                ("normal", 0),
                ("rotated_90", 90),
                ("rotated_180", 180),
                ("rotated_270", 270)
            ]
            
            all_results = []
            
            for orientation_name, rotation in orientations:
                try:
                    # Rotate image
                    if rotation == 0:
                        rotated_image = image
                    else:
                        rotated_image = image.rotate(-rotation, expand=True, fillcolor='white')
                    
                    # Analyze rotated image
                    if self.vision_pipeline:
                        caption_result = self.vision_pipeline(rotated_image)
                        caption = caption_result[0]['generated_text'] if caption_result else ""
                        
                        if caption and len(caption.strip()) > 0:
                            all_results.append(f"{orientation_name}: {caption}")
                    
                except Exception as e:
                    logger.warning(f"Failed to analyze {orientation_name} orientation: {e}")
                    continue
            
            # Combine results
            if all_results:
                combined_result = "Text found in different orientations:\n" + "\n".join(all_results)
                
                # Use Mistral to synthesize the results if available
                if self.mistral_client:
                    synthesis_prompt = f"""
                    Based on the following text recognition results from an image analyzed in different orientations, 
                    please provide a comprehensive answer to the question: "{question}"
                    
                    Recognition results:
                    {combined_result}
                    
                    Please synthesize this information and provide the most accurate and complete answer possible.
                    Focus on extracting all readable text regardless of its original orientation in the image.
                    """
                    
                    try:
                        from mistralai import UserMessage
                        from .mistral_multimodal_agent import MISTRAL_CLIENT_TYPE
                        
                        if MISTRAL_CLIENT_TYPE == "new":
                            response = self.mistral_client.chat.complete(
                                model="mistral-large-latest",
                                messages=[UserMessage(content=synthesis_prompt)]
                            )
                        else:
                            response = self.mistral_client.chat(
                                model="mistral-large-latest",
                                messages=[UserMessage(content=synthesis_prompt)]
                            )
                        
                        return response.choices[0].message.content
                    except Exception as e:
                        logger.warning(f"Failed to synthesize results: {e}")
                
                return combined_result
            else:
                return "No text could be detected in any orientation"
                
        except Exception as e:
            logger.error(f"Multi-orientation analysis failed: {e}")
            return f"Error in multi-orientation analysis: {e}"
    
    def get_enhanced_capabilities_status(self) -> Dict[str, Any]:
        """Get status of enhanced capabilities."""
        base_status = super().get_capabilities_status()
        
        enhanced_status = {
            **base_status,
            'enhanced_text_recognition': True,
            'multi_orientation_analysis': True,
            'rotated_text_support': True,
            'text_direction_detection': True
        }
        
        return enhanced_status