gaia-enhanced-agent / tools /enhanced_ocr_engine.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
"""
Enhanced OCR Engine for GAIA Agent - Phase 6
Handles multi-orientation text recognition, rotated/distorted text, and advanced OCR
"""
import logging
import numpy as np
from typing import Dict, Any, List, Optional, Tuple
from pathlib import Path
import tempfile
import os
# Image processing
try:
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
PIL_AVAILABLE = True
except ImportError:
PIL_AVAILABLE = False
# OCR engine
try:
import pytesseract
PYTESSERACT_AVAILABLE = True
except ImportError:
PYTESSERACT_AVAILABLE = False
# Computer vision for advanced processing
try:
import cv2
CV2_AVAILABLE = True
except ImportError:
CV2_AVAILABLE = False
logger = logging.getLogger(__name__)
class EnhancedOCREngine:
"""
Enhanced OCR engine for complex text recognition scenarios.
Features:
- Multi-orientation text recognition (0°, 90°, 180°, 270°)
- Rotated and distorted text handling
- Multi-language OCR support
- Text quality enhancement and preprocessing
- Confidence scoring for OCR results
- Advanced text extraction from complex layouts
"""
def __init__(self):
"""Initialize the enhanced OCR engine."""
self.name = "enhanced_ocr_engine"
self.description = "Enhanced OCR for multi-orientation text, rotated/distorted text, and complex layouts"
# Check dependencies
self.available = PIL_AVAILABLE and PYTESSERACT_AVAILABLE
if not self.available:
missing = []
if not PIL_AVAILABLE:
missing.append("PIL/Pillow")
if not PYTESSERACT_AVAILABLE:
missing.append("pytesseract")
logger.warning(f"⚠️ Enhanced OCR Engine not available - missing: {', '.join(missing)}")
return
# Test tesseract installation
try:
pytesseract.get_tesseract_version()
logger.info("✅ Tesseract OCR engine detected")
except Exception as e:
logger.warning(f"⚠️ Tesseract not properly installed: {e}")
self.available = False
return
# OCR configurations for different scenarios
self.ocr_configs = {
'default': '--oem 3 --psm 6',
'single_line': '--oem 3 --psm 8',
'single_word': '--oem 3 --psm 7',
'sparse_text': '--oem 3 --psm 11',
'single_char': '--oem 3 --psm 10',
'vertical_text': '--oem 3 --psm 5',
'uniform_block': '--oem 3 --psm 6'
}
# Supported orientations
self.orientations = [0, 90, 180, 270]
# Language codes for multi-language support
self.supported_languages = [
'eng', 'ara', 'chi_sim', 'chi_tra', 'fra', 'deu', 'spa', 'rus',
'jpn', 'kor', 'hin', 'tha', 'vie', 'heb', 'tur', 'pol', 'nld',
'ita', 'por', 'swe', 'dan', 'nor', 'fin', 'ces', 'hun', 'ron'
]
logger.info("✅ Enhanced OCR Engine initialized")
def preprocess_image(self, image: Image.Image, enhancement_level: str = 'medium') -> Image.Image:
"""
Preprocess image for better OCR results.
Args:
image: PIL Image object
enhancement_level: 'light', 'medium', 'heavy'
Returns:
Preprocessed PIL Image
"""
if not isinstance(image, Image.Image):
return image
try:
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
# Apply enhancements based on level
if enhancement_level in ['medium', 'heavy']:
# Enhance contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(1.2)
# Enhance sharpness
enhancer = ImageEnhance.Sharpness(image)
image = enhancer.enhance(1.1)
if enhancement_level == 'heavy':
# Additional heavy processing
# Reduce noise
image = image.filter(ImageFilter.MedianFilter(size=3))
# Enhance brightness slightly
enhancer = ImageEnhance.Brightness(image)
image = enhancer.enhance(1.05)
# Convert to grayscale for better OCR
image = ImageOps.grayscale(image)
# Increase contrast for text
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(1.3)
return image
except Exception as e:
logger.warning(f"Image preprocessing failed: {e}")
return image
def rotate_image(self, image: Image.Image, angle: int) -> Image.Image:
"""
Rotate image by specified angle.
Args:
image: PIL Image object
angle: Rotation angle in degrees
Returns:
Rotated PIL Image
"""
try:
if angle == 0:
return image
# Rotate image
rotated = image.rotate(-angle, expand=True, fillcolor='white')
return rotated
except Exception as e:
logger.warning(f"Image rotation failed: {e}")
return image
def detect_text_orientation(self, image: Image.Image) -> Dict[str, Any]:
"""
Detect the orientation of text in the image.
Args:
image: PIL Image object
Returns:
Dictionary with orientation detection results
"""
result = {
'best_orientation': 0,
'confidence': 0.0,
'orientations_tested': [],
'method': 'ocr_confidence'
}
if not self.available:
return result
try:
best_confidence = 0
best_orientation = 0
orientation_results = []
# Test each orientation
for angle in self.orientations:
rotated_image = self.rotate_image(image, angle)
preprocessed = self.preprocess_image(rotated_image, 'light')
# Get OCR data with confidence
try:
data = pytesseract.image_to_data(
preprocessed,
config=self.ocr_configs['default'],
output_type=pytesseract.Output.DICT
)
# Calculate average confidence for detected text
confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
orientation_results.append({
'angle': angle,
'confidence': avg_confidence,
'text_blocks': len(confidences)
})
if avg_confidence > best_confidence:
best_confidence = avg_confidence
best_orientation = angle
except Exception as e:
logger.warning(f"OCR failed for orientation {angle}: {e}")
orientation_results.append({
'angle': angle,
'confidence': 0,
'text_blocks': 0
})
result['best_orientation'] = best_orientation
result['confidence'] = best_confidence
result['orientations_tested'] = orientation_results
except Exception as e:
logger.warning(f"Orientation detection failed: {e}")
return result
def extract_text_with_confidence(self, image: Image.Image, config: str = 'default',
languages: List[str] = None) -> Dict[str, Any]:
"""
Extract text from image with confidence scores.
Args:
image: PIL Image object
config: OCR configuration key
languages: List of language codes to use
Returns:
Dictionary with text extraction results
"""
result = {
'text': '',
'confidence': 0.0,
'word_confidences': [],
'bounding_boxes': [],
'languages_used': languages or ['eng']
}
if not self.available:
return result
try:
# Prepare language string
lang_string = '+'.join(languages) if languages else 'eng'
# Get OCR configuration
ocr_config = self.ocr_configs.get(config, self.ocr_configs['default'])
ocr_config += f' -l {lang_string}'
# Extract text with detailed data
data = pytesseract.image_to_data(
image,
config=ocr_config,
output_type=pytesseract.Output.DICT
)
# Process results
words = []
confidences = []
boxes = []
for i in range(len(data['text'])):
text = data['text'][i].strip()
conf = int(data['conf'][i])
if text and conf > 0:
words.append(text)
confidences.append(conf)
boxes.append({
'x': data['left'][i],
'y': data['top'][i],
'width': data['width'][i],
'height': data['height'][i],
'text': text,
'confidence': conf
})
# Combine results
result['text'] = ' '.join(words)
result['confidence'] = sum(confidences) / len(confidences) if confidences else 0
result['word_confidences'] = confidences
result['bounding_boxes'] = boxes
except Exception as e:
logger.warning(f"Text extraction failed: {e}")
return result
def process_multi_orientation_ocr(self, image: Image.Image,
auto_detect_orientation: bool = True) -> Dict[str, Any]:
"""
Process OCR with multiple orientations and return best result.
Args:
image: PIL Image object
auto_detect_orientation: Whether to auto-detect best orientation
Returns:
Dictionary with best OCR results
"""
result = {
'text': '',
'confidence': 0.0,
'best_orientation': 0,
'orientation_results': [],
'preprocessing_applied': True
}
if not self.available:
return result
try:
# Preprocess image
preprocessed = self.preprocess_image(image, 'medium')
if auto_detect_orientation:
# Detect best orientation first
orientation_info = self.detect_text_orientation(preprocessed)
best_angle = orientation_info['best_orientation']
# Process with best orientation
rotated = self.rotate_image(preprocessed, best_angle)
ocr_result = self.extract_text_with_confidence(rotated)
result.update(ocr_result)
result['best_orientation'] = best_angle
result['orientation_results'] = orientation_info['orientations_tested']
else:
# Try all orientations and pick best
best_confidence = 0
best_result = None
best_angle = 0
orientation_results = []
for angle in self.orientations:
rotated = self.rotate_image(preprocessed, angle)
ocr_result = self.extract_text_with_confidence(rotated)
orientation_results.append({
'angle': angle,
'confidence': ocr_result['confidence'],
'text_length': len(ocr_result['text']),
'word_count': len(ocr_result['text'].split())
})
if ocr_result['confidence'] > best_confidence:
best_confidence = ocr_result['confidence']
best_result = ocr_result
best_angle = angle
if best_result:
result.update(best_result)
result['best_orientation'] = best_angle
result['orientation_results'] = orientation_results
except Exception as e:
logger.error(f"Multi-orientation OCR failed: {e}")
return result
def process_image_file(self, image_path: str, **kwargs) -> Dict[str, Any]:
"""
Process an image file with enhanced OCR.
Args:
image_path: Path to image file
**kwargs: Additional arguments for OCR processing
Returns:
Dictionary with OCR results
"""
result = {
'success': False,
'error': '',
'text': '',
'confidence': 0.0
}
if not self.available:
result['error'] = 'OCR engine not available'
return result
try:
# Load image
image = Image.open(image_path)
# Process with multi-orientation OCR
ocr_result = self.process_multi_orientation_ocr(image, **kwargs)
result['success'] = True
result.update(ocr_result)
except Exception as e:
result['error'] = str(e)
logger.error(f"Image file processing failed: {e}")
return result
def enhance_text_quality(self, text: str) -> str:
"""
Enhance OCR text quality by fixing common errors.
Args:
text: Raw OCR text
Returns:
Enhanced text
"""
if not text:
return text
# Common OCR error corrections
corrections = {
# Number/letter confusions
'0': 'O', # Context-dependent
'1': 'l', # Context-dependent
'5': 'S', # Context-dependent
'8': 'B', # Context-dependent
# Common character mistakes
'rn': 'm',
'cl': 'd',
'vv': 'w',
# Punctuation fixes
' ,': ',',
' .': '.',
' !': '!',
' ?': '?',
}
enhanced = text
# Apply basic corrections
for wrong, right in corrections.items():
if wrong in enhanced:
# Apply context-aware corrections
enhanced = enhanced.replace(wrong, right)
# Clean up extra spaces
enhanced = ' '.join(enhanced.split())
return enhanced
def get_enhanced_ocr_tools() -> List[EnhancedOCREngine]:
"""Get list of enhanced OCR tools."""
try:
ocr_engine = EnhancedOCREngine()
if ocr_engine.available:
return [ocr_engine]
else:
logger.warning("⚠️ Enhanced OCR engine not available")
return []
except Exception as e:
logger.error(f"❌ Failed to create enhanced OCR engine: {e}")
return []