Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

File size: 17,344 Bytes

9a6a4dc

"""
Linguistic Analysis Tool for GAIA Agent - Phase 6
Advanced text pattern recognition, semantic understanding, and linguistic analysis
"""

import re
import logging
from typing import Dict, Any, List, Optional, Tuple, Set
from collections import Counter
import string

# Natural language processing
try:
    from textblob import TextBlob
    TEXTBLOB_AVAILABLE = True
except ImportError:
    TEXTBLOB_AVAILABLE = False

# Advanced regex patterns
try:
    import regex
    REGEX_AVAILABLE = True
except ImportError:
    import re as regex
    REGEX_AVAILABLE = False

logger = logging.getLogger(__name__)


class LinguisticAnalyzer:
    """
    Advanced linguistic analysis tool for text pattern recognition and understanding.
    
    Features:
    - Text pattern recognition and analysis
    - Language detection and classification
    - Semantic understanding and interpretation
    - Text transformation and manipulation
    - Grammar and syntax analysis
    - Context-aware text processing
    """
    
    def __init__(self):
        """Initialize the linguistic analyzer."""
        self.name = "linguistic_analyzer"
        self.description = "Advanced linguistic analysis for pattern recognition and semantic understanding"
        
        # Initialize text processing capabilities
        self.available = True
        
        # Common text patterns
        self.patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'url': r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            'phone': r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
            'date': r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
            'time': r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s?[AaPp][Mm])?\b',
            'number': r'-?\d+(?:\.\d+)?',
            'currency': r'\$\d+(?:\.\d{2})?|\d+(?:\.\d{2})?\s?(?:USD|EUR|GBP|JPY)',
            'percentage': r'\d+(?:\.\d+)?%',
            'hashtag': r'#\w+',
            'mention': r'@\w+',
            'word': r'\b\w+\b',
            'sentence': r'[.!?]+',
            'question': r'\?',
            'exclamation': r'!',
        }
        
        # Language-specific patterns
        self.language_patterns = {
            'english': {
                'articles': r'\b(the|a|an)\b',
                'pronouns': r'\b(i|you|he|she|it|we|they|me|him|her|us|them)\b',
                'prepositions': r'\b(in|on|at|by|for|with|to|from|of|about)\b',
                'conjunctions': r'\b(and|or|but|so|yet|for|nor)\b',
                'common_words': r'\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should)\b'
            },
            'reversed_english': {
                'reversed_articles': r'\b(eht|a|na)\b',
                'reversed_common': r'\b(si|era|saw|erew|evah|sah|dah|od|seod|did|lliw|dluow|dluoc|dluohs)\b'
            }
        }
        
        # Semantic categories
        self.semantic_categories = {
            'direction': ['left', 'right', 'up', 'down', 'north', 'south', 'east', 'west'],
            'color': ['red', 'blue', 'green', 'yellow', 'black', 'white', 'purple', 'orange'],
            'size': ['big', 'small', 'large', 'tiny', 'huge', 'massive', 'little', 'giant'],
            'emotion': ['happy', 'sad', 'angry', 'excited', 'calm', 'nervous', 'joyful', 'depressed'],
            'time': ['morning', 'afternoon', 'evening', 'night', 'today', 'tomorrow', 'yesterday'],
            'number': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
        }
        
        # Opposite word pairs
        self.opposites = {
            'left': 'right', 'right': 'left',
            'up': 'down', 'down': 'up',
            'big': 'small', 'small': 'big',
            'large': 'small', 'tiny': 'huge',
            'hot': 'cold', 'cold': 'hot',
            'fast': 'slow', 'slow': 'fast',
            'good': 'bad', 'bad': 'good',
            'yes': 'no', 'no': 'yes',
            'true': 'false', 'false': 'true',
            'on': 'off', 'off': 'on',
            'in': 'out', 'out': 'in',
            'open': 'closed', 'closed': 'open',
            'start': 'end', 'end': 'start',
            'first': 'last', 'last': 'first'
        }
        
        logger.info("✅ Linguistic Analyzer initialized")
    
    def extract_patterns(self, text: str, pattern_types: List[str] = None) -> Dict[str, List[str]]:
        """
        Extract various patterns from text.
        
        Args:
            text: Input text to analyze
            pattern_types: List of pattern types to extract (default: all)
            
        Returns:
            Dictionary with extracted patterns
        """
        if not text:
            return {}
        
        if pattern_types is None:
            pattern_types = list(self.patterns.keys())
        
        results = {}
        
        for pattern_type in pattern_types:
            if pattern_type in self.patterns:
                pattern = self.patterns[pattern_type]
                matches = re.findall(pattern, text, re.IGNORECASE)
                results[pattern_type] = matches
        
        return results
    
    def analyze_text_structure(self, text: str) -> Dict[str, Any]:
        """
        Analyze the structural properties of text.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary with structural analysis
        """
        if not text:
            return {}
        
        # Basic metrics
        analysis = {
            'character_count': len(text),
            'word_count': len(text.split()),
            'sentence_count': len(re.findall(r'[.!?]+', text)),
            'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]),
            'line_count': len(text.split('\n')),
            'average_word_length': 0,
            'average_sentence_length': 0,
            'punctuation_count': 0,
            'uppercase_count': 0,
            'lowercase_count': 0,
            'digit_count': 0
        }
        
        # Calculate averages
        words = text.split()
        if words:
            analysis['average_word_length'] = sum(len(word) for word in words) / len(words)
        
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if s.strip()]
        if sentences:
            analysis['average_sentence_length'] = sum(len(s.split()) for s in sentences) / len(sentences)
        
        # Character type counts
        for char in text:
            if char in string.punctuation:
                analysis['punctuation_count'] += 1
            elif char.isupper():
                analysis['uppercase_count'] += 1
            elif char.islower():
                analysis['lowercase_count'] += 1
            elif char.isdigit():
                analysis['digit_count'] += 1
        
        return analysis
    
    def detect_language_features(self, text: str) -> Dict[str, Any]:
        """
        Detect language-specific features in text.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary with language feature analysis
        """
        if not text:
            return {}
        
        text_lower = text.lower()
        features = {}
        
        for language, patterns in self.language_patterns.items():
            lang_features = {}
            for feature_type, pattern in patterns.items():
                matches = re.findall(pattern, text_lower)
                lang_features[feature_type] = {
                    'count': len(matches),
                    'matches': matches[:10]  # Limit to first 10 matches
                }
            features[language] = lang_features
        
        return features
    
    def analyze_semantic_content(self, text: str) -> Dict[str, Any]:
        """
        Analyze semantic content and categorize words.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary with semantic analysis
        """
        if not text:
            return {}
        
        text_lower = text.lower()
        words = re.findall(r'\b\w+\b', text_lower)
        
        semantic_analysis = {
            'total_words': len(words),
            'unique_words': len(set(words)),
            'word_frequency': dict(Counter(words).most_common(20)),
            'semantic_categories': {},
            'detected_opposites': []
        }
        
        # Categorize words by semantic meaning
        for category, category_words in self.semantic_categories.items():
            found_words = [word for word in words if word in category_words]
            if found_words:
                semantic_analysis['semantic_categories'][category] = {
                    'count': len(found_words),
                    'words': list(set(found_words))
                }
        
        # Find opposite word pairs
        for word in set(words):
            if word in self.opposites:
                opposite = self.opposites[word]
                if opposite in words:
                    semantic_analysis['detected_opposites'].append({
                        'word': word,
                        'opposite': opposite,
                        'both_present': True
                    })
        
        return semantic_analysis
    
    def find_text_transformations(self, text: str) -> Dict[str, Any]:
        """
        Identify possible text transformations (reversals, rotations, etc.).
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary with transformation analysis
        """
        if not text:
            return {}
        
        transformations = {
            'original': text,
            'reversed': text[::-1],
            'word_reversed': ' '.join(reversed(text.split())),
            'case_swapped': text.swapcase(),
            'transformations_detected': []
        }
        
        # Check if reversed text makes more sense
        reversed_text = text[::-1]
        
        # Analyze both versions for English-like patterns
        original_score = self._calculate_english_score(text)
        reversed_score = self._calculate_english_score(reversed_text)
        
        if reversed_score > original_score * 1.5:  # Significant improvement
            transformations['transformations_detected'].append({
                'type': 'character_reversal',
                'confidence': reversed_score / (original_score + 1),
                'transformed_text': reversed_text
            })
        
        # Check word order reversal
        word_reversed = ' '.join(reversed(text.split()))
        word_reversed_score = self._calculate_english_score(word_reversed)
        
        if word_reversed_score > original_score * 1.2:
            transformations['transformations_detected'].append({
                'type': 'word_order_reversal',
                'confidence': word_reversed_score / (original_score + 1),
                'transformed_text': word_reversed
            })
        
        return transformations
    
    def _calculate_english_score(self, text: str) -> float:
        """Calculate how English-like a text appears."""
        if not text:
            return 0.0
        
        text_lower = text.lower()
        score = 0.0
        
        # Common English words
        common_words = [
            'the', 'and', 'or', 'if', 'you', 'understand', 'this', 'sentence',
            'write', 'opposite', 'of', 'word', 'as', 'answer', 'is', 'are',
            'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did'
        ]
        
        # Count common English words
        for word in common_words:
            if word in text_lower:
                score += 1.0
        
        # Check for English-like patterns
        if re.search(r'\b(the|a|an)\s+\w+', text_lower):
            score += 2.0
        
        if re.search(r'\w+\s+(is|are|was|were)\s+\w+', text_lower):
            score += 2.0
        
        # Penalize non-English character patterns
        if re.search(r'[^\w\s\.,!?;:\'"()-]', text):
            score -= 1.0
        
        return score
    
    def extract_answer_from_question(self, question: str) -> Dict[str, Any]:
        """
        Extract answer from a question using linguistic analysis.
        
        Args:
            question: Question text to analyze
            
        Returns:
            Dictionary with answer extraction results
        """
        result = {
            'question': question,
            'answer': '',
            'confidence': 0.0,
            'method': 'linguistic_analysis',
            'analysis': {}
        }
        
        if not question:
            return result
        
        # Analyze transformations
        transformations = self.find_text_transformations(question)
        result['analysis']['transformations'] = transformations
        
        # Check for specific patterns
        if 'opposite' in question.lower():
            # Look for opposite word questions
            opposite_analysis = self._analyze_opposite_question(question)
            result['analysis']['opposite_analysis'] = opposite_analysis
            
            if opposite_analysis['answer']:
                result['answer'] = opposite_analysis['answer']
                result['confidence'] = opposite_analysis['confidence']
                result['method'] = 'opposite_detection'
        
        # Check for reversed text patterns
        if transformations['transformations_detected']:
            best_transformation = max(
                transformations['transformations_detected'],
                key=lambda x: x['confidence']
            )
            
            if best_transformation['confidence'] > 0.7:
                # Re-analyze the transformed text
                transformed_result = self.extract_answer_from_question(
                    best_transformation['transformed_text']
                )
                
                if transformed_result['answer']:
                    result['answer'] = transformed_result['answer']
                    result['confidence'] = best_transformation['confidence']
                    result['method'] = f"transformation_{best_transformation['type']}"
        
        return result
    
    def _analyze_opposite_question(self, question: str) -> Dict[str, Any]:
        """Analyze questions asking for opposite words."""
        result = {
            'answer': '',
            'confidence': 0.0,
            'target_word': '',
            'opposite_found': False
        }
        
        question_lower = question.lower()
        
        # Look for words that have opposites
        words = re.findall(r'\b\w+\b', question_lower)
        
        for word in words:
            if word in self.opposites:
                result['target_word'] = word
                result['answer'] = self.opposites[word]
                result['opposite_found'] = True
                result['confidence'] = 0.9
                break
        
        return result
    
    def process_complex_text_query(self, query: str, context: str = '') -> Dict[str, Any]:
        """
        Process complex text queries with comprehensive analysis.
        
        Args:
            query: Text query to process
            context: Additional context
            
        Returns:
            Dictionary with comprehensive analysis results
        """
        result = {
            'query': query,
            'context': context,
            'structural_analysis': {},
            'semantic_analysis': {},
            'pattern_analysis': {},
            'transformation_analysis': {},
            'answer_extraction': {},
            'final_answer': '',
            'confidence': 0.0
        }
        
        if not query:
            return result
        
        try:
            # Perform comprehensive analysis
            result['structural_analysis'] = self.analyze_text_structure(query)
            result['semantic_analysis'] = self.analyze_semantic_content(query)
            result['pattern_analysis'] = self.extract_patterns(query)
            result['transformation_analysis'] = self.find_text_transformations(query)
            result['answer_extraction'] = self.extract_answer_from_question(query)
            
            # Determine final answer
            if result['answer_extraction']['answer']:
                result['final_answer'] = result['answer_extraction']['answer']
                result['confidence'] = result['answer_extraction']['confidence']
            
        except Exception as e:
            logger.error(f"Complex text query processing failed: {e}")
            result['error'] = str(e)
        
        return result


def get_linguistic_analysis_tools() -> List[LinguisticAnalyzer]:
    """Get list of linguistic analysis tools."""
    try:
        analyzer = LinguisticAnalyzer()
        if analyzer.available:
            return [analyzer]
        else:
            logger.warning("⚠️ Linguistic analyzer not available")
            return []
    except Exception as e:
        logger.error(f"❌ Failed to create linguistic analyzer: {e}")
        return []