""" Linguistic Analysis Tool for GAIA Agent - Phase 6 Advanced text pattern recognition, semantic understanding, and linguistic analysis """ import re import logging from typing import Dict, Any, List, Optional, Tuple, Set from collections import Counter import string # Natural language processing try: from textblob import TextBlob TEXTBLOB_AVAILABLE = True except ImportError: TEXTBLOB_AVAILABLE = False # Advanced regex patterns try: import regex REGEX_AVAILABLE = True except ImportError: import re as regex REGEX_AVAILABLE = False logger = logging.getLogger(__name__) class LinguisticAnalyzer: """ Advanced linguistic analysis tool for text pattern recognition and understanding. Features: - Text pattern recognition and analysis - Language detection and classification - Semantic understanding and interpretation - Text transformation and manipulation - Grammar and syntax analysis - Context-aware text processing """ def __init__(self): """Initialize the linguistic analyzer.""" self.name = "linguistic_analyzer" self.description = "Advanced linguistic analysis for pattern recognition and semantic understanding" # Initialize text processing capabilities self.available = True # Common text patterns self.patterns = { 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'url': r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'phone': r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', 'date': r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', 'time': r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s?[AaPp][Mm])?\b', 'number': r'-?\d+(?:\.\d+)?', 'currency': r'\$\d+(?:\.\d{2})?|\d+(?:\.\d{2})?\s?(?:USD|EUR|GBP|JPY)', 'percentage': r'\d+(?:\.\d+)?%', 'hashtag': r'#\w+', 'mention': r'@\w+', 'word': r'\b\w+\b', 'sentence': r'[.!?]+', 'question': r'\?', 'exclamation': r'!', } # Language-specific patterns self.language_patterns = { 'english': { 'articles': r'\b(the|a|an)\b', 'pronouns': r'\b(i|you|he|she|it|we|they|me|him|her|us|them)\b', 'prepositions': r'\b(in|on|at|by|for|with|to|from|of|about)\b', 'conjunctions': r'\b(and|or|but|so|yet|for|nor)\b', 'common_words': r'\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should)\b' }, 'reversed_english': { 'reversed_articles': r'\b(eht|a|na)\b', 'reversed_common': r'\b(si|era|saw|erew|evah|sah|dah|od|seod|did|lliw|dluow|dluoc|dluohs)\b' } } # Semantic categories self.semantic_categories = { 'direction': ['left', 'right', 'up', 'down', 'north', 'south', 'east', 'west'], 'color': ['red', 'blue', 'green', 'yellow', 'black', 'white', 'purple', 'orange'], 'size': ['big', 'small', 'large', 'tiny', 'huge', 'massive', 'little', 'giant'], 'emotion': ['happy', 'sad', 'angry', 'excited', 'calm', 'nervous', 'joyful', 'depressed'], 'time': ['morning', 'afternoon', 'evening', 'night', 'today', 'tomorrow', 'yesterday'], 'number': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'] } # Opposite word pairs self.opposites = { 'left': 'right', 'right': 'left', 'up': 'down', 'down': 'up', 'big': 'small', 'small': 'big', 'large': 'small', 'tiny': 'huge', 'hot': 'cold', 'cold': 'hot', 'fast': 'slow', 'slow': 'fast', 'good': 'bad', 'bad': 'good', 'yes': 'no', 'no': 'yes', 'true': 'false', 'false': 'true', 'on': 'off', 'off': 'on', 'in': 'out', 'out': 'in', 'open': 'closed', 'closed': 'open', 'start': 'end', 'end': 'start', 'first': 'last', 'last': 'first' } logger.info("✅ Linguistic Analyzer initialized") def extract_patterns(self, text: str, pattern_types: List[str] = None) -> Dict[str, List[str]]: """ Extract various patterns from text. Args: text: Input text to analyze pattern_types: List of pattern types to extract (default: all) Returns: Dictionary with extracted patterns """ if not text: return {} if pattern_types is None: pattern_types = list(self.patterns.keys()) results = {} for pattern_type in pattern_types: if pattern_type in self.patterns: pattern = self.patterns[pattern_type] matches = re.findall(pattern, text, re.IGNORECASE) results[pattern_type] = matches return results def analyze_text_structure(self, text: str) -> Dict[str, Any]: """ Analyze the structural properties of text. Args: text: Input text to analyze Returns: Dictionary with structural analysis """ if not text: return {} # Basic metrics analysis = { 'character_count': len(text), 'word_count': len(text.split()), 'sentence_count': len(re.findall(r'[.!?]+', text)), 'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]), 'line_count': len(text.split('\n')), 'average_word_length': 0, 'average_sentence_length': 0, 'punctuation_count': 0, 'uppercase_count': 0, 'lowercase_count': 0, 'digit_count': 0 } # Calculate averages words = text.split() if words: analysis['average_word_length'] = sum(len(word) for word in words) / len(words) sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] if sentences: analysis['average_sentence_length'] = sum(len(s.split()) for s in sentences) / len(sentences) # Character type counts for char in text: if char in string.punctuation: analysis['punctuation_count'] += 1 elif char.isupper(): analysis['uppercase_count'] += 1 elif char.islower(): analysis['lowercase_count'] += 1 elif char.isdigit(): analysis['digit_count'] += 1 return analysis def detect_language_features(self, text: str) -> Dict[str, Any]: """ Detect language-specific features in text. Args: text: Input text to analyze Returns: Dictionary with language feature analysis """ if not text: return {} text_lower = text.lower() features = {} for language, patterns in self.language_patterns.items(): lang_features = {} for feature_type, pattern in patterns.items(): matches = re.findall(pattern, text_lower) lang_features[feature_type] = { 'count': len(matches), 'matches': matches[:10] # Limit to first 10 matches } features[language] = lang_features return features def analyze_semantic_content(self, text: str) -> Dict[str, Any]: """ Analyze semantic content and categorize words. Args: text: Input text to analyze Returns: Dictionary with semantic analysis """ if not text: return {} text_lower = text.lower() words = re.findall(r'\b\w+\b', text_lower) semantic_analysis = { 'total_words': len(words), 'unique_words': len(set(words)), 'word_frequency': dict(Counter(words).most_common(20)), 'semantic_categories': {}, 'detected_opposites': [] } # Categorize words by semantic meaning for category, category_words in self.semantic_categories.items(): found_words = [word for word in words if word in category_words] if found_words: semantic_analysis['semantic_categories'][category] = { 'count': len(found_words), 'words': list(set(found_words)) } # Find opposite word pairs for word in set(words): if word in self.opposites: opposite = self.opposites[word] if opposite in words: semantic_analysis['detected_opposites'].append({ 'word': word, 'opposite': opposite, 'both_present': True }) return semantic_analysis def find_text_transformations(self, text: str) -> Dict[str, Any]: """ Identify possible text transformations (reversals, rotations, etc.). Args: text: Input text to analyze Returns: Dictionary with transformation analysis """ if not text: return {} transformations = { 'original': text, 'reversed': text[::-1], 'word_reversed': ' '.join(reversed(text.split())), 'case_swapped': text.swapcase(), 'transformations_detected': [] } # Check if reversed text makes more sense reversed_text = text[::-1] # Analyze both versions for English-like patterns original_score = self._calculate_english_score(text) reversed_score = self._calculate_english_score(reversed_text) if reversed_score > original_score * 1.5: # Significant improvement transformations['transformations_detected'].append({ 'type': 'character_reversal', 'confidence': reversed_score / (original_score + 1), 'transformed_text': reversed_text }) # Check word order reversal word_reversed = ' '.join(reversed(text.split())) word_reversed_score = self._calculate_english_score(word_reversed) if word_reversed_score > original_score * 1.2: transformations['transformations_detected'].append({ 'type': 'word_order_reversal', 'confidence': word_reversed_score / (original_score + 1), 'transformed_text': word_reversed }) return transformations def _calculate_english_score(self, text: str) -> float: """Calculate how English-like a text appears.""" if not text: return 0.0 text_lower = text.lower() score = 0.0 # Common English words common_words = [ 'the', 'and', 'or', 'if', 'you', 'understand', 'this', 'sentence', 'write', 'opposite', 'of', 'word', 'as', 'answer', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did' ] # Count common English words for word in common_words: if word in text_lower: score += 1.0 # Check for English-like patterns if re.search(r'\b(the|a|an)\s+\w+', text_lower): score += 2.0 if re.search(r'\w+\s+(is|are|was|were)\s+\w+', text_lower): score += 2.0 # Penalize non-English character patterns if re.search(r'[^\w\s\.,!?;:\'"()-]', text): score -= 1.0 return score def extract_answer_from_question(self, question: str) -> Dict[str, Any]: """ Extract answer from a question using linguistic analysis. Args: question: Question text to analyze Returns: Dictionary with answer extraction results """ result = { 'question': question, 'answer': '', 'confidence': 0.0, 'method': 'linguistic_analysis', 'analysis': {} } if not question: return result # Analyze transformations transformations = self.find_text_transformations(question) result['analysis']['transformations'] = transformations # Check for specific patterns if 'opposite' in question.lower(): # Look for opposite word questions opposite_analysis = self._analyze_opposite_question(question) result['analysis']['opposite_analysis'] = opposite_analysis if opposite_analysis['answer']: result['answer'] = opposite_analysis['answer'] result['confidence'] = opposite_analysis['confidence'] result['method'] = 'opposite_detection' # Check for reversed text patterns if transformations['transformations_detected']: best_transformation = max( transformations['transformations_detected'], key=lambda x: x['confidence'] ) if best_transformation['confidence'] > 0.7: # Re-analyze the transformed text transformed_result = self.extract_answer_from_question( best_transformation['transformed_text'] ) if transformed_result['answer']: result['answer'] = transformed_result['answer'] result['confidence'] = best_transformation['confidence'] result['method'] = f"transformation_{best_transformation['type']}" return result def _analyze_opposite_question(self, question: str) -> Dict[str, Any]: """Analyze questions asking for opposite words.""" result = { 'answer': '', 'confidence': 0.0, 'target_word': '', 'opposite_found': False } question_lower = question.lower() # Look for words that have opposites words = re.findall(r'\b\w+\b', question_lower) for word in words: if word in self.opposites: result['target_word'] = word result['answer'] = self.opposites[word] result['opposite_found'] = True result['confidence'] = 0.9 break return result def process_complex_text_query(self, query: str, context: str = '') -> Dict[str, Any]: """ Process complex text queries with comprehensive analysis. Args: query: Text query to process context: Additional context Returns: Dictionary with comprehensive analysis results """ result = { 'query': query, 'context': context, 'structural_analysis': {}, 'semantic_analysis': {}, 'pattern_analysis': {}, 'transformation_analysis': {}, 'answer_extraction': {}, 'final_answer': '', 'confidence': 0.0 } if not query: return result try: # Perform comprehensive analysis result['structural_analysis'] = self.analyze_text_structure(query) result['semantic_analysis'] = self.analyze_semantic_content(query) result['pattern_analysis'] = self.extract_patterns(query) result['transformation_analysis'] = self.find_text_transformations(query) result['answer_extraction'] = self.extract_answer_from_question(query) # Determine final answer if result['answer_extraction']['answer']: result['final_answer'] = result['answer_extraction']['answer'] result['confidence'] = result['answer_extraction']['confidence'] except Exception as e: logger.error(f"Complex text query processing failed: {e}") result['error'] = str(e) return result def get_linguistic_analysis_tools() -> List[LinguisticAnalyzer]: """Get list of linguistic analysis tools.""" try: analyzer = LinguisticAnalyzer() if analyzer.available: return [analyzer] else: logger.warning("⚠️ Linguistic analyzer not available") return [] except Exception as e: logger.error(f"❌ Failed to create linguistic analyzer: {e}") return []