gaia-enhanced-agent / tools /linguistic_analyzer.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
"""
Linguistic Analysis Tool for GAIA Agent - Phase 6
Advanced text pattern recognition, semantic understanding, and linguistic analysis
"""
import re
import logging
from typing import Dict, Any, List, Optional, Tuple, Set
from collections import Counter
import string
# Natural language processing
try:
from textblob import TextBlob
TEXTBLOB_AVAILABLE = True
except ImportError:
TEXTBLOB_AVAILABLE = False
# Advanced regex patterns
try:
import regex
REGEX_AVAILABLE = True
except ImportError:
import re as regex
REGEX_AVAILABLE = False
logger = logging.getLogger(__name__)
class LinguisticAnalyzer:
"""
Advanced linguistic analysis tool for text pattern recognition and understanding.
Features:
- Text pattern recognition and analysis
- Language detection and classification
- Semantic understanding and interpretation
- Text transformation and manipulation
- Grammar and syntax analysis
- Context-aware text processing
"""
def __init__(self):
"""Initialize the linguistic analyzer."""
self.name = "linguistic_analyzer"
self.description = "Advanced linguistic analysis for pattern recognition and semantic understanding"
# Initialize text processing capabilities
self.available = True
# Common text patterns
self.patterns = {
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
'url': r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
'phone': r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
'date': r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
'time': r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s?[AaPp][Mm])?\b',
'number': r'-?\d+(?:\.\d+)?',
'currency': r'\$\d+(?:\.\d{2})?|\d+(?:\.\d{2})?\s?(?:USD|EUR|GBP|JPY)',
'percentage': r'\d+(?:\.\d+)?%',
'hashtag': r'#\w+',
'mention': r'@\w+',
'word': r'\b\w+\b',
'sentence': r'[.!?]+',
'question': r'\?',
'exclamation': r'!',
}
# Language-specific patterns
self.language_patterns = {
'english': {
'articles': r'\b(the|a|an)\b',
'pronouns': r'\b(i|you|he|she|it|we|they|me|him|her|us|them)\b',
'prepositions': r'\b(in|on|at|by|for|with|to|from|of|about)\b',
'conjunctions': r'\b(and|or|but|so|yet|for|nor)\b',
'common_words': r'\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should)\b'
},
'reversed_english': {
'reversed_articles': r'\b(eht|a|na)\b',
'reversed_common': r'\b(si|era|saw|erew|evah|sah|dah|od|seod|did|lliw|dluow|dluoc|dluohs)\b'
}
}
# Semantic categories
self.semantic_categories = {
'direction': ['left', 'right', 'up', 'down', 'north', 'south', 'east', 'west'],
'color': ['red', 'blue', 'green', 'yellow', 'black', 'white', 'purple', 'orange'],
'size': ['big', 'small', 'large', 'tiny', 'huge', 'massive', 'little', 'giant'],
'emotion': ['happy', 'sad', 'angry', 'excited', 'calm', 'nervous', 'joyful', 'depressed'],
'time': ['morning', 'afternoon', 'evening', 'night', 'today', 'tomorrow', 'yesterday'],
'number': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
}
# Opposite word pairs
self.opposites = {
'left': 'right', 'right': 'left',
'up': 'down', 'down': 'up',
'big': 'small', 'small': 'big',
'large': 'small', 'tiny': 'huge',
'hot': 'cold', 'cold': 'hot',
'fast': 'slow', 'slow': 'fast',
'good': 'bad', 'bad': 'good',
'yes': 'no', 'no': 'yes',
'true': 'false', 'false': 'true',
'on': 'off', 'off': 'on',
'in': 'out', 'out': 'in',
'open': 'closed', 'closed': 'open',
'start': 'end', 'end': 'start',
'first': 'last', 'last': 'first'
}
logger.info("✅ Linguistic Analyzer initialized")
def extract_patterns(self, text: str, pattern_types: List[str] = None) -> Dict[str, List[str]]:
"""
Extract various patterns from text.
Args:
text: Input text to analyze
pattern_types: List of pattern types to extract (default: all)
Returns:
Dictionary with extracted patterns
"""
if not text:
return {}
if pattern_types is None:
pattern_types = list(self.patterns.keys())
results = {}
for pattern_type in pattern_types:
if pattern_type in self.patterns:
pattern = self.patterns[pattern_type]
matches = re.findall(pattern, text, re.IGNORECASE)
results[pattern_type] = matches
return results
def analyze_text_structure(self, text: str) -> Dict[str, Any]:
"""
Analyze the structural properties of text.
Args:
text: Input text to analyze
Returns:
Dictionary with structural analysis
"""
if not text:
return {}
# Basic metrics
analysis = {
'character_count': len(text),
'word_count': len(text.split()),
'sentence_count': len(re.findall(r'[.!?]+', text)),
'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]),
'line_count': len(text.split('\n')),
'average_word_length': 0,
'average_sentence_length': 0,
'punctuation_count': 0,
'uppercase_count': 0,
'lowercase_count': 0,
'digit_count': 0
}
# Calculate averages
words = text.split()
if words:
analysis['average_word_length'] = sum(len(word) for word in words) / len(words)
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
if sentences:
analysis['average_sentence_length'] = sum(len(s.split()) for s in sentences) / len(sentences)
# Character type counts
for char in text:
if char in string.punctuation:
analysis['punctuation_count'] += 1
elif char.isupper():
analysis['uppercase_count'] += 1
elif char.islower():
analysis['lowercase_count'] += 1
elif char.isdigit():
analysis['digit_count'] += 1
return analysis
def detect_language_features(self, text: str) -> Dict[str, Any]:
"""
Detect language-specific features in text.
Args:
text: Input text to analyze
Returns:
Dictionary with language feature analysis
"""
if not text:
return {}
text_lower = text.lower()
features = {}
for language, patterns in self.language_patterns.items():
lang_features = {}
for feature_type, pattern in patterns.items():
matches = re.findall(pattern, text_lower)
lang_features[feature_type] = {
'count': len(matches),
'matches': matches[:10] # Limit to first 10 matches
}
features[language] = lang_features
return features
def analyze_semantic_content(self, text: str) -> Dict[str, Any]:
"""
Analyze semantic content and categorize words.
Args:
text: Input text to analyze
Returns:
Dictionary with semantic analysis
"""
if not text:
return {}
text_lower = text.lower()
words = re.findall(r'\b\w+\b', text_lower)
semantic_analysis = {
'total_words': len(words),
'unique_words': len(set(words)),
'word_frequency': dict(Counter(words).most_common(20)),
'semantic_categories': {},
'detected_opposites': []
}
# Categorize words by semantic meaning
for category, category_words in self.semantic_categories.items():
found_words = [word for word in words if word in category_words]
if found_words:
semantic_analysis['semantic_categories'][category] = {
'count': len(found_words),
'words': list(set(found_words))
}
# Find opposite word pairs
for word in set(words):
if word in self.opposites:
opposite = self.opposites[word]
if opposite in words:
semantic_analysis['detected_opposites'].append({
'word': word,
'opposite': opposite,
'both_present': True
})
return semantic_analysis
def find_text_transformations(self, text: str) -> Dict[str, Any]:
"""
Identify possible text transformations (reversals, rotations, etc.).
Args:
text: Input text to analyze
Returns:
Dictionary with transformation analysis
"""
if not text:
return {}
transformations = {
'original': text,
'reversed': text[::-1],
'word_reversed': ' '.join(reversed(text.split())),
'case_swapped': text.swapcase(),
'transformations_detected': []
}
# Check if reversed text makes more sense
reversed_text = text[::-1]
# Analyze both versions for English-like patterns
original_score = self._calculate_english_score(text)
reversed_score = self._calculate_english_score(reversed_text)
if reversed_score > original_score * 1.5: # Significant improvement
transformations['transformations_detected'].append({
'type': 'character_reversal',
'confidence': reversed_score / (original_score + 1),
'transformed_text': reversed_text
})
# Check word order reversal
word_reversed = ' '.join(reversed(text.split()))
word_reversed_score = self._calculate_english_score(word_reversed)
if word_reversed_score > original_score * 1.2:
transformations['transformations_detected'].append({
'type': 'word_order_reversal',
'confidence': word_reversed_score / (original_score + 1),
'transformed_text': word_reversed
})
return transformations
def _calculate_english_score(self, text: str) -> float:
"""Calculate how English-like a text appears."""
if not text:
return 0.0
text_lower = text.lower()
score = 0.0
# Common English words
common_words = [
'the', 'and', 'or', 'if', 'you', 'understand', 'this', 'sentence',
'write', 'opposite', 'of', 'word', 'as', 'answer', 'is', 'are',
'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did'
]
# Count common English words
for word in common_words:
if word in text_lower:
score += 1.0
# Check for English-like patterns
if re.search(r'\b(the|a|an)\s+\w+', text_lower):
score += 2.0
if re.search(r'\w+\s+(is|are|was|were)\s+\w+', text_lower):
score += 2.0
# Penalize non-English character patterns
if re.search(r'[^\w\s\.,!?;:\'"()-]', text):
score -= 1.0
return score
def extract_answer_from_question(self, question: str) -> Dict[str, Any]:
"""
Extract answer from a question using linguistic analysis.
Args:
question: Question text to analyze
Returns:
Dictionary with answer extraction results
"""
result = {
'question': question,
'answer': '',
'confidence': 0.0,
'method': 'linguistic_analysis',
'analysis': {}
}
if not question:
return result
# Analyze transformations
transformations = self.find_text_transformations(question)
result['analysis']['transformations'] = transformations
# Check for specific patterns
if 'opposite' in question.lower():
# Look for opposite word questions
opposite_analysis = self._analyze_opposite_question(question)
result['analysis']['opposite_analysis'] = opposite_analysis
if opposite_analysis['answer']:
result['answer'] = opposite_analysis['answer']
result['confidence'] = opposite_analysis['confidence']
result['method'] = 'opposite_detection'
# Check for reversed text patterns
if transformations['transformations_detected']:
best_transformation = max(
transformations['transformations_detected'],
key=lambda x: x['confidence']
)
if best_transformation['confidence'] > 0.7:
# Re-analyze the transformed text
transformed_result = self.extract_answer_from_question(
best_transformation['transformed_text']
)
if transformed_result['answer']:
result['answer'] = transformed_result['answer']
result['confidence'] = best_transformation['confidence']
result['method'] = f"transformation_{best_transformation['type']}"
return result
def _analyze_opposite_question(self, question: str) -> Dict[str, Any]:
"""Analyze questions asking for opposite words."""
result = {
'answer': '',
'confidence': 0.0,
'target_word': '',
'opposite_found': False
}
question_lower = question.lower()
# Look for words that have opposites
words = re.findall(r'\b\w+\b', question_lower)
for word in words:
if word in self.opposites:
result['target_word'] = word
result['answer'] = self.opposites[word]
result['opposite_found'] = True
result['confidence'] = 0.9
break
return result
def process_complex_text_query(self, query: str, context: str = '') -> Dict[str, Any]:
"""
Process complex text queries with comprehensive analysis.
Args:
query: Text query to process
context: Additional context
Returns:
Dictionary with comprehensive analysis results
"""
result = {
'query': query,
'context': context,
'structural_analysis': {},
'semantic_analysis': {},
'pattern_analysis': {},
'transformation_analysis': {},
'answer_extraction': {},
'final_answer': '',
'confidence': 0.0
}
if not query:
return result
try:
# Perform comprehensive analysis
result['structural_analysis'] = self.analyze_text_structure(query)
result['semantic_analysis'] = self.analyze_semantic_content(query)
result['pattern_analysis'] = self.extract_patterns(query)
result['transformation_analysis'] = self.find_text_transformations(query)
result['answer_extraction'] = self.extract_answer_from_question(query)
# Determine final answer
if result['answer_extraction']['answer']:
result['final_answer'] = result['answer_extraction']['answer']
result['confidence'] = result['answer_extraction']['confidence']
except Exception as e:
logger.error(f"Complex text query processing failed: {e}")
result['error'] = str(e)
return result
def get_linguistic_analysis_tools() -> List[LinguisticAnalyzer]:
"""Get list of linguistic analysis tools."""
try:
analyzer = LinguisticAnalyzer()
if analyzer.available:
return [analyzer]
else:
logger.warning("⚠️ Linguistic analyzer not available")
return []
except Exception as e:
logger.error(f"❌ Failed to create linguistic analyzer: {e}")
return []