Spaces:
Running
Running
""" | |
Linguistic Analysis Tool for GAIA Agent - Phase 6 | |
Advanced text pattern recognition, semantic understanding, and linguistic analysis | |
""" | |
import re | |
import logging | |
from typing import Dict, Any, List, Optional, Tuple, Set | |
from collections import Counter | |
import string | |
# Natural language processing | |
try: | |
from textblob import TextBlob | |
TEXTBLOB_AVAILABLE = True | |
except ImportError: | |
TEXTBLOB_AVAILABLE = False | |
# Advanced regex patterns | |
try: | |
import regex | |
REGEX_AVAILABLE = True | |
except ImportError: | |
import re as regex | |
REGEX_AVAILABLE = False | |
logger = logging.getLogger(__name__) | |
class LinguisticAnalyzer: | |
""" | |
Advanced linguistic analysis tool for text pattern recognition and understanding. | |
Features: | |
- Text pattern recognition and analysis | |
- Language detection and classification | |
- Semantic understanding and interpretation | |
- Text transformation and manipulation | |
- Grammar and syntax analysis | |
- Context-aware text processing | |
""" | |
def __init__(self): | |
"""Initialize the linguistic analyzer.""" | |
self.name = "linguistic_analyzer" | |
self.description = "Advanced linguistic analysis for pattern recognition and semantic understanding" | |
# Initialize text processing capabilities | |
self.available = True | |
# Common text patterns | |
self.patterns = { | |
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', | |
'url': r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', | |
'phone': r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', | |
'date': r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', | |
'time': r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s?[AaPp][Mm])?\b', | |
'number': r'-?\d+(?:\.\d+)?', | |
'currency': r'\$\d+(?:\.\d{2})?|\d+(?:\.\d{2})?\s?(?:USD|EUR|GBP|JPY)', | |
'percentage': r'\d+(?:\.\d+)?%', | |
'hashtag': r'#\w+', | |
'mention': r'@\w+', | |
'word': r'\b\w+\b', | |
'sentence': r'[.!?]+', | |
'question': r'\?', | |
'exclamation': r'!', | |
} | |
# Language-specific patterns | |
self.language_patterns = { | |
'english': { | |
'articles': r'\b(the|a|an)\b', | |
'pronouns': r'\b(i|you|he|she|it|we|they|me|him|her|us|them)\b', | |
'prepositions': r'\b(in|on|at|by|for|with|to|from|of|about)\b', | |
'conjunctions': r'\b(and|or|but|so|yet|for|nor)\b', | |
'common_words': r'\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should)\b' | |
}, | |
'reversed_english': { | |
'reversed_articles': r'\b(eht|a|na)\b', | |
'reversed_common': r'\b(si|era|saw|erew|evah|sah|dah|od|seod|did|lliw|dluow|dluoc|dluohs)\b' | |
} | |
} | |
# Semantic categories | |
self.semantic_categories = { | |
'direction': ['left', 'right', 'up', 'down', 'north', 'south', 'east', 'west'], | |
'color': ['red', 'blue', 'green', 'yellow', 'black', 'white', 'purple', 'orange'], | |
'size': ['big', 'small', 'large', 'tiny', 'huge', 'massive', 'little', 'giant'], | |
'emotion': ['happy', 'sad', 'angry', 'excited', 'calm', 'nervous', 'joyful', 'depressed'], | |
'time': ['morning', 'afternoon', 'evening', 'night', 'today', 'tomorrow', 'yesterday'], | |
'number': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'] | |
} | |
# Opposite word pairs | |
self.opposites = { | |
'left': 'right', 'right': 'left', | |
'up': 'down', 'down': 'up', | |
'big': 'small', 'small': 'big', | |
'large': 'small', 'tiny': 'huge', | |
'hot': 'cold', 'cold': 'hot', | |
'fast': 'slow', 'slow': 'fast', | |
'good': 'bad', 'bad': 'good', | |
'yes': 'no', 'no': 'yes', | |
'true': 'false', 'false': 'true', | |
'on': 'off', 'off': 'on', | |
'in': 'out', 'out': 'in', | |
'open': 'closed', 'closed': 'open', | |
'start': 'end', 'end': 'start', | |
'first': 'last', 'last': 'first' | |
} | |
logger.info("✅ Linguistic Analyzer initialized") | |
def extract_patterns(self, text: str, pattern_types: List[str] = None) -> Dict[str, List[str]]: | |
""" | |
Extract various patterns from text. | |
Args: | |
text: Input text to analyze | |
pattern_types: List of pattern types to extract (default: all) | |
Returns: | |
Dictionary with extracted patterns | |
""" | |
if not text: | |
return {} | |
if pattern_types is None: | |
pattern_types = list(self.patterns.keys()) | |
results = {} | |
for pattern_type in pattern_types: | |
if pattern_type in self.patterns: | |
pattern = self.patterns[pattern_type] | |
matches = re.findall(pattern, text, re.IGNORECASE) | |
results[pattern_type] = matches | |
return results | |
def analyze_text_structure(self, text: str) -> Dict[str, Any]: | |
""" | |
Analyze the structural properties of text. | |
Args: | |
text: Input text to analyze | |
Returns: | |
Dictionary with structural analysis | |
""" | |
if not text: | |
return {} | |
# Basic metrics | |
analysis = { | |
'character_count': len(text), | |
'word_count': len(text.split()), | |
'sentence_count': len(re.findall(r'[.!?]+', text)), | |
'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]), | |
'line_count': len(text.split('\n')), | |
'average_word_length': 0, | |
'average_sentence_length': 0, | |
'punctuation_count': 0, | |
'uppercase_count': 0, | |
'lowercase_count': 0, | |
'digit_count': 0 | |
} | |
# Calculate averages | |
words = text.split() | |
if words: | |
analysis['average_word_length'] = sum(len(word) for word in words) / len(words) | |
sentences = re.split(r'[.!?]+', text) | |
sentences = [s.strip() for s in sentences if s.strip()] | |
if sentences: | |
analysis['average_sentence_length'] = sum(len(s.split()) for s in sentences) / len(sentences) | |
# Character type counts | |
for char in text: | |
if char in string.punctuation: | |
analysis['punctuation_count'] += 1 | |
elif char.isupper(): | |
analysis['uppercase_count'] += 1 | |
elif char.islower(): | |
analysis['lowercase_count'] += 1 | |
elif char.isdigit(): | |
analysis['digit_count'] += 1 | |
return analysis | |
def detect_language_features(self, text: str) -> Dict[str, Any]: | |
""" | |
Detect language-specific features in text. | |
Args: | |
text: Input text to analyze | |
Returns: | |
Dictionary with language feature analysis | |
""" | |
if not text: | |
return {} | |
text_lower = text.lower() | |
features = {} | |
for language, patterns in self.language_patterns.items(): | |
lang_features = {} | |
for feature_type, pattern in patterns.items(): | |
matches = re.findall(pattern, text_lower) | |
lang_features[feature_type] = { | |
'count': len(matches), | |
'matches': matches[:10] # Limit to first 10 matches | |
} | |
features[language] = lang_features | |
return features | |
def analyze_semantic_content(self, text: str) -> Dict[str, Any]: | |
""" | |
Analyze semantic content and categorize words. | |
Args: | |
text: Input text to analyze | |
Returns: | |
Dictionary with semantic analysis | |
""" | |
if not text: | |
return {} | |
text_lower = text.lower() | |
words = re.findall(r'\b\w+\b', text_lower) | |
semantic_analysis = { | |
'total_words': len(words), | |
'unique_words': len(set(words)), | |
'word_frequency': dict(Counter(words).most_common(20)), | |
'semantic_categories': {}, | |
'detected_opposites': [] | |
} | |
# Categorize words by semantic meaning | |
for category, category_words in self.semantic_categories.items(): | |
found_words = [word for word in words if word in category_words] | |
if found_words: | |
semantic_analysis['semantic_categories'][category] = { | |
'count': len(found_words), | |
'words': list(set(found_words)) | |
} | |
# Find opposite word pairs | |
for word in set(words): | |
if word in self.opposites: | |
opposite = self.opposites[word] | |
if opposite in words: | |
semantic_analysis['detected_opposites'].append({ | |
'word': word, | |
'opposite': opposite, | |
'both_present': True | |
}) | |
return semantic_analysis | |
def find_text_transformations(self, text: str) -> Dict[str, Any]: | |
""" | |
Identify possible text transformations (reversals, rotations, etc.). | |
Args: | |
text: Input text to analyze | |
Returns: | |
Dictionary with transformation analysis | |
""" | |
if not text: | |
return {} | |
transformations = { | |
'original': text, | |
'reversed': text[::-1], | |
'word_reversed': ' '.join(reversed(text.split())), | |
'case_swapped': text.swapcase(), | |
'transformations_detected': [] | |
} | |
# Check if reversed text makes more sense | |
reversed_text = text[::-1] | |
# Analyze both versions for English-like patterns | |
original_score = self._calculate_english_score(text) | |
reversed_score = self._calculate_english_score(reversed_text) | |
if reversed_score > original_score * 1.5: # Significant improvement | |
transformations['transformations_detected'].append({ | |
'type': 'character_reversal', | |
'confidence': reversed_score / (original_score + 1), | |
'transformed_text': reversed_text | |
}) | |
# Check word order reversal | |
word_reversed = ' '.join(reversed(text.split())) | |
word_reversed_score = self._calculate_english_score(word_reversed) | |
if word_reversed_score > original_score * 1.2: | |
transformations['transformations_detected'].append({ | |
'type': 'word_order_reversal', | |
'confidence': word_reversed_score / (original_score + 1), | |
'transformed_text': word_reversed | |
}) | |
return transformations | |
def _calculate_english_score(self, text: str) -> float: | |
"""Calculate how English-like a text appears.""" | |
if not text: | |
return 0.0 | |
text_lower = text.lower() | |
score = 0.0 | |
# Common English words | |
common_words = [ | |
'the', 'and', 'or', 'if', 'you', 'understand', 'this', 'sentence', | |
'write', 'opposite', 'of', 'word', 'as', 'answer', 'is', 'are', | |
'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did' | |
] | |
# Count common English words | |
for word in common_words: | |
if word in text_lower: | |
score += 1.0 | |
# Check for English-like patterns | |
if re.search(r'\b(the|a|an)\s+\w+', text_lower): | |
score += 2.0 | |
if re.search(r'\w+\s+(is|are|was|were)\s+\w+', text_lower): | |
score += 2.0 | |
# Penalize non-English character patterns | |
if re.search(r'[^\w\s\.,!?;:\'"()-]', text): | |
score -= 1.0 | |
return score | |
def extract_answer_from_question(self, question: str) -> Dict[str, Any]: | |
""" | |
Extract answer from a question using linguistic analysis. | |
Args: | |
question: Question text to analyze | |
Returns: | |
Dictionary with answer extraction results | |
""" | |
result = { | |
'question': question, | |
'answer': '', | |
'confidence': 0.0, | |
'method': 'linguistic_analysis', | |
'analysis': {} | |
} | |
if not question: | |
return result | |
# Analyze transformations | |
transformations = self.find_text_transformations(question) | |
result['analysis']['transformations'] = transformations | |
# Check for specific patterns | |
if 'opposite' in question.lower(): | |
# Look for opposite word questions | |
opposite_analysis = self._analyze_opposite_question(question) | |
result['analysis']['opposite_analysis'] = opposite_analysis | |
if opposite_analysis['answer']: | |
result['answer'] = opposite_analysis['answer'] | |
result['confidence'] = opposite_analysis['confidence'] | |
result['method'] = 'opposite_detection' | |
# Check for reversed text patterns | |
if transformations['transformations_detected']: | |
best_transformation = max( | |
transformations['transformations_detected'], | |
key=lambda x: x['confidence'] | |
) | |
if best_transformation['confidence'] > 0.7: | |
# Re-analyze the transformed text | |
transformed_result = self.extract_answer_from_question( | |
best_transformation['transformed_text'] | |
) | |
if transformed_result['answer']: | |
result['answer'] = transformed_result['answer'] | |
result['confidence'] = best_transformation['confidence'] | |
result['method'] = f"transformation_{best_transformation['type']}" | |
return result | |
def _analyze_opposite_question(self, question: str) -> Dict[str, Any]: | |
"""Analyze questions asking for opposite words.""" | |
result = { | |
'answer': '', | |
'confidence': 0.0, | |
'target_word': '', | |
'opposite_found': False | |
} | |
question_lower = question.lower() | |
# Look for words that have opposites | |
words = re.findall(r'\b\w+\b', question_lower) | |
for word in words: | |
if word in self.opposites: | |
result['target_word'] = word | |
result['answer'] = self.opposites[word] | |
result['opposite_found'] = True | |
result['confidence'] = 0.9 | |
break | |
return result | |
def process_complex_text_query(self, query: str, context: str = '') -> Dict[str, Any]: | |
""" | |
Process complex text queries with comprehensive analysis. | |
Args: | |
query: Text query to process | |
context: Additional context | |
Returns: | |
Dictionary with comprehensive analysis results | |
""" | |
result = { | |
'query': query, | |
'context': context, | |
'structural_analysis': {}, | |
'semantic_analysis': {}, | |
'pattern_analysis': {}, | |
'transformation_analysis': {}, | |
'answer_extraction': {}, | |
'final_answer': '', | |
'confidence': 0.0 | |
} | |
if not query: | |
return result | |
try: | |
# Perform comprehensive analysis | |
result['structural_analysis'] = self.analyze_text_structure(query) | |
result['semantic_analysis'] = self.analyze_semantic_content(query) | |
result['pattern_analysis'] = self.extract_patterns(query) | |
result['transformation_analysis'] = self.find_text_transformations(query) | |
result['answer_extraction'] = self.extract_answer_from_question(query) | |
# Determine final answer | |
if result['answer_extraction']['answer']: | |
result['final_answer'] = result['answer_extraction']['answer'] | |
result['confidence'] = result['answer_extraction']['confidence'] | |
except Exception as e: | |
logger.error(f"Complex text query processing failed: {e}") | |
result['error'] = str(e) | |
return result | |
def get_linguistic_analysis_tools() -> List[LinguisticAnalyzer]: | |
"""Get list of linguistic analysis tools.""" | |
try: | |
analyzer = LinguisticAnalyzer() | |
if analyzer.available: | |
return [analyzer] | |
else: | |
logger.warning("⚠️ Linguistic analyzer not available") | |
return [] | |
except Exception as e: | |
logger.error(f"❌ Failed to create linguistic analyzer: {e}") | |
return [] |