Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

gaia-enhanced-agent / tools /linguistic_analyzer.py

GAIA Agent Deployment

Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements

9a6a4dc 8 days ago

17.3 kB

	"""
	Linguistic Analysis Tool for GAIA Agent - Phase 6
	Advanced text pattern recognition, semantic understanding, and linguistic analysis
	"""

	import re
	import logging
	from typing import Dict, Any, List, Optional, Tuple, Set
	from collections import Counter
	import string

	# Natural language processing
	try:
	from textblob import TextBlob
	TEXTBLOB_AVAILABLE = True
	except ImportError:
	TEXTBLOB_AVAILABLE = False

	# Advanced regex patterns
	try:
	import regex
	REGEX_AVAILABLE = True
	except ImportError:
	import re as regex
	REGEX_AVAILABLE = False

	logger = logging.getLogger(__name__)


	class LinguisticAnalyzer:
	"""
	Advanced linguistic analysis tool for text pattern recognition and understanding.

	Features:
	- Text pattern recognition and analysis
	- Language detection and classification
	- Semantic understanding and interpretation
	- Text transformation and manipulation
	- Grammar and syntax analysis
	- Context-aware text processing
	"""

	def __init__(self):
	"""Initialize the linguistic analyzer."""
	self.name = "linguistic_analyzer"
	self.description = "Advanced linguistic analysis for pattern recognition and semantic understanding"

	# Initialize text processing capabilities
	self.available = True

	# Common text patterns
	self.patterns = {
	'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b',
	'url': r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
	'phone': r'(\+?1[-.\s]?)?$?([0-9]{3})$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
	'date': r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b\|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
	'time': r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s?[AaPp][Mm])?\b',
	'number': r'-?\d+(?:\.\d+)?',
	'currency': r'\$\d+(?:\.\d{2})?\|\d+(?:\.\d{2})?\s?(?:USD\|EUR\|GBP\|JPY)',
	'percentage': r'\d+(?:\.\d+)?%',
	'hashtag': r'#\w+',
	'mention': r'@\w+',
	'word': r'\b\w+\b',
	'sentence': r'[.!?]+',
	'question': r'\?',
	'exclamation': r'!',
	}

	# Language-specific patterns
	self.language_patterns = {
	'english': {
	'articles': r'\b(the\|a\|an)\b',
	'pronouns': r'\b(i\|you\|he\|she\|it\|we\|they\|me\|him\|her\|us\|them)\b',
	'prepositions': r'\b(in\|on\|at\|by\|for\|with\|to\|from\|of\|about)\b',
	'conjunctions': r'\b(and\|or\|but\|so\|yet\|for\|nor)\b',
	'common_words': r'\b(is\|are\|was\|were\|have\|has\|had\|do\|does\|did\|will\|would\|could\|should)\b'
	},
	'reversed_english': {
	'reversed_articles': r'\b(eht\|a\|na)\b',
	'reversed_common': r'\b(si\|era\|saw\|erew\|evah\|sah\|dah\|od\|seod\|did\|lliw\|dluow\|dluoc\|dluohs)\b'
	}
	}

	# Semantic categories
	self.semantic_categories = {
	'direction': ['left', 'right', 'up', 'down', 'north', 'south', 'east', 'west'],
	'color': ['red', 'blue', 'green', 'yellow', 'black', 'white', 'purple', 'orange'],
	'size': ['big', 'small', 'large', 'tiny', 'huge', 'massive', 'little', 'giant'],
	'emotion': ['happy', 'sad', 'angry', 'excited', 'calm', 'nervous', 'joyful', 'depressed'],
	'time': ['morning', 'afternoon', 'evening', 'night', 'today', 'tomorrow', 'yesterday'],
	'number': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
	}

	# Opposite word pairs
	self.opposites = {
	'left': 'right', 'right': 'left',
	'up': 'down', 'down': 'up',
	'big': 'small', 'small': 'big',
	'large': 'small', 'tiny': 'huge',
	'hot': 'cold', 'cold': 'hot',
	'fast': 'slow', 'slow': 'fast',
	'good': 'bad', 'bad': 'good',
	'yes': 'no', 'no': 'yes',
	'true': 'false', 'false': 'true',
	'on': 'off', 'off': 'on',
	'in': 'out', 'out': 'in',
	'open': 'closed', 'closed': 'open',
	'start': 'end', 'end': 'start',
	'first': 'last', 'last': 'first'
	}

	logger.info("✅ Linguistic Analyzer initialized")

	def extract_patterns(self, text: str, pattern_types: List[str] = None) -> Dict[str, List[str]]:
	"""
	Extract various patterns from text.

	Args:
	text: Input text to analyze
	pattern_types: List of pattern types to extract (default: all)

	Returns:
	Dictionary with extracted patterns
	"""
	if not text:
	return {}

	if pattern_types is None:
	pattern_types = list(self.patterns.keys())

	results = {}

	for pattern_type in pattern_types:
	if pattern_type in self.patterns:
	pattern = self.patterns[pattern_type]
	matches = re.findall(pattern, text, re.IGNORECASE)
	results[pattern_type] = matches

	return results

	def analyze_text_structure(self, text: str) -> Dict[str, Any]:
	"""
	Analyze the structural properties of text.

	Args:
	text: Input text to analyze

	Returns:
	Dictionary with structural analysis
	"""
	if not text:
	return {}

	# Basic metrics
	analysis = {
	'character_count': len(text),
	'word_count': len(text.split()),
	'sentence_count': len(re.findall(r'[.!?]+', text)),
	'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]),
	'line_count': len(text.split('\n')),
	'average_word_length': 0,
	'average_sentence_length': 0,
	'punctuation_count': 0,
	'uppercase_count': 0,
	'lowercase_count': 0,
	'digit_count': 0
	}

	# Calculate averages
	words = text.split()
	if words:
	analysis['average_word_length'] = sum(len(word) for word in words) / len(words)

	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]
	if sentences:
	analysis['average_sentence_length'] = sum(len(s.split()) for s in sentences) / len(sentences)

	# Character type counts
	for char in text:
	if char in string.punctuation:
	analysis['punctuation_count'] += 1
	elif char.isupper():
	analysis['uppercase_count'] += 1
	elif char.islower():
	analysis['lowercase_count'] += 1
	elif char.isdigit():
	analysis['digit_count'] += 1

	return analysis

	def detect_language_features(self, text: str) -> Dict[str, Any]:
	"""
	Detect language-specific features in text.

	Args:
	text: Input text to analyze

	Returns:
	Dictionary with language feature analysis
	"""
	if not text:
	return {}

	text_lower = text.lower()
	features = {}

	for language, patterns in self.language_patterns.items():
	lang_features = {}
	for feature_type, pattern in patterns.items():
	matches = re.findall(pattern, text_lower)
	lang_features[feature_type] = {
	'count': len(matches),
	'matches': matches[:10] # Limit to first 10 matches
	}
	features[language] = lang_features

	return features

	def analyze_semantic_content(self, text: str) -> Dict[str, Any]:
	"""
	Analyze semantic content and categorize words.

	Args:
	text: Input text to analyze

	Returns:
	Dictionary with semantic analysis
	"""
	if not text:
	return {}

	text_lower = text.lower()
	words = re.findall(r'\b\w+\b', text_lower)

	semantic_analysis = {
	'total_words': len(words),
	'unique_words': len(set(words)),
	'word_frequency': dict(Counter(words).most_common(20)),
	'semantic_categories': {},
	'detected_opposites': []
	}

	# Categorize words by semantic meaning
	for category, category_words in self.semantic_categories.items():
	found_words = [word for word in words if word in category_words]
	if found_words:
	semantic_analysis['semantic_categories'][category] = {
	'count': len(found_words),
	'words': list(set(found_words))
	}

	# Find opposite word pairs
	for word in set(words):
	if word in self.opposites:
	opposite = self.opposites[word]
	if opposite in words:
	semantic_analysis['detected_opposites'].append({
	'word': word,
	'opposite': opposite,
	'both_present': True
	})

	return semantic_analysis

	def find_text_transformations(self, text: str) -> Dict[str, Any]:
	"""
	Identify possible text transformations (reversals, rotations, etc.).

	Args:
	text: Input text to analyze

	Returns:
	Dictionary with transformation analysis
	"""
	if not text:
	return {}

	transformations = {
	'original': text,
	'reversed': text[::-1],
	'word_reversed': ' '.join(reversed(text.split())),
	'case_swapped': text.swapcase(),
	'transformations_detected': []
	}

	# Check if reversed text makes more sense
	reversed_text = text[::-1]

	# Analyze both versions for English-like patterns
	original_score = self._calculate_english_score(text)
	reversed_score = self._calculate_english_score(reversed_text)

	if reversed_score > original_score * 1.5: # Significant improvement
	transformations['transformations_detected'].append({
	'type': 'character_reversal',
	'confidence': reversed_score / (original_score + 1),
	'transformed_text': reversed_text
	})

	# Check word order reversal
	word_reversed = ' '.join(reversed(text.split()))
	word_reversed_score = self._calculate_english_score(word_reversed)

	if word_reversed_score > original_score * 1.2:
	transformations['transformations_detected'].append({
	'type': 'word_order_reversal',
	'confidence': word_reversed_score / (original_score + 1),
	'transformed_text': word_reversed
	})

	return transformations

	def _calculate_english_score(self, text: str) -> float:
	"""Calculate how English-like a text appears."""
	if not text:
	return 0.0

	text_lower = text.lower()
	score = 0.0

	# Common English words
	common_words = [
	'the', 'and', 'or', 'if', 'you', 'understand', 'this', 'sentence',
	'write', 'opposite', 'of', 'word', 'as', 'answer', 'is', 'are',
	'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did'
	]

	# Count common English words
	for word in common_words:
	if word in text_lower:
	score += 1.0

	# Check for English-like patterns
	if re.search(r'\b(the\|a\|an)\s+\w+', text_lower):
	score += 2.0

	if re.search(r'\w+\s+(is\|are\|was\|were)\s+\w+', text_lower):
	score += 2.0

	# Penalize non-English character patterns
	if re.search(r'[^\w\s\.,!?;:\'"()-]', text):
	score -= 1.0

	return score

	def extract_answer_from_question(self, question: str) -> Dict[str, Any]:
	"""
	Extract answer from a question using linguistic analysis.

	Args:
	question: Question text to analyze

	Returns:
	Dictionary with answer extraction results
	"""
	result = {
	'question': question,
	'answer': '',
	'confidence': 0.0,
	'method': 'linguistic_analysis',
	'analysis': {}
	}

	if not question:
	return result

	# Analyze transformations
	transformations = self.find_text_transformations(question)
	result['analysis']['transformations'] = transformations

	# Check for specific patterns
	if 'opposite' in question.lower():
	# Look for opposite word questions
	opposite_analysis = self._analyze_opposite_question(question)
	result['analysis']['opposite_analysis'] = opposite_analysis

	if opposite_analysis['answer']:
	result['answer'] = opposite_analysis['answer']
	result['confidence'] = opposite_analysis['confidence']
	result['method'] = 'opposite_detection'

	# Check for reversed text patterns
	if transformations['transformations_detected']:
	best_transformation = max(
	transformations['transformations_detected'],
	key=lambda x: x['confidence']
	)

	if best_transformation['confidence'] > 0.7:
	# Re-analyze the transformed text
	transformed_result = self.extract_answer_from_question(
	best_transformation['transformed_text']
	)

	if transformed_result['answer']:
	result['answer'] = transformed_result['answer']
	result['confidence'] = best_transformation['confidence']
	result['method'] = f"transformation_{best_transformation['type']}"

	return result

	def _analyze_opposite_question(self, question: str) -> Dict[str, Any]:
	"""Analyze questions asking for opposite words."""
	result = {
	'answer': '',
	'confidence': 0.0,
	'target_word': '',
	'opposite_found': False
	}

	question_lower = question.lower()

	# Look for words that have opposites
	words = re.findall(r'\b\w+\b', question_lower)

	for word in words:
	if word in self.opposites:
	result['target_word'] = word
	result['answer'] = self.opposites[word]
	result['opposite_found'] = True
	result['confidence'] = 0.9
	break

	return result

	def process_complex_text_query(self, query: str, context: str = '') -> Dict[str, Any]:
	"""
	Process complex text queries with comprehensive analysis.

	Args:
	query: Text query to process
	context: Additional context

	Returns:
	Dictionary with comprehensive analysis results
	"""
	result = {
	'query': query,
	'context': context,
	'structural_analysis': {},
	'semantic_analysis': {},
	'pattern_analysis': {},
	'transformation_analysis': {},
	'answer_extraction': {},
	'final_answer': '',
	'confidence': 0.0
	}

	if not query:
	return result

	try:
	# Perform comprehensive analysis
	result['structural_analysis'] = self.analyze_text_structure(query)
	result['semantic_analysis'] = self.analyze_semantic_content(query)
	result['pattern_analysis'] = self.extract_patterns(query)
	result['transformation_analysis'] = self.find_text_transformations(query)
	result['answer_extraction'] = self.extract_answer_from_question(query)

	# Determine final answer
	if result['answer_extraction']['answer']:
	result['final_answer'] = result['answer_extraction']['answer']
	result['confidence'] = result['answer_extraction']['confidence']

	except Exception as e:
	logger.error(f"Complex text query processing failed: {e}")
	result['error'] = str(e)

	return result


	def get_linguistic_analysis_tools() -> List[LinguisticAnalyzer]:
	"""Get list of linguistic analysis tools."""
	try:
	analyzer = LinguisticAnalyzer()
	if analyzer.available:
	return [analyzer]
	else:
	logger.warning("⚠️ Linguistic analyzer not available")
	return []
	except Exception as e:
	logger.error(f"❌ Failed to create linguistic analyzer: {e}")
	return []